src/core/cgroup.c

   1 /* SPDX-License-Identifier: LGPL-2.1+ */
   2 /***
   3   This file is part of systemd.
   4
   5   Copyright 2013 Lennart Poettering
   6
   7   systemd is free software; you can redistribute it and/or modify it
   8   under the terms of the GNU Lesser General Public License as published by
   9   the Free Software Foundation; either version 2.1 of the License, or
  10   (at your option) any later version.
  11
  12   systemd is distributed in the hope that it will be useful, but
  13   WITHOUT ANY WARRANTY; without even the implied warranty of
  14   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  15   Lesser General Public License for more details.
  16
  17   You should have received a copy of the GNU Lesser General Public License
  18   along with systemd; If not, see <http://www.gnu.org/licenses/>.
  19 ***/
  20
  21 #include <fcntl.h>
  22 #include <fnmatch.h>
  23
  24 #include "alloc-util.h"
  25 //#include "blockdev-util.h"
  26 //#include "bpf-firewall.h"
  27 #include "cgroup-util.h"
  28 #include "cgroup.h"
  29 #include "fd-util.h"
  30 #include "fileio.h"
  31 #include "fs-util.h"
  32 #include "parse-util.h"
  33 #include "path-util.h"
  34 #include "process-util.h"
  35 //#include "special.h"
  36 #include "stdio-util.h"
  37 #include "string-table.h"
  38 #include "string-util.h"
  39
  40 #define CGROUP_CPU_QUOTA_PERIOD_USEC ((usec_t) 100 * USEC_PER_MSEC)
  41
  42 #if 0 /// UNNEEDED by elogind
  43 static void cgroup_compat_warn(void) {
  44         static bool cgroup_compat_warned = false;
  45
  46         if (cgroup_compat_warned)
  47                 return;
  48
  49         log_warning("cgroup compatibility translation between legacy and unified hierarchy settings activated. See cgroup-compat debug messages for details.");
  50         cgroup_compat_warned = true;
  51 }
  52
  53 #define log_cgroup_compat(unit, fmt, ...) do {                                  \
  54                 cgroup_compat_warn();                                           \
  55                 log_unit_debug(unit, "cgroup-compat: " fmt, ##__VA_ARGS__);     \
  56         } while (false)
  57
  58 void cgroup_context_init(CGroupContext *c) {
  59         assert(c);
  60
  61         /* Initialize everything to the kernel defaults, assuming the
  62          * structure is preinitialized to 0 */
  63
  64         c->cpu_weight = CGROUP_WEIGHT_INVALID;
  65         c->startup_cpu_weight = CGROUP_WEIGHT_INVALID;
  66         c->cpu_quota_per_sec_usec = USEC_INFINITY;
  67
  68         c->cpu_shares = CGROUP_CPU_SHARES_INVALID;
  69         c->startup_cpu_shares = CGROUP_CPU_SHARES_INVALID;
  70
  71         c->memory_high = CGROUP_LIMIT_MAX;
  72         c->memory_max = CGROUP_LIMIT_MAX;
  73         c->memory_swap_max = CGROUP_LIMIT_MAX;
  74
  75         c->memory_limit = CGROUP_LIMIT_MAX;
  76
  77         c->io_weight = CGROUP_WEIGHT_INVALID;
  78         c->startup_io_weight = CGROUP_WEIGHT_INVALID;
  79
  80         c->blockio_weight = CGROUP_BLKIO_WEIGHT_INVALID;
  81         c->startup_blockio_weight = CGROUP_BLKIO_WEIGHT_INVALID;
  82
  83         c->tasks_max = (uint64_t) -1;
  84 }
  85
  86 void cgroup_context_free_device_allow(CGroupContext *c, CGroupDeviceAllow *a) {
  87         assert(c);
  88         assert(a);
  89
  90         LIST_REMOVE(device_allow, c->device_allow, a);
  91         free(a->path);
  92         free(a);
  93 }
  94
  95 void cgroup_context_free_io_device_weight(CGroupContext *c, CGroupIODeviceWeight *w) {
  96         assert(c);
  97         assert(w);
  98
  99         LIST_REMOVE(device_weights, c->io_device_weights, w);
 100         free(w->path);
 101         free(w);
 102 }
 103
 104 void cgroup_context_free_io_device_limit(CGroupContext *c, CGroupIODeviceLimit *l) {
 105         assert(c);
 106         assert(l);
 107
 108         LIST_REMOVE(device_limits, c->io_device_limits, l);
 109         free(l->path);
 110         free(l);
 111 }
 112
 113 void cgroup_context_free_blockio_device_weight(CGroupContext *c, CGroupBlockIODeviceWeight *w) {
 114         assert(c);
 115         assert(w);
 116
 117         LIST_REMOVE(device_weights, c->blockio_device_weights, w);
 118         free(w->path);
 119         free(w);
 120 }
 121
 122 void cgroup_context_free_blockio_device_bandwidth(CGroupContext *c, CGroupBlockIODeviceBandwidth *b) {
 123         assert(c);
 124         assert(b);
 125
 126         LIST_REMOVE(device_bandwidths, c->blockio_device_bandwidths, b);
 127         free(b->path);
 128         free(b);
 129 }
 130
 131 void cgroup_context_done(CGroupContext *c) {
 132         assert(c);
 133
 134         while (c->io_device_weights)
 135                 cgroup_context_free_io_device_weight(c, c->io_device_weights);
 136
 137         while (c->io_device_limits)
 138                 cgroup_context_free_io_device_limit(c, c->io_device_limits);
 139
 140         while (c->blockio_device_weights)
 141                 cgroup_context_free_blockio_device_weight(c, c->blockio_device_weights);
 142
 143         while (c->blockio_device_bandwidths)
 144                 cgroup_context_free_blockio_device_bandwidth(c, c->blockio_device_bandwidths);
 145
 146         while (c->device_allow)
 147                 cgroup_context_free_device_allow(c, c->device_allow);
 148
 149         c->ip_address_allow = ip_address_access_free_all(c->ip_address_allow);
 150         c->ip_address_deny = ip_address_access_free_all(c->ip_address_deny);
 151 }
 152
 153 void cgroup_context_dump(CGroupContext *c, FILE* f, const char *prefix) {
 154         CGroupIODeviceLimit *il;
 155         CGroupIODeviceWeight *iw;
 156         CGroupBlockIODeviceBandwidth *b;
 157         CGroupBlockIODeviceWeight *w;
 158         CGroupDeviceAllow *a;
 159         IPAddressAccessItem *iaai;
 160         char u[FORMAT_TIMESPAN_MAX];
 161
 162         assert(c);
 163         assert(f);
 164
 165         prefix = strempty(prefix);
 166
 167         fprintf(f,
 168                 "%sCPUAccounting=%s\n"
 169                 "%sIOAccounting=%s\n"
 170                 "%sBlockIOAccounting=%s\n"
 171                 "%sMemoryAccounting=%s\n"
 172                 "%sTasksAccounting=%s\n"
 173                 "%sIPAccounting=%s\n"
 174                 "%sCPUWeight=%" PRIu64 "\n"
 175                 "%sStartupCPUWeight=%" PRIu64 "\n"
 176                 "%sCPUShares=%" PRIu64 "\n"
 177                 "%sStartupCPUShares=%" PRIu64 "\n"
 178                 "%sCPUQuotaPerSecSec=%s\n"
 179                 "%sIOWeight=%" PRIu64 "\n"
 180                 "%sStartupIOWeight=%" PRIu64 "\n"
 181                 "%sBlockIOWeight=%" PRIu64 "\n"
 182                 "%sStartupBlockIOWeight=%" PRIu64 "\n"
 183                 "%sMemoryLow=%" PRIu64 "\n"
 184                 "%sMemoryHigh=%" PRIu64 "\n"
 185                 "%sMemoryMax=%" PRIu64 "\n"
 186                 "%sMemorySwapMax=%" PRIu64 "\n"
 187                 "%sMemoryLimit=%" PRIu64 "\n"
 188                 "%sTasksMax=%" PRIu64 "\n"
 189                 "%sDevicePolicy=%s\n"
 190                 "%sDelegate=%s\n",
 191                 prefix, yes_no(c->cpu_accounting),
 192                 prefix, yes_no(c->io_accounting),
 193                 prefix, yes_no(c->blockio_accounting),
 194                 prefix, yes_no(c->memory_accounting),
 195                 prefix, yes_no(c->tasks_accounting),
 196                 prefix, yes_no(c->ip_accounting),
 197                 prefix, c->cpu_weight,
 198                 prefix, c->startup_cpu_weight,
 199                 prefix, c->cpu_shares,
 200                 prefix, c->startup_cpu_shares,
 201                 prefix, format_timespan(u, sizeof(u), c->cpu_quota_per_sec_usec, 1),
 202                 prefix, c->io_weight,
 203                 prefix, c->startup_io_weight,
 204                 prefix, c->blockio_weight,
 205                 prefix, c->startup_blockio_weight,
 206                 prefix, c->memory_low,
 207                 prefix, c->memory_high,
 208                 prefix, c->memory_max,
 209                 prefix, c->memory_swap_max,
 210                 prefix, c->memory_limit,
 211                 prefix, c->tasks_max,
 212                 prefix, cgroup_device_policy_to_string(c->device_policy),
 213                 prefix, yes_no(c->delegate));
 214
 215         if (c->delegate) {
 216                 _cleanup_free_ char *t = NULL;
 217
 218                 (void) cg_mask_to_string(c->delegate_controllers, &t);
 219
 220                 fprintf(f, "%sDelegateControllers=%s\n",
 221                         prefix,
 222                         strempty(t));
 223         }
 224
 225         LIST_FOREACH(device_allow, a, c->device_allow)
 226                 fprintf(f,
 227                         "%sDeviceAllow=%s %s%s%s\n",
 228                         prefix,
 229                         a->path,
 230                         a->r ? "r" : "", a->w ? "w" : "", a->m ? "m" : "");
 231
 232         LIST_FOREACH(device_weights, iw, c->io_device_weights)
 233                 fprintf(f,
 234                         "%sIODeviceWeight=%s %" PRIu64,
 235                         prefix,
 236                         iw->path,
 237                         iw->weight);
 238
 239         LIST_FOREACH(device_limits, il, c->io_device_limits) {
 240                 char buf[FORMAT_BYTES_MAX];
 241                 CGroupIOLimitType type;
 242
 243                 for (type = 0; type < _CGROUP_IO_LIMIT_TYPE_MAX; type++)
 244                         if (il->limits[type] != cgroup_io_limit_defaults[type])
 245                                 fprintf(f,
 246                                         "%s%s=%s %s\n",
 247                                         prefix,
 248                                         cgroup_io_limit_type_to_string(type),
 249                                         il->path,
 250                                         format_bytes(buf, sizeof(buf), il->limits[type]));
 251         }
 252
 253         LIST_FOREACH(device_weights, w, c->blockio_device_weights)
 254                 fprintf(f,
 255                         "%sBlockIODeviceWeight=%s %" PRIu64,
 256                         prefix,
 257                         w->path,
 258                         w->weight);
 259
 260         LIST_FOREACH(device_bandwidths, b, c->blockio_device_bandwidths) {
 261                 char buf[FORMAT_BYTES_MAX];
 262
 263                 if (b->rbps != CGROUP_LIMIT_MAX)
 264                         fprintf(f,
 265                                 "%sBlockIOReadBandwidth=%s %s\n",
 266                                 prefix,
 267                                 b->path,
 268                                 format_bytes(buf, sizeof(buf), b->rbps));
 269                 if (b->wbps != CGROUP_LIMIT_MAX)
 270                         fprintf(f,
 271                                 "%sBlockIOWriteBandwidth=%s %s\n",
 272                                 prefix,
 273                                 b->path,
 274                                 format_bytes(buf, sizeof(buf), b->wbps));
 275         }
 276
 277         LIST_FOREACH(items, iaai, c->ip_address_allow) {
 278                 _cleanup_free_ char *k = NULL;
 279
 280                 (void) in_addr_to_string(iaai->family, &iaai->address, &k);
 281                 fprintf(f, "%sIPAddressAllow=%s/%u\n", prefix, strnull(k), iaai->prefixlen);
 282         }
 283
 284         LIST_FOREACH(items, iaai, c->ip_address_deny) {
 285                 _cleanup_free_ char *k = NULL;
 286
 287                 (void) in_addr_to_string(iaai->family, &iaai->address, &k);
 288                 fprintf(f, "%sIPAddressDeny=%s/%u\n", prefix, strnull(k), iaai->prefixlen);
 289         }
 290 }
 291
 292 static int lookup_block_device(const char *p, dev_t *dev) {
 293         struct stat st;
 294         int r;
 295
 296         assert(p);
 297         assert(dev);
 298
 299         r = stat(p, &st);
 300         if (r < 0)
 301                 return log_warning_errno(errno, "Couldn't stat device %s: %m", p);
 302
 303         if (S_ISBLK(st.st_mode))
 304                 *dev = st.st_rdev;
 305         else if (major(st.st_dev) != 0) {
 306                 /* If this is not a device node then find the block
 307                  * device this file is stored on */
 308                 *dev = st.st_dev;
 309
 310                 /* If this is a partition, try to get the originating
 311                  * block device */
 312                 (void) block_get_whole_disk(*dev, dev);
 313         } else {
 314                 log_warning("%s is not a block device and file system block device cannot be determined or is not local.", p);
 315                 return -ENODEV;
 316         }
 317
 318         return 0;
 319 }
 320
 321 static int whitelist_device(const char *path, const char *node, const char *acc) {
 322         char buf[2+DECIMAL_STR_MAX(dev_t)*2+2+4];
 323         struct stat st;
 324         bool ignore_notfound;
 325         int r;
 326
 327         assert(path);
 328         assert(acc);
 329
 330         if (node[0] == '-') {
 331                 /* Non-existent paths starting with "-" must be silently ignored */
 332                 node++;
 333                 ignore_notfound = true;
 334         } else
 335                 ignore_notfound = false;
 336
 337         if (stat(node, &st) < 0) {
 338                 if (errno == ENOENT && ignore_notfound)
 339                         return 0;
 340
 341                 return log_warning_errno(errno, "Couldn't stat device %s: %m", node);
 342         }
 343
 344         if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
 345                 log_warning("%s is not a device.", node);
 346                 return -ENODEV;
 347         }
 348
 349         sprintf(buf,
 350                 "%c %u:%u %s",
 351                 S_ISCHR(st.st_mode) ? 'c' : 'b',
 352                 major(st.st_rdev), minor(st.st_rdev),
 353                 acc);
 354
 355         r = cg_set_attribute("devices", path, "devices.allow", buf);
 356         if (r < 0)
 357                 log_full_errno(IN_SET(r, -ENOENT, -EROFS, -EINVAL, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
 358                                "Failed to set devices.allow on %s: %m", path);
 359
 360         return r;
 361 }
 362
 363 static int whitelist_major(const char *path, const char *name, char type, const char *acc) {
 364         _cleanup_fclose_ FILE *f = NULL;
 365         char line[LINE_MAX];
 366         bool good = false;
 367         int r;
 368
 369         assert(path);
 370         assert(acc);
 371         assert(IN_SET(type, 'b', 'c'));
 372
 373         f = fopen("/proc/devices", "re");
 374         if (!f)
 375                 return log_warning_errno(errno, "Cannot open /proc/devices to resolve %s (%c): %m", name, type);
 376
 377         FOREACH_LINE(line, f, goto fail) {
 378                 char buf[2+DECIMAL_STR_MAX(unsigned)+3+4], *p, *w;
 379                 unsigned maj;
 380
 381                 truncate_nl(line);
 382
 383                 if (type == 'c' && streq(line, "Character devices:")) {
 384                         good = true;
 385                         continue;
 386                 }
 387
 388                 if (type == 'b' && streq(line, "Block devices:")) {
 389                         good = true;
 390                         continue;
 391                 }
 392
 393                 if (isempty(line)) {
 394                         good = false;
 395                         continue;
 396                 }
 397
 398                 if (!good)
 399                         continue;
 400
 401                 p = strstrip(line);
 402
 403                 w = strpbrk(p, WHITESPACE);
 404                 if (!w)
 405                         continue;
 406                 *w = 0;
 407
 408                 r = safe_atou(p, &maj);
 409                 if (r < 0)
 410                         continue;
 411                 if (maj <= 0)
 412                         continue;
 413
 414                 w++;
 415                 w += strspn(w, WHITESPACE);
 416
 417                 if (fnmatch(name, w, 0) != 0)
 418                         continue;
 419
 420                 sprintf(buf,
 421                         "%c %u:* %s",
 422                         type,
 423                         maj,
 424                         acc);
 425
 426                 r = cg_set_attribute("devices", path, "devices.allow", buf);
 427                 if (r < 0)
 428                         log_full_errno(IN_SET(r, -ENOENT, -EROFS, -EINVAL, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
 429                                        "Failed to set devices.allow on %s: %m", path);
 430         }
 431
 432         return 0;
 433
 434 fail:
 435         return log_warning_errno(errno, "Failed to read /proc/devices: %m");
 436 }
 437
 438 static bool cgroup_context_has_cpu_weight(CGroupContext *c) {
 439         return c->cpu_weight != CGROUP_WEIGHT_INVALID ||
 440                 c->startup_cpu_weight != CGROUP_WEIGHT_INVALID;
 441 }
 442
 443 static bool cgroup_context_has_cpu_shares(CGroupContext *c) {
 444         return c->cpu_shares != CGROUP_CPU_SHARES_INVALID ||
 445                 c->startup_cpu_shares != CGROUP_CPU_SHARES_INVALID;
 446 }
 447
 448 static uint64_t cgroup_context_cpu_weight(CGroupContext *c, ManagerState state) {
 449         if (IN_SET(state, MANAGER_STARTING, MANAGER_INITIALIZING) &&
 450             c->startup_cpu_weight != CGROUP_WEIGHT_INVALID)
 451                 return c->startup_cpu_weight;
 452         else if (c->cpu_weight != CGROUP_WEIGHT_INVALID)
 453                 return c->cpu_weight;
 454         else
 455                 return CGROUP_WEIGHT_DEFAULT;
 456 }
 457
 458 static uint64_t cgroup_context_cpu_shares(CGroupContext *c, ManagerState state) {
 459         if (IN_SET(state, MANAGER_STARTING, MANAGER_INITIALIZING) &&
 460             c->startup_cpu_shares != CGROUP_CPU_SHARES_INVALID)
 461                 return c->startup_cpu_shares;
 462         else if (c->cpu_shares != CGROUP_CPU_SHARES_INVALID)
 463                 return c->cpu_shares;
 464         else
 465                 return CGROUP_CPU_SHARES_DEFAULT;
 466 }
 467
 468 static void cgroup_apply_unified_cpu_config(Unit *u, uint64_t weight, uint64_t quota) {
 469         char buf[MAX(DECIMAL_STR_MAX(uint64_t) + 1, (DECIMAL_STR_MAX(usec_t) + 1) * 2)];
 470         int r;
 471
 472         xsprintf(buf, "%" PRIu64 "\n", weight);
 473         r = cg_set_attribute("cpu", u->cgroup_path, "cpu.weight", buf);
 474         if (r < 0)
 475                 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
 476                               "Failed to set cpu.weight: %m");
 477
 478         if (quota != USEC_INFINITY)
 479                 xsprintf(buf, USEC_FMT " " USEC_FMT "\n",
 480                          quota * CGROUP_CPU_QUOTA_PERIOD_USEC / USEC_PER_SEC, CGROUP_CPU_QUOTA_PERIOD_USEC);
 481         else
 482                 xsprintf(buf, "max " USEC_FMT "\n", CGROUP_CPU_QUOTA_PERIOD_USEC);
 483
 484         r = cg_set_attribute("cpu", u->cgroup_path, "cpu.max", buf);
 485
 486         if (r < 0)
 487                 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
 488                               "Failed to set cpu.max: %m");
 489 }
 490
 491 static void cgroup_apply_legacy_cpu_config(Unit *u, uint64_t shares, uint64_t quota) {
 492         char buf[MAX(DECIMAL_STR_MAX(uint64_t), DECIMAL_STR_MAX(usec_t)) + 1];
 493         int r;
 494
 495         xsprintf(buf, "%" PRIu64 "\n", shares);
 496         r = cg_set_attribute("cpu", u->cgroup_path, "cpu.shares", buf);
 497         if (r < 0)
 498                 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
 499                               "Failed to set cpu.shares: %m");
 500
 501         xsprintf(buf, USEC_FMT "\n", CGROUP_CPU_QUOTA_PERIOD_USEC);
 502         r = cg_set_attribute("cpu", u->cgroup_path, "cpu.cfs_period_us", buf);
 503         if (r < 0)
 504                 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
 505                               "Failed to set cpu.cfs_period_us: %m");
 506
 507         if (quota != USEC_INFINITY) {
 508                 xsprintf(buf, USEC_FMT "\n", quota * CGROUP_CPU_QUOTA_PERIOD_USEC / USEC_PER_SEC);
 509                 r = cg_set_attribute("cpu", u->cgroup_path, "cpu.cfs_quota_us", buf);
 510         } else
 511                 r = cg_set_attribute("cpu", u->cgroup_path, "cpu.cfs_quota_us", "-1");
 512         if (r < 0)
 513                 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
 514                               "Failed to set cpu.cfs_quota_us: %m");
 515 }
 516
 517 static uint64_t cgroup_cpu_shares_to_weight(uint64_t shares) {
 518         return CLAMP(shares * CGROUP_WEIGHT_DEFAULT / CGROUP_CPU_SHARES_DEFAULT,
 519                      CGROUP_WEIGHT_MIN, CGROUP_WEIGHT_MAX);
 520 }
 521
 522 static uint64_t cgroup_cpu_weight_to_shares(uint64_t weight) {
 523         return CLAMP(weight * CGROUP_CPU_SHARES_DEFAULT / CGROUP_WEIGHT_DEFAULT,
 524                      CGROUP_CPU_SHARES_MIN, CGROUP_CPU_SHARES_MAX);
 525 }
 526
 527 static bool cgroup_context_has_io_config(CGroupContext *c) {
 528         return c->io_accounting ||
 529                 c->io_weight != CGROUP_WEIGHT_INVALID ||
 530                 c->startup_io_weight != CGROUP_WEIGHT_INVALID ||
 531                 c->io_device_weights ||
 532                 c->io_device_limits;
 533 }
 534
 535 static bool cgroup_context_has_blockio_config(CGroupContext *c) {
 536         return c->blockio_accounting ||
 537                 c->blockio_weight != CGROUP_BLKIO_WEIGHT_INVALID ||
 538                 c->startup_blockio_weight != CGROUP_BLKIO_WEIGHT_INVALID ||
 539                 c->blockio_device_weights ||
 540                 c->blockio_device_bandwidths;
 541 }
 542
 543 static uint64_t cgroup_context_io_weight(CGroupContext *c, ManagerState state) {
 544         if (IN_SET(state, MANAGER_STARTING, MANAGER_INITIALIZING) &&
 545             c->startup_io_weight != CGROUP_WEIGHT_INVALID)
 546                 return c->startup_io_weight;
 547         else if (c->io_weight != CGROUP_WEIGHT_INVALID)
 548                 return c->io_weight;
 549         else
 550                 return CGROUP_WEIGHT_DEFAULT;
 551 }
 552
 553 static uint64_t cgroup_context_blkio_weight(CGroupContext *c, ManagerState state) {
 554         if (IN_SET(state, MANAGER_STARTING, MANAGER_INITIALIZING) &&
 555             c->startup_blockio_weight != CGROUP_BLKIO_WEIGHT_INVALID)
 556                 return c->startup_blockio_weight;
 557         else if (c->blockio_weight != CGROUP_BLKIO_WEIGHT_INVALID)
 558                 return c->blockio_weight;
 559         else
 560                 return CGROUP_BLKIO_WEIGHT_DEFAULT;
 561 }
 562
 563 static uint64_t cgroup_weight_blkio_to_io(uint64_t blkio_weight) {
 564         return CLAMP(blkio_weight * CGROUP_WEIGHT_DEFAULT / CGROUP_BLKIO_WEIGHT_DEFAULT,
 565                      CGROUP_WEIGHT_MIN, CGROUP_WEIGHT_MAX);
 566 }
 567
 568 static uint64_t cgroup_weight_io_to_blkio(uint64_t io_weight) {
 569         return CLAMP(io_weight * CGROUP_BLKIO_WEIGHT_DEFAULT / CGROUP_WEIGHT_DEFAULT,
 570                      CGROUP_BLKIO_WEIGHT_MIN, CGROUP_BLKIO_WEIGHT_MAX);
 571 }
 572
 573 static void cgroup_apply_io_device_weight(Unit *u, const char *dev_path, uint64_t io_weight) {
 574         char buf[DECIMAL_STR_MAX(dev_t)*2+2+DECIMAL_STR_MAX(uint64_t)+1];
 575         dev_t dev;
 576         int r;
 577
 578         r = lookup_block_device(dev_path, &dev);
 579         if (r < 0)
 580                 return;
 581
 582         xsprintf(buf, "%u:%u %" PRIu64 "\n", major(dev), minor(dev), io_weight);
 583         r = cg_set_attribute("io", u->cgroup_path, "io.weight", buf);
 584         if (r < 0)
 585                 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
 586                               "Failed to set io.weight: %m");
 587 }
 588
 589 static void cgroup_apply_blkio_device_weight(Unit *u, const char *dev_path, uint64_t blkio_weight) {
 590         char buf[DECIMAL_STR_MAX(dev_t)*2+2+DECIMAL_STR_MAX(uint64_t)+1];
 591         dev_t dev;
 592         int r;
 593
 594         r = lookup_block_device(dev_path, &dev);
 595         if (r < 0)
 596                 return;
 597
 598         xsprintf(buf, "%u:%u %" PRIu64 "\n", major(dev), minor(dev), blkio_weight);
 599         r = cg_set_attribute("blkio", u->cgroup_path, "blkio.weight_device", buf);
 600         if (r < 0)
 601                 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
 602                               "Failed to set blkio.weight_device: %m");
 603 }
 604
 605 static unsigned cgroup_apply_io_device_limit(Unit *u, const char *dev_path, uint64_t *limits) {
 606         char limit_bufs[_CGROUP_IO_LIMIT_TYPE_MAX][DECIMAL_STR_MAX(uint64_t)];
 607         char buf[DECIMAL_STR_MAX(dev_t)*2+2+(6+DECIMAL_STR_MAX(uint64_t)+1)*4];
 608         CGroupIOLimitType type;
 609         dev_t dev;
 610         unsigned n = 0;
 611         int r;
 612
 613         r = lookup_block_device(dev_path, &dev);
 614         if (r < 0)
 615                 return 0;
 616
 617         for (type = 0; type < _CGROUP_IO_LIMIT_TYPE_MAX; type++) {
 618                 if (limits[type] != cgroup_io_limit_defaults[type]) {
 619                         xsprintf(limit_bufs[type], "%" PRIu64, limits[type]);
 620                         n++;
 621                 } else {
 622                         xsprintf(limit_bufs[type], "%s", limits[type] == CGROUP_LIMIT_MAX ? "max" : "0");
 623                 }
 624         }
 625
 626         xsprintf(buf, "%u:%u rbps=%s wbps=%s riops=%s wiops=%s\n", major(dev), minor(dev),
 627                  limit_bufs[CGROUP_IO_RBPS_MAX], limit_bufs[CGROUP_IO_WBPS_MAX],
 628                  limit_bufs[CGROUP_IO_RIOPS_MAX], limit_bufs[CGROUP_IO_WIOPS_MAX]);
 629         r = cg_set_attribute("io", u->cgroup_path, "io.max", buf);
 630         if (r < 0)
 631                 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
 632                               "Failed to set io.max: %m");
 633         return n;
 634 }
 635
 636 static unsigned cgroup_apply_blkio_device_limit(Unit *u, const char *dev_path, uint64_t rbps, uint64_t wbps) {
 637         char buf[DECIMAL_STR_MAX(dev_t)*2+2+DECIMAL_STR_MAX(uint64_t)+1];
 638         dev_t dev;
 639         unsigned n = 0;
 640         int r;
 641
 642         r = lookup_block_device(dev_path, &dev);
 643         if (r < 0)
 644                 return 0;
 645
 646         if (rbps != CGROUP_LIMIT_MAX)
 647                 n++;
 648         sprintf(buf, "%u:%u %" PRIu64 "\n", major(dev), minor(dev), rbps);
 649         r = cg_set_attribute("blkio", u->cgroup_path, "blkio.throttle.read_bps_device", buf);
 650         if (r < 0)
 651                 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
 652                               "Failed to set blkio.throttle.read_bps_device: %m");
 653
 654         if (wbps != CGROUP_LIMIT_MAX)
 655                 n++;
 656         sprintf(buf, "%u:%u %" PRIu64 "\n", major(dev), minor(dev), wbps);
 657         r = cg_set_attribute("blkio", u->cgroup_path, "blkio.throttle.write_bps_device", buf);
 658         if (r < 0)
 659                 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
 660                               "Failed to set blkio.throttle.write_bps_device: %m");
 661
 662         return n;
 663 }
 664
 665 static bool cgroup_context_has_unified_memory_config(CGroupContext *c) {
 666         return c->memory_low > 0 || c->memory_high != CGROUP_LIMIT_MAX || c->memory_max != CGROUP_LIMIT_MAX || c->memory_swap_max != CGROUP_LIMIT_MAX;
 667 }
 668
 669 static void cgroup_apply_unified_memory_limit(Unit *u, const char *file, uint64_t v) {
 670         char buf[DECIMAL_STR_MAX(uint64_t) + 1] = "max";
 671         int r;
 672
 673         if (v != CGROUP_LIMIT_MAX)
 674                 xsprintf(buf, "%" PRIu64 "\n", v);
 675
 676         r = cg_set_attribute("memory", u->cgroup_path, file, buf);
 677         if (r < 0)
 678                 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
 679                               "Failed to set %s: %m", file);
 680 }
 681
 682 static void cgroup_apply_firewall(Unit *u) {
 683         int r;
 684
 685         assert(u);
 686
 687         if (u->type == UNIT_SLICE) /* Skip this for slice units, they are inner cgroup nodes, and since bpf/cgroup is
 688                                     * not recursive we don't ever touch the bpf on them */
 689                 return;
 690
 691         r = bpf_firewall_compile(u);
 692         if (r < 0)
 693                 return;
 694
 695         (void) bpf_firewall_install(u);
 696         return;
 697 }
 698
 699 static void cgroup_context_apply(
 700                 Unit *u,
 701                 CGroupMask apply_mask,
 702                 bool apply_bpf,
 703                 ManagerState state) {
 704
 705         const char *path;
 706         CGroupContext *c;
 707         bool is_root;
 708         int r;
 709
 710         assert(u);
 711
 712         c = unit_get_cgroup_context(u);
 713         path = u->cgroup_path;
 714
 715         assert(c);
 716         assert(path);
 717
 718         /* Nothing to do? Exit early! */
 719         if (apply_mask == 0 && !apply_bpf)
 720                 return;
 721
 722         /* Some cgroup attributes are not supported on the root cgroup,
 723          * hence silently ignore */
 724         is_root = isempty(path) || path_equal(path, "/");
 725         if (is_root)
 726                 /* Make sure we don't try to display messages with an empty path. */
 727                 path = "/";
 728
 729         /* We generally ignore errors caused by read-only mounted
 730          * cgroup trees (assuming we are running in a container then),
 731          * and missing cgroups, i.e. EROFS and ENOENT. */
 732
 733         if ((apply_mask & CGROUP_MASK_CPU) && !is_root) {
 734                 bool has_weight, has_shares;
 735
 736                 has_weight = cgroup_context_has_cpu_weight(c);
 737                 has_shares = cgroup_context_has_cpu_shares(c);
 738
 739                 if (cg_all_unified() > 0) {
 740                         uint64_t weight;
 741
 742                         if (has_weight)
 743                                 weight = cgroup_context_cpu_weight(c, state);
 744                         else if (has_shares) {
 745                                 uint64_t shares = cgroup_context_cpu_shares(c, state);
 746
 747                                 weight = cgroup_cpu_shares_to_weight(shares);
 748
 749                                 log_cgroup_compat(u, "Applying [Startup]CpuShares %" PRIu64 " as [Startup]CpuWeight %" PRIu64 " on %s",
 750                                                   shares, weight, path);
 751                         } else
 752                                 weight = CGROUP_WEIGHT_DEFAULT;
 753
 754                         cgroup_apply_unified_cpu_config(u, weight, c->cpu_quota_per_sec_usec);
 755                 } else {
 756                         uint64_t shares;
 757
 758                         if (has_weight) {
 759                                 uint64_t weight = cgroup_context_cpu_weight(c, state);
 760
 761                                 shares = cgroup_cpu_weight_to_shares(weight);
 762
 763                                 log_cgroup_compat(u, "Applying [Startup]CpuWeight %" PRIu64 " as [Startup]CpuShares %" PRIu64 " on %s",
 764                                                   weight, shares, path);
 765                         } else if (has_shares)
 766                                 shares = cgroup_context_cpu_shares(c, state);
 767                         else
 768                                 shares = CGROUP_CPU_SHARES_DEFAULT;
 769
 770                         cgroup_apply_legacy_cpu_config(u, shares, c->cpu_quota_per_sec_usec);
 771                 }
 772         }
 773
 774         if (apply_mask & CGROUP_MASK_IO) {
 775                 bool has_io = cgroup_context_has_io_config(c);
 776                 bool has_blockio = cgroup_context_has_blockio_config(c);
 777
 778                 if (!is_root) {
 779                         char buf[8+DECIMAL_STR_MAX(uint64_t)+1];
 780                         uint64_t weight;
 781
 782                         if (has_io)
 783                                 weight = cgroup_context_io_weight(c, state);
 784                         else if (has_blockio) {
 785                                 uint64_t blkio_weight = cgroup_context_blkio_weight(c, state);
 786
 787                                 weight = cgroup_weight_blkio_to_io(blkio_weight);
 788
 789                                 log_cgroup_compat(u, "Applying [Startup]BlockIOWeight %" PRIu64 " as [Startup]IOWeight %" PRIu64,
 790                                                   blkio_weight, weight);
 791                         } else
 792                                 weight = CGROUP_WEIGHT_DEFAULT;
 793
 794                         xsprintf(buf, "default %" PRIu64 "\n", weight);
 795                         r = cg_set_attribute("io", path, "io.weight", buf);
 796                         if (r < 0)
 797                                 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
 798                                               "Failed to set io.weight: %m");
 799
 800                         if (has_io) {
 801                                 CGroupIODeviceWeight *w;
 802
 803                                 /* FIXME: no way to reset this list */
 804                                 LIST_FOREACH(device_weights, w, c->io_device_weights)
 805                                         cgroup_apply_io_device_weight(u, w->path, w->weight);
 806                         } else if (has_blockio) {
 807                                 CGroupBlockIODeviceWeight *w;
 808
 809                                 /* FIXME: no way to reset this list */
 810                                 LIST_FOREACH(device_weights, w, c->blockio_device_weights) {
 811                                         weight = cgroup_weight_blkio_to_io(w->weight);
 812
 813                                         log_cgroup_compat(u, "Applying BlockIODeviceWeight %" PRIu64 " as IODeviceWeight %" PRIu64 " for %s",
 814                                                           w->weight, weight, w->path);
 815
 816                                         cgroup_apply_io_device_weight(u, w->path, weight);
 817                                 }
 818                         }
 819                 }
 820
 821                 /* Apply limits and free ones without config. */
 822                 if (has_io) {
 823                         CGroupIODeviceLimit *l, *next;
 824
 825                         LIST_FOREACH_SAFE(device_limits, l, next, c->io_device_limits) {
 826                                 if (!cgroup_apply_io_device_limit(u, l->path, l->limits))
 827                                         cgroup_context_free_io_device_limit(c, l);
 828                         }
 829                 } else if (has_blockio) {
 830                         CGroupBlockIODeviceBandwidth *b, *next;
 831
 832                         LIST_FOREACH_SAFE(device_bandwidths, b, next, c->blockio_device_bandwidths) {
 833                                 uint64_t limits[_CGROUP_IO_LIMIT_TYPE_MAX];
 834                                 CGroupIOLimitType type;
 835
 836                                 for (type = 0; type < _CGROUP_IO_LIMIT_TYPE_MAX; type++)
 837                                         limits[type] = cgroup_io_limit_defaults[type];
 838
 839                                 limits[CGROUP_IO_RBPS_MAX] = b->rbps;
 840                                 limits[CGROUP_IO_WBPS_MAX] = b->wbps;
 841
 842                                 log_cgroup_compat(u, "Applying BlockIO{Read|Write}Bandwidth %" PRIu64 " %" PRIu64 " as IO{Read|Write}BandwidthMax for %s",
 843                                                   b->rbps, b->wbps, b->path);
 844
 845                                 if (!cgroup_apply_io_device_limit(u, b->path, limits))
 846                                         cgroup_context_free_blockio_device_bandwidth(c, b);
 847                         }
 848                 }
 849         }
 850
 851         if (apply_mask & CGROUP_MASK_BLKIO) {
 852                 bool has_io = cgroup_context_has_io_config(c);
 853                 bool has_blockio = cgroup_context_has_blockio_config(c);
 854
 855                 if (!is_root) {
 856                         char buf[DECIMAL_STR_MAX(uint64_t)+1];
 857                         uint64_t weight;
 858
 859                         if (has_io) {
 860                                 uint64_t io_weight = cgroup_context_io_weight(c, state);
 861
 862                                 weight = cgroup_weight_io_to_blkio(cgroup_context_io_weight(c, state));
 863
 864                                 log_cgroup_compat(u, "Applying [Startup]IOWeight %" PRIu64 " as [Startup]BlockIOWeight %" PRIu64,
 865                                                   io_weight, weight);
 866                         } else if (has_blockio)
 867                                 weight = cgroup_context_blkio_weight(c, state);
 868                         else
 869                                 weight = CGROUP_BLKIO_WEIGHT_DEFAULT;
 870
 871                         xsprintf(buf, "%" PRIu64 "\n", weight);
 872                         r = cg_set_attribute("blkio", path, "blkio.weight", buf);
 873                         if (r < 0)
 874                                 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
 875                                               "Failed to set blkio.weight: %m");
 876
 877                         if (has_io) {
 878                                 CGroupIODeviceWeight *w;
 879
 880                                 /* FIXME: no way to reset this list */
 881                                 LIST_FOREACH(device_weights, w, c->io_device_weights) {
 882                                         weight = cgroup_weight_io_to_blkio(w->weight);
 883
 884                                         log_cgroup_compat(u, "Applying IODeviceWeight %" PRIu64 " as BlockIODeviceWeight %" PRIu64 " for %s",
 885                                                           w->weight, weight, w->path);
 886
 887                                         cgroup_apply_blkio_device_weight(u, w->path, weight);
 888                                 }
 889                         } else if (has_blockio) {
 890                                 CGroupBlockIODeviceWeight *w;
 891
 892                                 /* FIXME: no way to reset this list */
 893                                 LIST_FOREACH(device_weights, w, c->blockio_device_weights)
 894                                         cgroup_apply_blkio_device_weight(u, w->path, w->weight);
 895                         }
 896                 }
 897
 898                 /* Apply limits and free ones without config. */
 899                 if (has_io) {
 900                         CGroupIODeviceLimit *l, *next;
 901
 902                         LIST_FOREACH_SAFE(device_limits, l, next, c->io_device_limits) {
 903                                 log_cgroup_compat(u, "Applying IO{Read|Write}Bandwidth %" PRIu64 " %" PRIu64 " as BlockIO{Read|Write}BandwidthMax for %s",
 904                                                   l->limits[CGROUP_IO_RBPS_MAX], l->limits[CGROUP_IO_WBPS_MAX], l->path);
 905
 906                                 if (!cgroup_apply_blkio_device_limit(u, l->path, l->limits[CGROUP_IO_RBPS_MAX], l->limits[CGROUP_IO_WBPS_MAX]))
 907                                         cgroup_context_free_io_device_limit(c, l);
 908                         }
 909                 } else if (has_blockio) {
 910                         CGroupBlockIODeviceBandwidth *b, *next;
 911
 912                         LIST_FOREACH_SAFE(device_bandwidths, b, next, c->blockio_device_bandwidths)
 913                                 if (!cgroup_apply_blkio_device_limit(u, b->path, b->rbps, b->wbps))
 914                                         cgroup_context_free_blockio_device_bandwidth(c, b);
 915                 }
 916         }
 917
 918         if ((apply_mask & CGROUP_MASK_MEMORY) && !is_root) {
 919                 if (cg_all_unified() > 0) {
 920                         uint64_t max, swap_max = CGROUP_LIMIT_MAX;
 921
 922                         if (cgroup_context_has_unified_memory_config(c)) {
 923                                 max = c->memory_max;
 924                                 swap_max = c->memory_swap_max;
 925                         } else {
 926                                 max = c->memory_limit;
 927
 928                                 if (max != CGROUP_LIMIT_MAX)
 929                                         log_cgroup_compat(u, "Applying MemoryLimit %" PRIu64 " as MemoryMax", max);
 930                         }
 931
 932                         cgroup_apply_unified_memory_limit(u, "memory.low", c->memory_low);
 933                         cgroup_apply_unified_memory_limit(u, "memory.high", c->memory_high);
 934                         cgroup_apply_unified_memory_limit(u, "memory.max", max);
 935                         cgroup_apply_unified_memory_limit(u, "memory.swap.max", swap_max);
 936                 } else {
 937                         char buf[DECIMAL_STR_MAX(uint64_t) + 1];
 938                         uint64_t val;
 939
 940                         if (cgroup_context_has_unified_memory_config(c)) {
 941                                 val = c->memory_max;
 942                                 log_cgroup_compat(u, "Applying MemoryMax %" PRIi64 " as MemoryLimit", val);
 943                         } else
 944                                 val = c->memory_limit;
 945
 946                         if (val == CGROUP_LIMIT_MAX)
 947                                 strncpy(buf, "-1\n", sizeof(buf));
 948                         else
 949                                 xsprintf(buf, "%" PRIu64 "\n", val);
 950
 951                         r = cg_set_attribute("memory", path, "memory.limit_in_bytes", buf);
 952                         if (r < 0)
 953                                 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
 954                                               "Failed to set memory.limit_in_bytes: %m");
 955                 }
 956         }
 957
 958         if ((apply_mask & CGROUP_MASK_DEVICES) && !is_root) {
 959                 CGroupDeviceAllow *a;
 960
 961                 /* Changing the devices list of a populated cgroup
 962                  * might result in EINVAL, hence ignore EINVAL
 963                  * here. */
 964
 965                 if (c->device_allow || c->device_policy != CGROUP_AUTO)
 966                         r = cg_set_attribute("devices", path, "devices.deny", "a");
 967                 else
 968                         r = cg_set_attribute("devices", path, "devices.allow", "a");
 969                 if (r < 0)
 970                         log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EINVAL, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
 971                                       "Failed to reset devices.list: %m");
 972
 973                 if (c->device_policy == CGROUP_CLOSED ||
 974                     (c->device_policy == CGROUP_AUTO && c->device_allow)) {
 975                         static const char auto_devices[] =
 976                                 "/dev/null\0" "rwm\0"
 977                                 "/dev/zero\0" "rwm\0"
 978                                 "/dev/full\0" "rwm\0"
 979                                 "/dev/random\0" "rwm\0"
 980                                 "/dev/urandom\0" "rwm\0"
 981                                 "/dev/tty\0" "rwm\0"
 982                                 "/dev/ptmx\0" "rwm\0"
 983                                 /* Allow /run/systemd/inaccessible/{chr,blk} devices for mapping InaccessiblePaths */
 984                                 "-/run/systemd/inaccessible/chr\0" "rwm\0"
 985                                 "-/run/systemd/inaccessible/blk\0" "rwm\0";
 986
 987                         const char *x, *y;
 988
 989                         NULSTR_FOREACH_PAIR(x, y, auto_devices)
 990                                 whitelist_device(path, x, y);
 991
 992                         /* PTS (/dev/pts) devices may not be duplicated, but accessed */
 993                         whitelist_major(path, "pts", 'c', "rw");
 994                 }
 995
 996                 LIST_FOREACH(device_allow, a, c->device_allow) {
 997                         char acc[4], *val;
 998                         unsigned k = 0;
 999
1000                         if (a->r)
1001                                 acc[k++] = 'r';
1002                         if (a->w)
1003                                 acc[k++] = 'w';
1004                         if (a->m)
1005                                 acc[k++] = 'm';
1006
1007                         if (k == 0)
1008                                 continue;
1009
1010                         acc[k++] = 0;
1011
1012                         if (path_startswith(a->path, "/dev/"))
1013                                 whitelist_device(path, a->path, acc);
1014                         else if ((val = startswith(a->path, "block-")))
1015                                 whitelist_major(path, val, 'b', acc);
1016                         else if ((val = startswith(a->path, "char-")))
1017                                 whitelist_major(path, val, 'c', acc);
1018                         else
1019                                 log_unit_debug(u, "Ignoring device %s while writing cgroup attribute.", a->path);
1020                 }
1021         }
1022
1023         if ((apply_mask & CGROUP_MASK_PIDS) && !is_root) {
1024
1025                 if (c->tasks_max != CGROUP_LIMIT_MAX) {
1026                         char buf[DECIMAL_STR_MAX(uint64_t) + 2];
1027
1028                         sprintf(buf, "%" PRIu64 "\n", c->tasks_max);
1029                         r = cg_set_attribute("pids", path, "pids.max", buf);
1030                 } else
1031                         r = cg_set_attribute("pids", path, "pids.max", "max");
1032
1033                 if (r < 0)
1034                         log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
1035                                       "Failed to set pids.max: %m");
1036         }
1037
1038         if (apply_bpf)
1039                 cgroup_apply_firewall(u);
1040 }
1041
1042 CGroupMask cgroup_context_get_mask(CGroupContext *c) {
1043         CGroupMask mask = 0;
1044
1045         /* Figure out which controllers we need */
1046
1047         if (c->cpu_accounting ||
1048             cgroup_context_has_cpu_weight(c) ||
1049             cgroup_context_has_cpu_shares(c) ||
1050             c->cpu_quota_per_sec_usec != USEC_INFINITY)
1051                 mask |= CGROUP_MASK_CPUACCT | CGROUP_MASK_CPU;
1052
1053         if (cgroup_context_has_io_config(c) || cgroup_context_has_blockio_config(c))
1054                 mask |= CGROUP_MASK_IO | CGROUP_MASK_BLKIO;
1055
1056         if (c->memory_accounting ||
1057             c->memory_limit != CGROUP_LIMIT_MAX ||
1058             cgroup_context_has_unified_memory_config(c))
1059                 mask |= CGROUP_MASK_MEMORY;
1060
1061         if (c->device_allow ||
1062             c->device_policy != CGROUP_AUTO)
1063                 mask |= CGROUP_MASK_DEVICES;
1064
1065         if (c->tasks_accounting ||
1066             c->tasks_max != (uint64_t) -1)
1067                 mask |= CGROUP_MASK_PIDS;
1068
1069         return mask;
1070 }
1071
1072 CGroupMask unit_get_own_mask(Unit *u) {
1073         CGroupContext *c;
1074
1075         /* Returns the mask of controllers the unit needs for itself */
1076
1077         c = unit_get_cgroup_context(u);
1078         if (!c)
1079                 return 0;
1080
1081         return cgroup_context_get_mask(c) | unit_get_delegate_mask(u);
1082 }
1083
1084 CGroupMask unit_get_delegate_mask(Unit *u) {
1085         CGroupContext *c;
1086
1087         /* If delegation is turned on, then turn on selected controllers, unless we are on the legacy hierarchy and the
1088          * process we fork into is known to drop privileges, and hence shouldn't get access to the controllers.
1089          *
1090          * Note that on the unified hierarchy it is safe to delegate controllers to unprivileged services. */
1091
1092         if (u->type == UNIT_SLICE)
1093                 return 0;
1094
1095         c = unit_get_cgroup_context(u);
1096         if (!c)
1097                 return 0;
1098
1099         if (!c->delegate)
1100                 return 0;
1101
1102         if (cg_all_unified() <= 0) {
1103                 ExecContext *e;
1104
1105                 e = unit_get_exec_context(u);
1106                 if (e && !exec_context_maintains_privileges(e))
1107                         return 0;
1108         }
1109
1110         return c->delegate_controllers;
1111 }
1112
1113 CGroupMask unit_get_members_mask(Unit *u) {
1114         assert(u);
1115
1116         /* Returns the mask of controllers all of the unit's children require, merged */
1117
1118         if (u->cgroup_members_mask_valid)
1119                 return u->cgroup_members_mask;
1120
1121         u->cgroup_members_mask = 0;
1122
1123         if (u->type == UNIT_SLICE) {
1124                 void *v;
1125                 Unit *member;
1126                 Iterator i;
1127
1128                 HASHMAP_FOREACH_KEY(v, member, u->dependencies[UNIT_BEFORE], i) {
1129
1130                         if (member == u)
1131                                 continue;
1132
1133                         if (UNIT_DEREF(member->slice) != u)
1134                                 continue;
1135
1136                         u->cgroup_members_mask |= unit_get_subtree_mask(member); /* note that this calls ourselves again, for the children */
1137                 }
1138         }
1139
1140         u->cgroup_members_mask_valid = true;
1141         return u->cgroup_members_mask;
1142 }
1143
1144 CGroupMask unit_get_siblings_mask(Unit *u) {
1145         assert(u);
1146
1147         /* Returns the mask of controllers all of the unit's siblings
1148          * require, i.e. the members mask of the unit's parent slice
1149          * if there is one. */
1150
1151         if (UNIT_ISSET(u->slice))
1152                 return unit_get_members_mask(UNIT_DEREF(u->slice));
1153
1154         return unit_get_subtree_mask(u); /* we are the top-level slice */
1155 }
1156
1157 CGroupMask unit_get_subtree_mask(Unit *u) {
1158
1159         /* Returns the mask of this subtree, meaning of the group
1160          * itself and its children. */
1161
1162         return unit_get_own_mask(u) | unit_get_members_mask(u);
1163 }
1164
1165 CGroupMask unit_get_target_mask(Unit *u) {
1166         CGroupMask mask;
1167
1168         /* This returns the cgroup mask of all controllers to enable
1169          * for a specific cgroup, i.e. everything it needs itself,
1170          * plus all that its children need, plus all that its siblings
1171          * need. This is primarily useful on the legacy cgroup
1172          * hierarchy, where we need to duplicate each cgroup in each
1173          * hierarchy that shall be enabled for it. */
1174
1175         mask = unit_get_own_mask(u) | unit_get_members_mask(u) | unit_get_siblings_mask(u);
1176         mask &= u->manager->cgroup_supported;
1177
1178         return mask;
1179 }
1180
1181 CGroupMask unit_get_enable_mask(Unit *u) {
1182         CGroupMask mask;
1183
1184         /* This returns the cgroup mask of all controllers to enable
1185          * for the children of a specific cgroup. This is primarily
1186          * useful for the unified cgroup hierarchy, where each cgroup
1187          * controls which controllers are enabled for its children. */
1188
1189         mask = unit_get_members_mask(u);
1190         mask &= u->manager->cgroup_supported;
1191
1192         return mask;
1193 }
1194
1195 bool unit_get_needs_bpf(Unit *u) {
1196         CGroupContext *c;
1197         Unit *p;
1198         assert(u);
1199
1200         /* We never attach BPF to slice units, as they are inner cgroup nodes and cgroup/BPF is not recursive at the
1201          * moment. */
1202         if (u->type == UNIT_SLICE)
1203                 return false;
1204
1205         c = unit_get_cgroup_context(u);
1206         if (!c)
1207                 return false;
1208
1209         if (c->ip_accounting ||
1210             c->ip_address_allow ||
1211             c->ip_address_deny)
1212                 return true;
1213
1214         /* If any parent slice has an IP access list defined, it applies too */
1215         for (p = UNIT_DEREF(u->slice); p; p = UNIT_DEREF(p->slice)) {
1216                 c = unit_get_cgroup_context(p);
1217                 if (!c)
1218                         return false;
1219
1220                 if (c->ip_address_allow ||
1221                     c->ip_address_deny)
1222                         return true;
1223         }
1224
1225         return false;
1226 }
1227
1228 /* Recurse from a unit up through its containing slices, propagating
1229  * mask bits upward. A unit is also member of itself. */
1230 void unit_update_cgroup_members_masks(Unit *u) {
1231         CGroupMask m;
1232         bool more;
1233
1234         assert(u);
1235
1236         /* Calculate subtree mask */
1237         m = unit_get_subtree_mask(u);
1238
1239         /* See if anything changed from the previous invocation. If
1240          * not, we're done. */
1241         if (u->cgroup_subtree_mask_valid && m == u->cgroup_subtree_mask)
1242                 return;
1243
1244         more =
1245                 u->cgroup_subtree_mask_valid &&
1246                 ((m & ~u->cgroup_subtree_mask) != 0) &&
1247                 ((~m & u->cgroup_subtree_mask) == 0);
1248
1249         u->cgroup_subtree_mask = m;
1250         u->cgroup_subtree_mask_valid = true;
1251
1252         if (UNIT_ISSET(u->slice)) {
1253                 Unit *s = UNIT_DEREF(u->slice);
1254
1255                 if (more)
1256                         /* There's more set now than before. We
1257                          * propagate the new mask to the parent's mask
1258                          * (not caring if it actually was valid or
1259                          * not). */
1260
1261                         s->cgroup_members_mask |= m;
1262
1263                 else
1264                         /* There's less set now than before (or we
1265                          * don't know), we need to recalculate
1266                          * everything, so let's invalidate the
1267                          * parent's members mask */
1268
1269                         s->cgroup_members_mask_valid = false;
1270
1271                 /* And now make sure that this change also hits our
1272                  * grandparents */
1273                 unit_update_cgroup_members_masks(s);
1274         }
1275 }
1276
1277 static const char *migrate_callback(CGroupMask mask, void *userdata) {
1278         Unit *u = userdata;
1279
1280         assert(mask != 0);
1281         assert(u);
1282
1283         while (u) {
1284                 if (u->cgroup_path &&
1285                     u->cgroup_realized &&
1286                     (u->cgroup_realized_mask & mask) == mask)
1287                         return u->cgroup_path;
1288
1289                 u = UNIT_DEREF(u->slice);
1290         }
1291
1292         return NULL;
1293 }
1294
1295 char *unit_default_cgroup_path(Unit *u) {
1296         _cleanup_free_ char *escaped = NULL, *slice = NULL;
1297         int r;
1298
1299         assert(u);
1300
1301         if (unit_has_name(u, SPECIAL_ROOT_SLICE))
1302                 return strdup(u->manager->cgroup_root);
1303
1304         if (UNIT_ISSET(u->slice) && !unit_has_name(UNIT_DEREF(u->slice), SPECIAL_ROOT_SLICE)) {
1305                 r = cg_slice_to_path(UNIT_DEREF(u->slice)->id, &slice);
1306                 if (r < 0)
1307                         return NULL;
1308         }
1309
1310         escaped = cg_escape(u->id);
1311         if (!escaped)
1312                 return NULL;
1313
1314         if (slice)
1315                 return strjoin(u->manager->cgroup_root, "/", slice, "/",
1316                                escaped);
1317         else
1318                 return strjoin(u->manager->cgroup_root, "/", escaped);
1319 }
1320
1321 int unit_set_cgroup_path(Unit *u, const char *path) {
1322         _cleanup_free_ char *p = NULL;
1323         int r;
1324
1325         assert(u);
1326
1327         if (path) {
1328                 p = strdup(path);
1329                 if (!p)
1330                         return -ENOMEM;
1331         } else
1332                 p = NULL;
1333
1334         if (streq_ptr(u->cgroup_path, p))
1335                 return 0;
1336
1337         if (p) {
1338                 r = hashmap_put(u->manager->cgroup_unit, p, u);
1339                 if (r < 0)
1340                         return r;
1341         }
1342
1343         unit_release_cgroup(u);
1344
1345         u->cgroup_path = p;
1346         p = NULL;
1347
1348         return 1;
1349 }
1350
1351 int unit_watch_cgroup(Unit *u) {
1352         _cleanup_free_ char *events = NULL;
1353         int r;
1354
1355         assert(u);
1356
1357         if (!u->cgroup_path)
1358                 return 0;
1359
1360         if (u->cgroup_inotify_wd >= 0)
1361                 return 0;
1362
1363         /* Only applies to the unified hierarchy */
1364         r = cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER);
1365         if (r < 0)
1366                 return log_error_errno(r, "Failed to determine whether the name=systemd hierarchy is unified: %m");
1367         if (r == 0)
1368                 return 0;
1369
1370         /* Don't watch the root slice, it's pointless. */
1371         if (unit_has_name(u, SPECIAL_ROOT_SLICE))
1372                 return 0;
1373
1374         r = hashmap_ensure_allocated(&u->manager->cgroup_inotify_wd_unit, &trivial_hash_ops);
1375         if (r < 0)
1376                 return log_oom();
1377
1378         r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, "cgroup.events", &events);
1379         if (r < 0)
1380                 return log_oom();
1381
1382         u->cgroup_inotify_wd = inotify_add_watch(u->manager->cgroup_inotify_fd, events, IN_MODIFY);
1383         if (u->cgroup_inotify_wd < 0) {
1384
1385                 if (errno == ENOENT) /* If the directory is already
1386                                       * gone we don't need to track
1387                                       * it, so this is not an error */
1388                         return 0;
1389
1390                 return log_unit_error_errno(u, errno, "Failed to add inotify watch descriptor for control group %s: %m", u->cgroup_path);
1391         }
1392
1393         r = hashmap_put(u->manager->cgroup_inotify_wd_unit, INT_TO_PTR(u->cgroup_inotify_wd), u);
1394         if (r < 0)
1395                 return log_unit_error_errno(u, r, "Failed to add inotify watch descriptor to hash map: %m");
1396
1397         return 0;
1398 }
1399
1400 int unit_pick_cgroup_path(Unit *u) {
1401         _cleanup_free_ char *path = NULL;
1402         int r;
1403
1404         assert(u);
1405
1406         if (u->cgroup_path)
1407                 return 0;
1408
1409         if (!UNIT_HAS_CGROUP_CONTEXT(u))
1410                 return -EINVAL;
1411
1412         path = unit_default_cgroup_path(u);
1413         if (!path)
1414                 return log_oom();
1415
1416         r = unit_set_cgroup_path(u, path);
1417         if (r == -EEXIST)
1418                 return log_unit_error_errno(u, r, "Control group %s exists already.", path);
1419         if (r < 0)
1420                 return log_unit_error_errno(u, r, "Failed to set unit's control group path to %s: %m", path);
1421
1422         return 0;
1423 }
1424
1425 static int unit_create_cgroup(
1426                 Unit *u,
1427                 CGroupMask target_mask,
1428                 CGroupMask enable_mask,
1429                 bool needs_bpf) {
1430
1431         CGroupContext *c;
1432         int r;
1433
1434         assert(u);
1435
1436         c = unit_get_cgroup_context(u);
1437         if (!c)
1438                 return 0;
1439
1440         /* Figure out our cgroup path */
1441         r = unit_pick_cgroup_path(u);
1442         if (r < 0)
1443                 return r;
1444
1445         /* First, create our own group */
1446         r = cg_create_everywhere(u->manager->cgroup_supported, target_mask, u->cgroup_path);
1447         if (r < 0)
1448                 return log_unit_error_errno(u, r, "Failed to create cgroup %s: %m", u->cgroup_path);
1449
1450         /* Start watching it */
1451         (void) unit_watch_cgroup(u);
1452
1453         /* Enable all controllers we need */
1454         r = cg_enable_everywhere(u->manager->cgroup_supported, enable_mask, u->cgroup_path);
1455         if (r < 0)
1456                 log_unit_warning_errno(u, r, "Failed to enable controllers on cgroup %s, ignoring: %m", u->cgroup_path);
1457
1458         /* Keep track that this is now realized */
1459         u->cgroup_realized = true;
1460         u->cgroup_realized_mask = target_mask;
1461         u->cgroup_enabled_mask = enable_mask;
1462         u->cgroup_bpf_state = needs_bpf ? UNIT_CGROUP_BPF_ON : UNIT_CGROUP_BPF_OFF;
1463
1464         if (u->type != UNIT_SLICE && !c->delegate) {
1465
1466                 /* Then, possibly move things over, but not if
1467                  * subgroups may contain processes, which is the case
1468                  * for slice and delegation units. */
1469                 r = cg_migrate_everywhere(u->manager->cgroup_supported, u->cgroup_path, u->cgroup_path, migrate_callback, u);
1470                 if (r < 0)
1471                         log_unit_warning_errno(u, r, "Failed to migrate cgroup from to %s, ignoring: %m", u->cgroup_path);
1472         }
1473
1474         return 0;
1475 }
1476
1477 int unit_attach_pids_to_cgroup(Unit *u) {
1478         int r;
1479         assert(u);
1480
1481         r = unit_realize_cgroup(u);
1482         if (r < 0)
1483                 return r;
1484
1485         r = cg_attach_many_everywhere(u->manager->cgroup_supported, u->cgroup_path, u->pids, migrate_callback, u);
1486         if (r < 0)
1487                 return r;
1488
1489         return 0;
1490 }
1491
1492 static void cgroup_xattr_apply(Unit *u) {
1493         char ids[SD_ID128_STRING_MAX];
1494         int r;
1495
1496         assert(u);
1497
1498         if (!MANAGER_IS_SYSTEM(u->manager))
1499                 return;
1500
1501         if (sd_id128_is_null(u->invocation_id))
1502                 return;
1503
1504         r = cg_set_xattr(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path,
1505                          "trusted.invocation_id",
1506                          sd_id128_to_string(u->invocation_id, ids), 32,
1507                          0);
1508         if (r < 0)
1509                 log_unit_debug_errno(u, r, "Failed to set invocation ID on control group %s, ignoring: %m", u->cgroup_path);
1510 }
1511
1512 static bool unit_has_mask_realized(
1513                 Unit *u,
1514                 CGroupMask target_mask,
1515                 CGroupMask enable_mask,
1516                 bool needs_bpf) {
1517
1518         assert(u);
1519
1520         return u->cgroup_realized &&
1521                 u->cgroup_realized_mask == target_mask &&
1522                 u->cgroup_enabled_mask == enable_mask &&
1523                 ((needs_bpf && u->cgroup_bpf_state == UNIT_CGROUP_BPF_ON) ||
1524                  (!needs_bpf && u->cgroup_bpf_state == UNIT_CGROUP_BPF_OFF));
1525 }
1526
1527 static void unit_add_to_cgroup_realize_queue(Unit *u) {
1528         assert(u);
1529
1530         if (u->in_cgroup_realize_queue)
1531                 return;
1532
1533         LIST_PREPEND(cgroup_realize_queue, u->manager->cgroup_realize_queue, u);
1534         u->in_cgroup_realize_queue = true;
1535 }
1536
1537 static void unit_remove_from_cgroup_realize_queue(Unit *u) {
1538         assert(u);
1539
1540         if (!u->in_cgroup_realize_queue)
1541                 return;
1542
1543         LIST_REMOVE(cgroup_realize_queue, u->manager->cgroup_realize_queue, u);
1544         u->in_cgroup_realize_queue = false;
1545 }
1546
1547
1548 /* Check if necessary controllers and attributes for a unit are in place.
1549  *
1550  * If so, do nothing.
1551  * If not, create paths, move processes over, and set attributes.
1552  *
1553  * Returns 0 on success and < 0 on failure. */
1554 static int unit_realize_cgroup_now(Unit *u, ManagerState state) {
1555         CGroupMask target_mask, enable_mask;
1556         bool needs_bpf, apply_bpf;
1557         int r;
1558
1559         assert(u);
1560
1561         unit_remove_from_cgroup_realize_queue(u);
1562
1563         target_mask = unit_get_target_mask(u);
1564         enable_mask = unit_get_enable_mask(u);
1565         needs_bpf = unit_get_needs_bpf(u);
1566
1567         if (unit_has_mask_realized(u, target_mask, enable_mask, needs_bpf))
1568                 return 0;
1569
1570         /* Make sure we apply the BPF filters either when one is configured, or if none is configured but previously
1571          * the state was anything but off. This way, if a unit with a BPF filter applied is reconfigured to lose it
1572          * this will trickle down properly to cgroupfs. */
1573         apply_bpf = needs_bpf || u->cgroup_bpf_state != UNIT_CGROUP_BPF_OFF;
1574
1575         /* First, realize parents */
1576         if (UNIT_ISSET(u->slice)) {
1577                 r = unit_realize_cgroup_now(UNIT_DEREF(u->slice), state);
1578                 if (r < 0)
1579                         return r;
1580         }
1581
1582         /* And then do the real work */
1583         r = unit_create_cgroup(u, target_mask, enable_mask, needs_bpf);
1584         if (r < 0)
1585                 return r;
1586
1587         /* Finally, apply the necessary attributes. */
1588         cgroup_context_apply(u, target_mask, apply_bpf, state);
1589         cgroup_xattr_apply(u);
1590
1591         return 0;
1592 }
1593
1594 unsigned manager_dispatch_cgroup_realize_queue(Manager *m) {
1595         ManagerState state;
1596         unsigned n = 0;
1597         Unit *i;
1598         int r;
1599
1600         assert(m);
1601
1602         state = manager_state(m);
1603
1604         while ((i = m->cgroup_realize_queue)) {
1605                 assert(i->in_cgroup_realize_queue);
1606
1607                 if (UNIT_IS_INACTIVE_OR_FAILED(unit_active_state(i))) {
1608                         /* Maybe things changed, and the unit is not actually active anymore? */
1609                         unit_remove_from_cgroup_realize_queue(i);
1610                         continue;
1611                 }
1612
1613                 r = unit_realize_cgroup_now(i, state);
1614                 if (r < 0)
1615                         log_warning_errno(r, "Failed to realize cgroups for queued unit %s, ignoring: %m", i->id);
1616
1617                 n++;
1618         }
1619
1620         return n;
1621 }
1622
1623 static void unit_add_siblings_to_cgroup_realize_queue(Unit *u) {
1624         Unit *slice;
1625
1626         /* This adds the siblings of the specified unit and the
1627          * siblings of all parent units to the cgroup queue. (But
1628          * neither the specified unit itself nor the parents.) */
1629
1630         while ((slice = UNIT_DEREF(u->slice))) {
1631                 Iterator i;
1632                 Unit *m;
1633                 void *v;
1634
1635                 HASHMAP_FOREACH_KEY(v, m, u->dependencies[UNIT_BEFORE], i) {
1636                         if (m == u)
1637                                 continue;
1638
1639                         /* Skip units that have a dependency on the slice
1640                          * but aren't actually in it. */
1641                         if (UNIT_DEREF(m->slice) != slice)
1642                                 continue;
1643
1644                         /* No point in doing cgroup application for units
1645                          * without active processes. */
1646                         if (UNIT_IS_INACTIVE_OR_FAILED(unit_active_state(m)))
1647                                 continue;
1648
1649                         /* If the unit doesn't need any new controllers
1650                          * and has current ones realized, it doesn't need
1651                          * any changes. */
1652                         if (unit_has_mask_realized(m,
1653                                                    unit_get_target_mask(m),
1654                                                    unit_get_enable_mask(m),
1655                                                    unit_get_needs_bpf(m)))
1656                                 continue;
1657
1658                         unit_add_to_cgroup_realize_queue(m);
1659                 }
1660
1661                 u = slice;
1662         }
1663 }
1664
1665 int unit_realize_cgroup(Unit *u) {
1666         assert(u);
1667
1668         if (!UNIT_HAS_CGROUP_CONTEXT(u))
1669                 return 0;
1670
1671         /* So, here's the deal: when realizing the cgroups for this
1672          * unit, we need to first create all parents, but there's more
1673          * actually: for the weight-based controllers we also need to
1674          * make sure that all our siblings (i.e. units that are in the
1675          * same slice as we are) have cgroups, too. Otherwise, things
1676          * would become very uneven as each of their processes would
1677          * get as much resources as all our group together. This call
1678          * will synchronously create the parent cgroups, but will
1679          * defer work on the siblings to the next event loop
1680          * iteration. */
1681
1682         /* Add all sibling slices to the cgroup queue. */
1683         unit_add_siblings_to_cgroup_realize_queue(u);
1684
1685         /* And realize this one now (and apply the values) */
1686         return unit_realize_cgroup_now(u, manager_state(u->manager));
1687 }
1688
1689 void unit_release_cgroup(Unit *u) {
1690         assert(u);
1691
1692         /* Forgets all cgroup details for this cgroup */
1693
1694         if (u->cgroup_path) {
1695                 (void) hashmap_remove(u->manager->cgroup_unit, u->cgroup_path);
1696                 u->cgroup_path = mfree(u->cgroup_path);
1697         }
1698
1699         if (u->cgroup_inotify_wd >= 0) {
1700                 if (inotify_rm_watch(u->manager->cgroup_inotify_fd, u->cgroup_inotify_wd) < 0)
1701                         log_unit_debug_errno(u, errno, "Failed to remove cgroup inotify watch %i for %s, ignoring", u->cgroup_inotify_wd, u->id);
1702
1703                 (void) hashmap_remove(u->manager->cgroup_inotify_wd_unit, INT_TO_PTR(u->cgroup_inotify_wd));
1704                 u->cgroup_inotify_wd = -1;
1705         }
1706 }
1707
1708 void unit_prune_cgroup(Unit *u) {
1709         int r;
1710         bool is_root_slice;
1711
1712         assert(u);
1713
1714         /* Removes the cgroup, if empty and possible, and stops watching it. */
1715
1716         if (!u->cgroup_path)
1717                 return;
1718
1719         (void) unit_get_cpu_usage(u, NULL); /* Cache the last CPU usage value before we destroy the cgroup */
1720
1721         is_root_slice = unit_has_name(u, SPECIAL_ROOT_SLICE);
1722
1723         r = cg_trim_everywhere(u->manager->cgroup_supported, u->cgroup_path, !is_root_slice);
1724         if (r < 0) {
1725                 log_unit_debug_errno(u, r, "Failed to destroy cgroup %s, ignoring: %m", u->cgroup_path);
1726                 return;
1727         }
1728
1729         if (is_root_slice)
1730                 return;
1731
1732         unit_release_cgroup(u);
1733
1734         u->cgroup_realized = false;
1735         u->cgroup_realized_mask = 0;
1736         u->cgroup_enabled_mask = 0;
1737 }
1738
1739 int unit_search_main_pid(Unit *u, pid_t *ret) {
1740         _cleanup_fclose_ FILE *f = NULL;
1741         pid_t pid = 0, npid, mypid;
1742         int r;
1743
1744         assert(u);
1745         assert(ret);
1746
1747         if (!u->cgroup_path)
1748                 return -ENXIO;
1749
1750         r = cg_enumerate_processes(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, &f);
1751         if (r < 0)
1752                 return r;
1753
1754         mypid = getpid_cached();
1755         while (cg_read_pid(f, &npid) > 0)  {
1756                 pid_t ppid;
1757
1758                 if (npid == pid)
1759                         continue;
1760
1761                 /* Ignore processes that aren't our kids */
1762                 if (get_process_ppid(npid, &ppid) >= 0 && ppid != mypid)
1763                         continue;
1764
1765                 if (pid != 0)
1766                         /* Dang, there's more than one daemonized PID
1767                         in this group, so we don't know what process
1768                         is the main process. */
1769
1770                         return -ENODATA;
1771
1772                 pid = npid;
1773         }
1774
1775         *ret = pid;
1776         return 0;
1777 }
1778
1779 static int unit_watch_pids_in_path(Unit *u, const char *path) {
1780         _cleanup_closedir_ DIR *d = NULL;
1781         _cleanup_fclose_ FILE *f = NULL;
1782         int ret = 0, r;
1783
1784         assert(u);
1785         assert(path);
1786
1787         r = cg_enumerate_processes(SYSTEMD_CGROUP_CONTROLLER, path, &f);
1788         if (r < 0)
1789                 ret = r;
1790         else {
1791                 pid_t pid;
1792
1793                 while ((r = cg_read_pid(f, &pid)) > 0) {
1794                         r = unit_watch_pid(u, pid);
1795                         if (r < 0 && ret >= 0)
1796                                 ret = r;
1797                 }
1798
1799                 if (r < 0 && ret >= 0)
1800                         ret = r;
1801         }
1802
1803         r = cg_enumerate_subgroups(SYSTEMD_CGROUP_CONTROLLER, path, &d);
1804         if (r < 0) {
1805                 if (ret >= 0)
1806                         ret = r;
1807         } else {
1808                 char *fn;
1809
1810                 while ((r = cg_read_subgroup(d, &fn)) > 0) {
1811                         _cleanup_free_ char *p = NULL;
1812
1813                         p = strjoin(path, "/", fn);
1814                         free(fn);
1815
1816                         if (!p)
1817                                 return -ENOMEM;
1818
1819                         r = unit_watch_pids_in_path(u, p);
1820                         if (r < 0 && ret >= 0)
1821                                 ret = r;
1822                 }
1823
1824                 if (r < 0 && ret >= 0)
1825                         ret = r;
1826         }
1827
1828         return ret;
1829 }
1830
1831 int unit_synthesize_cgroup_empty_event(Unit *u) {
1832         int r;
1833
1834         assert(u);
1835
1836         /* Enqueue a synthetic cgroup empty event if this unit doesn't watch any PIDs anymore. This is compatibility
1837          * support for non-unified systems where notifications aren't reliable, and hence need to take whatever we can
1838          * get as notification source as soon as we stopped having any useful PIDs to watch for. */
1839
1840         if (!u->cgroup_path)
1841                 return -ENOENT;
1842
1843         r = cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER);
1844         if (r < 0)
1845                 return r;
1846         if (r > 0) /* On unified we have reliable notifications, and don't need this */
1847                 return 0;
1848
1849         if (!set_isempty(u->pids))
1850                 return 0;
1851
1852         unit_add_to_cgroup_empty_queue(u);
1853         return 0;
1854 }
1855
1856 int unit_watch_all_pids(Unit *u) {
1857         int r;
1858
1859         assert(u);
1860
1861         /* Adds all PIDs from our cgroup to the set of PIDs we
1862          * watch. This is a fallback logic for cases where we do not
1863          * get reliable cgroup empty notifications: we try to use
1864          * SIGCHLD as replacement. */
1865
1866         if (!u->cgroup_path)
1867                 return -ENOENT;
1868
1869         r = cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER);
1870         if (r < 0)
1871                 return r;
1872         if (r > 0) /* On unified we can use proper notifications */
1873                 return 0;
1874
1875         return unit_watch_pids_in_path(u, u->cgroup_path);
1876 }
1877
1878 static int on_cgroup_empty_event(sd_event_source *s, void *userdata) {
1879         Manager *m = userdata;
1880         Unit *u;
1881         int r;
1882
1883         assert(s);
1884         assert(m);
1885
1886         u = m->cgroup_empty_queue;
1887         if (!u)
1888                 return 0;
1889
1890         assert(u->in_cgroup_empty_queue);
1891         u->in_cgroup_empty_queue = false;
1892         LIST_REMOVE(cgroup_empty_queue, m->cgroup_empty_queue, u);
1893
1894         if (m->cgroup_empty_queue) {
1895                 /* More stuff queued, let's make sure we remain enabled */
1896                 r = sd_event_source_set_enabled(s, SD_EVENT_ONESHOT);
1897                 if (r < 0)
1898                         log_debug_errno(r, "Failed to reenable cgroup empty event source: %m");
1899         }
1900
1901         unit_add_to_gc_queue(u);
1902
1903         if (UNIT_VTABLE(u)->notify_cgroup_empty)
1904                 UNIT_VTABLE(u)->notify_cgroup_empty(u);
1905
1906         return 0;
1907 }
1908
1909 void unit_add_to_cgroup_empty_queue(Unit *u) {
1910         int r;
1911
1912         assert(u);
1913
1914         /* Note that there are four different ways how cgroup empty events reach us:
1915          *
1916          * 1. On the unified hierarchy we get an inotify event on the cgroup
1917          *
1918          * 2. On the legacy hierarchy, when running in system mode, we get a datagram on the cgroup agent socket
1919          *
1920          * 3. On the legacy hierarchy, when running in user mode, we get a D-Bus signal on the system bus
1921          *
1922          * 4. On the legacy hierarchy, in service units we start watching all processes of the cgroup for SIGCHLD as
1923          *    soon as we get one SIGCHLD, to deal with unreliable cgroup notifications.
1924          *
1925          * Regardless which way we got the notification, we'll verify it here, and then add it to a separate
1926          * queue. This queue will be dispatched at a lower priority than the SIGCHLD handler, so that we always use
1927          * SIGCHLD if we can get it first, and only use the cgroup empty notifications if there's no SIGCHLD pending
1928          * (which might happen if the cgroup doesn't contain processes that are our own child, which is typically the
1929          * case for scope units). */
1930
1931         if (u->in_cgroup_empty_queue)
1932                 return;
1933
1934         /* Let's verify that the cgroup is really empty */
1935         if (!u->cgroup_path)
1936                 return;
1937         r = cg_is_empty_recursive(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path);
1938         if (r < 0) {
1939                 log_unit_debug_errno(u, r, "Failed to determine whether cgroup %s is empty: %m", u->cgroup_path);
1940                 return;
1941         }
1942         if (r == 0)
1943                 return;
1944
1945         LIST_PREPEND(cgroup_empty_queue, u->manager->cgroup_empty_queue, u);
1946         u->in_cgroup_empty_queue = true;
1947
1948         /* Trigger the defer event */
1949         r = sd_event_source_set_enabled(u->manager->cgroup_empty_event_source, SD_EVENT_ONESHOT);
1950         if (r < 0)
1951                 log_debug_errno(r, "Failed to enable cgroup empty event source: %m");
1952 }
1953
1954 static int on_cgroup_inotify_event(sd_event_source *s, int fd, uint32_t revents, void *userdata) {
1955         Manager *m = userdata;
1956
1957         assert(s);
1958         assert(fd >= 0);
1959         assert(m);
1960
1961         for (;;) {
1962                 union inotify_event_buffer buffer;
1963                 struct inotify_event *e;
1964                 ssize_t l;
1965
1966                 l = read(fd, &buffer, sizeof(buffer));
1967                 if (l < 0) {
1968                         if (IN_SET(errno, EINTR, EAGAIN))
1969                                 return 0;
1970
1971                         return log_error_errno(errno, "Failed to read control group inotify events: %m");
1972                 }
1973
1974                 FOREACH_INOTIFY_EVENT(e, buffer, l) {
1975                         Unit *u;
1976
1977                         if (e->wd < 0)
1978                                 /* Queue overflow has no watch descriptor */
1979                                 continue;
1980
1981                         if (e->mask & IN_IGNORED)
1982                                 /* The watch was just removed */
1983                                 continue;
1984
1985                         u = hashmap_get(m->cgroup_inotify_wd_unit, INT_TO_PTR(e->wd));
1986                         if (!u) /* Not that inotify might deliver
1987                                  * events for a watch even after it
1988                                  * was removed, because it was queued
1989                                  * before the removal. Let's ignore
1990                                  * this here safely. */
1991                                 continue;
1992
1993                         unit_add_to_cgroup_empty_queue(u);
1994                 }
1995         }
1996 }
1997 #endif // 0
1998
1999 int manager_setup_cgroup(Manager *m) {
2000         _cleanup_free_ char *path = NULL;
2001         const char *scope_path;
2002         CGroupController c;
2003         int r, all_unified;
2004 #if 0 /// UNNEEDED by elogind
2005         char *e;
2006 #endif // 0
2007
2008         assert(m);
2009
2010         /* 1. Determine hierarchy */
2011         m->cgroup_root = mfree(m->cgroup_root);
2012 #if 0 /// elogind is not init and must therefore search for PID 1 instead of self.
2013         r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, 0, &m->cgroup_root);
2014 #else
2015         r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, 1, &m->cgroup_root);
2016 #endif // 0
2017         if (r < 0)
2018                 return log_error_errno(r, "Cannot determine cgroup we are running in: %m");
2019
2020 #if 0 /// elogind does not support systemd scopes and slices
2021         /* Chop off the init scope, if we are already located in it */
2022         e = endswith(m->cgroup_root, "/" SPECIAL_INIT_SCOPE);
2023
2024         /* LEGACY: Also chop off the system slice if we are in
2025          * it. This is to support live upgrades from older systemd
2026          * versions where PID 1 was moved there. Also see
2027          * cg_get_root_path(). */
2028         if (!e && MANAGER_IS_SYSTEM(m)) {
2029                 e = endswith(m->cgroup_root, "/" SPECIAL_SYSTEM_SLICE);
2030                 if (!e)
2031                         e = endswith(m->cgroup_root, "/system"); /* even more legacy */
2032         }
2033         if (e)
2034                 *e = 0;
2035 #endif // 0
2036
2037         log_debug_elogind("Cgroup Controller \"%s\" -> root \"%s\"",
2038                           SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root);
2039         /* And make sure to store away the root value without trailing slash, even for the root dir, so that we can
2040          * easily prepend it everywhere. */
2041         delete_trailing_chars(m->cgroup_root, "/");
2042
2043         /* 2. Show data */
2044         r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, NULL, &path);
2045         if (r < 0)
2046                 return log_error_errno(r, "Cannot find cgroup mount point: %m");
2047
2048         r = cg_unified_flush();
2049         if (r < 0)
2050                 return log_error_errno(r, "Couldn't determine if we are running in the unified hierarchy: %m");
2051
2052         all_unified = cg_all_unified();
2053         if (all_unified < 0)
2054                 return log_error_errno(all_unified, "Couldn't determine whether we are in all unified mode: %m");
2055         if (all_unified > 0)
2056                 log_debug("Unified cgroup hierarchy is located at %s.", path);
2057         else {
2058                 r = cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER);
2059                 if (r < 0)
2060                         return log_error_errno(r, "Failed to determine whether systemd's own controller is in unified mode: %m");
2061                 if (r > 0)
2062                         log_debug("Unified cgroup hierarchy is located at %s. Controllers are on legacy hierarchies.", path);
2063                 else
2064                         log_debug("Using cgroup controller " SYSTEMD_CGROUP_CONTROLLER_LEGACY ". File system hierarchy is at %s.", path);
2065         }
2066
2067 #if 0 /// elogind is not init, and does not install the agent here.
2068         /* 3. Allocate cgroup empty defer event source */
2069         m->cgroup_empty_event_source = sd_event_source_unref(m->cgroup_empty_event_source);
2070         r = sd_event_add_defer(m->event, &m->cgroup_empty_event_source, on_cgroup_empty_event, m);
2071         if (r < 0)
2072                 return log_error_errno(r, "Failed to create cgroup empty event source: %m");
2073
2074         r = sd_event_source_set_priority(m->cgroup_empty_event_source, SD_EVENT_PRIORITY_NORMAL-5);
2075         if (r < 0)
2076                 return log_error_errno(r, "Failed to set priority of cgroup empty event source: %m");
2077
2078         r = sd_event_source_set_enabled(m->cgroup_empty_event_source, SD_EVENT_OFF);
2079         if (r < 0)
2080                 return log_error_errno(r, "Failed to disable cgroup empty event source: %m");
2081
2082         (void) sd_event_source_set_description(m->cgroup_empty_event_source, "cgroup-empty");
2083
2084         /* 4. Install notifier inotify object, or agent */
2085         if (cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER) > 0) {
2086
2087                 /* In the unified hierarchy we can get cgroup empty notifications via inotify. */
2088
2089                 m->cgroup_inotify_event_source = sd_event_source_unref(m->cgroup_inotify_event_source);
2090                 safe_close(m->cgroup_inotify_fd);
2091
2092                 m->cgroup_inotify_fd = inotify_init1(IN_NONBLOCK|IN_CLOEXEC);
2093                 if (m->cgroup_inotify_fd < 0)
2094                         return log_error_errno(errno, "Failed to create control group inotify object: %m");
2095
2096                 r = sd_event_add_io(m->event, &m->cgroup_inotify_event_source, m->cgroup_inotify_fd, EPOLLIN, on_cgroup_inotify_event, m);
2097                 if (r < 0)
2098                         return log_error_errno(r, "Failed to watch control group inotify object: %m");
2099
2100                 /* Process cgroup empty notifications early, but after service notifications and SIGCHLD. Also
2101                  * see handling of cgroup agent notifications, for the classic cgroup hierarchy support. */
2102                 r = sd_event_source_set_priority(m->cgroup_inotify_event_source, SD_EVENT_PRIORITY_NORMAL-4);
2103                 if (r < 0)
2104                         return log_error_errno(r, "Failed to set priority of inotify event source: %m");
2105
2106                 (void) sd_event_source_set_description(m->cgroup_inotify_event_source, "cgroup-inotify");
2107
2108         } else if (MANAGER_IS_SYSTEM(m) && m->test_run_flags == 0) {
2109
2110                 /* On the legacy hierarchy we only get notifications via cgroup agents. (Which isn't really reliable,
2111                  * since it does not generate events when control groups with children run empty. */
2112
2113                 r = cg_install_release_agent(SYSTEMD_CGROUP_CONTROLLER, SYSTEMD_CGROUP_AGENT_PATH);
2114                 if (r < 0)
2115                         log_warning_errno(r, "Failed to install release agent, ignoring: %m");
2116                 else if (r > 0)
2117                         log_debug("Installed release agent.");
2118                 else if (r == 0)
2119                         log_debug("Release agent already installed.");
2120         }
2121
2122         /* 5. Make sure we are in the special "init.scope" unit in the root slice. */
2123         scope_path = strjoina(m->cgroup_root, "/" SPECIAL_INIT_SCOPE);
2124         r = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, scope_path, 0);
2125 #else
2126         /* Note:
2127                 * This method is in core, and normally called by systemd
2128                 * being init. As elogind is never init, we can not install
2129                 * our agent here. We do so when mounting our cgroup file
2130                 * system, so only if elogind is its own tiny controller.
2131                 * Further, elogind is not meant to run in systemd init scope. */
2132         if (MANAGER_IS_SYSTEM(m))
2133                 // we are our own cgroup controller
2134                 scope_path = strjoina("");
2135         else if (streq(m->cgroup_root, "/elogind"))
2136                 // root already is our cgroup
2137                 scope_path = strjoina(m->cgroup_root);
2138         else
2139                 // we have to create our own group
2140                 scope_path = strjoina(m->cgroup_root, "/elogind");
2141         r = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, scope_path, 0);
2142 #endif // 0
2143         if (r < 0)
2144                 return log_error_errno(r, "Failed to create %s control group: %m", scope_path);
2145         log_debug_elogind("Created control group \"%s\"", scope_path);
2146
2147 #if 0 /// elogind is not a "sub-controller" like systemd, so migration is not needed.
2148         /* Also, move all other userspace processes remaining in the root cgroup into that scope. */
2149         r = cg_migrate(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, SYSTEMD_CGROUP_CONTROLLER, scope_path, 0);
2150         if (r < 0)
2151                 log_warning_errno(r, "Couldn't move remaining userspace processes, ignoring: %m");
2152 #endif // 0
2153
2154         /* 6. And pin it, so that it cannot be unmounted */
2155         safe_close(m->pin_cgroupfs_fd);
2156         m->pin_cgroupfs_fd = open(path, O_RDONLY|O_CLOEXEC|O_DIRECTORY|O_NOCTTY|O_NONBLOCK);
2157         if (m->pin_cgroupfs_fd < 0)
2158                 return log_error_errno(errno, "Failed to open pin file: %m");
2159
2160         /* 7. Always enable hierarchical support if it exists... */
2161         if (!all_unified && m->test_run_flags == 0)
2162                 (void) cg_set_attribute("memory", "/", "memory.use_hierarchy", "1");
2163
2164         /* 8. Figure out which controllers are supported, and log about it */
2165         r = cg_mask_supported(&m->cgroup_supported);
2166         if (r < 0)
2167                 return log_error_errno(r, "Failed to determine supported controllers: %m");
2168         for (c = 0; c < _CGROUP_CONTROLLER_MAX; c++)
2169                 log_debug("Controller '%s' supported: %s", cgroup_controller_to_string(c), yes_no(m->cgroup_supported & CGROUP_CONTROLLER_TO_MASK(c)));
2170
2171         return 0;
2172 }
2173
2174 void manager_shutdown_cgroup(Manager *m, bool delete) {
2175         assert(m);
2176
2177         /* We can't really delete the group, since we are in it. But
2178          * let's trim it. */
2179         if (delete && m->cgroup_root)
2180                 (void) cg_trim(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, false);
2181
2182 #if 0 /// elogind is not init
2183         m->cgroup_empty_event_source = sd_event_source_unref(m->cgroup_empty_event_source);
2184
2185         m->cgroup_inotify_wd_unit = hashmap_free(m->cgroup_inotify_wd_unit);
2186
2187         m->cgroup_inotify_event_source = sd_event_source_unref(m->cgroup_inotify_event_source);
2188         m->cgroup_inotify_fd = safe_close(m->cgroup_inotify_fd);
2189 #endif // 0
2190
2191         m->pin_cgroupfs_fd = safe_close(m->pin_cgroupfs_fd);
2192
2193         m->cgroup_root = mfree(m->cgroup_root);
2194 }
2195
2196 #if 0 /// UNNEEDED by elogind
2197 Unit* manager_get_unit_by_cgroup(Manager *m, const char *cgroup) {
2198         char *p;
2199         Unit *u;
2200
2201         assert(m);
2202         assert(cgroup);
2203
2204         u = hashmap_get(m->cgroup_unit, cgroup);
2205         if (u)
2206                 return u;
2207
2208         p = strdupa(cgroup);
2209         for (;;) {
2210                 char *e;
2211
2212                 e = strrchr(p, '/');
2213                 if (!e || e == p)
2214                         return hashmap_get(m->cgroup_unit, SPECIAL_ROOT_SLICE);
2215
2216                 *e = 0;
2217
2218                 u = hashmap_get(m->cgroup_unit, p);
2219                 if (u)
2220                         return u;
2221         }
2222 }
2223
2224 Unit *manager_get_unit_by_pid_cgroup(Manager *m, pid_t pid) {
2225         _cleanup_free_ char *cgroup = NULL;
2226         int r;
2227
2228         assert(m);
2229
2230         if (pid <= 0)
2231                 return NULL;
2232
2233         r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, pid, &cgroup);
2234         if (r < 0)
2235                 return NULL;
2236
2237         return manager_get_unit_by_cgroup(m, cgroup);
2238 }
2239
2240 Unit *manager_get_unit_by_pid(Manager *m, pid_t pid) {
2241         Unit *u;
2242
2243         assert(m);
2244
2245         if (pid <= 0)
2246                 return NULL;
2247
2248         if (pid == getpid_cached())
2249                 return hashmap_get(m->units, SPECIAL_INIT_SCOPE);
2250
2251         u = hashmap_get(m->watch_pids1, PID_TO_PTR(pid));
2252         if (u)
2253                 return u;
2254
2255         u = hashmap_get(m->watch_pids2, PID_TO_PTR(pid));
2256         if (u)
2257                 return u;
2258
2259         return manager_get_unit_by_pid_cgroup(m, pid);
2260 }
2261 #endif // 0
2262
2263 #if 0 /// elogind must substitute this with its own variant
2264 int manager_notify_cgroup_empty(Manager *m, const char *cgroup) {
2265         Unit *u;
2266
2267         assert(m);
2268         assert(cgroup);
2269
2270         /* Called on the legacy hierarchy whenever we get an explicit cgroup notification from the cgroup agent process
2271          * or from the --system instance */
2272
2273         log_debug("Got cgroup empty notification for: %s", cgroup);
2274
2275         u = manager_get_unit_by_cgroup(m, cgroup);
2276         if (!u)
2277                 return 0;
2278
2279         unit_add_to_cgroup_empty_queue(u);
2280         return 1;
2281 }
2282 #else
2283 int manager_notify_cgroup_empty(Manager *m, const char *cgroup) {
2284         Session *s;
2285
2286         assert(m);
2287         assert(cgroup);
2288
2289         log_debug("Got cgroup empty notification for: %s", cgroup);
2290
2291         s = hashmap_get(m->sessions, cgroup);
2292
2293         if (s) {
2294                 session_finalize(s);
2295                 session_free(s);
2296         } else
2297                 log_warning("Session not found: %s", cgroup);
2298
2299         return 0;
2300 }
2301 #endif // 0
2302 #if 0 /// UNNEEDED by elogind
2303 int unit_get_memory_current(Unit *u, uint64_t *ret) {
2304         _cleanup_free_ char *v = NULL;
2305         int r;
2306
2307         assert(u);
2308         assert(ret);
2309
2310         if (!UNIT_CGROUP_BOOL(u, memory_accounting))
2311                 return -ENODATA;
2312
2313         if (!u->cgroup_path)
2314                 return -ENODATA;
2315
2316         if ((u->cgroup_realized_mask & CGROUP_MASK_MEMORY) == 0)
2317                 return -ENODATA;
2318
2319         r = cg_all_unified();
2320         if (r < 0)
2321                 return r;
2322         if (r > 0)
2323                 r = cg_get_attribute("memory", u->cgroup_path, "memory.current", &v);
2324         else
2325                 r = cg_get_attribute("memory", u->cgroup_path, "memory.usage_in_bytes", &v);
2326         if (r == -ENOENT)
2327                 return -ENODATA;
2328         if (r < 0)
2329                 return r;
2330
2331         return safe_atou64(v, ret);
2332 }
2333
2334 int unit_get_tasks_current(Unit *u, uint64_t *ret) {
2335         _cleanup_free_ char *v = NULL;
2336         int r;
2337
2338         assert(u);
2339         assert(ret);
2340
2341         if (!UNIT_CGROUP_BOOL(u, tasks_accounting))
2342                 return -ENODATA;
2343
2344         if (!u->cgroup_path)
2345                 return -ENODATA;
2346
2347         if ((u->cgroup_realized_mask & CGROUP_MASK_PIDS) == 0)
2348                 return -ENODATA;
2349
2350         r = cg_get_attribute("pids", u->cgroup_path, "pids.current", &v);
2351         if (r == -ENOENT)
2352                 return -ENODATA;
2353         if (r < 0)
2354                 return r;
2355
2356         return safe_atou64(v, ret);
2357 }
2358
2359 static int unit_get_cpu_usage_raw(Unit *u, nsec_t *ret) {
2360         _cleanup_free_ char *v = NULL;
2361         uint64_t ns;
2362         int r;
2363
2364         assert(u);
2365         assert(ret);
2366
2367         if (!u->cgroup_path)
2368                 return -ENODATA;
2369
2370         r = cg_all_unified();
2371         if (r < 0)
2372                 return r;
2373         if (r > 0) {
2374                 const char *keys[] = { "usage_usec", NULL };
2375                 _cleanup_free_ char *val = NULL;
2376                 uint64_t us;
2377
2378                 if ((u->cgroup_realized_mask & CGROUP_MASK_CPU) == 0)
2379                         return -ENODATA;
2380
2381                 r = cg_get_keyed_attribute("cpu", u->cgroup_path, "cpu.stat", keys, &val);
2382                 if (r < 0)
2383                         return r;
2384
2385                 r = safe_atou64(val, &us);
2386                 if (r < 0)
2387                         return r;
2388
2389                 ns = us * NSEC_PER_USEC;
2390         } else {
2391                 if ((u->cgroup_realized_mask & CGROUP_MASK_CPUACCT) == 0)
2392                         return -ENODATA;
2393
2394                 r = cg_get_attribute("cpuacct", u->cgroup_path, "cpuacct.usage", &v);
2395                 if (r == -ENOENT)
2396                         return -ENODATA;
2397                 if (r < 0)
2398                         return r;
2399
2400                 r = safe_atou64(v, &ns);
2401                 if (r < 0)
2402                         return r;
2403         }
2404
2405         *ret = ns;
2406         return 0;
2407 }
2408
2409 int unit_get_cpu_usage(Unit *u, nsec_t *ret) {
2410         nsec_t ns;
2411         int r;
2412
2413         assert(u);
2414
2415         /* Retrieve the current CPU usage counter. This will subtract the CPU counter taken when the unit was
2416          * started. If the cgroup has been removed already, returns the last cached value. To cache the value, simply
2417          * call this function with a NULL return value. */
2418
2419         if (!UNIT_CGROUP_BOOL(u, cpu_accounting))
2420                 return -ENODATA;
2421
2422         r = unit_get_cpu_usage_raw(u, &ns);
2423         if (r == -ENODATA && u->cpu_usage_last != NSEC_INFINITY) {
2424                 /* If we can't get the CPU usage anymore (because the cgroup was already removed, for example), use our
2425                  * cached value. */
2426
2427                 if (ret)
2428                         *ret = u->cpu_usage_last;
2429                 return 0;
2430         }
2431         if (r < 0)
2432                 return r;
2433
2434         if (ns > u->cpu_usage_base)
2435                 ns -= u->cpu_usage_base;
2436         else
2437                 ns = 0;
2438
2439         u->cpu_usage_last = ns;
2440         if (ret)
2441                 *ret = ns;
2442
2443         return 0;
2444 }
2445
2446 int unit_get_ip_accounting(
2447                 Unit *u,
2448                 CGroupIPAccountingMetric metric,
2449                 uint64_t *ret) {
2450
2451         uint64_t value;
2452         int fd, r;
2453
2454         assert(u);
2455         assert(metric >= 0);
2456         assert(metric < _CGROUP_IP_ACCOUNTING_METRIC_MAX);
2457         assert(ret);
2458
2459         /* IP accounting is currently not recursive, and hence we refuse to return any data for slice nodes. Slices are
2460          * inner cgroup nodes and hence have no processes directly attached, hence their counters would be zero
2461          * anyway. And if we block this now we can later open this up, if the kernel learns recursive BPF cgroup
2462          * filters. */
2463         if (u->type == UNIT_SLICE)
2464                 return -ENODATA;
2465
2466         if (!UNIT_CGROUP_BOOL(u, ip_accounting))
2467                 return -ENODATA;
2468
2469         fd = IN_SET(metric, CGROUP_IP_INGRESS_BYTES, CGROUP_IP_INGRESS_PACKETS) ?
2470                 u->ip_accounting_ingress_map_fd :
2471                 u->ip_accounting_egress_map_fd;
2472         if (fd < 0)
2473                 return -ENODATA;
2474
2475         if (IN_SET(metric, CGROUP_IP_INGRESS_BYTES, CGROUP_IP_EGRESS_BYTES))
2476                 r = bpf_firewall_read_accounting(fd, &value, NULL);
2477         else
2478                 r = bpf_firewall_read_accounting(fd, NULL, &value);
2479         if (r < 0)
2480                 return r;
2481
2482         /* Add in additional metrics from a previous runtime. Note that when reexecing/reloading the daemon we compile
2483          * all BPF programs and maps anew, but serialize the old counters. When deserializing we store them in the
2484          * ip_accounting_extra[] field, and add them in here transparently. */
2485
2486         *ret = value + u->ip_accounting_extra[metric];
2487
2488         return r;
2489 }
2490
2491 int unit_reset_cpu_accounting(Unit *u) {
2492         nsec_t ns;
2493         int r;
2494
2495         assert(u);
2496
2497         u->cpu_usage_last = NSEC_INFINITY;
2498
2499         r = unit_get_cpu_usage_raw(u, &ns);
2500         if (r < 0) {
2501                 u->cpu_usage_base = 0;
2502                 return r;
2503         }
2504
2505         u->cpu_usage_base = ns;
2506         return 0;
2507 }
2508
2509 int unit_reset_ip_accounting(Unit *u) {
2510         int r = 0, q = 0;
2511
2512         assert(u);
2513
2514         if (u->ip_accounting_ingress_map_fd >= 0)
2515                 r = bpf_firewall_reset_accounting(u->ip_accounting_ingress_map_fd);
2516
2517         if (u->ip_accounting_egress_map_fd >= 0)
2518                 q = bpf_firewall_reset_accounting(u->ip_accounting_egress_map_fd);
2519
2520         zero(u->ip_accounting_extra);
2521
2522         return r < 0 ? r : q;
2523 }
2524
2525 void unit_invalidate_cgroup(Unit *u, CGroupMask m) {
2526         assert(u);
2527
2528         if (!UNIT_HAS_CGROUP_CONTEXT(u))
2529                 return;
2530
2531         if (m == 0)
2532                 return;
2533
2534         /* always invalidate compat pairs together */
2535         if (m & (CGROUP_MASK_IO | CGROUP_MASK_BLKIO))
2536                 m |= CGROUP_MASK_IO | CGROUP_MASK_BLKIO;
2537
2538         if (m & (CGROUP_MASK_CPU | CGROUP_MASK_CPUACCT))
2539                 m |= CGROUP_MASK_CPU | CGROUP_MASK_CPUACCT;
2540
2541         if ((u->cgroup_realized_mask & m) == 0) /* NOP? */
2542                 return;
2543
2544         u->cgroup_realized_mask &= ~m;
2545         unit_add_to_cgroup_realize_queue(u);
2546 }
2547
2548 void unit_invalidate_cgroup_bpf(Unit *u) {
2549         assert(u);
2550
2551         if (!UNIT_HAS_CGROUP_CONTEXT(u))
2552                 return;
2553
2554         if (u->cgroup_bpf_state == UNIT_CGROUP_BPF_INVALIDATED) /* NOP? */
2555                 return;
2556
2557         u->cgroup_bpf_state = UNIT_CGROUP_BPF_INVALIDATED;
2558         unit_add_to_cgroup_realize_queue(u);
2559
2560         /* If we are a slice unit, we also need to put compile a new BPF program for all our children, as the IP access
2561          * list of our children includes our own. */
2562         if (u->type == UNIT_SLICE) {
2563                 Unit *member;
2564                 Iterator i;
2565                 void *v;
2566
2567                 HASHMAP_FOREACH_KEY(v, member, u->dependencies[UNIT_BEFORE], i) {
2568                         if (member == u)
2569                                 continue;
2570
2571                         if (UNIT_DEREF(member->slice) != u)
2572                                 continue;
2573
2574                         unit_invalidate_cgroup_bpf(member);
2575                 }
2576         }
2577 }
2578
2579 void manager_invalidate_startup_units(Manager *m) {
2580         Iterator i;
2581         Unit *u;
2582
2583         assert(m);
2584
2585         SET_FOREACH(u, m->startup_units, i)
2586                 unit_invalidate_cgroup(u, CGROUP_MASK_CPU|CGROUP_MASK_IO|CGROUP_MASK_BLKIO);
2587 }
2588
2589 static const char* const cgroup_device_policy_table[_CGROUP_DEVICE_POLICY_MAX] = {
2590         [CGROUP_AUTO] = "auto",
2591         [CGROUP_CLOSED] = "closed",
2592         [CGROUP_STRICT] = "strict",
2593 };
2594
2595 DEFINE_STRING_TABLE_LOOKUP(cgroup_device_policy, CGroupDevicePolicy);
2596 #endif // 0