src/core/cgroup.c

   1 /* SPDX-License-Identifier: LGPL-2.1+ */
   2 /***
   3   This file is part of systemd.
   4
   5   Copyright 2013 Lennart Poettering
   6
   7   systemd is free software; you can redistribute it and/or modify it
   8   under the terms of the GNU Lesser General Public License as published by
   9   the Free Software Foundation; either version 2.1 of the License, or
  10   (at your option) any later version.
  11
  12   systemd is distributed in the hope that it will be useful, but
  13   WITHOUT ANY WARRANTY; without even the implied warranty of
  14   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  15   Lesser General Public License for more details.
  16
  17   You should have received a copy of the GNU Lesser General Public License
  18   along with systemd; If not, see <http://www.gnu.org/licenses/>.
  19 ***/
  20
  21 #include <fcntl.h>
  22 #include <fnmatch.h>
  23
  24 #include "alloc-util.h"
  25 //#include "blockdev-util.h"
  26 //#include "bpf-firewall.h"
  27 #include "cgroup-util.h"
  28 #include "cgroup.h"
  29 #include "fd-util.h"
  30 #include "fileio.h"
  31 #include "fs-util.h"
  32 #include "parse-util.h"
  33 #include "path-util.h"
  34 #include "process-util.h"
  35 //#include "special.h"
  36 #include "stdio-util.h"
  37 #include "string-table.h"
  38 #include "string-util.h"
  39
  40 #define CGROUP_CPU_QUOTA_PERIOD_USEC ((usec_t) 100 * USEC_PER_MSEC)
  41
  42 #if 0 /// UNNEEDED by elogind
  43 static void cgroup_compat_warn(void) {
  44         static bool cgroup_compat_warned = false;
  45
  46         if (cgroup_compat_warned)
  47                 return;
  48
  49         log_warning("cgroup compatibility translation between legacy and unified hierarchy settings activated. See cgroup-compat debug messages for details.");
  50         cgroup_compat_warned = true;
  51 }
  52
  53 #define log_cgroup_compat(unit, fmt, ...) do {                                  \
  54                 cgroup_compat_warn();                                           \
  55                 log_unit_debug(unit, "cgroup-compat: " fmt, ##__VA_ARGS__);     \
  56         } while (false)
  57
  58 void cgroup_context_init(CGroupContext *c) {
  59         assert(c);
  60
  61         /* Initialize everything to the kernel defaults, assuming the
  62          * structure is preinitialized to 0 */
  63
  64         c->cpu_weight = CGROUP_WEIGHT_INVALID;
  65         c->startup_cpu_weight = CGROUP_WEIGHT_INVALID;
  66         c->cpu_quota_per_sec_usec = USEC_INFINITY;
  67
  68         c->cpu_shares = CGROUP_CPU_SHARES_INVALID;
  69         c->startup_cpu_shares = CGROUP_CPU_SHARES_INVALID;
  70
  71         c->memory_high = CGROUP_LIMIT_MAX;
  72         c->memory_max = CGROUP_LIMIT_MAX;
  73         c->memory_swap_max = CGROUP_LIMIT_MAX;
  74
  75         c->memory_limit = CGROUP_LIMIT_MAX;
  76
  77         c->io_weight = CGROUP_WEIGHT_INVALID;
  78         c->startup_io_weight = CGROUP_WEIGHT_INVALID;
  79
  80         c->blockio_weight = CGROUP_BLKIO_WEIGHT_INVALID;
  81         c->startup_blockio_weight = CGROUP_BLKIO_WEIGHT_INVALID;
  82
  83         c->tasks_max = (uint64_t) -1;
  84 }
  85
  86 void cgroup_context_free_device_allow(CGroupContext *c, CGroupDeviceAllow *a) {
  87         assert(c);
  88         assert(a);
  89
  90         LIST_REMOVE(device_allow, c->device_allow, a);
  91         free(a->path);
  92         free(a);
  93 }
  94
  95 void cgroup_context_free_io_device_weight(CGroupContext *c, CGroupIODeviceWeight *w) {
  96         assert(c);
  97         assert(w);
  98
  99         LIST_REMOVE(device_weights, c->io_device_weights, w);
 100         free(w->path);
 101         free(w);
 102 }
 103
 104 void cgroup_context_free_io_device_limit(CGroupContext *c, CGroupIODeviceLimit *l) {
 105         assert(c);
 106         assert(l);
 107
 108         LIST_REMOVE(device_limits, c->io_device_limits, l);
 109         free(l->path);
 110         free(l);
 111 }
 112
 113 void cgroup_context_free_blockio_device_weight(CGroupContext *c, CGroupBlockIODeviceWeight *w) {
 114         assert(c);
 115         assert(w);
 116
 117         LIST_REMOVE(device_weights, c->blockio_device_weights, w);
 118         free(w->path);
 119         free(w);
 120 }
 121
 122 void cgroup_context_free_blockio_device_bandwidth(CGroupContext *c, CGroupBlockIODeviceBandwidth *b) {
 123         assert(c);
 124         assert(b);
 125
 126         LIST_REMOVE(device_bandwidths, c->blockio_device_bandwidths, b);
 127         free(b->path);
 128         free(b);
 129 }
 130
 131 void cgroup_context_done(CGroupContext *c) {
 132         assert(c);
 133
 134         while (c->io_device_weights)
 135                 cgroup_context_free_io_device_weight(c, c->io_device_weights);
 136
 137         while (c->io_device_limits)
 138                 cgroup_context_free_io_device_limit(c, c->io_device_limits);
 139
 140         while (c->blockio_device_weights)
 141                 cgroup_context_free_blockio_device_weight(c, c->blockio_device_weights);
 142
 143         while (c->blockio_device_bandwidths)
 144                 cgroup_context_free_blockio_device_bandwidth(c, c->blockio_device_bandwidths);
 145
 146         while (c->device_allow)
 147                 cgroup_context_free_device_allow(c, c->device_allow);
 148
 149         c->ip_address_allow = ip_address_access_free_all(c->ip_address_allow);
 150         c->ip_address_deny = ip_address_access_free_all(c->ip_address_deny);
 151 }
 152
 153 void cgroup_context_dump(CGroupContext *c, FILE* f, const char *prefix) {
 154         CGroupIODeviceLimit *il;
 155         CGroupIODeviceWeight *iw;
 156         CGroupBlockIODeviceBandwidth *b;
 157         CGroupBlockIODeviceWeight *w;
 158         CGroupDeviceAllow *a;
 159         IPAddressAccessItem *iaai;
 160         char u[FORMAT_TIMESPAN_MAX];
 161
 162         assert(c);
 163         assert(f);
 164
 165         prefix = strempty(prefix);
 166
 167         fprintf(f,
 168                 "%sCPUAccounting=%s\n"
 169                 "%sIOAccounting=%s\n"
 170                 "%sBlockIOAccounting=%s\n"
 171                 "%sMemoryAccounting=%s\n"
 172                 "%sTasksAccounting=%s\n"
 173                 "%sIPAccounting=%s\n"
 174                 "%sCPUWeight=%" PRIu64 "\n"
 175                 "%sStartupCPUWeight=%" PRIu64 "\n"
 176                 "%sCPUShares=%" PRIu64 "\n"
 177                 "%sStartupCPUShares=%" PRIu64 "\n"
 178                 "%sCPUQuotaPerSecSec=%s\n"
 179                 "%sIOWeight=%" PRIu64 "\n"
 180                 "%sStartupIOWeight=%" PRIu64 "\n"
 181                 "%sBlockIOWeight=%" PRIu64 "\n"
 182                 "%sStartupBlockIOWeight=%" PRIu64 "\n"
 183                 "%sMemoryLow=%" PRIu64 "\n"
 184                 "%sMemoryHigh=%" PRIu64 "\n"
 185                 "%sMemoryMax=%" PRIu64 "\n"
 186                 "%sMemorySwapMax=%" PRIu64 "\n"
 187                 "%sMemoryLimit=%" PRIu64 "\n"
 188                 "%sTasksMax=%" PRIu64 "\n"
 189                 "%sDevicePolicy=%s\n"
 190                 "%sDelegate=%s\n",
 191                 prefix, yes_no(c->cpu_accounting),
 192                 prefix, yes_no(c->io_accounting),
 193                 prefix, yes_no(c->blockio_accounting),
 194                 prefix, yes_no(c->memory_accounting),
 195                 prefix, yes_no(c->tasks_accounting),
 196                 prefix, yes_no(c->ip_accounting),
 197                 prefix, c->cpu_weight,
 198                 prefix, c->startup_cpu_weight,
 199                 prefix, c->cpu_shares,
 200                 prefix, c->startup_cpu_shares,
 201                 prefix, format_timespan(u, sizeof(u), c->cpu_quota_per_sec_usec, 1),
 202                 prefix, c->io_weight,
 203                 prefix, c->startup_io_weight,
 204                 prefix, c->blockio_weight,
 205                 prefix, c->startup_blockio_weight,
 206                 prefix, c->memory_low,
 207                 prefix, c->memory_high,
 208                 prefix, c->memory_max,
 209                 prefix, c->memory_swap_max,
 210                 prefix, c->memory_limit,
 211                 prefix, c->tasks_max,
 212                 prefix, cgroup_device_policy_to_string(c->device_policy),
 213                 prefix, yes_no(c->delegate));
 214
 215         if (c->delegate) {
 216                 _cleanup_free_ char *t = NULL;
 217
 218                 (void) cg_mask_to_string(c->delegate_controllers, &t);
 219
 220                 fprintf(f, "%sDelegateControllers=%s\n",
 221                         prefix,
 222                         strempty(t));
 223         }
 224
 225         LIST_FOREACH(device_allow, a, c->device_allow)
 226                 fprintf(f,
 227                         "%sDeviceAllow=%s %s%s%s\n",
 228                         prefix,
 229                         a->path,
 230                         a->r ? "r" : "", a->w ? "w" : "", a->m ? "m" : "");
 231
 232         LIST_FOREACH(device_weights, iw, c->io_device_weights)
 233                 fprintf(f,
 234                         "%sIODeviceWeight=%s %" PRIu64,
 235                         prefix,
 236                         iw->path,
 237                         iw->weight);
 238
 239         LIST_FOREACH(device_limits, il, c->io_device_limits) {
 240                 char buf[FORMAT_BYTES_MAX];
 241                 CGroupIOLimitType type;
 242
 243                 for (type = 0; type < _CGROUP_IO_LIMIT_TYPE_MAX; type++)
 244                         if (il->limits[type] != cgroup_io_limit_defaults[type])
 245                                 fprintf(f,
 246                                         "%s%s=%s %s\n",
 247                                         prefix,
 248                                         cgroup_io_limit_type_to_string(type),
 249                                         il->path,
 250                                         format_bytes(buf, sizeof(buf), il->limits[type]));
 251         }
 252
 253         LIST_FOREACH(device_weights, w, c->blockio_device_weights)
 254                 fprintf(f,
 255                         "%sBlockIODeviceWeight=%s %" PRIu64,
 256                         prefix,
 257                         w->path,
 258                         w->weight);
 259
 260         LIST_FOREACH(device_bandwidths, b, c->blockio_device_bandwidths) {
 261                 char buf[FORMAT_BYTES_MAX];
 262
 263                 if (b->rbps != CGROUP_LIMIT_MAX)
 264                         fprintf(f,
 265                                 "%sBlockIOReadBandwidth=%s %s\n",
 266                                 prefix,
 267                                 b->path,
 268                                 format_bytes(buf, sizeof(buf), b->rbps));
 269                 if (b->wbps != CGROUP_LIMIT_MAX)
 270                         fprintf(f,
 271                                 "%sBlockIOWriteBandwidth=%s %s\n",
 272                                 prefix,
 273                                 b->path,
 274                                 format_bytes(buf, sizeof(buf), b->wbps));
 275         }
 276
 277         LIST_FOREACH(items, iaai, c->ip_address_allow) {
 278                 _cleanup_free_ char *k = NULL;
 279
 280                 (void) in_addr_to_string(iaai->family, &iaai->address, &k);
 281                 fprintf(f, "%sIPAddressAllow=%s/%u\n", prefix, strnull(k), iaai->prefixlen);
 282         }
 283
 284         LIST_FOREACH(items, iaai, c->ip_address_deny) {
 285                 _cleanup_free_ char *k = NULL;
 286
 287                 (void) in_addr_to_string(iaai->family, &iaai->address, &k);
 288                 fprintf(f, "%sIPAddressDeny=%s/%u\n", prefix, strnull(k), iaai->prefixlen);
 289         }
 290 }
 291
 292 static int lookup_block_device(const char *p, dev_t *dev) {
 293         struct stat st;
 294         int r;
 295
 296         assert(p);
 297         assert(dev);
 298
 299         r = stat(p, &st);
 300         if (r < 0)
 301                 return log_warning_errno(errno, "Couldn't stat device %s: %m", p);
 302
 303         if (S_ISBLK(st.st_mode))
 304                 *dev = st.st_rdev;
 305         else if (major(st.st_dev) != 0) {
 306                 /* If this is not a device node then find the block
 307                  * device this file is stored on */
 308                 *dev = st.st_dev;
 309
 310                 /* If this is a partition, try to get the originating
 311                  * block device */
 312                 (void) block_get_whole_disk(*dev, dev);
 313         } else {
 314                 log_warning("%s is not a block device and file system block device cannot be determined or is not local.", p);
 315                 return -ENODEV;
 316         }
 317
 318         return 0;
 319 }
 320
 321 static int whitelist_device(const char *path, const char *node, const char *acc) {
 322         char buf[2+DECIMAL_STR_MAX(dev_t)*2+2+4];
 323         struct stat st;
 324         bool ignore_notfound;
 325         int r;
 326
 327         assert(path);
 328         assert(acc);
 329
 330         if (node[0] == '-') {
 331                 /* Non-existent paths starting with "-" must be silently ignored */
 332                 node++;
 333                 ignore_notfound = true;
 334         } else
 335                 ignore_notfound = false;
 336
 337         if (stat(node, &st) < 0) {
 338                 if (errno == ENOENT && ignore_notfound)
 339                         return 0;
 340
 341                 return log_warning_errno(errno, "Couldn't stat device %s: %m", node);
 342         }
 343
 344         if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
 345                 log_warning("%s is not a device.", node);
 346                 return -ENODEV;
 347         }
 348
 349         sprintf(buf,
 350                 "%c %u:%u %s",
 351                 S_ISCHR(st.st_mode) ? 'c' : 'b',
 352                 major(st.st_rdev), minor(st.st_rdev),
 353                 acc);
 354
 355         r = cg_set_attribute("devices", path, "devices.allow", buf);
 356         if (r < 0)
 357                 log_full_errno(IN_SET(r, -ENOENT, -EROFS, -EINVAL, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
 358                                "Failed to set devices.allow on %s: %m", path);
 359
 360         return r;
 361 }
 362
 363 static int whitelist_major(const char *path, const char *name, char type, const char *acc) {
 364         _cleanup_fclose_ FILE *f = NULL;
 365         char line[LINE_MAX];
 366         bool good = false;
 367         int r;
 368
 369         assert(path);
 370         assert(acc);
 371         assert(IN_SET(type, 'b', 'c'));
 372
 373         f = fopen("/proc/devices", "re");
 374         if (!f)
 375                 return log_warning_errno(errno, "Cannot open /proc/devices to resolve %s (%c): %m", name, type);
 376
 377         FOREACH_LINE(line, f, goto fail) {
 378                 char buf[2+DECIMAL_STR_MAX(unsigned)+3+4], *p, *w;
 379                 unsigned maj;
 380
 381                 truncate_nl(line);
 382
 383                 if (type == 'c' && streq(line, "Character devices:")) {
 384                         good = true;
 385                         continue;
 386                 }
 387
 388                 if (type == 'b' && streq(line, "Block devices:")) {
 389                         good = true;
 390                         continue;
 391                 }
 392
 393                 if (isempty(line)) {
 394                         good = false;
 395                         continue;
 396                 }
 397
 398                 if (!good)
 399                         continue;
 400
 401                 p = strstrip(line);
 402
 403                 w = strpbrk(p, WHITESPACE);
 404                 if (!w)
 405                         continue;
 406                 *w = 0;
 407
 408                 r = safe_atou(p, &maj);
 409                 if (r < 0)
 410                         continue;
 411                 if (maj <= 0)
 412                         continue;
 413
 414                 w++;
 415                 w += strspn(w, WHITESPACE);
 416
 417                 if (fnmatch(name, w, 0) != 0)
 418                         continue;
 419
 420                 sprintf(buf,
 421                         "%c %u:* %s",
 422                         type,
 423                         maj,
 424                         acc);
 425
 426                 r = cg_set_attribute("devices", path, "devices.allow", buf);
 427                 if (r < 0)
 428                         log_full_errno(IN_SET(r, -ENOENT, -EROFS, -EINVAL, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
 429                                        "Failed to set devices.allow on %s: %m", path);
 430         }
 431
 432         return 0;
 433
 434 fail:
 435         return log_warning_errno(errno, "Failed to read /proc/devices: %m");
 436 }
 437
 438 static bool cgroup_context_has_cpu_weight(CGroupContext *c) {
 439         return c->cpu_weight != CGROUP_WEIGHT_INVALID ||
 440                 c->startup_cpu_weight != CGROUP_WEIGHT_INVALID;
 441 }
 442
 443 static bool cgroup_context_has_cpu_shares(CGroupContext *c) {
 444         return c->cpu_shares != CGROUP_CPU_SHARES_INVALID ||
 445                 c->startup_cpu_shares != CGROUP_CPU_SHARES_INVALID;
 446 }
 447
 448 static uint64_t cgroup_context_cpu_weight(CGroupContext *c, ManagerState state) {
 449         if (IN_SET(state, MANAGER_STARTING, MANAGER_INITIALIZING) &&
 450             c->startup_cpu_weight != CGROUP_WEIGHT_INVALID)
 451                 return c->startup_cpu_weight;
 452         else if (c->cpu_weight != CGROUP_WEIGHT_INVALID)
 453                 return c->cpu_weight;
 454         else
 455                 return CGROUP_WEIGHT_DEFAULT;
 456 }
 457
 458 static uint64_t cgroup_context_cpu_shares(CGroupContext *c, ManagerState state) {
 459         if (IN_SET(state, MANAGER_STARTING, MANAGER_INITIALIZING) &&
 460             c->startup_cpu_shares != CGROUP_CPU_SHARES_INVALID)
 461                 return c->startup_cpu_shares;
 462         else if (c->cpu_shares != CGROUP_CPU_SHARES_INVALID)
 463                 return c->cpu_shares;
 464         else
 465                 return CGROUP_CPU_SHARES_DEFAULT;
 466 }
 467
 468 static void cgroup_apply_unified_cpu_config(Unit *u, uint64_t weight, uint64_t quota) {
 469         char buf[MAX(DECIMAL_STR_MAX(uint64_t) + 1, (DECIMAL_STR_MAX(usec_t) + 1) * 2)];
 470         int r;
 471
 472         xsprintf(buf, "%" PRIu64 "\n", weight);
 473         r = cg_set_attribute("cpu", u->cgroup_path, "cpu.weight", buf);
 474         if (r < 0)
 475                 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
 476                               "Failed to set cpu.weight: %m");
 477
 478         if (quota != USEC_INFINITY)
 479                 xsprintf(buf, USEC_FMT " " USEC_FMT "\n",
 480                          quota * CGROUP_CPU_QUOTA_PERIOD_USEC / USEC_PER_SEC, CGROUP_CPU_QUOTA_PERIOD_USEC);
 481         else
 482                 xsprintf(buf, "max " USEC_FMT "\n", CGROUP_CPU_QUOTA_PERIOD_USEC);
 483
 484         r = cg_set_attribute("cpu", u->cgroup_path, "cpu.max", buf);
 485
 486         if (r < 0)
 487                 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
 488                               "Failed to set cpu.max: %m");
 489 }
 490
 491 static void cgroup_apply_legacy_cpu_config(Unit *u, uint64_t shares, uint64_t quota) {
 492         char buf[MAX(DECIMAL_STR_MAX(uint64_t), DECIMAL_STR_MAX(usec_t)) + 1];
 493         int r;
 494
 495         xsprintf(buf, "%" PRIu64 "\n", shares);
 496         r = cg_set_attribute("cpu", u->cgroup_path, "cpu.shares", buf);
 497         if (r < 0)
 498                 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
 499                               "Failed to set cpu.shares: %m");
 500
 501         xsprintf(buf, USEC_FMT "\n", CGROUP_CPU_QUOTA_PERIOD_USEC);
 502         r = cg_set_attribute("cpu", u->cgroup_path, "cpu.cfs_period_us", buf);
 503         if (r < 0)
 504                 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
 505                               "Failed to set cpu.cfs_period_us: %m");
 506
 507         if (quota != USEC_INFINITY) {
 508                 xsprintf(buf, USEC_FMT "\n", quota * CGROUP_CPU_QUOTA_PERIOD_USEC / USEC_PER_SEC);
 509                 r = cg_set_attribute("cpu", u->cgroup_path, "cpu.cfs_quota_us", buf);
 510         } else
 511                 r = cg_set_attribute("cpu", u->cgroup_path, "cpu.cfs_quota_us", "-1");
 512         if (r < 0)
 513                 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
 514                               "Failed to set cpu.cfs_quota_us: %m");
 515 }
 516
 517 static uint64_t cgroup_cpu_shares_to_weight(uint64_t shares) {
 518         return CLAMP(shares * CGROUP_WEIGHT_DEFAULT / CGROUP_CPU_SHARES_DEFAULT,
 519                      CGROUP_WEIGHT_MIN, CGROUP_WEIGHT_MAX);
 520 }
 521
 522 static uint64_t cgroup_cpu_weight_to_shares(uint64_t weight) {
 523         return CLAMP(weight * CGROUP_CPU_SHARES_DEFAULT / CGROUP_WEIGHT_DEFAULT,
 524                      CGROUP_CPU_SHARES_MIN, CGROUP_CPU_SHARES_MAX);
 525 }
 526
 527 static bool cgroup_context_has_io_config(CGroupContext *c) {
 528         return c->io_accounting ||
 529                 c->io_weight != CGROUP_WEIGHT_INVALID ||
 530                 c->startup_io_weight != CGROUP_WEIGHT_INVALID ||
 531                 c->io_device_weights ||
 532                 c->io_device_limits;
 533 }
 534
 535 static bool cgroup_context_has_blockio_config(CGroupContext *c) {
 536         return c->blockio_accounting ||
 537                 c->blockio_weight != CGROUP_BLKIO_WEIGHT_INVALID ||
 538                 c->startup_blockio_weight != CGROUP_BLKIO_WEIGHT_INVALID ||
 539                 c->blockio_device_weights ||
 540                 c->blockio_device_bandwidths;
 541 }
 542
 543 static uint64_t cgroup_context_io_weight(CGroupContext *c, ManagerState state) {
 544         if (IN_SET(state, MANAGER_STARTING, MANAGER_INITIALIZING) &&
 545             c->startup_io_weight != CGROUP_WEIGHT_INVALID)
 546                 return c->startup_io_weight;
 547         else if (c->io_weight != CGROUP_WEIGHT_INVALID)
 548                 return c->io_weight;
 549         else
 550                 return CGROUP_WEIGHT_DEFAULT;
 551 }
 552
 553 static uint64_t cgroup_context_blkio_weight(CGroupContext *c, ManagerState state) {
 554         if (IN_SET(state, MANAGER_STARTING, MANAGER_INITIALIZING) &&
 555             c->startup_blockio_weight != CGROUP_BLKIO_WEIGHT_INVALID)
 556                 return c->startup_blockio_weight;
 557         else if (c->blockio_weight != CGROUP_BLKIO_WEIGHT_INVALID)
 558                 return c->blockio_weight;
 559         else
 560                 return CGROUP_BLKIO_WEIGHT_DEFAULT;
 561 }
 562
 563 static uint64_t cgroup_weight_blkio_to_io(uint64_t blkio_weight) {
 564         return CLAMP(blkio_weight * CGROUP_WEIGHT_DEFAULT / CGROUP_BLKIO_WEIGHT_DEFAULT,
 565                      CGROUP_WEIGHT_MIN, CGROUP_WEIGHT_MAX);
 566 }
 567
 568 static uint64_t cgroup_weight_io_to_blkio(uint64_t io_weight) {
 569         return CLAMP(io_weight * CGROUP_BLKIO_WEIGHT_DEFAULT / CGROUP_WEIGHT_DEFAULT,
 570                      CGROUP_BLKIO_WEIGHT_MIN, CGROUP_BLKIO_WEIGHT_MAX);
 571 }
 572
 573 static void cgroup_apply_io_device_weight(Unit *u, const char *dev_path, uint64_t io_weight) {
 574         char buf[DECIMAL_STR_MAX(dev_t)*2+2+DECIMAL_STR_MAX(uint64_t)+1];
 575         dev_t dev;
 576         int r;
 577
 578         r = lookup_block_device(dev_path, &dev);
 579         if (r < 0)
 580                 return;
 581
 582         xsprintf(buf, "%u:%u %" PRIu64 "\n", major(dev), minor(dev), io_weight);
 583         r = cg_set_attribute("io", u->cgroup_path, "io.weight", buf);
 584         if (r < 0)
 585                 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
 586                               "Failed to set io.weight: %m");
 587 }
 588
 589 static void cgroup_apply_blkio_device_weight(Unit *u, const char *dev_path, uint64_t blkio_weight) {
 590         char buf[DECIMAL_STR_MAX(dev_t)*2+2+DECIMAL_STR_MAX(uint64_t)+1];
 591         dev_t dev;
 592         int r;
 593
 594         r = lookup_block_device(dev_path, &dev);
 595         if (r < 0)
 596                 return;
 597
 598         xsprintf(buf, "%u:%u %" PRIu64 "\n", major(dev), minor(dev), blkio_weight);
 599         r = cg_set_attribute("blkio", u->cgroup_path, "blkio.weight_device", buf);
 600         if (r < 0)
 601                 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
 602                               "Failed to set blkio.weight_device: %m");
 603 }
 604
 605 static unsigned cgroup_apply_io_device_limit(Unit *u, const char *dev_path, uint64_t *limits) {
 606         char limit_bufs[_CGROUP_IO_LIMIT_TYPE_MAX][DECIMAL_STR_MAX(uint64_t)];
 607         char buf[DECIMAL_STR_MAX(dev_t)*2+2+(6+DECIMAL_STR_MAX(uint64_t)+1)*4];
 608         CGroupIOLimitType type;
 609         dev_t dev;
 610         unsigned n = 0;
 611         int r;
 612
 613         r = lookup_block_device(dev_path, &dev);
 614         if (r < 0)
 615                 return 0;
 616
 617         for (type = 0; type < _CGROUP_IO_LIMIT_TYPE_MAX; type++) {
 618                 if (limits[type] != cgroup_io_limit_defaults[type]) {
 619                         xsprintf(limit_bufs[type], "%" PRIu64, limits[type]);
 620                         n++;
 621                 } else {
 622                         xsprintf(limit_bufs[type], "%s", limits[type] == CGROUP_LIMIT_MAX ? "max" : "0");
 623                 }
 624         }
 625
 626         xsprintf(buf, "%u:%u rbps=%s wbps=%s riops=%s wiops=%s\n", major(dev), minor(dev),
 627                  limit_bufs[CGROUP_IO_RBPS_MAX], limit_bufs[CGROUP_IO_WBPS_MAX],
 628                  limit_bufs[CGROUP_IO_RIOPS_MAX], limit_bufs[CGROUP_IO_WIOPS_MAX]);
 629         r = cg_set_attribute("io", u->cgroup_path, "io.max", buf);
 630         if (r < 0)
 631                 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
 632                               "Failed to set io.max: %m");
 633         return n;
 634 }
 635
 636 static unsigned cgroup_apply_blkio_device_limit(Unit *u, const char *dev_path, uint64_t rbps, uint64_t wbps) {
 637         char buf[DECIMAL_STR_MAX(dev_t)*2+2+DECIMAL_STR_MAX(uint64_t)+1];
 638         dev_t dev;
 639         unsigned n = 0;
 640         int r;
 641
 642         r = lookup_block_device(dev_path, &dev);
 643         if (r < 0)
 644                 return 0;
 645
 646         if (rbps != CGROUP_LIMIT_MAX)
 647                 n++;
 648         sprintf(buf, "%u:%u %" PRIu64 "\n", major(dev), minor(dev), rbps);
 649         r = cg_set_attribute("blkio", u->cgroup_path, "blkio.throttle.read_bps_device", buf);
 650         if (r < 0)
 651                 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
 652                               "Failed to set blkio.throttle.read_bps_device: %m");
 653
 654         if (wbps != CGROUP_LIMIT_MAX)
 655                 n++;
 656         sprintf(buf, "%u:%u %" PRIu64 "\n", major(dev), minor(dev), wbps);
 657         r = cg_set_attribute("blkio", u->cgroup_path, "blkio.throttle.write_bps_device", buf);
 658         if (r < 0)
 659                 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
 660                               "Failed to set blkio.throttle.write_bps_device: %m");
 661
 662         return n;
 663 }
 664
 665 static bool cgroup_context_has_unified_memory_config(CGroupContext *c) {
 666         return c->memory_low > 0 || c->memory_high != CGROUP_LIMIT_MAX || c->memory_max != CGROUP_LIMIT_MAX || c->memory_swap_max != CGROUP_LIMIT_MAX;
 667 }
 668
 669 static void cgroup_apply_unified_memory_limit(Unit *u, const char *file, uint64_t v) {
 670         char buf[DECIMAL_STR_MAX(uint64_t) + 1] = "max";
 671         int r;
 672
 673         if (v != CGROUP_LIMIT_MAX)
 674                 xsprintf(buf, "%" PRIu64 "\n", v);
 675
 676         r = cg_set_attribute("memory", u->cgroup_path, file, buf);
 677         if (r < 0)
 678                 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
 679                               "Failed to set %s: %m", file);
 680 }
 681
 682 static void cgroup_apply_firewall(Unit *u) {
 683         int r;
 684
 685         assert(u);
 686
 687         if (u->type == UNIT_SLICE) /* Skip this for slice units, they are inner cgroup nodes, and since bpf/cgroup is
 688                                     * not recursive we don't ever touch the bpf on them */
 689                 return;
 690
 691         r = bpf_firewall_compile(u);
 692         if (r < 0)
 693                 return;
 694
 695         (void) bpf_firewall_install(u);
 696         return;
 697 }
 698
 699 static void cgroup_context_apply(
 700                 Unit *u,
 701                 CGroupMask apply_mask,
 702                 bool apply_bpf,
 703                 ManagerState state) {
 704
 705         const char *path;
 706         CGroupContext *c;
 707         bool is_root;
 708         int r;
 709
 710         assert(u);
 711
 712         c = unit_get_cgroup_context(u);
 713         path = u->cgroup_path;
 714
 715         assert(c);
 716         assert(path);
 717
 718         /* Nothing to do? Exit early! */
 719         if (apply_mask == 0 && !apply_bpf)
 720                 return;
 721
 722         /* Some cgroup attributes are not supported on the root cgroup,
 723          * hence silently ignore */
 724         is_root = isempty(path) || path_equal(path, "/");
 725         if (is_root)
 726                 /* Make sure we don't try to display messages with an empty path. */
 727                 path = "/";
 728
 729         /* We generally ignore errors caused by read-only mounted
 730          * cgroup trees (assuming we are running in a container then),
 731          * and missing cgroups, i.e. EROFS and ENOENT. */
 732
 733         if ((apply_mask & CGROUP_MASK_CPU) && !is_root) {
 734                 bool has_weight, has_shares;
 735
 736                 has_weight = cgroup_context_has_cpu_weight(c);
 737                 has_shares = cgroup_context_has_cpu_shares(c);
 738
 739                 if (cg_all_unified() > 0) {
 740                         uint64_t weight;
 741
 742                         if (has_weight)
 743                                 weight = cgroup_context_cpu_weight(c, state);
 744                         else if (has_shares) {
 745                                 uint64_t shares = cgroup_context_cpu_shares(c, state);
 746
 747                                 weight = cgroup_cpu_shares_to_weight(shares);
 748
 749                                 log_cgroup_compat(u, "Applying [Startup]CpuShares %" PRIu64 " as [Startup]CpuWeight %" PRIu64 " on %s",
 750                                                   shares, weight, path);
 751                         } else
 752                                 weight = CGROUP_WEIGHT_DEFAULT;
 753
 754                         cgroup_apply_unified_cpu_config(u, weight, c->cpu_quota_per_sec_usec);
 755                 } else {
 756                         uint64_t shares;
 757
 758                         if (has_weight) {
 759                                 uint64_t weight = cgroup_context_cpu_weight(c, state);
 760
 761                                 shares = cgroup_cpu_weight_to_shares(weight);
 762
 763                                 log_cgroup_compat(u, "Applying [Startup]CpuWeight %" PRIu64 " as [Startup]CpuShares %" PRIu64 " on %s",
 764                                                   weight, shares, path);
 765                         } else if (has_shares)
 766                                 shares = cgroup_context_cpu_shares(c, state);
 767                         else
 768                                 shares = CGROUP_CPU_SHARES_DEFAULT;
 769
 770                         cgroup_apply_legacy_cpu_config(u, shares, c->cpu_quota_per_sec_usec);
 771                 }
 772         }
 773
 774         if (apply_mask & CGROUP_MASK_IO) {
 775                 bool has_io = cgroup_context_has_io_config(c);
 776                 bool has_blockio = cgroup_context_has_blockio_config(c);
 777
 778                 if (!is_root) {
 779                         char buf[8+DECIMAL_STR_MAX(uint64_t)+1];
 780                         uint64_t weight;
 781
 782                         if (has_io)
 783                                 weight = cgroup_context_io_weight(c, state);
 784                         else if (has_blockio) {
 785                                 uint64_t blkio_weight = cgroup_context_blkio_weight(c, state);
 786
 787                                 weight = cgroup_weight_blkio_to_io(blkio_weight);
 788
 789                                 log_cgroup_compat(u, "Applying [Startup]BlockIOWeight %" PRIu64 " as [Startup]IOWeight %" PRIu64,
 790                                                   blkio_weight, weight);
 791                         } else
 792                                 weight = CGROUP_WEIGHT_DEFAULT;
 793
 794                         xsprintf(buf, "default %" PRIu64 "\n", weight);
 795                         r = cg_set_attribute("io", path, "io.weight", buf);
 796                         if (r < 0)
 797                                 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
 798                                               "Failed to set io.weight: %m");
 799
 800                         if (has_io) {
 801                                 CGroupIODeviceWeight *w;
 802
 803                                 /* FIXME: no way to reset this list */
 804                                 LIST_FOREACH(device_weights, w, c->io_device_weights)
 805                                         cgroup_apply_io_device_weight(u, w->path, w->weight);
 806                         } else if (has_blockio) {
 807                                 CGroupBlockIODeviceWeight *w;
 808
 809                                 /* FIXME: no way to reset this list */
 810                                 LIST_FOREACH(device_weights, w, c->blockio_device_weights) {
 811                                         weight = cgroup_weight_blkio_to_io(w->weight);
 812
 813                                         log_cgroup_compat(u, "Applying BlockIODeviceWeight %" PRIu64 " as IODeviceWeight %" PRIu64 " for %s",
 814                                                           w->weight, weight, w->path);
 815
 816                                         cgroup_apply_io_device_weight(u, w->path, weight);
 817                                 }
 818                         }
 819                 }
 820
 821                 /* Apply limits and free ones without config. */
 822                 if (has_io) {
 823                         CGroupIODeviceLimit *l, *next;
 824
 825                         LIST_FOREACH_SAFE(device_limits, l, next, c->io_device_limits) {
 826                                 if (!cgroup_apply_io_device_limit(u, l->path, l->limits))
 827                                         cgroup_context_free_io_device_limit(c, l);
 828                         }
 829                 } else if (has_blockio) {
 830                         CGroupBlockIODeviceBandwidth *b, *next;
 831
 832                         LIST_FOREACH_SAFE(device_bandwidths, b, next, c->blockio_device_bandwidths) {
 833                                 uint64_t limits[_CGROUP_IO_LIMIT_TYPE_MAX];
 834                                 CGroupIOLimitType type;
 835
 836                                 for (type = 0; type < _CGROUP_IO_LIMIT_TYPE_MAX; type++)
 837                                         limits[type] = cgroup_io_limit_defaults[type];
 838
 839                                 limits[CGROUP_IO_RBPS_MAX] = b->rbps;
 840                                 limits[CGROUP_IO_WBPS_MAX] = b->wbps;
 841
 842                                 log_cgroup_compat(u, "Applying BlockIO{Read|Write}Bandwidth %" PRIu64 " %" PRIu64 " as IO{Read|Write}BandwidthMax for %s",
 843                                                   b->rbps, b->wbps, b->path);
 844
 845                                 if (!cgroup_apply_io_device_limit(u, b->path, limits))
 846                                         cgroup_context_free_blockio_device_bandwidth(c, b);
 847                         }
 848                 }
 849         }
 850
 851         if (apply_mask & CGROUP_MASK_BLKIO) {
 852                 bool has_io = cgroup_context_has_io_config(c);
 853                 bool has_blockio = cgroup_context_has_blockio_config(c);
 854
 855                 if (!is_root) {
 856                         char buf[DECIMAL_STR_MAX(uint64_t)+1];
 857                         uint64_t weight;
 858
 859                         if (has_io) {
 860                                 uint64_t io_weight = cgroup_context_io_weight(c, state);
 861
 862                                 weight = cgroup_weight_io_to_blkio(cgroup_context_io_weight(c, state));
 863
 864                                 log_cgroup_compat(u, "Applying [Startup]IOWeight %" PRIu64 " as [Startup]BlockIOWeight %" PRIu64,
 865                                                   io_weight, weight);
 866                         } else if (has_blockio)
 867                                 weight = cgroup_context_blkio_weight(c, state);
 868                         else
 869                                 weight = CGROUP_BLKIO_WEIGHT_DEFAULT;
 870
 871                         xsprintf(buf, "%" PRIu64 "\n", weight);
 872                         r = cg_set_attribute("blkio", path, "blkio.weight", buf);
 873                         if (r < 0)
 874                                 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
 875                                               "Failed to set blkio.weight: %m");
 876
 877                         if (has_io) {
 878                                 CGroupIODeviceWeight *w;
 879
 880                                 /* FIXME: no way to reset this list */
 881                                 LIST_FOREACH(device_weights, w, c->io_device_weights) {
 882                                         weight = cgroup_weight_io_to_blkio(w->weight);
 883
 884                                         log_cgroup_compat(u, "Applying IODeviceWeight %" PRIu64 " as BlockIODeviceWeight %" PRIu64 " for %s",
 885                                                           w->weight, weight, w->path);
 886
 887                                         cgroup_apply_blkio_device_weight(u, w->path, weight);
 888                                 }
 889                         } else if (has_blockio) {
 890                                 CGroupBlockIODeviceWeight *w;
 891
 892                                 /* FIXME: no way to reset this list */
 893                                 LIST_FOREACH(device_weights, w, c->blockio_device_weights)
 894                                         cgroup_apply_blkio_device_weight(u, w->path, w->weight);
 895                         }
 896                 }
 897
 898                 /* Apply limits and free ones without config. */
 899                 if (has_io) {
 900                         CGroupIODeviceLimit *l, *next;
 901
 902                         LIST_FOREACH_SAFE(device_limits, l, next, c->io_device_limits) {
 903                                 log_cgroup_compat(u, "Applying IO{Read|Write}Bandwidth %" PRIu64 " %" PRIu64 " as BlockIO{Read|Write}BandwidthMax for %s",
 904                                                   l->limits[CGROUP_IO_RBPS_MAX], l->limits[CGROUP_IO_WBPS_MAX], l->path);
 905
 906                                 if (!cgroup_apply_blkio_device_limit(u, l->path, l->limits[CGROUP_IO_RBPS_MAX], l->limits[CGROUP_IO_WBPS_MAX]))
 907                                         cgroup_context_free_io_device_limit(c, l);
 908                         }
 909                 } else if (has_blockio) {
 910                         CGroupBlockIODeviceBandwidth *b, *next;
 911
 912                         LIST_FOREACH_SAFE(device_bandwidths, b, next, c->blockio_device_bandwidths)
 913                                 if (!cgroup_apply_blkio_device_limit(u, b->path, b->rbps, b->wbps))
 914                                         cgroup_context_free_blockio_device_bandwidth(c, b);
 915                 }
 916         }
 917
 918         if ((apply_mask & CGROUP_MASK_MEMORY) && !is_root) {
 919                 if (cg_all_unified() > 0) {
 920                         uint64_t max, swap_max = CGROUP_LIMIT_MAX;
 921
 922                         if (cgroup_context_has_unified_memory_config(c)) {
 923                                 max = c->memory_max;
 924                                 swap_max = c->memory_swap_max;
 925                         } else {
 926                                 max = c->memory_limit;
 927
 928                                 if (max != CGROUP_LIMIT_MAX)
 929                                         log_cgroup_compat(u, "Applying MemoryLimit %" PRIu64 " as MemoryMax", max);
 930                         }
 931
 932                         cgroup_apply_unified_memory_limit(u, "memory.low", c->memory_low);
 933                         cgroup_apply_unified_memory_limit(u, "memory.high", c->memory_high);
 934                         cgroup_apply_unified_memory_limit(u, "memory.max", max);
 935                         cgroup_apply_unified_memory_limit(u, "memory.swap.max", swap_max);
 936                 } else {
 937                         char buf[DECIMAL_STR_MAX(uint64_t) + 1];
 938                         uint64_t val;
 939
 940                         if (cgroup_context_has_unified_memory_config(c)) {
 941                                 val = c->memory_max;
 942                                 log_cgroup_compat(u, "Applying MemoryMax %" PRIi64 " as MemoryLimit", val);
 943                         } else
 944                                 val = c->memory_limit;
 945
 946                         if (val == CGROUP_LIMIT_MAX)
 947                                 strncpy(buf, "-1\n", sizeof(buf));
 948                         else
 949                                 xsprintf(buf, "%" PRIu64 "\n", val);
 950
 951                         r = cg_set_attribute("memory", path, "memory.limit_in_bytes", buf);
 952                         if (r < 0)
 953                                 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
 954                                               "Failed to set memory.limit_in_bytes: %m");
 955                 }
 956         }
 957
 958         if ((apply_mask & CGROUP_MASK_DEVICES) && !is_root) {
 959                 CGroupDeviceAllow *a;
 960
 961                 /* Changing the devices list of a populated cgroup
 962                  * might result in EINVAL, hence ignore EINVAL
 963                  * here. */
 964
 965                 if (c->device_allow || c->device_policy != CGROUP_AUTO)
 966                         r = cg_set_attribute("devices", path, "devices.deny", "a");
 967                 else
 968                         r = cg_set_attribute("devices", path, "devices.allow", "a");
 969                 if (r < 0)
 970                         log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EINVAL, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
 971                                       "Failed to reset devices.list: %m");
 972
 973                 if (c->device_policy == CGROUP_CLOSED ||
 974                     (c->device_policy == CGROUP_AUTO && c->device_allow)) {
 975                         static const char auto_devices[] =
 976                                 "/dev/null\0" "rwm\0"
 977                                 "/dev/zero\0" "rwm\0"
 978                                 "/dev/full\0" "rwm\0"
 979                                 "/dev/random\0" "rwm\0"
 980                                 "/dev/urandom\0" "rwm\0"
 981                                 "/dev/tty\0" "rwm\0"
 982                                 "/dev/ptmx\0" "rwm\0"
 983                                 /* Allow /run/systemd/inaccessible/{chr,blk} devices for mapping InaccessiblePaths */
 984                                 "-/run/systemd/inaccessible/chr\0" "rwm\0"
 985                                 "-/run/systemd/inaccessible/blk\0" "rwm\0";
 986
 987                         const char *x, *y;
 988
 989                         NULSTR_FOREACH_PAIR(x, y, auto_devices)
 990                                 whitelist_device(path, x, y);
 991
 992                         /* PTS (/dev/pts) devices may not be duplicated, but accessed */
 993                         whitelist_major(path, "pts", 'c', "rw");
 994                 }
 995
 996                 LIST_FOREACH(device_allow, a, c->device_allow) {
 997                         char acc[4], *val;
 998                         unsigned k = 0;
 999
1000                         if (a->r)
1001                                 acc[k++] = 'r';
1002                         if (a->w)
1003                                 acc[k++] = 'w';
1004                         if (a->m)
1005                                 acc[k++] = 'm';
1006
1007                         if (k == 0)
1008                                 continue;
1009
1010                         acc[k++] = 0;
1011
1012                         if (path_startswith(a->path, "/dev/"))
1013                                 whitelist_device(path, a->path, acc);
1014                         else if ((val = startswith(a->path, "block-")))
1015                                 whitelist_major(path, val, 'b', acc);
1016                         else if ((val = startswith(a->path, "char-")))
1017                                 whitelist_major(path, val, 'c', acc);
1018                         else
1019                                 log_unit_debug(u, "Ignoring device %s while writing cgroup attribute.", a->path);
1020                 }
1021         }
1022
1023         if ((apply_mask & CGROUP_MASK_PIDS) && !is_root) {
1024
1025                 if (c->tasks_max != CGROUP_LIMIT_MAX) {
1026                         char buf[DECIMAL_STR_MAX(uint64_t) + 2];
1027
1028                         sprintf(buf, "%" PRIu64 "\n", c->tasks_max);
1029                         r = cg_set_attribute("pids", path, "pids.max", buf);
1030                 } else
1031                         r = cg_set_attribute("pids", path, "pids.max", "max");
1032
1033                 if (r < 0)
1034                         log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
1035                                       "Failed to set pids.max: %m");
1036         }
1037
1038         if (apply_bpf)
1039                 cgroup_apply_firewall(u);
1040 }
1041
1042 CGroupMask cgroup_context_get_mask(CGroupContext *c) {
1043         CGroupMask mask = 0;
1044
1045         /* Figure out which controllers we need */
1046
1047         if (c->cpu_accounting ||
1048             cgroup_context_has_cpu_weight(c) ||
1049             cgroup_context_has_cpu_shares(c) ||
1050             c->cpu_quota_per_sec_usec != USEC_INFINITY)
1051                 mask |= CGROUP_MASK_CPUACCT | CGROUP_MASK_CPU;
1052
1053         if (cgroup_context_has_io_config(c) || cgroup_context_has_blockio_config(c))
1054                 mask |= CGROUP_MASK_IO | CGROUP_MASK_BLKIO;
1055
1056         if (c->memory_accounting ||
1057             c->memory_limit != CGROUP_LIMIT_MAX ||
1058             cgroup_context_has_unified_memory_config(c))
1059                 mask |= CGROUP_MASK_MEMORY;
1060
1061         if (c->device_allow ||
1062             c->device_policy != CGROUP_AUTO)
1063                 mask |= CGROUP_MASK_DEVICES;
1064
1065         if (c->tasks_accounting ||
1066             c->tasks_max != CGROUP_LIMIT_MAX)
1067                 mask |= CGROUP_MASK_PIDS;
1068
1069         return mask;
1070 }
1071
1072 CGroupMask unit_get_own_mask(Unit *u) {
1073         CGroupContext *c;
1074
1075         /* Returns the mask of controllers the unit needs for itself */
1076
1077         c = unit_get_cgroup_context(u);
1078         if (!c)
1079                 return 0;
1080
1081         return cgroup_context_get_mask(c) | unit_get_delegate_mask(u);
1082 }
1083
1084 CGroupMask unit_get_delegate_mask(Unit *u) {
1085         CGroupContext *c;
1086
1087         /* If delegation is turned on, then turn on selected controllers, unless we are on the legacy hierarchy and the
1088          * process we fork into is known to drop privileges, and hence shouldn't get access to the controllers.
1089          *
1090          * Note that on the unified hierarchy it is safe to delegate controllers to unprivileged services. */
1091
1092         if (u->type == UNIT_SLICE)
1093                 return 0;
1094
1095         c = unit_get_cgroup_context(u);
1096         if (!c)
1097                 return 0;
1098
1099         if (!c->delegate)
1100                 return 0;
1101
1102         if (cg_all_unified() <= 0) {
1103                 ExecContext *e;
1104
1105                 e = unit_get_exec_context(u);
1106                 if (e && !exec_context_maintains_privileges(e))
1107                         return 0;
1108         }
1109
1110         return c->delegate_controllers;
1111 }
1112
1113 CGroupMask unit_get_members_mask(Unit *u) {
1114         assert(u);
1115
1116         /* Returns the mask of controllers all of the unit's children require, merged */
1117
1118         if (u->cgroup_members_mask_valid)
1119                 return u->cgroup_members_mask;
1120
1121         u->cgroup_members_mask = 0;
1122
1123         if (u->type == UNIT_SLICE) {
1124                 void *v;
1125                 Unit *member;
1126                 Iterator i;
1127
1128                 HASHMAP_FOREACH_KEY(v, member, u->dependencies[UNIT_BEFORE], i) {
1129
1130                         if (member == u)
1131                                 continue;
1132
1133                         if (UNIT_DEREF(member->slice) != u)
1134                                 continue;
1135
1136                         u->cgroup_members_mask |= unit_get_subtree_mask(member); /* note that this calls ourselves again, for the children */
1137                 }
1138         }
1139
1140         u->cgroup_members_mask_valid = true;
1141         return u->cgroup_members_mask;
1142 }
1143
1144 CGroupMask unit_get_siblings_mask(Unit *u) {
1145         assert(u);
1146
1147         /* Returns the mask of controllers all of the unit's siblings
1148          * require, i.e. the members mask of the unit's parent slice
1149          * if there is one. */
1150
1151         if (UNIT_ISSET(u->slice))
1152                 return unit_get_members_mask(UNIT_DEREF(u->slice));
1153
1154         return unit_get_subtree_mask(u); /* we are the top-level slice */
1155 }
1156
1157 CGroupMask unit_get_subtree_mask(Unit *u) {
1158
1159         /* Returns the mask of this subtree, meaning of the group
1160          * itself and its children. */
1161
1162         return unit_get_own_mask(u) | unit_get_members_mask(u);
1163 }
1164
1165 CGroupMask unit_get_target_mask(Unit *u) {
1166         CGroupMask mask;
1167
1168         /* This returns the cgroup mask of all controllers to enable
1169          * for a specific cgroup, i.e. everything it needs itself,
1170          * plus all that its children need, plus all that its siblings
1171          * need. This is primarily useful on the legacy cgroup
1172          * hierarchy, where we need to duplicate each cgroup in each
1173          * hierarchy that shall be enabled for it. */
1174
1175         mask = unit_get_own_mask(u) | unit_get_members_mask(u) | unit_get_siblings_mask(u);
1176         mask &= u->manager->cgroup_supported;
1177
1178         return mask;
1179 }
1180
1181 CGroupMask unit_get_enable_mask(Unit *u) {
1182         CGroupMask mask;
1183
1184         /* This returns the cgroup mask of all controllers to enable
1185          * for the children of a specific cgroup. This is primarily
1186          * useful for the unified cgroup hierarchy, where each cgroup
1187          * controls which controllers are enabled for its children. */
1188
1189         mask = unit_get_members_mask(u);
1190         mask &= u->manager->cgroup_supported;
1191
1192         return mask;
1193 }
1194
1195 bool unit_get_needs_bpf(Unit *u) {
1196         CGroupContext *c;
1197         Unit *p;
1198         assert(u);
1199
1200         /* We never attach BPF to slice units, as they are inner cgroup nodes and cgroup/BPF is not recursive at the
1201          * moment. */
1202         if (u->type == UNIT_SLICE)
1203                 return false;
1204
1205         c = unit_get_cgroup_context(u);
1206         if (!c)
1207                 return false;
1208
1209         if (c->ip_accounting ||
1210             c->ip_address_allow ||
1211             c->ip_address_deny)
1212                 return true;
1213
1214         /* If any parent slice has an IP access list defined, it applies too */
1215         for (p = UNIT_DEREF(u->slice); p; p = UNIT_DEREF(p->slice)) {
1216                 c = unit_get_cgroup_context(p);
1217                 if (!c)
1218                         return false;
1219
1220                 if (c->ip_address_allow ||
1221                     c->ip_address_deny)
1222                         return true;
1223         }
1224
1225         return false;
1226 }
1227
1228 /* Recurse from a unit up through its containing slices, propagating
1229  * mask bits upward. A unit is also member of itself. */
1230 void unit_update_cgroup_members_masks(Unit *u) {
1231         CGroupMask m;
1232         bool more;
1233
1234         assert(u);
1235
1236         /* Calculate subtree mask */
1237         m = unit_get_subtree_mask(u);
1238
1239         /* See if anything changed from the previous invocation. If
1240          * not, we're done. */
1241         if (u->cgroup_subtree_mask_valid && m == u->cgroup_subtree_mask)
1242                 return;
1243
1244         more =
1245                 u->cgroup_subtree_mask_valid &&
1246                 ((m & ~u->cgroup_subtree_mask) != 0) &&
1247                 ((~m & u->cgroup_subtree_mask) == 0);
1248
1249         u->cgroup_subtree_mask = m;
1250         u->cgroup_subtree_mask_valid = true;
1251
1252         if (UNIT_ISSET(u->slice)) {
1253                 Unit *s = UNIT_DEREF(u->slice);
1254
1255                 if (more)
1256                         /* There's more set now than before. We
1257                          * propagate the new mask to the parent's mask
1258                          * (not caring if it actually was valid or
1259                          * not). */
1260
1261                         s->cgroup_members_mask |= m;
1262
1263                 else
1264                         /* There's less set now than before (or we
1265                          * don't know), we need to recalculate
1266                          * everything, so let's invalidate the
1267                          * parent's members mask */
1268
1269                         s->cgroup_members_mask_valid = false;
1270
1271                 /* And now make sure that this change also hits our
1272                  * grandparents */
1273                 unit_update_cgroup_members_masks(s);
1274         }
1275 }
1276
1277 static const char *migrate_callback(CGroupMask mask, void *userdata) {
1278         Unit *u = userdata;
1279
1280         assert(mask != 0);
1281         assert(u);
1282
1283         while (u) {
1284                 if (u->cgroup_path &&
1285                     u->cgroup_realized &&
1286                     (u->cgroup_realized_mask & mask) == mask)
1287                         return u->cgroup_path;
1288
1289                 u = UNIT_DEREF(u->slice);
1290         }
1291
1292         return NULL;
1293 }
1294
1295 char *unit_default_cgroup_path(Unit *u) {
1296         _cleanup_free_ char *escaped = NULL, *slice = NULL;
1297         int r;
1298
1299         assert(u);
1300
1301         if (unit_has_name(u, SPECIAL_ROOT_SLICE))
1302                 return strdup(u->manager->cgroup_root);
1303
1304         if (UNIT_ISSET(u->slice) && !unit_has_name(UNIT_DEREF(u->slice), SPECIAL_ROOT_SLICE)) {
1305                 r = cg_slice_to_path(UNIT_DEREF(u->slice)->id, &slice);
1306                 if (r < 0)
1307                         return NULL;
1308         }
1309
1310         escaped = cg_escape(u->id);
1311         if (!escaped)
1312                 return NULL;
1313
1314         if (slice)
1315                 return strjoin(u->manager->cgroup_root, "/", slice, "/",
1316                                escaped);
1317         else
1318                 return strjoin(u->manager->cgroup_root, "/", escaped);
1319 }
1320
1321 int unit_set_cgroup_path(Unit *u, const char *path) {
1322         _cleanup_free_ char *p = NULL;
1323         int r;
1324
1325         assert(u);
1326
1327         if (path) {
1328                 p = strdup(path);
1329                 if (!p)
1330                         return -ENOMEM;
1331         } else
1332                 p = NULL;
1333
1334         if (streq_ptr(u->cgroup_path, p))
1335                 return 0;
1336
1337         if (p) {
1338                 r = hashmap_put(u->manager->cgroup_unit, p, u);
1339                 if (r < 0)
1340                         return r;
1341         }
1342
1343         unit_release_cgroup(u);
1344
1345         u->cgroup_path = p;
1346         p = NULL;
1347
1348         return 1;
1349 }
1350
1351 int unit_watch_cgroup(Unit *u) {
1352         _cleanup_free_ char *events = NULL;
1353         int r;
1354
1355         assert(u);
1356
1357         if (!u->cgroup_path)
1358                 return 0;
1359
1360         if (u->cgroup_inotify_wd >= 0)
1361                 return 0;
1362
1363         /* Only applies to the unified hierarchy */
1364         r = cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER);
1365         if (r < 0)
1366                 return log_error_errno(r, "Failed to determine whether the name=systemd hierarchy is unified: %m");
1367         if (r == 0)
1368                 return 0;
1369
1370         /* Don't watch the root slice, it's pointless. */
1371         if (unit_has_name(u, SPECIAL_ROOT_SLICE))
1372                 return 0;
1373
1374         r = hashmap_ensure_allocated(&u->manager->cgroup_inotify_wd_unit, &trivial_hash_ops);
1375         if (r < 0)
1376                 return log_oom();
1377
1378         r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, "cgroup.events", &events);
1379         if (r < 0)
1380                 return log_oom();
1381
1382         u->cgroup_inotify_wd = inotify_add_watch(u->manager->cgroup_inotify_fd, events, IN_MODIFY);
1383         if (u->cgroup_inotify_wd < 0) {
1384
1385                 if (errno == ENOENT) /* If the directory is already
1386                                       * gone we don't need to track
1387                                       * it, so this is not an error */
1388                         return 0;
1389
1390                 return log_unit_error_errno(u, errno, "Failed to add inotify watch descriptor for control group %s: %m", u->cgroup_path);
1391         }
1392
1393         r = hashmap_put(u->manager->cgroup_inotify_wd_unit, INT_TO_PTR(u->cgroup_inotify_wd), u);
1394         if (r < 0)
1395                 return log_unit_error_errno(u, r, "Failed to add inotify watch descriptor to hash map: %m");
1396
1397         return 0;
1398 }
1399
1400 int unit_pick_cgroup_path(Unit *u) {
1401         _cleanup_free_ char *path = NULL;
1402         int r;
1403
1404         assert(u);
1405
1406         if (u->cgroup_path)
1407                 return 0;
1408
1409         if (!UNIT_HAS_CGROUP_CONTEXT(u))
1410                 return -EINVAL;
1411
1412         path = unit_default_cgroup_path(u);
1413         if (!path)
1414                 return log_oom();
1415
1416         r = unit_set_cgroup_path(u, path);
1417         if (r == -EEXIST)
1418                 return log_unit_error_errno(u, r, "Control group %s exists already.", path);
1419         if (r < 0)
1420                 return log_unit_error_errno(u, r, "Failed to set unit's control group path to %s: %m", path);
1421
1422         return 0;
1423 }
1424
1425 static int unit_create_cgroup(
1426                 Unit *u,
1427                 CGroupMask target_mask,
1428                 CGroupMask enable_mask,
1429                 bool needs_bpf) {
1430
1431         CGroupContext *c;
1432         int r;
1433
1434         assert(u);
1435
1436         c = unit_get_cgroup_context(u);
1437         if (!c)
1438                 return 0;
1439
1440         /* Figure out our cgroup path */
1441         r = unit_pick_cgroup_path(u);
1442         if (r < 0)
1443                 return r;
1444
1445         /* First, create our own group */
1446         r = cg_create_everywhere(u->manager->cgroup_supported, target_mask, u->cgroup_path);
1447         if (r < 0)
1448                 return log_unit_error_errno(u, r, "Failed to create cgroup %s: %m", u->cgroup_path);
1449
1450         /* Start watching it */
1451         (void) unit_watch_cgroup(u);
1452
1453         /* Enable all controllers we need */
1454         r = cg_enable_everywhere(u->manager->cgroup_supported, enable_mask, u->cgroup_path);
1455         if (r < 0)
1456                 log_unit_warning_errno(u, r, "Failed to enable controllers on cgroup %s, ignoring: %m", u->cgroup_path);
1457
1458         /* Keep track that this is now realized */
1459         u->cgroup_realized = true;
1460         u->cgroup_realized_mask = target_mask;
1461         u->cgroup_enabled_mask = enable_mask;
1462         u->cgroup_bpf_state = needs_bpf ? UNIT_CGROUP_BPF_ON : UNIT_CGROUP_BPF_OFF;
1463
1464         if (u->type != UNIT_SLICE && !c->delegate) {
1465
1466                 /* Then, possibly move things over, but not if
1467                  * subgroups may contain processes, which is the case
1468                  * for slice and delegation units. */
1469                 r = cg_migrate_everywhere(u->manager->cgroup_supported, u->cgroup_path, u->cgroup_path, migrate_callback, u);
1470                 if (r < 0)
1471                         log_unit_warning_errno(u, r, "Failed to migrate cgroup from to %s, ignoring: %m", u->cgroup_path);
1472         }
1473
1474         return 0;
1475 }
1476
1477 int unit_attach_pids_to_cgroup(Unit *u) {
1478         int r;
1479         assert(u);
1480
1481         r = unit_realize_cgroup(u);
1482         if (r < 0)
1483                 return r;
1484
1485         r = cg_attach_many_everywhere(u->manager->cgroup_supported, u->cgroup_path, u->pids, migrate_callback, u);
1486         if (r < 0)
1487                 return r;
1488
1489         return 0;
1490 }
1491
1492 static void cgroup_xattr_apply(Unit *u) {
1493         char ids[SD_ID128_STRING_MAX];
1494         int r;
1495
1496         assert(u);
1497
1498         if (!MANAGER_IS_SYSTEM(u->manager))
1499                 return;
1500
1501         if (sd_id128_is_null(u->invocation_id))
1502                 return;
1503
1504         r = cg_set_xattr(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path,
1505                          "trusted.invocation_id",
1506                          sd_id128_to_string(u->invocation_id, ids), 32,
1507                          0);
1508         if (r < 0)
1509                 log_unit_debug_errno(u, r, "Failed to set invocation ID on control group %s, ignoring: %m", u->cgroup_path);
1510 }
1511
1512 static bool unit_has_mask_realized(
1513                 Unit *u,
1514                 CGroupMask target_mask,
1515                 CGroupMask enable_mask,
1516                 bool needs_bpf) {
1517
1518         assert(u);
1519
1520         return u->cgroup_realized &&
1521                 u->cgroup_realized_mask == target_mask &&
1522                 u->cgroup_enabled_mask == enable_mask &&
1523                 ((needs_bpf && u->cgroup_bpf_state == UNIT_CGROUP_BPF_ON) ||
1524                  (!needs_bpf && u->cgroup_bpf_state == UNIT_CGROUP_BPF_OFF));
1525 }
1526
1527 static void unit_add_to_cgroup_realize_queue(Unit *u) {
1528         assert(u);
1529
1530         if (u->in_cgroup_realize_queue)
1531                 return;
1532
1533         LIST_PREPEND(cgroup_realize_queue, u->manager->cgroup_realize_queue, u);
1534         u->in_cgroup_realize_queue = true;
1535 }
1536
1537 static void unit_remove_from_cgroup_realize_queue(Unit *u) {
1538         assert(u);
1539
1540         if (!u->in_cgroup_realize_queue)
1541                 return;
1542
1543         LIST_REMOVE(cgroup_realize_queue, u->manager->cgroup_realize_queue, u);
1544         u->in_cgroup_realize_queue = false;
1545 }
1546
1547
1548 /* Check if necessary controllers and attributes for a unit are in place.
1549  *
1550  * If so, do nothing.
1551  * If not, create paths, move processes over, and set attributes.
1552  *
1553  * Returns 0 on success and < 0 on failure. */
1554 static int unit_realize_cgroup_now(Unit *u, ManagerState state) {
1555         CGroupMask target_mask, enable_mask;
1556         bool needs_bpf, apply_bpf;
1557         int r;
1558
1559         assert(u);
1560
1561         unit_remove_from_cgroup_realize_queue(u);
1562
1563         target_mask = unit_get_target_mask(u);
1564         enable_mask = unit_get_enable_mask(u);
1565         needs_bpf = unit_get_needs_bpf(u);
1566
1567         if (unit_has_mask_realized(u, target_mask, enable_mask, needs_bpf))
1568                 return 0;
1569
1570         /* Make sure we apply the BPF filters either when one is configured, or if none is configured but previously
1571          * the state was anything but off. This way, if a unit with a BPF filter applied is reconfigured to lose it
1572          * this will trickle down properly to cgroupfs. */
1573         apply_bpf = needs_bpf || u->cgroup_bpf_state != UNIT_CGROUP_BPF_OFF;
1574
1575         /* First, realize parents */
1576         if (UNIT_ISSET(u->slice)) {
1577                 r = unit_realize_cgroup_now(UNIT_DEREF(u->slice), state);
1578                 if (r < 0)
1579                         return r;
1580         }
1581
1582         /* And then do the real work */
1583         r = unit_create_cgroup(u, target_mask, enable_mask, needs_bpf);
1584         if (r < 0)
1585                 return r;
1586
1587         /* Finally, apply the necessary attributes. */
1588         cgroup_context_apply(u, target_mask, apply_bpf, state);
1589         cgroup_xattr_apply(u);
1590
1591         return 0;
1592 }
1593
1594 unsigned manager_dispatch_cgroup_realize_queue(Manager *m) {
1595         ManagerState state;
1596         unsigned n = 0;
1597         Unit *i;
1598         int r;
1599
1600         assert(m);
1601
1602         state = manager_state(m);
1603
1604         while ((i = m->cgroup_realize_queue)) {
1605                 assert(i->in_cgroup_realize_queue);
1606
1607                 if (UNIT_IS_INACTIVE_OR_FAILED(unit_active_state(i))) {
1608                         /* Maybe things changed, and the unit is not actually active anymore? */
1609                         unit_remove_from_cgroup_realize_queue(i);
1610                         continue;
1611                 }
1612
1613                 r = unit_realize_cgroup_now(i, state);
1614                 if (r < 0)
1615                         log_warning_errno(r, "Failed to realize cgroups for queued unit %s, ignoring: %m", i->id);
1616
1617                 n++;
1618         }
1619
1620         return n;
1621 }
1622
1623 static void unit_add_siblings_to_cgroup_realize_queue(Unit *u) {
1624         Unit *slice;
1625
1626         /* This adds the siblings of the specified unit and the
1627          * siblings of all parent units to the cgroup queue. (But
1628          * neither the specified unit itself nor the parents.) */
1629
1630         while ((slice = UNIT_DEREF(u->slice))) {
1631                 Iterator i;
1632                 Unit *m;
1633                 void *v;
1634
1635                 HASHMAP_FOREACH_KEY(v, m, u->dependencies[UNIT_BEFORE], i) {
1636                         if (m == u)
1637                                 continue;
1638
1639                         /* Skip units that have a dependency on the slice
1640                          * but aren't actually in it. */
1641                         if (UNIT_DEREF(m->slice) != slice)
1642                                 continue;
1643
1644                         /* No point in doing cgroup application for units
1645                          * without active processes. */
1646                         if (UNIT_IS_INACTIVE_OR_FAILED(unit_active_state(m)))
1647                                 continue;
1648
1649                         /* If the unit doesn't need any new controllers
1650                          * and has current ones realized, it doesn't need
1651                          * any changes. */
1652                         if (unit_has_mask_realized(m,
1653                                                    unit_get_target_mask(m),
1654                                                    unit_get_enable_mask(m),
1655                                                    unit_get_needs_bpf(m)))
1656                                 continue;
1657
1658                         unit_add_to_cgroup_realize_queue(m);
1659                 }
1660
1661                 u = slice;
1662         }
1663 }
1664
1665 int unit_realize_cgroup(Unit *u) {
1666         assert(u);
1667
1668         if (!UNIT_HAS_CGROUP_CONTEXT(u))
1669                 return 0;
1670
1671         /* So, here's the deal: when realizing the cgroups for this
1672          * unit, we need to first create all parents, but there's more
1673          * actually: for the weight-based controllers we also need to
1674          * make sure that all our siblings (i.e. units that are in the
1675          * same slice as we are) have cgroups, too. Otherwise, things
1676          * would become very uneven as each of their processes would
1677          * get as much resources as all our group together. This call
1678          * will synchronously create the parent cgroups, but will
1679          * defer work on the siblings to the next event loop
1680          * iteration. */
1681
1682         /* Add all sibling slices to the cgroup queue. */
1683         unit_add_siblings_to_cgroup_realize_queue(u);
1684
1685         /* And realize this one now (and apply the values) */
1686         return unit_realize_cgroup_now(u, manager_state(u->manager));
1687 }
1688
1689 void unit_release_cgroup(Unit *u) {
1690         assert(u);
1691
1692         /* Forgets all cgroup details for this cgroup */
1693
1694         if (u->cgroup_path) {
1695                 (void) hashmap_remove(u->manager->cgroup_unit, u->cgroup_path);
1696                 u->cgroup_path = mfree(u->cgroup_path);
1697         }
1698
1699         if (u->cgroup_inotify_wd >= 0) {
1700                 if (inotify_rm_watch(u->manager->cgroup_inotify_fd, u->cgroup_inotify_wd) < 0)
1701                         log_unit_debug_errno(u, errno, "Failed to remove cgroup inotify watch %i for %s, ignoring", u->cgroup_inotify_wd, u->id);
1702
1703                 (void) hashmap_remove(u->manager->cgroup_inotify_wd_unit, INT_TO_PTR(u->cgroup_inotify_wd));
1704                 u->cgroup_inotify_wd = -1;
1705         }
1706 }
1707
1708 void unit_prune_cgroup(Unit *u) {
1709         int r;
1710         bool is_root_slice;
1711
1712         assert(u);
1713
1714         /* Removes the cgroup, if empty and possible, and stops watching it. */
1715
1716         if (!u->cgroup_path)
1717                 return;
1718
1719         (void) unit_get_cpu_usage(u, NULL); /* Cache the last CPU usage value before we destroy the cgroup */
1720
1721         is_root_slice = unit_has_name(u, SPECIAL_ROOT_SLICE);
1722
1723         r = cg_trim_everywhere(u->manager->cgroup_supported, u->cgroup_path, !is_root_slice);
1724         if (r < 0) {
1725                 log_unit_debug_errno(u, r, "Failed to destroy cgroup %s, ignoring: %m", u->cgroup_path);
1726                 return;
1727         }
1728
1729         if (is_root_slice)
1730                 return;
1731
1732         unit_release_cgroup(u);
1733
1734         u->cgroup_realized = false;
1735         u->cgroup_realized_mask = 0;
1736         u->cgroup_enabled_mask = 0;
1737 }
1738
1739 int unit_search_main_pid(Unit *u, pid_t *ret) {
1740         _cleanup_fclose_ FILE *f = NULL;
1741         pid_t pid = 0, npid, mypid;
1742         int r;
1743
1744         assert(u);
1745         assert(ret);
1746
1747         if (!u->cgroup_path)
1748                 return -ENXIO;
1749
1750         r = cg_enumerate_processes(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, &f);
1751         if (r < 0)
1752                 return r;
1753
1754         mypid = getpid_cached();
1755         while (cg_read_pid(f, &npid) > 0)  {
1756                 pid_t ppid;
1757
1758                 if (npid == pid)
1759                         continue;
1760
1761                 /* Ignore processes that aren't our kids */
1762                 if (get_process_ppid(npid, &ppid) >= 0 && ppid != mypid)
1763                         continue;
1764
1765                 if (pid != 0)
1766                         /* Dang, there's more than one daemonized PID
1767                         in this group, so we don't know what process
1768                         is the main process. */
1769
1770                         return -ENODATA;
1771
1772                 pid = npid;
1773         }
1774
1775         *ret = pid;
1776         return 0;
1777 }
1778
1779 static int unit_watch_pids_in_path(Unit *u, const char *path) {
1780         _cleanup_closedir_ DIR *d = NULL;
1781         _cleanup_fclose_ FILE *f = NULL;
1782         int ret = 0, r;
1783
1784         assert(u);
1785         assert(path);
1786
1787         r = cg_enumerate_processes(SYSTEMD_CGROUP_CONTROLLER, path, &f);
1788         if (r < 0)
1789                 ret = r;
1790         else {
1791                 pid_t pid;
1792
1793                 while ((r = cg_read_pid(f, &pid)) > 0) {
1794                         r = unit_watch_pid(u, pid);
1795                         if (r < 0 && ret >= 0)
1796                                 ret = r;
1797                 }
1798
1799                 if (r < 0 && ret >= 0)
1800                         ret = r;
1801         }
1802
1803         r = cg_enumerate_subgroups(SYSTEMD_CGROUP_CONTROLLER, path, &d);
1804         if (r < 0) {
1805                 if (ret >= 0)
1806                         ret = r;
1807         } else {
1808                 char *fn;
1809
1810                 while ((r = cg_read_subgroup(d, &fn)) > 0) {
1811                         _cleanup_free_ char *p = NULL;
1812
1813                         p = strjoin(path, "/", fn);
1814                         free(fn);
1815
1816                         if (!p)
1817                                 return -ENOMEM;
1818
1819                         r = unit_watch_pids_in_path(u, p);
1820                         if (r < 0 && ret >= 0)
1821                                 ret = r;
1822                 }
1823
1824                 if (r < 0 && ret >= 0)
1825                         ret = r;
1826         }
1827
1828         return ret;
1829 }
1830
1831 int unit_watch_all_pids(Unit *u) {
1832         int r;
1833
1834         assert(u);
1835
1836         /* Adds all PIDs from our cgroup to the set of PIDs we
1837          * watch. This is a fallback logic for cases where we do not
1838          * get reliable cgroup empty notifications: we try to use
1839          * SIGCHLD as replacement. */
1840
1841         if (!u->cgroup_path)
1842                 return -ENOENT;
1843
1844         r = cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER);
1845         if (r < 0)
1846                 return r;
1847         if (r > 0) /* On unified we can use proper notifications */
1848                 return 0;
1849
1850         return unit_watch_pids_in_path(u, u->cgroup_path);
1851 }
1852
1853 static int on_cgroup_empty_event(sd_event_source *s, void *userdata) {
1854         Manager *m = userdata;
1855         Unit *u;
1856         int r;
1857
1858         assert(s);
1859         assert(m);
1860
1861         u = m->cgroup_empty_queue;
1862         if (!u)
1863                 return 0;
1864
1865         assert(u->in_cgroup_empty_queue);
1866         u->in_cgroup_empty_queue = false;
1867         LIST_REMOVE(cgroup_empty_queue, m->cgroup_empty_queue, u);
1868
1869         if (m->cgroup_empty_queue) {
1870                 /* More stuff queued, let's make sure we remain enabled */
1871                 r = sd_event_source_set_enabled(s, SD_EVENT_ONESHOT);
1872                 if (r < 0)
1873                         log_debug_errno(r, "Failed to reenable cgroup empty event source: %m");
1874         }
1875
1876         unit_add_to_gc_queue(u);
1877
1878         if (UNIT_VTABLE(u)->notify_cgroup_empty)
1879                 UNIT_VTABLE(u)->notify_cgroup_empty(u);
1880
1881         return 0;
1882 }
1883
1884 void unit_add_to_cgroup_empty_queue(Unit *u) {
1885         int r;
1886
1887         assert(u);
1888
1889         /* Note that there are four different ways how cgroup empty events reach us:
1890          *
1891          * 1. On the unified hierarchy we get an inotify event on the cgroup
1892          *
1893          * 2. On the legacy hierarchy, when running in system mode, we get a datagram on the cgroup agent socket
1894          *
1895          * 3. On the legacy hierarchy, when running in user mode, we get a D-Bus signal on the system bus
1896          *
1897          * 4. On the legacy hierarchy, in service units we start watching all processes of the cgroup for SIGCHLD as
1898          *    soon as we get one SIGCHLD, to deal with unreliable cgroup notifications.
1899          *
1900          * Regardless which way we got the notification, we'll verify it here, and then add it to a separate
1901          * queue. This queue will be dispatched at a lower priority than the SIGCHLD handler, so that we always use
1902          * SIGCHLD if we can get it first, and only use the cgroup empty notifications if there's no SIGCHLD pending
1903          * (which might happen if the cgroup doesn't contain processes that are our own child, which is typically the
1904          * case for scope units). */
1905
1906         if (u->in_cgroup_empty_queue)
1907                 return;
1908
1909         /* Let's verify that the cgroup is really empty */
1910         if (!u->cgroup_path)
1911                 return;
1912         r = cg_is_empty_recursive(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path);
1913         if (r < 0) {
1914                 log_unit_debug_errno(u, r, "Failed to determine whether cgroup %s is empty: %m", u->cgroup_path);
1915                 return;
1916         }
1917         if (r == 0)
1918                 return;
1919
1920         LIST_PREPEND(cgroup_empty_queue, u->manager->cgroup_empty_queue, u);
1921         u->in_cgroup_empty_queue = true;
1922
1923         /* Trigger the defer event */
1924         r = sd_event_source_set_enabled(u->manager->cgroup_empty_event_source, SD_EVENT_ONESHOT);
1925         if (r < 0)
1926                 log_debug_errno(r, "Failed to enable cgroup empty event source: %m");
1927 }
1928
1929 static int on_cgroup_inotify_event(sd_event_source *s, int fd, uint32_t revents, void *userdata) {
1930         Manager *m = userdata;
1931
1932         assert(s);
1933         assert(fd >= 0);
1934         assert(m);
1935
1936         for (;;) {
1937                 union inotify_event_buffer buffer;
1938                 struct inotify_event *e;
1939                 ssize_t l;
1940
1941                 l = read(fd, &buffer, sizeof(buffer));
1942                 if (l < 0) {
1943                         if (IN_SET(errno, EINTR, EAGAIN))
1944                                 return 0;
1945
1946                         return log_error_errno(errno, "Failed to read control group inotify events: %m");
1947                 }
1948
1949                 FOREACH_INOTIFY_EVENT(e, buffer, l) {
1950                         Unit *u;
1951
1952                         if (e->wd < 0)
1953                                 /* Queue overflow has no watch descriptor */
1954                                 continue;
1955
1956                         if (e->mask & IN_IGNORED)
1957                                 /* The watch was just removed */
1958                                 continue;
1959
1960                         u = hashmap_get(m->cgroup_inotify_wd_unit, INT_TO_PTR(e->wd));
1961                         if (!u) /* Not that inotify might deliver
1962                                  * events for a watch even after it
1963                                  * was removed, because it was queued
1964                                  * before the removal. Let's ignore
1965                                  * this here safely. */
1966                                 continue;
1967
1968                         unit_add_to_cgroup_empty_queue(u);
1969                 }
1970         }
1971 }
1972 #endif // 0
1973
1974 int manager_setup_cgroup(Manager *m) {
1975         _cleanup_free_ char *path = NULL;
1976         const char *scope_path;
1977         CGroupController c;
1978         int r, all_unified;
1979 #if 0 /// UNNEEDED by elogind
1980         char *e;
1981 #endif // 0
1982
1983         assert(m);
1984
1985         /* 1. Determine hierarchy */
1986         m->cgroup_root = mfree(m->cgroup_root);
1987 #if 0 /// elogind is not init and must therefore search for PID 1 instead of self.
1988         r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, 0, &m->cgroup_root);
1989 #else
1990         r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, 1, &m->cgroup_root);
1991 #endif // 0
1992         if (r < 0)
1993                 return log_error_errno(r, "Cannot determine cgroup we are running in: %m");
1994
1995 #if 0 /// elogind does not support systemd scopes and slices
1996         /* Chop off the init scope, if we are already located in it */
1997         e = endswith(m->cgroup_root, "/" SPECIAL_INIT_SCOPE);
1998
1999         /* LEGACY: Also chop off the system slice if we are in
2000          * it. This is to support live upgrades from older systemd
2001          * versions where PID 1 was moved there. Also see
2002          * cg_get_root_path(). */
2003         if (!e && MANAGER_IS_SYSTEM(m)) {
2004                 e = endswith(m->cgroup_root, "/" SPECIAL_SYSTEM_SLICE);
2005                 if (!e)
2006                         e = endswith(m->cgroup_root, "/system"); /* even more legacy */
2007         }
2008         if (e)
2009                 *e = 0;
2010 #endif // 0
2011
2012         log_debug_elogind("Cgroup Controller \"%s\" -> root \"%s\"",
2013                           SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root);
2014         /* And make sure to store away the root value without trailing slash, even for the root dir, so that we can
2015          * easily prepend it everywhere. */
2016         delete_trailing_chars(m->cgroup_root, "/");
2017
2018         /* 2. Show data */
2019         r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, NULL, &path);
2020         if (r < 0)
2021                 return log_error_errno(r, "Cannot find cgroup mount point: %m");
2022
2023         r = cg_unified_flush();
2024         if (r < 0)
2025                 return log_error_errno(r, "Couldn't determine if we are running in the unified hierarchy: %m");
2026
2027         all_unified = cg_all_unified();
2028         if (all_unified < 0)
2029                 return log_error_errno(all_unified, "Couldn't determine whether we are in all unified mode: %m");
2030         if (all_unified > 0)
2031                 log_debug("Unified cgroup hierarchy is located at %s.", path);
2032         else {
2033                 r = cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER);
2034                 if (r < 0)
2035                         return log_error_errno(r, "Failed to determine whether systemd's own controller is in unified mode: %m");
2036                 if (r > 0)
2037                         log_debug("Unified cgroup hierarchy is located at %s. Controllers are on legacy hierarchies.", path);
2038                 else
2039                         log_debug("Using cgroup controller " SYSTEMD_CGROUP_CONTROLLER_LEGACY ". File system hierarchy is at %s.", path);
2040         }
2041
2042 #if 0 /// elogind is not init, and does not install the agent here.
2043         /* 3. Allocate cgroup empty defer event source */
2044         m->cgroup_empty_event_source = sd_event_source_unref(m->cgroup_empty_event_source);
2045         r = sd_event_add_defer(m->event, &m->cgroup_empty_event_source, on_cgroup_empty_event, m);
2046         if (r < 0)
2047                 return log_error_errno(r, "Failed to create cgroup empty event source: %m");
2048
2049         r = sd_event_source_set_priority(m->cgroup_empty_event_source, SD_EVENT_PRIORITY_NORMAL-5);
2050         if (r < 0)
2051                 return log_error_errno(r, "Failed to set priority of cgroup empty event source: %m");
2052
2053         r = sd_event_source_set_enabled(m->cgroup_empty_event_source, SD_EVENT_OFF);
2054         if (r < 0)
2055                 return log_error_errno(r, "Failed to disable cgroup empty event source: %m");
2056
2057         (void) sd_event_source_set_description(m->cgroup_empty_event_source, "cgroup-empty");
2058
2059         /* 4. Install notifier inotify object, or agent */
2060         if (cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER) > 0) {
2061
2062                 /* In the unified hierarchy we can get cgroup empty notifications via inotify. */
2063
2064                 m->cgroup_inotify_event_source = sd_event_source_unref(m->cgroup_inotify_event_source);
2065                 safe_close(m->cgroup_inotify_fd);
2066
2067                 m->cgroup_inotify_fd = inotify_init1(IN_NONBLOCK|IN_CLOEXEC);
2068                 if (m->cgroup_inotify_fd < 0)
2069                         return log_error_errno(errno, "Failed to create control group inotify object: %m");
2070
2071                 r = sd_event_add_io(m->event, &m->cgroup_inotify_event_source, m->cgroup_inotify_fd, EPOLLIN, on_cgroup_inotify_event, m);
2072                 if (r < 0)
2073                         return log_error_errno(r, "Failed to watch control group inotify object: %m");
2074
2075                 /* Process cgroup empty notifications early, but after service notifications and SIGCHLD. Also
2076                  * see handling of cgroup agent notifications, for the classic cgroup hierarchy support. */
2077                 r = sd_event_source_set_priority(m->cgroup_inotify_event_source, SD_EVENT_PRIORITY_NORMAL-4);
2078                 if (r < 0)
2079                         return log_error_errno(r, "Failed to set priority of inotify event source: %m");
2080
2081                 (void) sd_event_source_set_description(m->cgroup_inotify_event_source, "cgroup-inotify");
2082
2083         } else if (MANAGER_IS_SYSTEM(m) && m->test_run_flags == 0) {
2084
2085                 /* On the legacy hierarchy we only get notifications via cgroup agents. (Which isn't really reliable,
2086                  * since it does not generate events when control groups with children run empty. */
2087
2088                 r = cg_install_release_agent(SYSTEMD_CGROUP_CONTROLLER, SYSTEMD_CGROUP_AGENT_PATH);
2089                 if (r < 0)
2090                         log_warning_errno(r, "Failed to install release agent, ignoring: %m");
2091                 else if (r > 0)
2092                         log_debug("Installed release agent.");
2093                 else if (r == 0)
2094                         log_debug("Release agent already installed.");
2095         }
2096
2097         /* 5. Make sure we are in the special "init.scope" unit in the root slice. */
2098         scope_path = strjoina(m->cgroup_root, "/" SPECIAL_INIT_SCOPE);
2099         r = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, scope_path, 0);
2100 #else
2101         /* Note:
2102                 * This method is in core, and normally called by systemd
2103                 * being init. As elogind is never init, we can not install
2104                 * our agent here. We do so when mounting our cgroup file
2105                 * system, so only if elogind is its own tiny controller.
2106                 * Further, elogind is not meant to run in systemd init scope. */
2107         if (MANAGER_IS_SYSTEM(m))
2108                 // we are our own cgroup controller
2109                 scope_path = strjoina("");
2110         else if (streq(m->cgroup_root, "/elogind"))
2111                 // root already is our cgroup
2112                 scope_path = strjoina(m->cgroup_root);
2113         else
2114                 // we have to create our own group
2115                 scope_path = strjoina(m->cgroup_root, "/elogind");
2116         r = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, scope_path, 0);
2117 #endif // 0
2118         if (r < 0)
2119                 return log_error_errno(r, "Failed to create %s control group: %m", scope_path);
2120         log_debug_elogind("Created control group \"%s\"", scope_path);
2121
2122 #if 0 /// elogind is not a "sub-controller" like systemd, so migration is not needed.
2123         /* Also, move all other userspace processes remaining in the root cgroup into that scope. */
2124         r = cg_migrate(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, SYSTEMD_CGROUP_CONTROLLER, scope_path, 0);
2125         if (r < 0)
2126                 log_warning_errno(r, "Couldn't move remaining userspace processes, ignoring: %m");
2127 #endif // 0
2128
2129         /* 6. And pin it, so that it cannot be unmounted */
2130         safe_close(m->pin_cgroupfs_fd);
2131         m->pin_cgroupfs_fd = open(path, O_RDONLY|O_CLOEXEC|O_DIRECTORY|O_NOCTTY|O_NONBLOCK);
2132         if (m->pin_cgroupfs_fd < 0)
2133                 return log_error_errno(errno, "Failed to open pin file: %m");
2134
2135         /* 7. Always enable hierarchical support if it exists... */
2136         if (!all_unified && m->test_run_flags == 0)
2137                 (void) cg_set_attribute("memory", "/", "memory.use_hierarchy", "1");
2138
2139         /* 8. Figure out which controllers are supported, and log about it */
2140         r = cg_mask_supported(&m->cgroup_supported);
2141         if (r < 0)
2142                 return log_error_errno(r, "Failed to determine supported controllers: %m");
2143         for (c = 0; c < _CGROUP_CONTROLLER_MAX; c++)
2144                 log_debug("Controller '%s' supported: %s", cgroup_controller_to_string(c), yes_no(m->cgroup_supported & CGROUP_CONTROLLER_TO_MASK(c)));
2145
2146         return 0;
2147 }
2148
2149 void manager_shutdown_cgroup(Manager *m, bool delete) {
2150         assert(m);
2151
2152         /* We can't really delete the group, since we are in it. But
2153          * let's trim it. */
2154         if (delete && m->cgroup_root)
2155                 (void) cg_trim(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, false);
2156
2157 #if 0 /// elogind is not init
2158         m->cgroup_empty_event_source = sd_event_source_unref(m->cgroup_empty_event_source);
2159
2160         m->cgroup_inotify_wd_unit = hashmap_free(m->cgroup_inotify_wd_unit);
2161
2162         m->cgroup_inotify_event_source = sd_event_source_unref(m->cgroup_inotify_event_source);
2163         m->cgroup_inotify_fd = safe_close(m->cgroup_inotify_fd);
2164 #endif // 0
2165
2166         m->pin_cgroupfs_fd = safe_close(m->pin_cgroupfs_fd);
2167
2168         m->cgroup_root = mfree(m->cgroup_root);
2169 }
2170
2171 #if 0 /// UNNEEDED by elogind
2172 Unit* manager_get_unit_by_cgroup(Manager *m, const char *cgroup) {
2173         char *p;
2174         Unit *u;
2175
2176         assert(m);
2177         assert(cgroup);
2178
2179         u = hashmap_get(m->cgroup_unit, cgroup);
2180         if (u)
2181                 return u;
2182
2183         p = strdupa(cgroup);
2184         for (;;) {
2185                 char *e;
2186
2187                 e = strrchr(p, '/');
2188                 if (!e || e == p)
2189                         return hashmap_get(m->cgroup_unit, SPECIAL_ROOT_SLICE);
2190
2191                 *e = 0;
2192
2193                 u = hashmap_get(m->cgroup_unit, p);
2194                 if (u)
2195                         return u;
2196         }
2197 }
2198
2199 Unit *manager_get_unit_by_pid_cgroup(Manager *m, pid_t pid) {
2200         _cleanup_free_ char *cgroup = NULL;
2201         int r;
2202
2203         assert(m);
2204
2205         if (pid <= 0)
2206                 return NULL;
2207
2208         r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, pid, &cgroup);
2209         if (r < 0)
2210                 return NULL;
2211
2212         return manager_get_unit_by_cgroup(m, cgroup);
2213 }
2214
2215 Unit *manager_get_unit_by_pid(Manager *m, pid_t pid) {
2216         Unit *u;
2217
2218         assert(m);
2219
2220         if (pid <= 0)
2221                 return NULL;
2222
2223         if (pid == 1)
2224                 return hashmap_get(m->units, SPECIAL_INIT_SCOPE);
2225
2226         u = hashmap_get(m->watch_pids1, PID_TO_PTR(pid));
2227         if (u)
2228                 return u;
2229
2230         u = hashmap_get(m->watch_pids2, PID_TO_PTR(pid));
2231         if (u)
2232                 return u;
2233
2234         return manager_get_unit_by_pid_cgroup(m, pid);
2235 }
2236 #endif // 0
2237
2238 #if 0 /// elogind must substitute this with its own variant
2239 int manager_notify_cgroup_empty(Manager *m, const char *cgroup) {
2240         Unit *u;
2241
2242         assert(m);
2243         assert(cgroup);
2244
2245         /* Called on the legacy hierarchy whenever we get an explicit cgroup notification from the cgroup agent process
2246          * or from the --system instance */
2247
2248         log_debug("Got cgroup empty notification for: %s", cgroup);
2249
2250         u = manager_get_unit_by_cgroup(m, cgroup);
2251         if (!u)
2252                 return 0;
2253
2254         unit_add_to_cgroup_empty_queue(u);
2255         return 1;
2256 }
2257 #else
2258 int manager_notify_cgroup_empty(Manager *m, const char *cgroup) {
2259         Session *s;
2260
2261         assert(m);
2262         assert(cgroup);
2263
2264         log_debug("Got cgroup empty notification for: %s", cgroup);
2265
2266         s = hashmap_get(m->sessions, cgroup);
2267
2268         if (s) {
2269                 session_finalize(s);
2270                 session_free(s);
2271         } else
2272                 log_warning("Session not found: %s", cgroup);
2273
2274         return 0;
2275 }
2276 #endif // 0
2277 #if 0 /// UNNEEDED by elogind
2278 int unit_get_memory_current(Unit *u, uint64_t *ret) {
2279         _cleanup_free_ char *v = NULL;
2280         int r;
2281
2282         assert(u);
2283         assert(ret);
2284
2285         if (!UNIT_CGROUP_BOOL(u, memory_accounting))
2286                 return -ENODATA;
2287
2288         if (!u->cgroup_path)
2289                 return -ENODATA;
2290
2291         if ((u->cgroup_realized_mask & CGROUP_MASK_MEMORY) == 0)
2292                 return -ENODATA;
2293
2294         r = cg_all_unified();
2295         if (r < 0)
2296                 return r;
2297         if (r > 0)
2298                 r = cg_get_attribute("memory", u->cgroup_path, "memory.current", &v);
2299         else
2300                 r = cg_get_attribute("memory", u->cgroup_path, "memory.usage_in_bytes", &v);
2301         if (r == -ENOENT)
2302                 return -ENODATA;
2303         if (r < 0)
2304                 return r;
2305
2306         return safe_atou64(v, ret);
2307 }
2308
2309 int unit_get_tasks_current(Unit *u, uint64_t *ret) {
2310         _cleanup_free_ char *v = NULL;
2311         int r;
2312
2313         assert(u);
2314         assert(ret);
2315
2316         if (!UNIT_CGROUP_BOOL(u, tasks_accounting))
2317                 return -ENODATA;
2318
2319         if (!u->cgroup_path)
2320                 return -ENODATA;
2321
2322         if ((u->cgroup_realized_mask & CGROUP_MASK_PIDS) == 0)
2323                 return -ENODATA;
2324
2325         r = cg_get_attribute("pids", u->cgroup_path, "pids.current", &v);
2326         if (r == -ENOENT)
2327                 return -ENODATA;
2328         if (r < 0)
2329                 return r;
2330
2331         return safe_atou64(v, ret);
2332 }
2333
2334 static int unit_get_cpu_usage_raw(Unit *u, nsec_t *ret) {
2335         _cleanup_free_ char *v = NULL;
2336         uint64_t ns;
2337         int r;
2338
2339         assert(u);
2340         assert(ret);
2341
2342         if (!u->cgroup_path)
2343                 return -ENODATA;
2344
2345         r = cg_all_unified();
2346         if (r < 0)
2347                 return r;
2348         if (r > 0) {
2349                 const char *keys[] = { "usage_usec", NULL };
2350                 _cleanup_free_ char *val = NULL;
2351                 uint64_t us;
2352
2353                 if ((u->cgroup_realized_mask & CGROUP_MASK_CPU) == 0)
2354                         return -ENODATA;
2355
2356                 r = cg_get_keyed_attribute("cpu", u->cgroup_path, "cpu.stat", keys, &val);
2357                 if (r < 0)
2358                         return r;
2359
2360                 r = safe_atou64(val, &us);
2361                 if (r < 0)
2362                         return r;
2363
2364                 ns = us * NSEC_PER_USEC;
2365         } else {
2366                 if ((u->cgroup_realized_mask & CGROUP_MASK_CPUACCT) == 0)
2367                         return -ENODATA;
2368
2369                 r = cg_get_attribute("cpuacct", u->cgroup_path, "cpuacct.usage", &v);
2370                 if (r == -ENOENT)
2371                         return -ENODATA;
2372                 if (r < 0)
2373                         return r;
2374
2375                 r = safe_atou64(v, &ns);
2376                 if (r < 0)
2377                         return r;
2378         }
2379
2380         *ret = ns;
2381         return 0;
2382 }
2383
2384 int unit_get_cpu_usage(Unit *u, nsec_t *ret) {
2385         nsec_t ns;
2386         int r;
2387
2388         assert(u);
2389
2390         /* Retrieve the current CPU usage counter. This will subtract the CPU counter taken when the unit was
2391          * started. If the cgroup has been removed already, returns the last cached value. To cache the value, simply
2392          * call this function with a NULL return value. */
2393
2394         if (!UNIT_CGROUP_BOOL(u, cpu_accounting))
2395                 return -ENODATA;
2396
2397         r = unit_get_cpu_usage_raw(u, &ns);
2398         if (r == -ENODATA && u->cpu_usage_last != NSEC_INFINITY) {
2399                 /* If we can't get the CPU usage anymore (because the cgroup was already removed, for example), use our
2400                  * cached value. */
2401
2402                 if (ret)
2403                         *ret = u->cpu_usage_last;
2404                 return 0;
2405         }
2406         if (r < 0)
2407                 return r;
2408
2409         if (ns > u->cpu_usage_base)
2410                 ns -= u->cpu_usage_base;
2411         else
2412                 ns = 0;
2413
2414         u->cpu_usage_last = ns;
2415         if (ret)
2416                 *ret = ns;
2417
2418         return 0;
2419 }
2420
2421 int unit_get_ip_accounting(
2422                 Unit *u,
2423                 CGroupIPAccountingMetric metric,
2424                 uint64_t *ret) {
2425
2426         uint64_t value;
2427         int fd, r;
2428
2429         assert(u);
2430         assert(metric >= 0);
2431         assert(metric < _CGROUP_IP_ACCOUNTING_METRIC_MAX);
2432         assert(ret);
2433
2434         /* IP accounting is currently not recursive, and hence we refuse to return any data for slice nodes. Slices are
2435          * inner cgroup nodes and hence have no processes directly attached, hence their counters would be zero
2436          * anyway. And if we block this now we can later open this up, if the kernel learns recursive BPF cgroup
2437          * filters. */
2438         if (u->type == UNIT_SLICE)
2439                 return -ENODATA;
2440
2441         if (!UNIT_CGROUP_BOOL(u, ip_accounting))
2442                 return -ENODATA;
2443
2444         fd = IN_SET(metric, CGROUP_IP_INGRESS_BYTES, CGROUP_IP_INGRESS_PACKETS) ?
2445                 u->ip_accounting_ingress_map_fd :
2446                 u->ip_accounting_egress_map_fd;
2447         if (fd < 0)
2448                 return -ENODATA;
2449
2450         if (IN_SET(metric, CGROUP_IP_INGRESS_BYTES, CGROUP_IP_EGRESS_BYTES))
2451                 r = bpf_firewall_read_accounting(fd, &value, NULL);
2452         else
2453                 r = bpf_firewall_read_accounting(fd, NULL, &value);
2454         if (r < 0)
2455                 return r;
2456
2457         /* Add in additional metrics from a previous runtime. Note that when reexecing/reloading the daemon we compile
2458          * all BPF programs and maps anew, but serialize the old counters. When deserializing we store them in the
2459          * ip_accounting_extra[] field, and add them in here transparently. */
2460
2461         *ret = value + u->ip_accounting_extra[metric];
2462
2463         return r;
2464 }
2465
2466 int unit_reset_cpu_accounting(Unit *u) {
2467         nsec_t ns;
2468         int r;
2469
2470         assert(u);
2471
2472         u->cpu_usage_last = NSEC_INFINITY;
2473
2474         r = unit_get_cpu_usage_raw(u, &ns);
2475         if (r < 0) {
2476                 u->cpu_usage_base = 0;
2477                 return r;
2478         }
2479
2480         u->cpu_usage_base = ns;
2481         return 0;
2482 }
2483
2484 int unit_reset_ip_accounting(Unit *u) {
2485         int r = 0, q = 0;
2486
2487         assert(u);
2488
2489         if (u->ip_accounting_ingress_map_fd >= 0)
2490                 r = bpf_firewall_reset_accounting(u->ip_accounting_ingress_map_fd);
2491
2492         if (u->ip_accounting_egress_map_fd >= 0)
2493                 q = bpf_firewall_reset_accounting(u->ip_accounting_egress_map_fd);
2494
2495         zero(u->ip_accounting_extra);
2496
2497         return r < 0 ? r : q;
2498 }
2499
2500 void unit_invalidate_cgroup(Unit *u, CGroupMask m) {
2501         assert(u);
2502
2503         if (!UNIT_HAS_CGROUP_CONTEXT(u))
2504                 return;
2505
2506         if (m == 0)
2507                 return;
2508
2509         /* always invalidate compat pairs together */
2510         if (m & (CGROUP_MASK_IO | CGROUP_MASK_BLKIO))
2511                 m |= CGROUP_MASK_IO | CGROUP_MASK_BLKIO;
2512
2513         if (m & (CGROUP_MASK_CPU | CGROUP_MASK_CPUACCT))
2514                 m |= CGROUP_MASK_CPU | CGROUP_MASK_CPUACCT;
2515
2516         if ((u->cgroup_realized_mask & m) == 0) /* NOP? */
2517                 return;
2518
2519         u->cgroup_realized_mask &= ~m;
2520         unit_add_to_cgroup_realize_queue(u);
2521 }
2522
2523 void unit_invalidate_cgroup_bpf(Unit *u) {
2524         assert(u);
2525
2526         if (!UNIT_HAS_CGROUP_CONTEXT(u))
2527                 return;
2528
2529         if (u->cgroup_bpf_state == UNIT_CGROUP_BPF_INVALIDATED) /* NOP? */
2530                 return;
2531
2532         u->cgroup_bpf_state = UNIT_CGROUP_BPF_INVALIDATED;
2533         unit_add_to_cgroup_realize_queue(u);
2534
2535         /* If we are a slice unit, we also need to put compile a new BPF program for all our children, as the IP access
2536          * list of our children includes our own. */
2537         if (u->type == UNIT_SLICE) {
2538                 Unit *member;
2539                 Iterator i;
2540                 void *v;
2541
2542                 HASHMAP_FOREACH_KEY(v, member, u->dependencies[UNIT_BEFORE], i) {
2543                         if (member == u)
2544                                 continue;
2545
2546                         if (UNIT_DEREF(member->slice) != u)
2547                                 continue;
2548
2549                         unit_invalidate_cgroup_bpf(member);
2550                 }
2551         }
2552 }
2553
2554 void manager_invalidate_startup_units(Manager *m) {
2555         Iterator i;
2556         Unit *u;
2557
2558         assert(m);
2559
2560         SET_FOREACH(u, m->startup_units, i)
2561                 unit_invalidate_cgroup(u, CGROUP_MASK_CPU|CGROUP_MASK_IO|CGROUP_MASK_BLKIO);
2562 }
2563
2564 static const char* const cgroup_device_policy_table[_CGROUP_DEVICE_POLICY_MAX] = {
2565         [CGROUP_AUTO] = "auto",
2566         [CGROUP_CLOSED] = "closed",
2567         [CGROUP_STRICT] = "strict",
2568 };
2569
2570 DEFINE_STRING_TABLE_LOOKUP(cgroup_device_policy, CGroupDevicePolicy);
2571 #endif // 0