src/core/cgroup.c

   1 /* SPDX-License-Identifier: LGPL-2.1+ */
   2 /***
   3   This file is part of systemd.
   4
   5   Copyright 2013 Lennart Poettering
   6
   7   systemd is free software; you can redistribute it and/or modify it
   8   under the terms of the GNU Lesser General Public License as published by
   9   the Free Software Foundation; either version 2.1 of the License, or
  10   (at your option) any later version.
  11
  12   systemd is distributed in the hope that it will be useful, but
  13   WITHOUT ANY WARRANTY; without even the implied warranty of
  14   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  15   Lesser General Public License for more details.
  16
  17   You should have received a copy of the GNU Lesser General Public License
  18   along with systemd; If not, see <http://www.gnu.org/licenses/>.
  19 ***/
  20
  21 #include <fcntl.h>
  22 #include <fnmatch.h>
  23
  24 #include "alloc-util.h"
  25 //#include "blockdev-util.h"
  26 //#include "bpf-firewall.h"
  27 #include "cgroup-util.h"
  28 #include "cgroup.h"
  29 #include "fd-util.h"
  30 #include "fileio.h"
  31 #include "fs-util.h"
  32 #include "parse-util.h"
  33 #include "path-util.h"
  34 #include "process-util.h"
  35 //#include "procfs-util.h"
  36 //#include "special.h"
  37 #include "stdio-util.h"
  38 #include "string-table.h"
  39 #include "string-util.h"
  40
  41 #define CGROUP_CPU_QUOTA_PERIOD_USEC ((usec_t) 100 * USEC_PER_MSEC)
  42
  43 bool unit_has_root_cgroup(Unit *u) {
  44         assert(u);
  45
  46         /* Returns whether this unit manages the root cgroup. Note that this is different from being named "-.slice",
  47          * as inside of containers the root slice won't be identical to the root cgroup. */
  48
  49         if (!u->cgroup_path)
  50                 return false;
  51
  52         return isempty(u->cgroup_path) || path_equal(u->cgroup_path, "/");
  53 }
  54
  55 #if 0 /// UNNEEDED by elogind
  56 static void cgroup_compat_warn(void) {
  57         static bool cgroup_compat_warned = false;
  58
  59         if (cgroup_compat_warned)
  60                 return;
  61
  62         log_warning("cgroup compatibility translation between legacy and unified hierarchy settings activated. See cgroup-compat debug messages for details.");
  63         cgroup_compat_warned = true;
  64 }
  65
  66 #define log_cgroup_compat(unit, fmt, ...) do {                                  \
  67                 cgroup_compat_warn();                                           \
  68                 log_unit_debug(unit, "cgroup-compat: " fmt, ##__VA_ARGS__);     \
  69         } while (false)
  70
  71 void cgroup_context_init(CGroupContext *c) {
  72         assert(c);
  73
  74         /* Initialize everything to the kernel defaults, assuming the
  75          * structure is preinitialized to 0 */
  76
  77         c->cpu_weight = CGROUP_WEIGHT_INVALID;
  78         c->startup_cpu_weight = CGROUP_WEIGHT_INVALID;
  79         c->cpu_quota_per_sec_usec = USEC_INFINITY;
  80
  81         c->cpu_shares = CGROUP_CPU_SHARES_INVALID;
  82         c->startup_cpu_shares = CGROUP_CPU_SHARES_INVALID;
  83
  84         c->memory_high = CGROUP_LIMIT_MAX;
  85         c->memory_max = CGROUP_LIMIT_MAX;
  86         c->memory_swap_max = CGROUP_LIMIT_MAX;
  87
  88         c->memory_limit = CGROUP_LIMIT_MAX;
  89
  90         c->io_weight = CGROUP_WEIGHT_INVALID;
  91         c->startup_io_weight = CGROUP_WEIGHT_INVALID;
  92
  93         c->blockio_weight = CGROUP_BLKIO_WEIGHT_INVALID;
  94         c->startup_blockio_weight = CGROUP_BLKIO_WEIGHT_INVALID;
  95
  96         c->tasks_max = (uint64_t) -1;
  97 }
  98
  99 void cgroup_context_free_device_allow(CGroupContext *c, CGroupDeviceAllow *a) {
 100         assert(c);
 101         assert(a);
 102
 103         LIST_REMOVE(device_allow, c->device_allow, a);
 104         free(a->path);
 105         free(a);
 106 }
 107
 108 void cgroup_context_free_io_device_weight(CGroupContext *c, CGroupIODeviceWeight *w) {
 109         assert(c);
 110         assert(w);
 111
 112         LIST_REMOVE(device_weights, c->io_device_weights, w);
 113         free(w->path);
 114         free(w);
 115 }
 116
 117 void cgroup_context_free_io_device_limit(CGroupContext *c, CGroupIODeviceLimit *l) {
 118         assert(c);
 119         assert(l);
 120
 121         LIST_REMOVE(device_limits, c->io_device_limits, l);
 122         free(l->path);
 123         free(l);
 124 }
 125
 126 void cgroup_context_free_blockio_device_weight(CGroupContext *c, CGroupBlockIODeviceWeight *w) {
 127         assert(c);
 128         assert(w);
 129
 130         LIST_REMOVE(device_weights, c->blockio_device_weights, w);
 131         free(w->path);
 132         free(w);
 133 }
 134
 135 void cgroup_context_free_blockio_device_bandwidth(CGroupContext *c, CGroupBlockIODeviceBandwidth *b) {
 136         assert(c);
 137         assert(b);
 138
 139         LIST_REMOVE(device_bandwidths, c->blockio_device_bandwidths, b);
 140         free(b->path);
 141         free(b);
 142 }
 143
 144 void cgroup_context_done(CGroupContext *c) {
 145         assert(c);
 146
 147         while (c->io_device_weights)
 148                 cgroup_context_free_io_device_weight(c, c->io_device_weights);
 149
 150         while (c->io_device_limits)
 151                 cgroup_context_free_io_device_limit(c, c->io_device_limits);
 152
 153         while (c->blockio_device_weights)
 154                 cgroup_context_free_blockio_device_weight(c, c->blockio_device_weights);
 155
 156         while (c->blockio_device_bandwidths)
 157                 cgroup_context_free_blockio_device_bandwidth(c, c->blockio_device_bandwidths);
 158
 159         while (c->device_allow)
 160                 cgroup_context_free_device_allow(c, c->device_allow);
 161
 162         c->ip_address_allow = ip_address_access_free_all(c->ip_address_allow);
 163         c->ip_address_deny = ip_address_access_free_all(c->ip_address_deny);
 164 }
 165
 166 void cgroup_context_dump(CGroupContext *c, FILE* f, const char *prefix) {
 167         CGroupIODeviceLimit *il;
 168         CGroupIODeviceWeight *iw;
 169         CGroupBlockIODeviceBandwidth *b;
 170         CGroupBlockIODeviceWeight *w;
 171         CGroupDeviceAllow *a;
 172         IPAddressAccessItem *iaai;
 173         char u[FORMAT_TIMESPAN_MAX];
 174
 175         assert(c);
 176         assert(f);
 177
 178         prefix = strempty(prefix);
 179
 180         fprintf(f,
 181                 "%sCPUAccounting=%s\n"
 182                 "%sIOAccounting=%s\n"
 183                 "%sBlockIOAccounting=%s\n"
 184                 "%sMemoryAccounting=%s\n"
 185                 "%sTasksAccounting=%s\n"
 186                 "%sIPAccounting=%s\n"
 187                 "%sCPUWeight=%" PRIu64 "\n"
 188                 "%sStartupCPUWeight=%" PRIu64 "\n"
 189                 "%sCPUShares=%" PRIu64 "\n"
 190                 "%sStartupCPUShares=%" PRIu64 "\n"
 191                 "%sCPUQuotaPerSecSec=%s\n"
 192                 "%sIOWeight=%" PRIu64 "\n"
 193                 "%sStartupIOWeight=%" PRIu64 "\n"
 194                 "%sBlockIOWeight=%" PRIu64 "\n"
 195                 "%sStartupBlockIOWeight=%" PRIu64 "\n"
 196                 "%sMemoryLow=%" PRIu64 "\n"
 197                 "%sMemoryHigh=%" PRIu64 "\n"
 198                 "%sMemoryMax=%" PRIu64 "\n"
 199                 "%sMemorySwapMax=%" PRIu64 "\n"
 200                 "%sMemoryLimit=%" PRIu64 "\n"
 201                 "%sTasksMax=%" PRIu64 "\n"
 202                 "%sDevicePolicy=%s\n"
 203                 "%sDelegate=%s\n",
 204                 prefix, yes_no(c->cpu_accounting),
 205                 prefix, yes_no(c->io_accounting),
 206                 prefix, yes_no(c->blockio_accounting),
 207                 prefix, yes_no(c->memory_accounting),
 208                 prefix, yes_no(c->tasks_accounting),
 209                 prefix, yes_no(c->ip_accounting),
 210                 prefix, c->cpu_weight,
 211                 prefix, c->startup_cpu_weight,
 212                 prefix, c->cpu_shares,
 213                 prefix, c->startup_cpu_shares,
 214                 prefix, format_timespan(u, sizeof(u), c->cpu_quota_per_sec_usec, 1),
 215                 prefix, c->io_weight,
 216                 prefix, c->startup_io_weight,
 217                 prefix, c->blockio_weight,
 218                 prefix, c->startup_blockio_weight,
 219                 prefix, c->memory_low,
 220                 prefix, c->memory_high,
 221                 prefix, c->memory_max,
 222                 prefix, c->memory_swap_max,
 223                 prefix, c->memory_limit,
 224                 prefix, c->tasks_max,
 225                 prefix, cgroup_device_policy_to_string(c->device_policy),
 226                 prefix, yes_no(c->delegate));
 227
 228         if (c->delegate) {
 229                 _cleanup_free_ char *t = NULL;
 230
 231                 (void) cg_mask_to_string(c->delegate_controllers, &t);
 232
 233                 fprintf(f, "%sDelegateControllers=%s\n",
 234                         prefix,
 235                         strempty(t));
 236         }
 237
 238         LIST_FOREACH(device_allow, a, c->device_allow)
 239                 fprintf(f,
 240                         "%sDeviceAllow=%s %s%s%s\n",
 241                         prefix,
 242                         a->path,
 243                         a->r ? "r" : "", a->w ? "w" : "", a->m ? "m" : "");
 244
 245         LIST_FOREACH(device_weights, iw, c->io_device_weights)
 246                 fprintf(f,
 247                         "%sIODeviceWeight=%s %" PRIu64,
 248                         prefix,
 249                         iw->path,
 250                         iw->weight);
 251
 252         LIST_FOREACH(device_limits, il, c->io_device_limits) {
 253                 char buf[FORMAT_BYTES_MAX];
 254                 CGroupIOLimitType type;
 255
 256                 for (type = 0; type < _CGROUP_IO_LIMIT_TYPE_MAX; type++)
 257                         if (il->limits[type] != cgroup_io_limit_defaults[type])
 258                                 fprintf(f,
 259                                         "%s%s=%s %s\n",
 260                                         prefix,
 261                                         cgroup_io_limit_type_to_string(type),
 262                                         il->path,
 263                                         format_bytes(buf, sizeof(buf), il->limits[type]));
 264         }
 265
 266         LIST_FOREACH(device_weights, w, c->blockio_device_weights)
 267                 fprintf(f,
 268                         "%sBlockIODeviceWeight=%s %" PRIu64,
 269                         prefix,
 270                         w->path,
 271                         w->weight);
 272
 273         LIST_FOREACH(device_bandwidths, b, c->blockio_device_bandwidths) {
 274                 char buf[FORMAT_BYTES_MAX];
 275
 276                 if (b->rbps != CGROUP_LIMIT_MAX)
 277                         fprintf(f,
 278                                 "%sBlockIOReadBandwidth=%s %s\n",
 279                                 prefix,
 280                                 b->path,
 281                                 format_bytes(buf, sizeof(buf), b->rbps));
 282                 if (b->wbps != CGROUP_LIMIT_MAX)
 283                         fprintf(f,
 284                                 "%sBlockIOWriteBandwidth=%s %s\n",
 285                                 prefix,
 286                                 b->path,
 287                                 format_bytes(buf, sizeof(buf), b->wbps));
 288         }
 289
 290         LIST_FOREACH(items, iaai, c->ip_address_allow) {
 291                 _cleanup_free_ char *k = NULL;
 292
 293                 (void) in_addr_to_string(iaai->family, &iaai->address, &k);
 294                 fprintf(f, "%sIPAddressAllow=%s/%u\n", prefix, strnull(k), iaai->prefixlen);
 295         }
 296
 297         LIST_FOREACH(items, iaai, c->ip_address_deny) {
 298                 _cleanup_free_ char *k = NULL;
 299
 300                 (void) in_addr_to_string(iaai->family, &iaai->address, &k);
 301                 fprintf(f, "%sIPAddressDeny=%s/%u\n", prefix, strnull(k), iaai->prefixlen);
 302         }
 303 }
 304
 305 static int lookup_block_device(const char *p, dev_t *dev) {
 306         struct stat st;
 307         int r;
 308
 309         assert(p);
 310         assert(dev);
 311
 312         r = stat(p, &st);
 313         if (r < 0)
 314                 return log_warning_errno(errno, "Couldn't stat device %s: %m", p);
 315
 316         if (S_ISBLK(st.st_mode))
 317                 *dev = st.st_rdev;
 318         else if (major(st.st_dev) != 0) {
 319                 /* If this is not a device node then find the block
 320                  * device this file is stored on */
 321                 *dev = st.st_dev;
 322
 323                 /* If this is a partition, try to get the originating
 324                  * block device */
 325                 (void) block_get_whole_disk(*dev, dev);
 326         } else {
 327                 log_warning("%s is not a block device and file system block device cannot be determined or is not local.", p);
 328                 return -ENODEV;
 329         }
 330
 331         return 0;
 332 }
 333
 334 static int whitelist_device(const char *path, const char *node, const char *acc) {
 335         char buf[2+DECIMAL_STR_MAX(dev_t)*2+2+4];
 336         struct stat st;
 337         bool ignore_notfound;
 338         int r;
 339
 340         assert(path);
 341         assert(acc);
 342
 343         if (node[0] == '-') {
 344                 /* Non-existent paths starting with "-" must be silently ignored */
 345                 node++;
 346                 ignore_notfound = true;
 347         } else
 348                 ignore_notfound = false;
 349
 350         if (stat(node, &st) < 0) {
 351                 if (errno == ENOENT && ignore_notfound)
 352                         return 0;
 353
 354                 return log_warning_errno(errno, "Couldn't stat device %s: %m", node);
 355         }
 356
 357         if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
 358                 log_warning("%s is not a device.", node);
 359                 return -ENODEV;
 360         }
 361
 362         sprintf(buf,
 363                 "%c %u:%u %s",
 364                 S_ISCHR(st.st_mode) ? 'c' : 'b',
 365                 major(st.st_rdev), minor(st.st_rdev),
 366                 acc);
 367
 368         r = cg_set_attribute("devices", path, "devices.allow", buf);
 369         if (r < 0)
 370                 log_full_errno(IN_SET(r, -ENOENT, -EROFS, -EINVAL, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
 371                                "Failed to set devices.allow on %s: %m", path);
 372
 373         return r;
 374 }
 375
 376 static int whitelist_major(const char *path, const char *name, char type, const char *acc) {
 377         _cleanup_fclose_ FILE *f = NULL;
 378         char line[LINE_MAX];
 379         bool good = false;
 380         int r;
 381
 382         assert(path);
 383         assert(acc);
 384         assert(IN_SET(type, 'b', 'c'));
 385
 386         f = fopen("/proc/devices", "re");
 387         if (!f)
 388                 return log_warning_errno(errno, "Cannot open /proc/devices to resolve %s (%c): %m", name, type);
 389
 390         FOREACH_LINE(line, f, goto fail) {
 391                 char buf[2+DECIMAL_STR_MAX(unsigned)+3+4], *p, *w;
 392                 unsigned maj;
 393
 394                 truncate_nl(line);
 395
 396                 if (type == 'c' && streq(line, "Character devices:")) {
 397                         good = true;
 398                         continue;
 399                 }
 400
 401                 if (type == 'b' && streq(line, "Block devices:")) {
 402                         good = true;
 403                         continue;
 404                 }
 405
 406                 if (isempty(line)) {
 407                         good = false;
 408                         continue;
 409                 }
 410
 411                 if (!good)
 412                         continue;
 413
 414                 p = strstrip(line);
 415
 416                 w = strpbrk(p, WHITESPACE);
 417                 if (!w)
 418                         continue;
 419                 *w = 0;
 420
 421                 r = safe_atou(p, &maj);
 422                 if (r < 0)
 423                         continue;
 424                 if (maj <= 0)
 425                         continue;
 426
 427                 w++;
 428                 w += strspn(w, WHITESPACE);
 429
 430                 if (fnmatch(name, w, 0) != 0)
 431                         continue;
 432
 433                 sprintf(buf,
 434                         "%c %u:* %s",
 435                         type,
 436                         maj,
 437                         acc);
 438
 439                 r = cg_set_attribute("devices", path, "devices.allow", buf);
 440                 if (r < 0)
 441                         log_full_errno(IN_SET(r, -ENOENT, -EROFS, -EINVAL, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
 442                                        "Failed to set devices.allow on %s: %m", path);
 443         }
 444
 445         return 0;
 446
 447 fail:
 448         return log_warning_errno(errno, "Failed to read /proc/devices: %m");
 449 }
 450
 451 static bool cgroup_context_has_cpu_weight(CGroupContext *c) {
 452         return c->cpu_weight != CGROUP_WEIGHT_INVALID ||
 453                 c->startup_cpu_weight != CGROUP_WEIGHT_INVALID;
 454 }
 455
 456 static bool cgroup_context_has_cpu_shares(CGroupContext *c) {
 457         return c->cpu_shares != CGROUP_CPU_SHARES_INVALID ||
 458                 c->startup_cpu_shares != CGROUP_CPU_SHARES_INVALID;
 459 }
 460
 461 static uint64_t cgroup_context_cpu_weight(CGroupContext *c, ManagerState state) {
 462         if (IN_SET(state, MANAGER_STARTING, MANAGER_INITIALIZING) &&
 463             c->startup_cpu_weight != CGROUP_WEIGHT_INVALID)
 464                 return c->startup_cpu_weight;
 465         else if (c->cpu_weight != CGROUP_WEIGHT_INVALID)
 466                 return c->cpu_weight;
 467         else
 468                 return CGROUP_WEIGHT_DEFAULT;
 469 }
 470
 471 static uint64_t cgroup_context_cpu_shares(CGroupContext *c, ManagerState state) {
 472         if (IN_SET(state, MANAGER_STARTING, MANAGER_INITIALIZING) &&
 473             c->startup_cpu_shares != CGROUP_CPU_SHARES_INVALID)
 474                 return c->startup_cpu_shares;
 475         else if (c->cpu_shares != CGROUP_CPU_SHARES_INVALID)
 476                 return c->cpu_shares;
 477         else
 478                 return CGROUP_CPU_SHARES_DEFAULT;
 479 }
 480
 481 static void cgroup_apply_unified_cpu_config(Unit *u, uint64_t weight, uint64_t quota) {
 482         char buf[MAX(DECIMAL_STR_MAX(uint64_t) + 1, (DECIMAL_STR_MAX(usec_t) + 1) * 2)];
 483         int r;
 484
 485         xsprintf(buf, "%" PRIu64 "\n", weight);
 486         r = cg_set_attribute("cpu", u->cgroup_path, "cpu.weight", buf);
 487         if (r < 0)
 488                 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
 489                               "Failed to set cpu.weight: %m");
 490
 491         if (quota != USEC_INFINITY)
 492                 xsprintf(buf, USEC_FMT " " USEC_FMT "\n",
 493                          quota * CGROUP_CPU_QUOTA_PERIOD_USEC / USEC_PER_SEC, CGROUP_CPU_QUOTA_PERIOD_USEC);
 494         else
 495                 xsprintf(buf, "max " USEC_FMT "\n", CGROUP_CPU_QUOTA_PERIOD_USEC);
 496
 497         r = cg_set_attribute("cpu", u->cgroup_path, "cpu.max", buf);
 498
 499         if (r < 0)
 500                 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
 501                               "Failed to set cpu.max: %m");
 502 }
 503
 504 static void cgroup_apply_legacy_cpu_config(Unit *u, uint64_t shares, uint64_t quota) {
 505         char buf[MAX(DECIMAL_STR_MAX(uint64_t), DECIMAL_STR_MAX(usec_t)) + 1];
 506         int r;
 507
 508         xsprintf(buf, "%" PRIu64 "\n", shares);
 509         r = cg_set_attribute("cpu", u->cgroup_path, "cpu.shares", buf);
 510         if (r < 0)
 511                 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
 512                               "Failed to set cpu.shares: %m");
 513
 514         xsprintf(buf, USEC_FMT "\n", CGROUP_CPU_QUOTA_PERIOD_USEC);
 515         r = cg_set_attribute("cpu", u->cgroup_path, "cpu.cfs_period_us", buf);
 516         if (r < 0)
 517                 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
 518                               "Failed to set cpu.cfs_period_us: %m");
 519
 520         if (quota != USEC_INFINITY) {
 521                 xsprintf(buf, USEC_FMT "\n", quota * CGROUP_CPU_QUOTA_PERIOD_USEC / USEC_PER_SEC);
 522                 r = cg_set_attribute("cpu", u->cgroup_path, "cpu.cfs_quota_us", buf);
 523         } else
 524                 r = cg_set_attribute("cpu", u->cgroup_path, "cpu.cfs_quota_us", "-1");
 525         if (r < 0)
 526                 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
 527                               "Failed to set cpu.cfs_quota_us: %m");
 528 }
 529
 530 static uint64_t cgroup_cpu_shares_to_weight(uint64_t shares) {
 531         return CLAMP(shares * CGROUP_WEIGHT_DEFAULT / CGROUP_CPU_SHARES_DEFAULT,
 532                      CGROUP_WEIGHT_MIN, CGROUP_WEIGHT_MAX);
 533 }
 534
 535 static uint64_t cgroup_cpu_weight_to_shares(uint64_t weight) {
 536         return CLAMP(weight * CGROUP_CPU_SHARES_DEFAULT / CGROUP_WEIGHT_DEFAULT,
 537                      CGROUP_CPU_SHARES_MIN, CGROUP_CPU_SHARES_MAX);
 538 }
 539
 540 static bool cgroup_context_has_io_config(CGroupContext *c) {
 541         return c->io_accounting ||
 542                 c->io_weight != CGROUP_WEIGHT_INVALID ||
 543                 c->startup_io_weight != CGROUP_WEIGHT_INVALID ||
 544                 c->io_device_weights ||
 545                 c->io_device_limits;
 546 }
 547
 548 static bool cgroup_context_has_blockio_config(CGroupContext *c) {
 549         return c->blockio_accounting ||
 550                 c->blockio_weight != CGROUP_BLKIO_WEIGHT_INVALID ||
 551                 c->startup_blockio_weight != CGROUP_BLKIO_WEIGHT_INVALID ||
 552                 c->blockio_device_weights ||
 553                 c->blockio_device_bandwidths;
 554 }
 555
 556 static uint64_t cgroup_context_io_weight(CGroupContext *c, ManagerState state) {
 557         if (IN_SET(state, MANAGER_STARTING, MANAGER_INITIALIZING) &&
 558             c->startup_io_weight != CGROUP_WEIGHT_INVALID)
 559                 return c->startup_io_weight;
 560         else if (c->io_weight != CGROUP_WEIGHT_INVALID)
 561                 return c->io_weight;
 562         else
 563                 return CGROUP_WEIGHT_DEFAULT;
 564 }
 565
 566 static uint64_t cgroup_context_blkio_weight(CGroupContext *c, ManagerState state) {
 567         if (IN_SET(state, MANAGER_STARTING, MANAGER_INITIALIZING) &&
 568             c->startup_blockio_weight != CGROUP_BLKIO_WEIGHT_INVALID)
 569                 return c->startup_blockio_weight;
 570         else if (c->blockio_weight != CGROUP_BLKIO_WEIGHT_INVALID)
 571                 return c->blockio_weight;
 572         else
 573                 return CGROUP_BLKIO_WEIGHT_DEFAULT;
 574 }
 575
 576 static uint64_t cgroup_weight_blkio_to_io(uint64_t blkio_weight) {
 577         return CLAMP(blkio_weight * CGROUP_WEIGHT_DEFAULT / CGROUP_BLKIO_WEIGHT_DEFAULT,
 578                      CGROUP_WEIGHT_MIN, CGROUP_WEIGHT_MAX);
 579 }
 580
 581 static uint64_t cgroup_weight_io_to_blkio(uint64_t io_weight) {
 582         return CLAMP(io_weight * CGROUP_BLKIO_WEIGHT_DEFAULT / CGROUP_WEIGHT_DEFAULT,
 583                      CGROUP_BLKIO_WEIGHT_MIN, CGROUP_BLKIO_WEIGHT_MAX);
 584 }
 585
 586 static void cgroup_apply_io_device_weight(Unit *u, const char *dev_path, uint64_t io_weight) {
 587         char buf[DECIMAL_STR_MAX(dev_t)*2+2+DECIMAL_STR_MAX(uint64_t)+1];
 588         dev_t dev;
 589         int r;
 590
 591         r = lookup_block_device(dev_path, &dev);
 592         if (r < 0)
 593                 return;
 594
 595         xsprintf(buf, "%u:%u %" PRIu64 "\n", major(dev), minor(dev), io_weight);
 596         r = cg_set_attribute("io", u->cgroup_path, "io.weight", buf);
 597         if (r < 0)
 598                 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
 599                               "Failed to set io.weight: %m");
 600 }
 601
 602 static void cgroup_apply_blkio_device_weight(Unit *u, const char *dev_path, uint64_t blkio_weight) {
 603         char buf[DECIMAL_STR_MAX(dev_t)*2+2+DECIMAL_STR_MAX(uint64_t)+1];
 604         dev_t dev;
 605         int r;
 606
 607         r = lookup_block_device(dev_path, &dev);
 608         if (r < 0)
 609                 return;
 610
 611         xsprintf(buf, "%u:%u %" PRIu64 "\n", major(dev), minor(dev), blkio_weight);
 612         r = cg_set_attribute("blkio", u->cgroup_path, "blkio.weight_device", buf);
 613         if (r < 0)
 614                 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
 615                               "Failed to set blkio.weight_device: %m");
 616 }
 617
 618 static unsigned cgroup_apply_io_device_limit(Unit *u, const char *dev_path, uint64_t *limits) {
 619         char limit_bufs[_CGROUP_IO_LIMIT_TYPE_MAX][DECIMAL_STR_MAX(uint64_t)];
 620         char buf[DECIMAL_STR_MAX(dev_t)*2+2+(6+DECIMAL_STR_MAX(uint64_t)+1)*4];
 621         CGroupIOLimitType type;
 622         dev_t dev;
 623         unsigned n = 0;
 624         int r;
 625
 626         r = lookup_block_device(dev_path, &dev);
 627         if (r < 0)
 628                 return 0;
 629
 630         for (type = 0; type < _CGROUP_IO_LIMIT_TYPE_MAX; type++) {
 631                 if (limits[type] != cgroup_io_limit_defaults[type]) {
 632                         xsprintf(limit_bufs[type], "%" PRIu64, limits[type]);
 633                         n++;
 634                 } else {
 635                         xsprintf(limit_bufs[type], "%s", limits[type] == CGROUP_LIMIT_MAX ? "max" : "0");
 636                 }
 637         }
 638
 639         xsprintf(buf, "%u:%u rbps=%s wbps=%s riops=%s wiops=%s\n", major(dev), minor(dev),
 640                  limit_bufs[CGROUP_IO_RBPS_MAX], limit_bufs[CGROUP_IO_WBPS_MAX],
 641                  limit_bufs[CGROUP_IO_RIOPS_MAX], limit_bufs[CGROUP_IO_WIOPS_MAX]);
 642         r = cg_set_attribute("io", u->cgroup_path, "io.max", buf);
 643         if (r < 0)
 644                 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
 645                               "Failed to set io.max: %m");
 646         return n;
 647 }
 648
 649 static unsigned cgroup_apply_blkio_device_limit(Unit *u, const char *dev_path, uint64_t rbps, uint64_t wbps) {
 650         char buf[DECIMAL_STR_MAX(dev_t)*2+2+DECIMAL_STR_MAX(uint64_t)+1];
 651         dev_t dev;
 652         unsigned n = 0;
 653         int r;
 654
 655         r = lookup_block_device(dev_path, &dev);
 656         if (r < 0)
 657                 return 0;
 658
 659         if (rbps != CGROUP_LIMIT_MAX)
 660                 n++;
 661         sprintf(buf, "%u:%u %" PRIu64 "\n", major(dev), minor(dev), rbps);
 662         r = cg_set_attribute("blkio", u->cgroup_path, "blkio.throttle.read_bps_device", buf);
 663         if (r < 0)
 664                 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
 665                               "Failed to set blkio.throttle.read_bps_device: %m");
 666
 667         if (wbps != CGROUP_LIMIT_MAX)
 668                 n++;
 669         sprintf(buf, "%u:%u %" PRIu64 "\n", major(dev), minor(dev), wbps);
 670         r = cg_set_attribute("blkio", u->cgroup_path, "blkio.throttle.write_bps_device", buf);
 671         if (r < 0)
 672                 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
 673                               "Failed to set blkio.throttle.write_bps_device: %m");
 674
 675         return n;
 676 }
 677
 678 static bool cgroup_context_has_unified_memory_config(CGroupContext *c) {
 679         return c->memory_low > 0 || c->memory_high != CGROUP_LIMIT_MAX || c->memory_max != CGROUP_LIMIT_MAX || c->memory_swap_max != CGROUP_LIMIT_MAX;
 680 }
 681
 682 static void cgroup_apply_unified_memory_limit(Unit *u, const char *file, uint64_t v) {
 683         char buf[DECIMAL_STR_MAX(uint64_t) + 1] = "max";
 684         int r;
 685
 686         if (v != CGROUP_LIMIT_MAX)
 687                 xsprintf(buf, "%" PRIu64 "\n", v);
 688
 689         r = cg_set_attribute("memory", u->cgroup_path, file, buf);
 690         if (r < 0)
 691                 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
 692                               "Failed to set %s: %m", file);
 693 }
 694
 695 static void cgroup_apply_firewall(Unit *u) {
 696         int r;
 697
 698         assert(u);
 699
 700         if (u->type == UNIT_SLICE) /* Skip this for slice units, they are inner cgroup nodes, and since bpf/cgroup is
 701                                     * not recursive we don't ever touch the bpf on them */
 702                 return;
 703
 704         r = bpf_firewall_compile(u);
 705         if (r < 0)
 706                 return;
 707
 708         (void) bpf_firewall_install(u);
 709         return;
 710 }
 711
 712 static void cgroup_context_apply(
 713                 Unit *u,
 714                 CGroupMask apply_mask,
 715                 bool apply_bpf,
 716                 ManagerState state) {
 717
 718         const char *path;
 719         CGroupContext *c;
 720         bool is_root;
 721         int r;
 722
 723         assert(u);
 724
 725         /* Nothing to do? Exit early! */
 726         if (apply_mask == 0 && !apply_bpf)
 727                 return;
 728
 729         /* Some cgroup attributes are not supported on the root cgroup, hence silently ignore */
 730         is_root = unit_has_root_cgroup(u);
 731
 732         assert_se(c = unit_get_cgroup_context(u));
 733         assert_se(path = u->cgroup_path);
 734
 735         if (is_root) /* Make sure we don't try to display messages with an empty path. */
 736                 path = "/";
 737
 738         /* We generally ignore errors caused by read-only mounted
 739          * cgroup trees (assuming we are running in a container then),
 740          * and missing cgroups, i.e. EROFS and ENOENT. */
 741
 742         if ((apply_mask & CGROUP_MASK_CPU) && !is_root) {
 743                 bool has_weight, has_shares;
 744
 745                 has_weight = cgroup_context_has_cpu_weight(c);
 746                 has_shares = cgroup_context_has_cpu_shares(c);
 747
 748                 if (cg_all_unified() > 0) {
 749                         uint64_t weight;
 750
 751                         if (has_weight)
 752                                 weight = cgroup_context_cpu_weight(c, state);
 753                         else if (has_shares) {
 754                                 uint64_t shares = cgroup_context_cpu_shares(c, state);
 755
 756                                 weight = cgroup_cpu_shares_to_weight(shares);
 757
 758                                 log_cgroup_compat(u, "Applying [Startup]CpuShares %" PRIu64 " as [Startup]CpuWeight %" PRIu64 " on %s",
 759                                                   shares, weight, path);
 760                         } else
 761                                 weight = CGROUP_WEIGHT_DEFAULT;
 762
 763                         cgroup_apply_unified_cpu_config(u, weight, c->cpu_quota_per_sec_usec);
 764                 } else {
 765                         uint64_t shares;
 766
 767                         if (has_weight) {
 768                                 uint64_t weight = cgroup_context_cpu_weight(c, state);
 769
 770                                 shares = cgroup_cpu_weight_to_shares(weight);
 771
 772                                 log_cgroup_compat(u, "Applying [Startup]CpuWeight %" PRIu64 " as [Startup]CpuShares %" PRIu64 " on %s",
 773                                                   weight, shares, path);
 774                         } else if (has_shares)
 775                                 shares = cgroup_context_cpu_shares(c, state);
 776                         else
 777                                 shares = CGROUP_CPU_SHARES_DEFAULT;
 778
 779                         cgroup_apply_legacy_cpu_config(u, shares, c->cpu_quota_per_sec_usec);
 780                 }
 781         }
 782
 783         if (apply_mask & CGROUP_MASK_IO) {
 784                 bool has_io = cgroup_context_has_io_config(c);
 785                 bool has_blockio = cgroup_context_has_blockio_config(c);
 786
 787                 if (!is_root) {
 788                         char buf[8+DECIMAL_STR_MAX(uint64_t)+1];
 789                         uint64_t weight;
 790
 791                         if (has_io)
 792                                 weight = cgroup_context_io_weight(c, state);
 793                         else if (has_blockio) {
 794                                 uint64_t blkio_weight = cgroup_context_blkio_weight(c, state);
 795
 796                                 weight = cgroup_weight_blkio_to_io(blkio_weight);
 797
 798                                 log_cgroup_compat(u, "Applying [Startup]BlockIOWeight %" PRIu64 " as [Startup]IOWeight %" PRIu64,
 799                                                   blkio_weight, weight);
 800                         } else
 801                                 weight = CGROUP_WEIGHT_DEFAULT;
 802
 803                         xsprintf(buf, "default %" PRIu64 "\n", weight);
 804                         r = cg_set_attribute("io", path, "io.weight", buf);
 805                         if (r < 0)
 806                                 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
 807                                               "Failed to set io.weight: %m");
 808
 809                         if (has_io) {
 810                                 CGroupIODeviceWeight *w;
 811
 812                                 /* FIXME: no way to reset this list */
 813                                 LIST_FOREACH(device_weights, w, c->io_device_weights)
 814                                         cgroup_apply_io_device_weight(u, w->path, w->weight);
 815                         } else if (has_blockio) {
 816                                 CGroupBlockIODeviceWeight *w;
 817
 818                                 /* FIXME: no way to reset this list */
 819                                 LIST_FOREACH(device_weights, w, c->blockio_device_weights) {
 820                                         weight = cgroup_weight_blkio_to_io(w->weight);
 821
 822                                         log_cgroup_compat(u, "Applying BlockIODeviceWeight %" PRIu64 " as IODeviceWeight %" PRIu64 " for %s",
 823                                                           w->weight, weight, w->path);
 824
 825                                         cgroup_apply_io_device_weight(u, w->path, weight);
 826                                 }
 827                         }
 828                 }
 829
 830                 /* Apply limits and free ones without config. */
 831                 if (has_io) {
 832                         CGroupIODeviceLimit *l, *next;
 833
 834                         LIST_FOREACH_SAFE(device_limits, l, next, c->io_device_limits) {
 835                                 if (!cgroup_apply_io_device_limit(u, l->path, l->limits))
 836                                         cgroup_context_free_io_device_limit(c, l);
 837                         }
 838                 } else if (has_blockio) {
 839                         CGroupBlockIODeviceBandwidth *b, *next;
 840
 841                         LIST_FOREACH_SAFE(device_bandwidths, b, next, c->blockio_device_bandwidths) {
 842                                 uint64_t limits[_CGROUP_IO_LIMIT_TYPE_MAX];
 843                                 CGroupIOLimitType type;
 844
 845                                 for (type = 0; type < _CGROUP_IO_LIMIT_TYPE_MAX; type++)
 846                                         limits[type] = cgroup_io_limit_defaults[type];
 847
 848                                 limits[CGROUP_IO_RBPS_MAX] = b->rbps;
 849                                 limits[CGROUP_IO_WBPS_MAX] = b->wbps;
 850
 851                                 log_cgroup_compat(u, "Applying BlockIO{Read|Write}Bandwidth %" PRIu64 " %" PRIu64 " as IO{Read|Write}BandwidthMax for %s",
 852                                                   b->rbps, b->wbps, b->path);
 853
 854                                 if (!cgroup_apply_io_device_limit(u, b->path, limits))
 855                                         cgroup_context_free_blockio_device_bandwidth(c, b);
 856                         }
 857                 }
 858         }
 859
 860         if (apply_mask & CGROUP_MASK_BLKIO) {
 861                 bool has_io = cgroup_context_has_io_config(c);
 862                 bool has_blockio = cgroup_context_has_blockio_config(c);
 863
 864                 if (!is_root) {
 865                         char buf[DECIMAL_STR_MAX(uint64_t)+1];
 866                         uint64_t weight;
 867
 868                         if (has_io) {
 869                                 uint64_t io_weight = cgroup_context_io_weight(c, state);
 870
 871                                 weight = cgroup_weight_io_to_blkio(cgroup_context_io_weight(c, state));
 872
 873                                 log_cgroup_compat(u, "Applying [Startup]IOWeight %" PRIu64 " as [Startup]BlockIOWeight %" PRIu64,
 874                                                   io_weight, weight);
 875                         } else if (has_blockio)
 876                                 weight = cgroup_context_blkio_weight(c, state);
 877                         else
 878                                 weight = CGROUP_BLKIO_WEIGHT_DEFAULT;
 879
 880                         xsprintf(buf, "%" PRIu64 "\n", weight);
 881                         r = cg_set_attribute("blkio", path, "blkio.weight", buf);
 882                         if (r < 0)
 883                                 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
 884                                               "Failed to set blkio.weight: %m");
 885
 886                         if (has_io) {
 887                                 CGroupIODeviceWeight *w;
 888
 889                                 /* FIXME: no way to reset this list */
 890                                 LIST_FOREACH(device_weights, w, c->io_device_weights) {
 891                                         weight = cgroup_weight_io_to_blkio(w->weight);
 892
 893                                         log_cgroup_compat(u, "Applying IODeviceWeight %" PRIu64 " as BlockIODeviceWeight %" PRIu64 " for %s",
 894                                                           w->weight, weight, w->path);
 895
 896                                         cgroup_apply_blkio_device_weight(u, w->path, weight);
 897                                 }
 898                         } else if (has_blockio) {
 899                                 CGroupBlockIODeviceWeight *w;
 900
 901                                 /* FIXME: no way to reset this list */
 902                                 LIST_FOREACH(device_weights, w, c->blockio_device_weights)
 903                                         cgroup_apply_blkio_device_weight(u, w->path, w->weight);
 904                         }
 905                 }
 906
 907                 /* Apply limits and free ones without config. */
 908                 if (has_io) {
 909                         CGroupIODeviceLimit *l, *next;
 910
 911                         LIST_FOREACH_SAFE(device_limits, l, next, c->io_device_limits) {
 912                                 log_cgroup_compat(u, "Applying IO{Read|Write}Bandwidth %" PRIu64 " %" PRIu64 " as BlockIO{Read|Write}BandwidthMax for %s",
 913                                                   l->limits[CGROUP_IO_RBPS_MAX], l->limits[CGROUP_IO_WBPS_MAX], l->path);
 914
 915                                 if (!cgroup_apply_blkio_device_limit(u, l->path, l->limits[CGROUP_IO_RBPS_MAX], l->limits[CGROUP_IO_WBPS_MAX]))
 916                                         cgroup_context_free_io_device_limit(c, l);
 917                         }
 918                 } else if (has_blockio) {
 919                         CGroupBlockIODeviceBandwidth *b, *next;
 920
 921                         LIST_FOREACH_SAFE(device_bandwidths, b, next, c->blockio_device_bandwidths)
 922                                 if (!cgroup_apply_blkio_device_limit(u, b->path, b->rbps, b->wbps))
 923                                         cgroup_context_free_blockio_device_bandwidth(c, b);
 924                 }
 925         }
 926
 927         if ((apply_mask & CGROUP_MASK_MEMORY) && !is_root) {
 928                 if (cg_all_unified() > 0) {
 929                         uint64_t max, swap_max = CGROUP_LIMIT_MAX;
 930
 931                         if (cgroup_context_has_unified_memory_config(c)) {
 932                                 max = c->memory_max;
 933                                 swap_max = c->memory_swap_max;
 934                         } else {
 935                                 max = c->memory_limit;
 936
 937                                 if (max != CGROUP_LIMIT_MAX)
 938                                         log_cgroup_compat(u, "Applying MemoryLimit %" PRIu64 " as MemoryMax", max);
 939                         }
 940
 941                         cgroup_apply_unified_memory_limit(u, "memory.low", c->memory_low);
 942                         cgroup_apply_unified_memory_limit(u, "memory.high", c->memory_high);
 943                         cgroup_apply_unified_memory_limit(u, "memory.max", max);
 944                         cgroup_apply_unified_memory_limit(u, "memory.swap.max", swap_max);
 945                 } else {
 946                         char buf[DECIMAL_STR_MAX(uint64_t) + 1];
 947                         uint64_t val;
 948
 949                         if (cgroup_context_has_unified_memory_config(c)) {
 950                                 val = c->memory_max;
 951                                 log_cgroup_compat(u, "Applying MemoryMax %" PRIi64 " as MemoryLimit", val);
 952                         } else
 953                                 val = c->memory_limit;
 954
 955                         if (val == CGROUP_LIMIT_MAX)
 956                                 strncpy(buf, "-1\n", sizeof(buf));
 957                         else
 958                                 xsprintf(buf, "%" PRIu64 "\n", val);
 959
 960                         r = cg_set_attribute("memory", path, "memory.limit_in_bytes", buf);
 961                         if (r < 0)
 962                                 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
 963                                               "Failed to set memory.limit_in_bytes: %m");
 964                 }
 965         }
 966
 967         if ((apply_mask & CGROUP_MASK_DEVICES) && !is_root) {
 968                 CGroupDeviceAllow *a;
 969
 970                 /* Changing the devices list of a populated cgroup
 971                  * might result in EINVAL, hence ignore EINVAL
 972                  * here. */
 973
 974                 if (c->device_allow || c->device_policy != CGROUP_AUTO)
 975                         r = cg_set_attribute("devices", path, "devices.deny", "a");
 976                 else
 977                         r = cg_set_attribute("devices", path, "devices.allow", "a");
 978                 if (r < 0)
 979                         log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EINVAL, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
 980                                       "Failed to reset devices.list: %m");
 981
 982                 if (c->device_policy == CGROUP_CLOSED ||
 983                     (c->device_policy == CGROUP_AUTO && c->device_allow)) {
 984                         static const char auto_devices[] =
 985                                 "/dev/null\0" "rwm\0"
 986                                 "/dev/zero\0" "rwm\0"
 987                                 "/dev/full\0" "rwm\0"
 988                                 "/dev/random\0" "rwm\0"
 989                                 "/dev/urandom\0" "rwm\0"
 990                                 "/dev/tty\0" "rwm\0"
 991                                 "/dev/ptmx\0" "rwm\0"
 992                                 /* Allow /run/systemd/inaccessible/{chr,blk} devices for mapping InaccessiblePaths */
 993                                 "-/run/systemd/inaccessible/chr\0" "rwm\0"
 994                                 "-/run/systemd/inaccessible/blk\0" "rwm\0";
 995
 996                         const char *x, *y;
 997
 998                         NULSTR_FOREACH_PAIR(x, y, auto_devices)
 999                                 whitelist_device(path, x, y);
1000
1001                         /* PTS (/dev/pts) devices may not be duplicated, but accessed */
1002                         whitelist_major(path, "pts", 'c', "rw");
1003                 }
1004
1005                 LIST_FOREACH(device_allow, a, c->device_allow) {
1006                         char acc[4], *val;
1007                         unsigned k = 0;
1008
1009                         if (a->r)
1010                                 acc[k++] = 'r';
1011                         if (a->w)
1012                                 acc[k++] = 'w';
1013                         if (a->m)
1014                                 acc[k++] = 'm';
1015
1016                         if (k == 0)
1017                                 continue;
1018
1019                         acc[k++] = 0;
1020
1021                         if (path_startswith(a->path, "/dev/"))
1022                                 whitelist_device(path, a->path, acc);
1023                         else if ((val = startswith(a->path, "block-")))
1024                                 whitelist_major(path, val, 'b', acc);
1025                         else if ((val = startswith(a->path, "char-")))
1026                                 whitelist_major(path, val, 'c', acc);
1027                         else
1028                                 log_unit_debug(u, "Ignoring device %s while writing cgroup attribute.", a->path);
1029                 }
1030         }
1031
1032         if (apply_mask & CGROUP_MASK_PIDS) {
1033
1034                 if (is_root) {
1035                         /* So, the "pids" controller does not expose anything on the root cgroup, in order not to
1036                          * replicate knobs exposed elsewhere needlessly. We abstract this away here however, and when
1037                          * the knobs of the root cgroup are modified propagate this to the relevant sysctls. There's a
1038                          * non-obvious asymmetry however: unlike the cgroup properties we don't really want to take
1039                          * exclusive ownership of the sysctls, but we still want to honour things if the user sets
1040                          * limits. Hence we employ sort of a one-way strategy: when the user sets a bounded limit
1041                          * through us it counts. When the user afterwards unsets it again (i.e. sets it to unbounded)
1042                          * it also counts. But if the user never set a limit through us (i.e. we are the default of
1043                          * "unbounded") we leave things unmodified. For this we manage a global boolean that we turn on
1044                          * the first time we set a limit. Note that this boolean is flushed out on manager reload,
1045                          * which is desirable so that there's an offical way to release control of the sysctl from
1046                          * systemd: set the limit to unbounded and reload. */
1047
1048                         if (c->tasks_max != CGROUP_LIMIT_MAX) {
1049                                 u->manager->sysctl_pid_max_changed = true;
1050                                 r = procfs_tasks_set_limit(c->tasks_max);
1051                         } else if (u->manager->sysctl_pid_max_changed)
1052                                 r = procfs_tasks_set_limit(TASKS_MAX);
1053                         else
1054                                 r = 0;
1055
1056                         if (r < 0)
1057                                 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
1058                                               "Failed to write to tasks limit sysctls: %m");
1059
1060                 } else {
1061                         if (c->tasks_max != CGROUP_LIMIT_MAX) {
1062                                 char buf[DECIMAL_STR_MAX(uint64_t) + 2];
1063
1064                                 sprintf(buf, "%" PRIu64 "\n", c->tasks_max);
1065                                 r = cg_set_attribute("pids", path, "pids.max", buf);
1066                         } else
1067                                 r = cg_set_attribute("pids", path, "pids.max", "max");
1068                         if (r < 0)
1069                                 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
1070                                               "Failed to set pids.max: %m");
1071                 }
1072         }
1073
1074         if (apply_bpf)
1075                 cgroup_apply_firewall(u);
1076 }
1077
1078 CGroupMask cgroup_context_get_mask(CGroupContext *c) {
1079         CGroupMask mask = 0;
1080
1081         /* Figure out which controllers we need */
1082
1083         if (c->cpu_accounting ||
1084             cgroup_context_has_cpu_weight(c) ||
1085             cgroup_context_has_cpu_shares(c) ||
1086             c->cpu_quota_per_sec_usec != USEC_INFINITY)
1087                 mask |= CGROUP_MASK_CPUACCT | CGROUP_MASK_CPU;
1088
1089         if (cgroup_context_has_io_config(c) || cgroup_context_has_blockio_config(c))
1090                 mask |= CGROUP_MASK_IO | CGROUP_MASK_BLKIO;
1091
1092         if (c->memory_accounting ||
1093             c->memory_limit != CGROUP_LIMIT_MAX ||
1094             cgroup_context_has_unified_memory_config(c))
1095                 mask |= CGROUP_MASK_MEMORY;
1096
1097         if (c->device_allow ||
1098             c->device_policy != CGROUP_AUTO)
1099                 mask |= CGROUP_MASK_DEVICES;
1100
1101         if (c->tasks_accounting ||
1102             c->tasks_max != CGROUP_LIMIT_MAX)
1103                 mask |= CGROUP_MASK_PIDS;
1104
1105         return mask;
1106 }
1107
1108 CGroupMask unit_get_own_mask(Unit *u) {
1109         CGroupContext *c;
1110
1111         /* Returns the mask of controllers the unit needs for itself */
1112
1113         c = unit_get_cgroup_context(u);
1114         if (!c)
1115                 return 0;
1116
1117         return cgroup_context_get_mask(c) | unit_get_delegate_mask(u);
1118 }
1119
1120 CGroupMask unit_get_delegate_mask(Unit *u) {
1121         CGroupContext *c;
1122
1123         /* If delegation is turned on, then turn on selected controllers, unless we are on the legacy hierarchy and the
1124          * process we fork into is known to drop privileges, and hence shouldn't get access to the controllers.
1125          *
1126          * Note that on the unified hierarchy it is safe to delegate controllers to unprivileged services. */
1127
1128         if (u->type == UNIT_SLICE)
1129                 return 0;
1130
1131         c = unit_get_cgroup_context(u);
1132         if (!c)
1133                 return 0;
1134
1135         if (!c->delegate)
1136                 return 0;
1137
1138         if (cg_all_unified() <= 0) {
1139                 ExecContext *e;
1140
1141                 e = unit_get_exec_context(u);
1142                 if (e && !exec_context_maintains_privileges(e))
1143                         return 0;
1144         }
1145
1146         return c->delegate_controllers;
1147 }
1148
1149 CGroupMask unit_get_members_mask(Unit *u) {
1150         assert(u);
1151
1152         /* Returns the mask of controllers all of the unit's children require, merged */
1153
1154         if (u->cgroup_members_mask_valid)
1155                 return u->cgroup_members_mask;
1156
1157         u->cgroup_members_mask = 0;
1158
1159         if (u->type == UNIT_SLICE) {
1160                 void *v;
1161                 Unit *member;
1162                 Iterator i;
1163
1164                 HASHMAP_FOREACH_KEY(v, member, u->dependencies[UNIT_BEFORE], i) {
1165
1166                         if (member == u)
1167                                 continue;
1168
1169                         if (UNIT_DEREF(member->slice) != u)
1170                                 continue;
1171
1172                         u->cgroup_members_mask |= unit_get_subtree_mask(member); /* note that this calls ourselves again, for the children */
1173                 }
1174         }
1175
1176         u->cgroup_members_mask_valid = true;
1177         return u->cgroup_members_mask;
1178 }
1179
1180 CGroupMask unit_get_siblings_mask(Unit *u) {
1181         assert(u);
1182
1183         /* Returns the mask of controllers all of the unit's siblings
1184          * require, i.e. the members mask of the unit's parent slice
1185          * if there is one. */
1186
1187         if (UNIT_ISSET(u->slice))
1188                 return unit_get_members_mask(UNIT_DEREF(u->slice));
1189
1190         return unit_get_subtree_mask(u); /* we are the top-level slice */
1191 }
1192
1193 CGroupMask unit_get_subtree_mask(Unit *u) {
1194
1195         /* Returns the mask of this subtree, meaning of the group
1196          * itself and its children. */
1197
1198         return unit_get_own_mask(u) | unit_get_members_mask(u);
1199 }
1200
1201 CGroupMask unit_get_target_mask(Unit *u) {
1202         CGroupMask mask;
1203
1204         /* This returns the cgroup mask of all controllers to enable
1205          * for a specific cgroup, i.e. everything it needs itself,
1206          * plus all that its children need, plus all that its siblings
1207          * need. This is primarily useful on the legacy cgroup
1208          * hierarchy, where we need to duplicate each cgroup in each
1209          * hierarchy that shall be enabled for it. */
1210
1211         mask = unit_get_own_mask(u) | unit_get_members_mask(u) | unit_get_siblings_mask(u);
1212         mask &= u->manager->cgroup_supported;
1213
1214         return mask;
1215 }
1216
1217 CGroupMask unit_get_enable_mask(Unit *u) {
1218         CGroupMask mask;
1219
1220         /* This returns the cgroup mask of all controllers to enable
1221          * for the children of a specific cgroup. This is primarily
1222          * useful for the unified cgroup hierarchy, where each cgroup
1223          * controls which controllers are enabled for its children. */
1224
1225         mask = unit_get_members_mask(u);
1226         mask &= u->manager->cgroup_supported;
1227
1228         return mask;
1229 }
1230
1231 bool unit_get_needs_bpf(Unit *u) {
1232         CGroupContext *c;
1233         Unit *p;
1234         assert(u);
1235
1236         /* We never attach BPF to slice units, as they are inner cgroup nodes and cgroup/BPF is not recursive at the
1237          * moment. */
1238         if (u->type == UNIT_SLICE)
1239                 return false;
1240
1241         c = unit_get_cgroup_context(u);
1242         if (!c)
1243                 return false;
1244
1245         if (c->ip_accounting ||
1246             c->ip_address_allow ||
1247             c->ip_address_deny)
1248                 return true;
1249
1250         /* If any parent slice has an IP access list defined, it applies too */
1251         for (p = UNIT_DEREF(u->slice); p; p = UNIT_DEREF(p->slice)) {
1252                 c = unit_get_cgroup_context(p);
1253                 if (!c)
1254                         return false;
1255
1256                 if (c->ip_address_allow ||
1257                     c->ip_address_deny)
1258                         return true;
1259         }
1260
1261         return false;
1262 }
1263
1264 /* Recurse from a unit up through its containing slices, propagating
1265  * mask bits upward. A unit is also member of itself. */
1266 void unit_update_cgroup_members_masks(Unit *u) {
1267         CGroupMask m;
1268         bool more;
1269
1270         assert(u);
1271
1272         /* Calculate subtree mask */
1273         m = unit_get_subtree_mask(u);
1274
1275         /* See if anything changed from the previous invocation. If
1276          * not, we're done. */
1277         if (u->cgroup_subtree_mask_valid && m == u->cgroup_subtree_mask)
1278                 return;
1279
1280         more =
1281                 u->cgroup_subtree_mask_valid &&
1282                 ((m & ~u->cgroup_subtree_mask) != 0) &&
1283                 ((~m & u->cgroup_subtree_mask) == 0);
1284
1285         u->cgroup_subtree_mask = m;
1286         u->cgroup_subtree_mask_valid = true;
1287
1288         if (UNIT_ISSET(u->slice)) {
1289                 Unit *s = UNIT_DEREF(u->slice);
1290
1291                 if (more)
1292                         /* There's more set now than before. We
1293                          * propagate the new mask to the parent's mask
1294                          * (not caring if it actually was valid or
1295                          * not). */
1296
1297                         s->cgroup_members_mask |= m;
1298
1299                 else
1300                         /* There's less set now than before (or we
1301                          * don't know), we need to recalculate
1302                          * everything, so let's invalidate the
1303                          * parent's members mask */
1304
1305                         s->cgroup_members_mask_valid = false;
1306
1307                 /* And now make sure that this change also hits our
1308                  * grandparents */
1309                 unit_update_cgroup_members_masks(s);
1310         }
1311 }
1312
1313 static const char *migrate_callback(CGroupMask mask, void *userdata) {
1314         Unit *u = userdata;
1315
1316         assert(mask != 0);
1317         assert(u);
1318
1319         while (u) {
1320                 if (u->cgroup_path &&
1321                     u->cgroup_realized &&
1322                     (u->cgroup_realized_mask & mask) == mask)
1323                         return u->cgroup_path;
1324
1325                 u = UNIT_DEREF(u->slice);
1326         }
1327
1328         return NULL;
1329 }
1330
1331 char *unit_default_cgroup_path(Unit *u) {
1332         _cleanup_free_ char *escaped = NULL, *slice = NULL;
1333         int r;
1334
1335         assert(u);
1336
1337         if (unit_has_name(u, SPECIAL_ROOT_SLICE))
1338                 return strdup(u->manager->cgroup_root);
1339
1340         if (UNIT_ISSET(u->slice) && !unit_has_name(UNIT_DEREF(u->slice), SPECIAL_ROOT_SLICE)) {
1341                 r = cg_slice_to_path(UNIT_DEREF(u->slice)->id, &slice);
1342                 if (r < 0)
1343                         return NULL;
1344         }
1345
1346         escaped = cg_escape(u->id);
1347         if (!escaped)
1348                 return NULL;
1349
1350         if (slice)
1351                 return strjoin(u->manager->cgroup_root, "/", slice, "/",
1352                                escaped);
1353         else
1354                 return strjoin(u->manager->cgroup_root, "/", escaped);
1355 }
1356
1357 int unit_set_cgroup_path(Unit *u, const char *path) {
1358         _cleanup_free_ char *p = NULL;
1359         int r;
1360
1361         assert(u);
1362
1363         if (path) {
1364                 p = strdup(path);
1365                 if (!p)
1366                         return -ENOMEM;
1367         } else
1368                 p = NULL;
1369
1370         if (streq_ptr(u->cgroup_path, p))
1371                 return 0;
1372
1373         if (p) {
1374                 r = hashmap_put(u->manager->cgroup_unit, p, u);
1375                 if (r < 0)
1376                         return r;
1377         }
1378
1379         unit_release_cgroup(u);
1380
1381         u->cgroup_path = p;
1382         p = NULL;
1383
1384         return 1;
1385 }
1386
1387 int unit_watch_cgroup(Unit *u) {
1388         _cleanup_free_ char *events = NULL;
1389         int r;
1390
1391         assert(u);
1392
1393         if (!u->cgroup_path)
1394                 return 0;
1395
1396         if (u->cgroup_inotify_wd >= 0)
1397                 return 0;
1398
1399         /* Only applies to the unified hierarchy */
1400         r = cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER);
1401         if (r < 0)
1402                 return log_error_errno(r, "Failed to determine whether the name=systemd hierarchy is unified: %m");
1403         if (r == 0)
1404                 return 0;
1405
1406         /* Don't watch the root slice, it's pointless. */
1407         if (unit_has_name(u, SPECIAL_ROOT_SLICE))
1408                 return 0;
1409
1410         r = hashmap_ensure_allocated(&u->manager->cgroup_inotify_wd_unit, &trivial_hash_ops);
1411         if (r < 0)
1412                 return log_oom();
1413
1414         r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, "cgroup.events", &events);
1415         if (r < 0)
1416                 return log_oom();
1417
1418         u->cgroup_inotify_wd = inotify_add_watch(u->manager->cgroup_inotify_fd, events, IN_MODIFY);
1419         if (u->cgroup_inotify_wd < 0) {
1420
1421                 if (errno == ENOENT) /* If the directory is already
1422                                       * gone we don't need to track
1423                                       * it, so this is not an error */
1424                         return 0;
1425
1426                 return log_unit_error_errno(u, errno, "Failed to add inotify watch descriptor for control group %s: %m", u->cgroup_path);
1427         }
1428
1429         r = hashmap_put(u->manager->cgroup_inotify_wd_unit, INT_TO_PTR(u->cgroup_inotify_wd), u);
1430         if (r < 0)
1431                 return log_unit_error_errno(u, r, "Failed to add inotify watch descriptor to hash map: %m");
1432
1433         return 0;
1434 }
1435
1436 int unit_pick_cgroup_path(Unit *u) {
1437         _cleanup_free_ char *path = NULL;
1438         int r;
1439
1440         assert(u);
1441
1442         if (u->cgroup_path)
1443                 return 0;
1444
1445         if (!UNIT_HAS_CGROUP_CONTEXT(u))
1446                 return -EINVAL;
1447
1448         path = unit_default_cgroup_path(u);
1449         if (!path)
1450                 return log_oom();
1451
1452         r = unit_set_cgroup_path(u, path);
1453         if (r == -EEXIST)
1454                 return log_unit_error_errno(u, r, "Control group %s exists already.", path);
1455         if (r < 0)
1456                 return log_unit_error_errno(u, r, "Failed to set unit's control group path to %s: %m", path);
1457
1458         return 0;
1459 }
1460
1461 static int unit_create_cgroup(
1462                 Unit *u,
1463                 CGroupMask target_mask,
1464                 CGroupMask enable_mask,
1465                 bool needs_bpf) {
1466
1467         CGroupContext *c;
1468         int r;
1469
1470         assert(u);
1471
1472         c = unit_get_cgroup_context(u);
1473         if (!c)
1474                 return 0;
1475
1476         /* Figure out our cgroup path */
1477         r = unit_pick_cgroup_path(u);
1478         if (r < 0)
1479                 return r;
1480
1481         /* First, create our own group */
1482         r = cg_create_everywhere(u->manager->cgroup_supported, target_mask, u->cgroup_path);
1483         if (r < 0)
1484                 return log_unit_error_errno(u, r, "Failed to create cgroup %s: %m", u->cgroup_path);
1485
1486         /* Start watching it */
1487         (void) unit_watch_cgroup(u);
1488
1489         /* Enable all controllers we need */
1490         r = cg_enable_everywhere(u->manager->cgroup_supported, enable_mask, u->cgroup_path);
1491         if (r < 0)
1492                 log_unit_warning_errno(u, r, "Failed to enable controllers on cgroup %s, ignoring: %m", u->cgroup_path);
1493
1494         /* Keep track that this is now realized */
1495         u->cgroup_realized = true;
1496         u->cgroup_realized_mask = target_mask;
1497         u->cgroup_enabled_mask = enable_mask;
1498         u->cgroup_bpf_state = needs_bpf ? UNIT_CGROUP_BPF_ON : UNIT_CGROUP_BPF_OFF;
1499
1500         if (u->type != UNIT_SLICE && !c->delegate) {
1501
1502                 /* Then, possibly move things over, but not if
1503                  * subgroups may contain processes, which is the case
1504                  * for slice and delegation units. */
1505                 r = cg_migrate_everywhere(u->manager->cgroup_supported, u->cgroup_path, u->cgroup_path, migrate_callback, u);
1506                 if (r < 0)
1507                         log_unit_warning_errno(u, r, "Failed to migrate cgroup from to %s, ignoring: %m", u->cgroup_path);
1508         }
1509
1510         return 0;
1511 }
1512
1513 int unit_attach_pids_to_cgroup(Unit *u) {
1514         int r;
1515         assert(u);
1516
1517         r = unit_realize_cgroup(u);
1518         if (r < 0)
1519                 return r;
1520
1521         r = cg_attach_many_everywhere(u->manager->cgroup_supported, u->cgroup_path, u->pids, migrate_callback, u);
1522         if (r < 0)
1523                 return r;
1524
1525         return 0;
1526 }
1527
1528 static void cgroup_xattr_apply(Unit *u) {
1529         char ids[SD_ID128_STRING_MAX];
1530         int r;
1531
1532         assert(u);
1533
1534         if (!MANAGER_IS_SYSTEM(u->manager))
1535                 return;
1536
1537         if (sd_id128_is_null(u->invocation_id))
1538                 return;
1539
1540         r = cg_set_xattr(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path,
1541                          "trusted.invocation_id",
1542                          sd_id128_to_string(u->invocation_id, ids), 32,
1543                          0);
1544         if (r < 0)
1545                 log_unit_debug_errno(u, r, "Failed to set invocation ID on control group %s, ignoring: %m", u->cgroup_path);
1546 }
1547
1548 static bool unit_has_mask_realized(
1549                 Unit *u,
1550                 CGroupMask target_mask,
1551                 CGroupMask enable_mask,
1552                 bool needs_bpf) {
1553
1554         assert(u);
1555
1556         return u->cgroup_realized &&
1557                 u->cgroup_realized_mask == target_mask &&
1558                 u->cgroup_enabled_mask == enable_mask &&
1559                 ((needs_bpf && u->cgroup_bpf_state == UNIT_CGROUP_BPF_ON) ||
1560                  (!needs_bpf && u->cgroup_bpf_state == UNIT_CGROUP_BPF_OFF));
1561 }
1562
1563 static void unit_add_to_cgroup_realize_queue(Unit *u) {
1564         assert(u);
1565
1566         if (u->in_cgroup_realize_queue)
1567                 return;
1568
1569         LIST_PREPEND(cgroup_realize_queue, u->manager->cgroup_realize_queue, u);
1570         u->in_cgroup_realize_queue = true;
1571 }
1572
1573 static void unit_remove_from_cgroup_realize_queue(Unit *u) {
1574         assert(u);
1575
1576         if (!u->in_cgroup_realize_queue)
1577                 return;
1578
1579         LIST_REMOVE(cgroup_realize_queue, u->manager->cgroup_realize_queue, u);
1580         u->in_cgroup_realize_queue = false;
1581 }
1582
1583
1584 /* Check if necessary controllers and attributes for a unit are in place.
1585  *
1586  * If so, do nothing.
1587  * If not, create paths, move processes over, and set attributes.
1588  *
1589  * Returns 0 on success and < 0 on failure. */
1590 static int unit_realize_cgroup_now(Unit *u, ManagerState state) {
1591         CGroupMask target_mask, enable_mask;
1592         bool needs_bpf, apply_bpf;
1593         int r;
1594
1595         assert(u);
1596
1597         unit_remove_from_cgroup_realize_queue(u);
1598
1599         target_mask = unit_get_target_mask(u);
1600         enable_mask = unit_get_enable_mask(u);
1601         needs_bpf = unit_get_needs_bpf(u);
1602
1603         if (unit_has_mask_realized(u, target_mask, enable_mask, needs_bpf))
1604                 return 0;
1605
1606         /* Make sure we apply the BPF filters either when one is configured, or if none is configured but previously
1607          * the state was anything but off. This way, if a unit with a BPF filter applied is reconfigured to lose it
1608          * this will trickle down properly to cgroupfs. */
1609         apply_bpf = needs_bpf || u->cgroup_bpf_state != UNIT_CGROUP_BPF_OFF;
1610
1611         /* First, realize parents */
1612         if (UNIT_ISSET(u->slice)) {
1613                 r = unit_realize_cgroup_now(UNIT_DEREF(u->slice), state);
1614                 if (r < 0)
1615                         return r;
1616         }
1617
1618         /* And then do the real work */
1619         r = unit_create_cgroup(u, target_mask, enable_mask, needs_bpf);
1620         if (r < 0)
1621                 return r;
1622
1623         /* Finally, apply the necessary attributes. */
1624         cgroup_context_apply(u, target_mask, apply_bpf, state);
1625         cgroup_xattr_apply(u);
1626
1627         return 0;
1628 }
1629
1630 unsigned manager_dispatch_cgroup_realize_queue(Manager *m) {
1631         ManagerState state;
1632         unsigned n = 0;
1633         Unit *i;
1634         int r;
1635
1636         assert(m);
1637
1638         state = manager_state(m);
1639
1640         while ((i = m->cgroup_realize_queue)) {
1641                 assert(i->in_cgroup_realize_queue);
1642
1643                 if (UNIT_IS_INACTIVE_OR_FAILED(unit_active_state(i))) {
1644                         /* Maybe things changed, and the unit is not actually active anymore? */
1645                         unit_remove_from_cgroup_realize_queue(i);
1646                         continue;
1647                 }
1648
1649                 r = unit_realize_cgroup_now(i, state);
1650                 if (r < 0)
1651                         log_warning_errno(r, "Failed to realize cgroups for queued unit %s, ignoring: %m", i->id);
1652
1653                 n++;
1654         }
1655
1656         return n;
1657 }
1658
1659 static void unit_add_siblings_to_cgroup_realize_queue(Unit *u) {
1660         Unit *slice;
1661
1662         /* This adds the siblings of the specified unit and the
1663          * siblings of all parent units to the cgroup queue. (But
1664          * neither the specified unit itself nor the parents.) */
1665
1666         while ((slice = UNIT_DEREF(u->slice))) {
1667                 Iterator i;
1668                 Unit *m;
1669                 void *v;
1670
1671                 HASHMAP_FOREACH_KEY(v, m, u->dependencies[UNIT_BEFORE], i) {
1672                         if (m == u)
1673                                 continue;
1674
1675                         /* Skip units that have a dependency on the slice
1676                          * but aren't actually in it. */
1677                         if (UNIT_DEREF(m->slice) != slice)
1678                                 continue;
1679
1680                         /* No point in doing cgroup application for units
1681                          * without active processes. */
1682                         if (UNIT_IS_INACTIVE_OR_FAILED(unit_active_state(m)))
1683                                 continue;
1684
1685                         /* If the unit doesn't need any new controllers
1686                          * and has current ones realized, it doesn't need
1687                          * any changes. */
1688                         if (unit_has_mask_realized(m,
1689                                                    unit_get_target_mask(m),
1690                                                    unit_get_enable_mask(m),
1691                                                    unit_get_needs_bpf(m)))
1692                                 continue;
1693
1694                         unit_add_to_cgroup_realize_queue(m);
1695                 }
1696
1697                 u = slice;
1698         }
1699 }
1700
1701 int unit_realize_cgroup(Unit *u) {
1702         assert(u);
1703
1704         if (!UNIT_HAS_CGROUP_CONTEXT(u))
1705                 return 0;
1706
1707         /* So, here's the deal: when realizing the cgroups for this
1708          * unit, we need to first create all parents, but there's more
1709          * actually: for the weight-based controllers we also need to
1710          * make sure that all our siblings (i.e. units that are in the
1711          * same slice as we are) have cgroups, too. Otherwise, things
1712          * would become very uneven as each of their processes would
1713          * get as much resources as all our group together. This call
1714          * will synchronously create the parent cgroups, but will
1715          * defer work on the siblings to the next event loop
1716          * iteration. */
1717
1718         /* Add all sibling slices to the cgroup queue. */
1719         unit_add_siblings_to_cgroup_realize_queue(u);
1720
1721         /* And realize this one now (and apply the values) */
1722         return unit_realize_cgroup_now(u, manager_state(u->manager));
1723 }
1724
1725 void unit_release_cgroup(Unit *u) {
1726         assert(u);
1727
1728         /* Forgets all cgroup details for this cgroup */
1729
1730         if (u->cgroup_path) {
1731                 (void) hashmap_remove(u->manager->cgroup_unit, u->cgroup_path);
1732                 u->cgroup_path = mfree(u->cgroup_path);
1733         }
1734
1735         if (u->cgroup_inotify_wd >= 0) {
1736                 if (inotify_rm_watch(u->manager->cgroup_inotify_fd, u->cgroup_inotify_wd) < 0)
1737                         log_unit_debug_errno(u, errno, "Failed to remove cgroup inotify watch %i for %s, ignoring", u->cgroup_inotify_wd, u->id);
1738
1739                 (void) hashmap_remove(u->manager->cgroup_inotify_wd_unit, INT_TO_PTR(u->cgroup_inotify_wd));
1740                 u->cgroup_inotify_wd = -1;
1741         }
1742 }
1743
1744 void unit_prune_cgroup(Unit *u) {
1745         int r;
1746         bool is_root_slice;
1747
1748         assert(u);
1749
1750         /* Removes the cgroup, if empty and possible, and stops watching it. */
1751
1752         if (!u->cgroup_path)
1753                 return;
1754
1755         (void) unit_get_cpu_usage(u, NULL); /* Cache the last CPU usage value before we destroy the cgroup */
1756
1757         is_root_slice = unit_has_name(u, SPECIAL_ROOT_SLICE);
1758
1759         r = cg_trim_everywhere(u->manager->cgroup_supported, u->cgroup_path, !is_root_slice);
1760         if (r < 0) {
1761                 log_unit_debug_errno(u, r, "Failed to destroy cgroup %s, ignoring: %m", u->cgroup_path);
1762                 return;
1763         }
1764
1765         if (is_root_slice)
1766                 return;
1767
1768         unit_release_cgroup(u);
1769
1770         u->cgroup_realized = false;
1771         u->cgroup_realized_mask = 0;
1772         u->cgroup_enabled_mask = 0;
1773 }
1774
1775 int unit_search_main_pid(Unit *u, pid_t *ret) {
1776         _cleanup_fclose_ FILE *f = NULL;
1777         pid_t pid = 0, npid, mypid;
1778         int r;
1779
1780         assert(u);
1781         assert(ret);
1782
1783         if (!u->cgroup_path)
1784                 return -ENXIO;
1785
1786         r = cg_enumerate_processes(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, &f);
1787         if (r < 0)
1788                 return r;
1789
1790         mypid = getpid_cached();
1791         while (cg_read_pid(f, &npid) > 0)  {
1792                 pid_t ppid;
1793
1794                 if (npid == pid)
1795                         continue;
1796
1797                 /* Ignore processes that aren't our kids */
1798                 if (get_process_ppid(npid, &ppid) >= 0 && ppid != mypid)
1799                         continue;
1800
1801                 if (pid != 0)
1802                         /* Dang, there's more than one daemonized PID
1803                         in this group, so we don't know what process
1804                         is the main process. */
1805
1806                         return -ENODATA;
1807
1808                 pid = npid;
1809         }
1810
1811         *ret = pid;
1812         return 0;
1813 }
1814
1815 static int unit_watch_pids_in_path(Unit *u, const char *path) {
1816         _cleanup_closedir_ DIR *d = NULL;
1817         _cleanup_fclose_ FILE *f = NULL;
1818         int ret = 0, r;
1819
1820         assert(u);
1821         assert(path);
1822
1823         r = cg_enumerate_processes(SYSTEMD_CGROUP_CONTROLLER, path, &f);
1824         if (r < 0)
1825                 ret = r;
1826         else {
1827                 pid_t pid;
1828
1829                 while ((r = cg_read_pid(f, &pid)) > 0) {
1830                         r = unit_watch_pid(u, pid);
1831                         if (r < 0 && ret >= 0)
1832                                 ret = r;
1833                 }
1834
1835                 if (r < 0 && ret >= 0)
1836                         ret = r;
1837         }
1838
1839         r = cg_enumerate_subgroups(SYSTEMD_CGROUP_CONTROLLER, path, &d);
1840         if (r < 0) {
1841                 if (ret >= 0)
1842                         ret = r;
1843         } else {
1844                 char *fn;
1845
1846                 while ((r = cg_read_subgroup(d, &fn)) > 0) {
1847                         _cleanup_free_ char *p = NULL;
1848
1849                         p = strjoin(path, "/", fn);
1850                         free(fn);
1851
1852                         if (!p)
1853                                 return -ENOMEM;
1854
1855                         r = unit_watch_pids_in_path(u, p);
1856                         if (r < 0 && ret >= 0)
1857                                 ret = r;
1858                 }
1859
1860                 if (r < 0 && ret >= 0)
1861                         ret = r;
1862         }
1863
1864         return ret;
1865 }
1866
1867 int unit_synthesize_cgroup_empty_event(Unit *u) {
1868         int r;
1869
1870         assert(u);
1871
1872         /* Enqueue a synthetic cgroup empty event if this unit doesn't watch any PIDs anymore. This is compatibility
1873          * support for non-unified systems where notifications aren't reliable, and hence need to take whatever we can
1874          * get as notification source as soon as we stopped having any useful PIDs to watch for. */
1875
1876         if (!u->cgroup_path)
1877                 return -ENOENT;
1878
1879         r = cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER);
1880         if (r < 0)
1881                 return r;
1882         if (r > 0) /* On unified we have reliable notifications, and don't need this */
1883                 return 0;
1884
1885         if (!set_isempty(u->pids))
1886                 return 0;
1887
1888         unit_add_to_cgroup_empty_queue(u);
1889         return 0;
1890 }
1891
1892 int unit_watch_all_pids(Unit *u) {
1893         int r;
1894
1895         assert(u);
1896
1897         /* Adds all PIDs from our cgroup to the set of PIDs we
1898          * watch. This is a fallback logic for cases where we do not
1899          * get reliable cgroup empty notifications: we try to use
1900          * SIGCHLD as replacement. */
1901
1902         if (!u->cgroup_path)
1903                 return -ENOENT;
1904
1905         r = cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER);
1906         if (r < 0)
1907                 return r;
1908         if (r > 0) /* On unified we can use proper notifications */
1909                 return 0;
1910
1911         return unit_watch_pids_in_path(u, u->cgroup_path);
1912 }
1913
1914 static int on_cgroup_empty_event(sd_event_source *s, void *userdata) {
1915         Manager *m = userdata;
1916         Unit *u;
1917         int r;
1918
1919         assert(s);
1920         assert(m);
1921
1922         u = m->cgroup_empty_queue;
1923         if (!u)
1924                 return 0;
1925
1926         assert(u->in_cgroup_empty_queue);
1927         u->in_cgroup_empty_queue = false;
1928         LIST_REMOVE(cgroup_empty_queue, m->cgroup_empty_queue, u);
1929
1930         if (m->cgroup_empty_queue) {
1931                 /* More stuff queued, let's make sure we remain enabled */
1932                 r = sd_event_source_set_enabled(s, SD_EVENT_ONESHOT);
1933                 if (r < 0)
1934                         log_debug_errno(r, "Failed to reenable cgroup empty event source: %m");
1935         }
1936
1937         unit_add_to_gc_queue(u);
1938
1939         if (UNIT_VTABLE(u)->notify_cgroup_empty)
1940                 UNIT_VTABLE(u)->notify_cgroup_empty(u);
1941
1942         return 0;
1943 }
1944
1945 void unit_add_to_cgroup_empty_queue(Unit *u) {
1946         int r;
1947
1948         assert(u);
1949
1950         /* Note that there are four different ways how cgroup empty events reach us:
1951          *
1952          * 1. On the unified hierarchy we get an inotify event on the cgroup
1953          *
1954          * 2. On the legacy hierarchy, when running in system mode, we get a datagram on the cgroup agent socket
1955          *
1956          * 3. On the legacy hierarchy, when running in user mode, we get a D-Bus signal on the system bus
1957          *
1958          * 4. On the legacy hierarchy, in service units we start watching all processes of the cgroup for SIGCHLD as
1959          *    soon as we get one SIGCHLD, to deal with unreliable cgroup notifications.
1960          *
1961          * Regardless which way we got the notification, we'll verify it here, and then add it to a separate
1962          * queue. This queue will be dispatched at a lower priority than the SIGCHLD handler, so that we always use
1963          * SIGCHLD if we can get it first, and only use the cgroup empty notifications if there's no SIGCHLD pending
1964          * (which might happen if the cgroup doesn't contain processes that are our own child, which is typically the
1965          * case for scope units). */
1966
1967         if (u->in_cgroup_empty_queue)
1968                 return;
1969
1970         /* Let's verify that the cgroup is really empty */
1971         if (!u->cgroup_path)
1972                 return;
1973         r = cg_is_empty_recursive(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path);
1974         if (r < 0) {
1975                 log_unit_debug_errno(u, r, "Failed to determine whether cgroup %s is empty: %m", u->cgroup_path);
1976                 return;
1977         }
1978         if (r == 0)
1979                 return;
1980
1981         LIST_PREPEND(cgroup_empty_queue, u->manager->cgroup_empty_queue, u);
1982         u->in_cgroup_empty_queue = true;
1983
1984         /* Trigger the defer event */
1985         r = sd_event_source_set_enabled(u->manager->cgroup_empty_event_source, SD_EVENT_ONESHOT);
1986         if (r < 0)
1987                 log_debug_errno(r, "Failed to enable cgroup empty event source: %m");
1988 }
1989
1990 static int on_cgroup_inotify_event(sd_event_source *s, int fd, uint32_t revents, void *userdata) {
1991         Manager *m = userdata;
1992
1993         assert(s);
1994         assert(fd >= 0);
1995         assert(m);
1996
1997         for (;;) {
1998                 union inotify_event_buffer buffer;
1999                 struct inotify_event *e;
2000                 ssize_t l;
2001
2002                 l = read(fd, &buffer, sizeof(buffer));
2003                 if (l < 0) {
2004                         if (IN_SET(errno, EINTR, EAGAIN))
2005                                 return 0;
2006
2007                         return log_error_errno(errno, "Failed to read control group inotify events: %m");
2008                 }
2009
2010                 FOREACH_INOTIFY_EVENT(e, buffer, l) {
2011                         Unit *u;
2012
2013                         if (e->wd < 0)
2014                                 /* Queue overflow has no watch descriptor */
2015                                 continue;
2016
2017                         if (e->mask & IN_IGNORED)
2018                                 /* The watch was just removed */
2019                                 continue;
2020
2021                         u = hashmap_get(m->cgroup_inotify_wd_unit, INT_TO_PTR(e->wd));
2022                         if (!u) /* Not that inotify might deliver
2023                                  * events for a watch even after it
2024                                  * was removed, because it was queued
2025                                  * before the removal. Let's ignore
2026                                  * this here safely. */
2027                                 continue;
2028
2029                         unit_add_to_cgroup_empty_queue(u);
2030                 }
2031         }
2032 }
2033 #endif // 0
2034
2035 int manager_setup_cgroup(Manager *m) {
2036         _cleanup_free_ char *path = NULL;
2037         const char *scope_path;
2038         CGroupController c;
2039         int r, all_unified;
2040 #if 0 /// UNNEEDED by elogind
2041         char *e;
2042 #endif // 0
2043
2044         assert(m);
2045
2046         /* 1. Determine hierarchy */
2047         m->cgroup_root = mfree(m->cgroup_root);
2048 #if 0 /// elogind is not init and must therefore search for PID 1 instead of self.
2049         r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, 0, &m->cgroup_root);
2050 #else
2051         r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, 1, &m->cgroup_root);
2052 #endif // 0
2053         if (r < 0)
2054                 return log_error_errno(r, "Cannot determine cgroup we are running in: %m");
2055
2056 #if 0 /// elogind does not support systemd scopes and slices
2057         /* Chop off the init scope, if we are already located in it */
2058         e = endswith(m->cgroup_root, "/" SPECIAL_INIT_SCOPE);
2059
2060         /* LEGACY: Also chop off the system slice if we are in
2061          * it. This is to support live upgrades from older systemd
2062          * versions where PID 1 was moved there. Also see
2063          * cg_get_root_path(). */
2064         if (!e && MANAGER_IS_SYSTEM(m)) {
2065                 e = endswith(m->cgroup_root, "/" SPECIAL_SYSTEM_SLICE);
2066                 if (!e)
2067                         e = endswith(m->cgroup_root, "/system"); /* even more legacy */
2068         }
2069         if (e)
2070                 *e = 0;
2071 #endif // 0
2072
2073         log_debug_elogind("Cgroup Controller \"%s\" -> root \"%s\"",
2074                           SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root);
2075         /* And make sure to store away the root value without trailing slash, even for the root dir, so that we can
2076          * easily prepend it everywhere. */
2077         delete_trailing_chars(m->cgroup_root, "/");
2078
2079         /* 2. Show data */
2080         r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, NULL, &path);
2081         if (r < 0)
2082                 return log_error_errno(r, "Cannot find cgroup mount point: %m");
2083
2084         r = cg_unified_flush();
2085         if (r < 0)
2086                 return log_error_errno(r, "Couldn't determine if we are running in the unified hierarchy: %m");
2087
2088         all_unified = cg_all_unified();
2089         if (all_unified < 0)
2090                 return log_error_errno(all_unified, "Couldn't determine whether we are in all unified mode: %m");
2091         if (all_unified > 0)
2092                 log_debug("Unified cgroup hierarchy is located at %s.", path);
2093         else {
2094                 r = cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER);
2095                 if (r < 0)
2096                         return log_error_errno(r, "Failed to determine whether systemd's own controller is in unified mode: %m");
2097                 if (r > 0)
2098                         log_debug("Unified cgroup hierarchy is located at %s. Controllers are on legacy hierarchies.", path);
2099                 else
2100                         log_debug("Using cgroup controller " SYSTEMD_CGROUP_CONTROLLER_LEGACY ". File system hierarchy is at %s.", path);
2101         }
2102
2103 #if 0 /// elogind is not init, and does not install the agent here.
2104         /* 3. Allocate cgroup empty defer event source */
2105         m->cgroup_empty_event_source = sd_event_source_unref(m->cgroup_empty_event_source);
2106         r = sd_event_add_defer(m->event, &m->cgroup_empty_event_source, on_cgroup_empty_event, m);
2107         if (r < 0)
2108                 return log_error_errno(r, "Failed to create cgroup empty event source: %m");
2109
2110         r = sd_event_source_set_priority(m->cgroup_empty_event_source, SD_EVENT_PRIORITY_NORMAL-5);
2111         if (r < 0)
2112                 return log_error_errno(r, "Failed to set priority of cgroup empty event source: %m");
2113
2114         r = sd_event_source_set_enabled(m->cgroup_empty_event_source, SD_EVENT_OFF);
2115         if (r < 0)
2116                 return log_error_errno(r, "Failed to disable cgroup empty event source: %m");
2117
2118         (void) sd_event_source_set_description(m->cgroup_empty_event_source, "cgroup-empty");
2119
2120         /* 4. Install notifier inotify object, or agent */
2121         if (cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER) > 0) {
2122
2123                 /* In the unified hierarchy we can get cgroup empty notifications via inotify. */
2124
2125                 m->cgroup_inotify_event_source = sd_event_source_unref(m->cgroup_inotify_event_source);
2126                 safe_close(m->cgroup_inotify_fd);
2127
2128                 m->cgroup_inotify_fd = inotify_init1(IN_NONBLOCK|IN_CLOEXEC);
2129                 if (m->cgroup_inotify_fd < 0)
2130                         return log_error_errno(errno, "Failed to create control group inotify object: %m");
2131
2132                 r = sd_event_add_io(m->event, &m->cgroup_inotify_event_source, m->cgroup_inotify_fd, EPOLLIN, on_cgroup_inotify_event, m);
2133                 if (r < 0)
2134                         return log_error_errno(r, "Failed to watch control group inotify object: %m");
2135
2136                 /* Process cgroup empty notifications early, but after service notifications and SIGCHLD. Also
2137                  * see handling of cgroup agent notifications, for the classic cgroup hierarchy support. */
2138                 r = sd_event_source_set_priority(m->cgroup_inotify_event_source, SD_EVENT_PRIORITY_NORMAL-4);
2139                 if (r < 0)
2140                         return log_error_errno(r, "Failed to set priority of inotify event source: %m");
2141
2142                 (void) sd_event_source_set_description(m->cgroup_inotify_event_source, "cgroup-inotify");
2143
2144         } else if (MANAGER_IS_SYSTEM(m) && m->test_run_flags == 0) {
2145
2146                 /* On the legacy hierarchy we only get notifications via cgroup agents. (Which isn't really reliable,
2147                  * since it does not generate events when control groups with children run empty. */
2148
2149                 r = cg_install_release_agent(SYSTEMD_CGROUP_CONTROLLER, SYSTEMD_CGROUP_AGENT_PATH);
2150                 if (r < 0)
2151                         log_warning_errno(r, "Failed to install release agent, ignoring: %m");
2152                 else if (r > 0)
2153                         log_debug("Installed release agent.");
2154                 else if (r == 0)
2155                         log_debug("Release agent already installed.");
2156         }
2157
2158         /* 5. Make sure we are in the special "init.scope" unit in the root slice. */
2159         scope_path = strjoina(m->cgroup_root, "/" SPECIAL_INIT_SCOPE);
2160         r = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, scope_path, 0);
2161 #else
2162         /* Note:
2163                 * This method is in core, and normally called by systemd
2164                 * being init. As elogind is never init, we can not install
2165                 * our agent here. We do so when mounting our cgroup file
2166                 * system, so only if elogind is its own tiny controller.
2167                 * Further, elogind is not meant to run in systemd init scope. */
2168         if (MANAGER_IS_SYSTEM(m))
2169                 // we are our own cgroup controller
2170                 scope_path = strjoina("");
2171         else if (streq(m->cgroup_root, "/elogind"))
2172                 // root already is our cgroup
2173                 scope_path = strjoina(m->cgroup_root);
2174         else
2175                 // we have to create our own group
2176                 scope_path = strjoina(m->cgroup_root, "/elogind");
2177         r = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, scope_path, 0);
2178 #endif // 0
2179         if (r < 0)
2180                 return log_error_errno(r, "Failed to create %s control group: %m", scope_path);
2181         log_debug_elogind("Created control group \"%s\"", scope_path);
2182
2183 #if 0 /// elogind is not a "sub-controller" like systemd, so migration is not needed.
2184         /* Also, move all other userspace processes remaining in the root cgroup into that scope. */
2185         r = cg_migrate(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, SYSTEMD_CGROUP_CONTROLLER, scope_path, 0);
2186         if (r < 0)
2187                 log_warning_errno(r, "Couldn't move remaining userspace processes, ignoring: %m");
2188 #endif // 0
2189
2190         /* 6. And pin it, so that it cannot be unmounted */
2191         safe_close(m->pin_cgroupfs_fd);
2192         m->pin_cgroupfs_fd = open(path, O_RDONLY|O_CLOEXEC|O_DIRECTORY|O_NOCTTY|O_NONBLOCK);
2193         if (m->pin_cgroupfs_fd < 0)
2194                 return log_error_errno(errno, "Failed to open pin file: %m");
2195
2196         /* 7. Always enable hierarchical support if it exists... */
2197         if (!all_unified && m->test_run_flags == 0)
2198                 (void) cg_set_attribute("memory", "/", "memory.use_hierarchy", "1");
2199
2200         /* 8. Figure out which controllers are supported, and log about it */
2201         r = cg_mask_supported(&m->cgroup_supported);
2202         if (r < 0)
2203                 return log_error_errno(r, "Failed to determine supported controllers: %m");
2204         for (c = 0; c < _CGROUP_CONTROLLER_MAX; c++)
2205                 log_debug("Controller '%s' supported: %s", cgroup_controller_to_string(c), yes_no(m->cgroup_supported & CGROUP_CONTROLLER_TO_MASK(c)));
2206
2207         return 0;
2208 }
2209
2210 void manager_shutdown_cgroup(Manager *m, bool delete) {
2211         assert(m);
2212
2213         /* We can't really delete the group, since we are in it. But
2214          * let's trim it. */
2215         if (delete && m->cgroup_root)
2216                 (void) cg_trim(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, false);
2217
2218 #if 0 /// elogind is not init
2219         m->cgroup_empty_event_source = sd_event_source_unref(m->cgroup_empty_event_source);
2220
2221         m->cgroup_inotify_wd_unit = hashmap_free(m->cgroup_inotify_wd_unit);
2222
2223         m->cgroup_inotify_event_source = sd_event_source_unref(m->cgroup_inotify_event_source);
2224         m->cgroup_inotify_fd = safe_close(m->cgroup_inotify_fd);
2225 #endif // 0
2226
2227         m->pin_cgroupfs_fd = safe_close(m->pin_cgroupfs_fd);
2228
2229         m->cgroup_root = mfree(m->cgroup_root);
2230 }
2231
2232 #if 0 /// UNNEEDED by elogind
2233 Unit* manager_get_unit_by_cgroup(Manager *m, const char *cgroup) {
2234         char *p;
2235         Unit *u;
2236
2237         assert(m);
2238         assert(cgroup);
2239
2240         u = hashmap_get(m->cgroup_unit, cgroup);
2241         if (u)
2242                 return u;
2243
2244         p = strdupa(cgroup);
2245         for (;;) {
2246                 char *e;
2247
2248                 e = strrchr(p, '/');
2249                 if (!e || e == p)
2250                         return hashmap_get(m->cgroup_unit, SPECIAL_ROOT_SLICE);
2251
2252                 *e = 0;
2253
2254                 u = hashmap_get(m->cgroup_unit, p);
2255                 if (u)
2256                         return u;
2257         }
2258 }
2259
2260 Unit *manager_get_unit_by_pid_cgroup(Manager *m, pid_t pid) {
2261         _cleanup_free_ char *cgroup = NULL;
2262
2263         assert(m);
2264
2265         if (!pid_is_valid(pid))
2266                 return NULL;
2267
2268         if (cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, pid, &cgroup) < 0)
2269                 return NULL;
2270
2271         return manager_get_unit_by_cgroup(m, cgroup);
2272 }
2273
2274 Unit *manager_get_unit_by_pid(Manager *m, pid_t pid) {
2275         Unit *u, **array;
2276
2277         assert(m);
2278
2279         /* Note that a process might be owned by multiple units, we return only one here, which is good enough for most
2280          * cases, though not strictly correct. We prefer the one reported by cgroup membership, as that's the most
2281          * relevant one as children of the process will be assigned to that one, too, before all else. */
2282
2283         if (!pid_is_valid(pid))
2284                 return NULL;
2285
2286         if (pid == getpid_cached())
2287                 return hashmap_get(m->units, SPECIAL_INIT_SCOPE);
2288
2289         u = manager_get_unit_by_pid_cgroup(m, pid);
2290         if (u)
2291                 return u;
2292
2293         u = hashmap_get(m->watch_pids, PID_TO_PTR(pid));
2294         if (u)
2295                 return u;
2296
2297         array = hashmap_get(m->watch_pids, PID_TO_PTR(-pid));
2298         if (array)
2299                 return array[0];
2300
2301         return NULL;
2302 }
2303 #endif // 0
2304
2305 #if 0 /// elogind must substitute this with its own variant
2306 int manager_notify_cgroup_empty(Manager *m, const char *cgroup) {
2307         Unit *u;
2308
2309         assert(m);
2310         assert(cgroup);
2311
2312         /* Called on the legacy hierarchy whenever we get an explicit cgroup notification from the cgroup agent process
2313          * or from the --system instance */
2314
2315         log_debug("Got cgroup empty notification for: %s", cgroup);
2316
2317         u = manager_get_unit_by_cgroup(m, cgroup);
2318         if (!u)
2319                 return 0;
2320
2321         unit_add_to_cgroup_empty_queue(u);
2322         return 1;
2323 }
2324 #else
2325 int manager_notify_cgroup_empty(Manager *m, const char *cgroup) {
2326         Session *s;
2327
2328         assert(m);
2329         assert(cgroup);
2330
2331         log_debug("Got cgroup empty notification for: %s", cgroup);
2332
2333         s = hashmap_get(m->sessions, cgroup);
2334
2335         if (s) {
2336                 session_finalize(s);
2337                 session_free(s);
2338         } else
2339                 log_warning("Session not found: %s", cgroup);
2340
2341         return 0;
2342 }
2343 #endif // 0
2344 #if 0 /// UNNEEDED by elogind
2345 int unit_get_memory_current(Unit *u, uint64_t *ret) {
2346         _cleanup_free_ char *v = NULL;
2347         int r;
2348
2349         assert(u);
2350         assert(ret);
2351
2352         if (!UNIT_CGROUP_BOOL(u, memory_accounting))
2353                 return -ENODATA;
2354
2355         if (!u->cgroup_path)
2356                 return -ENODATA;
2357
2358         /* The root cgroup doesn't expose this information, let's get it from /proc instead */
2359         if (unit_has_root_cgroup(u))
2360                 return procfs_memory_get_current(ret);
2361
2362         if ((u->cgroup_realized_mask & CGROUP_MASK_MEMORY) == 0)
2363                 return -ENODATA;
2364
2365         r = cg_all_unified();
2366         if (r < 0)
2367                 return r;
2368         if (r > 0)
2369                 r = cg_get_attribute("memory", u->cgroup_path, "memory.current", &v);
2370         else
2371                 r = cg_get_attribute("memory", u->cgroup_path, "memory.usage_in_bytes", &v);
2372         if (r == -ENOENT)
2373                 return -ENODATA;
2374         if (r < 0)
2375                 return r;
2376
2377         return safe_atou64(v, ret);
2378 }
2379
2380 int unit_get_tasks_current(Unit *u, uint64_t *ret) {
2381         _cleanup_free_ char *v = NULL;
2382         int r;
2383
2384         assert(u);
2385         assert(ret);
2386
2387         if (!UNIT_CGROUP_BOOL(u, tasks_accounting))
2388                 return -ENODATA;
2389
2390         if (!u->cgroup_path)
2391                 return -ENODATA;
2392
2393         /* The root cgroup doesn't expose this information, let's get it from /proc instead */
2394         if (unit_has_root_cgroup(u))
2395                 return procfs_tasks_get_current(ret);
2396
2397         if ((u->cgroup_realized_mask & CGROUP_MASK_PIDS) == 0)
2398                 return -ENODATA;
2399
2400         r = cg_get_attribute("pids", u->cgroup_path, "pids.current", &v);
2401         if (r == -ENOENT)
2402                 return -ENODATA;
2403         if (r < 0)
2404                 return r;
2405
2406         return safe_atou64(v, ret);
2407 }
2408
2409 static int unit_get_cpu_usage_raw(Unit *u, nsec_t *ret) {
2410         _cleanup_free_ char *v = NULL;
2411         uint64_t ns;
2412         int r;
2413
2414         assert(u);
2415         assert(ret);
2416
2417         if (!u->cgroup_path)
2418                 return -ENODATA;
2419
2420         /* The root cgroup doesn't expose this information, let's get it from /proc instead */
2421         if (unit_has_root_cgroup(u))
2422                 return procfs_cpu_get_usage(ret);
2423
2424         r = cg_all_unified();
2425         if (r < 0)
2426                 return r;
2427         if (r > 0) {
2428                 _cleanup_free_ char *val = NULL;
2429                 uint64_t us;
2430
2431                 if ((u->cgroup_realized_mask & CGROUP_MASK_CPU) == 0)
2432                         return -ENODATA;
2433
2434                 r = cg_get_keyed_attribute("cpu", u->cgroup_path, "cpu.stat", STRV_MAKE("usage_usec"), &val);
2435                 if (r < 0)
2436                         return r;
2437                 if (IN_SET(r, -ENOENT, -ENXIO))
2438                         return -ENODATA;
2439
2440                 r = safe_atou64(val, &us);
2441                 if (r < 0)
2442                         return r;
2443
2444                 ns = us * NSEC_PER_USEC;
2445         } else {
2446                 if ((u->cgroup_realized_mask & CGROUP_MASK_CPUACCT) == 0)
2447                         return -ENODATA;
2448
2449                 r = cg_get_attribute("cpuacct", u->cgroup_path, "cpuacct.usage", &v);
2450                 if (r == -ENOENT)
2451                         return -ENODATA;
2452                 if (r < 0)
2453                         return r;
2454
2455                 r = safe_atou64(v, &ns);
2456                 if (r < 0)
2457                         return r;
2458         }
2459
2460         *ret = ns;
2461         return 0;
2462 }
2463
2464 int unit_get_cpu_usage(Unit *u, nsec_t *ret) {
2465         nsec_t ns;
2466         int r;
2467
2468         assert(u);
2469
2470         /* Retrieve the current CPU usage counter. This will subtract the CPU counter taken when the unit was
2471          * started. If the cgroup has been removed already, returns the last cached value. To cache the value, simply
2472          * call this function with a NULL return value. */
2473
2474         if (!UNIT_CGROUP_BOOL(u, cpu_accounting))
2475                 return -ENODATA;
2476
2477         r = unit_get_cpu_usage_raw(u, &ns);
2478         if (r == -ENODATA && u->cpu_usage_last != NSEC_INFINITY) {
2479                 /* If we can't get the CPU usage anymore (because the cgroup was already removed, for example), use our
2480                  * cached value. */
2481
2482                 if (ret)
2483                         *ret = u->cpu_usage_last;
2484                 return 0;
2485         }
2486         if (r < 0)
2487                 return r;
2488
2489         if (ns > u->cpu_usage_base)
2490                 ns -= u->cpu_usage_base;
2491         else
2492                 ns = 0;
2493
2494         u->cpu_usage_last = ns;
2495         if (ret)
2496                 *ret = ns;
2497
2498         return 0;
2499 }
2500
2501 int unit_get_ip_accounting(
2502                 Unit *u,
2503                 CGroupIPAccountingMetric metric,
2504                 uint64_t *ret) {
2505
2506         uint64_t value;
2507         int fd, r;
2508
2509         assert(u);
2510         assert(metric >= 0);
2511         assert(metric < _CGROUP_IP_ACCOUNTING_METRIC_MAX);
2512         assert(ret);
2513
2514         /* IP accounting is currently not recursive, and hence we refuse to return any data for slice nodes. Slices are
2515          * inner cgroup nodes and hence have no processes directly attached, hence their counters would be zero
2516          * anyway. And if we block this now we can later open this up, if the kernel learns recursive BPF cgroup
2517          * filters. */
2518         if (u->type == UNIT_SLICE)
2519                 return -ENODATA;
2520
2521         if (!UNIT_CGROUP_BOOL(u, ip_accounting))
2522                 return -ENODATA;
2523
2524         fd = IN_SET(metric, CGROUP_IP_INGRESS_BYTES, CGROUP_IP_INGRESS_PACKETS) ?
2525                 u->ip_accounting_ingress_map_fd :
2526                 u->ip_accounting_egress_map_fd;
2527         if (fd < 0)
2528                 return -ENODATA;
2529
2530         if (IN_SET(metric, CGROUP_IP_INGRESS_BYTES, CGROUP_IP_EGRESS_BYTES))
2531                 r = bpf_firewall_read_accounting(fd, &value, NULL);
2532         else
2533                 r = bpf_firewall_read_accounting(fd, NULL, &value);
2534         if (r < 0)
2535                 return r;
2536
2537         /* Add in additional metrics from a previous runtime. Note that when reexecing/reloading the daemon we compile
2538          * all BPF programs and maps anew, but serialize the old counters. When deserializing we store them in the
2539          * ip_accounting_extra[] field, and add them in here transparently. */
2540
2541         *ret = value + u->ip_accounting_extra[metric];
2542
2543         return r;
2544 }
2545
2546 int unit_reset_cpu_accounting(Unit *u) {
2547         nsec_t ns;
2548         int r;
2549
2550         assert(u);
2551
2552         u->cpu_usage_last = NSEC_INFINITY;
2553
2554         r = unit_get_cpu_usage_raw(u, &ns);
2555         if (r < 0) {
2556                 u->cpu_usage_base = 0;
2557                 return r;
2558         }
2559
2560         u->cpu_usage_base = ns;
2561         return 0;
2562 }
2563
2564 int unit_reset_ip_accounting(Unit *u) {
2565         int r = 0, q = 0;
2566
2567         assert(u);
2568
2569         if (u->ip_accounting_ingress_map_fd >= 0)
2570                 r = bpf_firewall_reset_accounting(u->ip_accounting_ingress_map_fd);
2571
2572         if (u->ip_accounting_egress_map_fd >= 0)
2573                 q = bpf_firewall_reset_accounting(u->ip_accounting_egress_map_fd);
2574
2575         zero(u->ip_accounting_extra);
2576
2577         return r < 0 ? r : q;
2578 }
2579
2580 void unit_invalidate_cgroup(Unit *u, CGroupMask m) {
2581         assert(u);
2582
2583         if (!UNIT_HAS_CGROUP_CONTEXT(u))
2584                 return;
2585
2586         if (m == 0)
2587                 return;
2588
2589         /* always invalidate compat pairs together */
2590         if (m & (CGROUP_MASK_IO | CGROUP_MASK_BLKIO))
2591                 m |= CGROUP_MASK_IO | CGROUP_MASK_BLKIO;
2592
2593         if (m & (CGROUP_MASK_CPU | CGROUP_MASK_CPUACCT))
2594                 m |= CGROUP_MASK_CPU | CGROUP_MASK_CPUACCT;
2595
2596         if ((u->cgroup_realized_mask & m) == 0) /* NOP? */
2597                 return;
2598
2599         u->cgroup_realized_mask &= ~m;
2600         unit_add_to_cgroup_realize_queue(u);
2601 }
2602
2603 void unit_invalidate_cgroup_bpf(Unit *u) {
2604         assert(u);
2605
2606         if (!UNIT_HAS_CGROUP_CONTEXT(u))
2607                 return;
2608
2609         if (u->cgroup_bpf_state == UNIT_CGROUP_BPF_INVALIDATED) /* NOP? */
2610                 return;
2611
2612         u->cgroup_bpf_state = UNIT_CGROUP_BPF_INVALIDATED;
2613         unit_add_to_cgroup_realize_queue(u);
2614
2615         /* If we are a slice unit, we also need to put compile a new BPF program for all our children, as the IP access
2616          * list of our children includes our own. */
2617         if (u->type == UNIT_SLICE) {
2618                 Unit *member;
2619                 Iterator i;
2620                 void *v;
2621
2622                 HASHMAP_FOREACH_KEY(v, member, u->dependencies[UNIT_BEFORE], i) {
2623                         if (member == u)
2624                                 continue;
2625
2626                         if (UNIT_DEREF(member->slice) != u)
2627                                 continue;
2628
2629                         unit_invalidate_cgroup_bpf(member);
2630                 }
2631         }
2632 }
2633
2634 void manager_invalidate_startup_units(Manager *m) {
2635         Iterator i;
2636         Unit *u;
2637
2638         assert(m);
2639
2640         SET_FOREACH(u, m->startup_units, i)
2641                 unit_invalidate_cgroup(u, CGROUP_MASK_CPU|CGROUP_MASK_IO|CGROUP_MASK_BLKIO);
2642 }
2643
2644 static const char* const cgroup_device_policy_table[_CGROUP_DEVICE_POLICY_MAX] = {
2645         [CGROUP_AUTO] = "auto",
2646         [CGROUP_CLOSED] = "closed",
2647         [CGROUP_STRICT] = "strict",
2648 };
2649
2650 DEFINE_STRING_TABLE_LOOKUP(cgroup_device_policy, CGroupDevicePolicy);
2651 #endif // 0