src/core/cgroup.c

   1 /* SPDX-License-Identifier: LGPL-2.1+ */
   2 /***
   3   This file is part of systemd.
   4
   5   Copyright 2013 Lennart Poettering
   6
   7   systemd is free software; you can redistribute it and/or modify it
   8   under the terms of the GNU Lesser General Public License as published by
   9   the Free Software Foundation; either version 2.1 of the License, or
  10   (at your option) any later version.
  11
  12   systemd is distributed in the hope that it will be useful, but
  13   WITHOUT ANY WARRANTY; without even the implied warranty of
  14   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  15   Lesser General Public License for more details.
  16
  17   You should have received a copy of the GNU Lesser General Public License
  18   along with systemd; If not, see <http://www.gnu.org/licenses/>.
  19 ***/
  20
  21 #include <fcntl.h>
  22 #include <fnmatch.h>
  23
  24 #include "alloc-util.h"
  25 //#include "blockdev-util.h"
  26 //#include "bpf-firewall.h"
  27 #include "cgroup-util.h"
  28 #include "cgroup.h"
  29 #include "fd-util.h"
  30 #include "fileio.h"
  31 #include "fs-util.h"
  32 #include "parse-util.h"
  33 #include "path-util.h"
  34 #include "process-util.h"
  35 //#include "procfs-util.h"
  36 //#include "special.h"
  37 #include "stdio-util.h"
  38 #include "string-table.h"
  39 #include "string-util.h"
  40
  41 #define CGROUP_CPU_QUOTA_PERIOD_USEC ((usec_t) 100 * USEC_PER_MSEC)
  42
  43 bool unit_has_root_cgroup(Unit *u) {
  44         assert(u);
  45
  46         /* Returns whether this unit manages the root cgroup. Note that this is different from being named "-.slice",
  47          * as inside of containers the root slice won't be identical to the root cgroup. */
  48
  49         if (!u->cgroup_path)
  50                 return false;
  51
  52         return isempty(u->cgroup_path) || path_equal(u->cgroup_path, "/");
  53 }
  54
  55 #if 0 /// UNNEEDED by elogind
  56 static void cgroup_compat_warn(void) {
  57         static bool cgroup_compat_warned = false;
  58
  59         if (cgroup_compat_warned)
  60                 return;
  61
  62         log_warning("cgroup compatibility translation between legacy and unified hierarchy settings activated. See cgroup-compat debug messages for details.");
  63         cgroup_compat_warned = true;
  64 }
  65
  66 #define log_cgroup_compat(unit, fmt, ...) do {                                  \
  67                 cgroup_compat_warn();                                           \
  68                 log_unit_debug(unit, "cgroup-compat: " fmt, ##__VA_ARGS__);     \
  69         } while (false)
  70
  71 void cgroup_context_init(CGroupContext *c) {
  72         assert(c);
  73
  74         /* Initialize everything to the kernel defaults, assuming the
  75          * structure is preinitialized to 0 */
  76
  77         c->cpu_weight = CGROUP_WEIGHT_INVALID;
  78         c->startup_cpu_weight = CGROUP_WEIGHT_INVALID;
  79         c->cpu_quota_per_sec_usec = USEC_INFINITY;
  80
  81         c->cpu_shares = CGROUP_CPU_SHARES_INVALID;
  82         c->startup_cpu_shares = CGROUP_CPU_SHARES_INVALID;
  83
  84         c->memory_high = CGROUP_LIMIT_MAX;
  85         c->memory_max = CGROUP_LIMIT_MAX;
  86         c->memory_swap_max = CGROUP_LIMIT_MAX;
  87
  88         c->memory_limit = CGROUP_LIMIT_MAX;
  89
  90         c->io_weight = CGROUP_WEIGHT_INVALID;
  91         c->startup_io_weight = CGROUP_WEIGHT_INVALID;
  92
  93         c->blockio_weight = CGROUP_BLKIO_WEIGHT_INVALID;
  94         c->startup_blockio_weight = CGROUP_BLKIO_WEIGHT_INVALID;
  95
  96         c->tasks_max = (uint64_t) -1;
  97 }
  98
  99 void cgroup_context_free_device_allow(CGroupContext *c, CGroupDeviceAllow *a) {
 100         assert(c);
 101         assert(a);
 102
 103         LIST_REMOVE(device_allow, c->device_allow, a);
 104         free(a->path);
 105         free(a);
 106 }
 107
 108 void cgroup_context_free_io_device_weight(CGroupContext *c, CGroupIODeviceWeight *w) {
 109         assert(c);
 110         assert(w);
 111
 112         LIST_REMOVE(device_weights, c->io_device_weights, w);
 113         free(w->path);
 114         free(w);
 115 }
 116
 117 void cgroup_context_free_io_device_limit(CGroupContext *c, CGroupIODeviceLimit *l) {
 118         assert(c);
 119         assert(l);
 120
 121         LIST_REMOVE(device_limits, c->io_device_limits, l);
 122         free(l->path);
 123         free(l);
 124 }
 125
 126 void cgroup_context_free_blockio_device_weight(CGroupContext *c, CGroupBlockIODeviceWeight *w) {
 127         assert(c);
 128         assert(w);
 129
 130         LIST_REMOVE(device_weights, c->blockio_device_weights, w);
 131         free(w->path);
 132         free(w);
 133 }
 134
 135 void cgroup_context_free_blockio_device_bandwidth(CGroupContext *c, CGroupBlockIODeviceBandwidth *b) {
 136         assert(c);
 137         assert(b);
 138
 139         LIST_REMOVE(device_bandwidths, c->blockio_device_bandwidths, b);
 140         free(b->path);
 141         free(b);
 142 }
 143
 144 void cgroup_context_done(CGroupContext *c) {
 145         assert(c);
 146
 147         while (c->io_device_weights)
 148                 cgroup_context_free_io_device_weight(c, c->io_device_weights);
 149
 150         while (c->io_device_limits)
 151                 cgroup_context_free_io_device_limit(c, c->io_device_limits);
 152
 153         while (c->blockio_device_weights)
 154                 cgroup_context_free_blockio_device_weight(c, c->blockio_device_weights);
 155
 156         while (c->blockio_device_bandwidths)
 157                 cgroup_context_free_blockio_device_bandwidth(c, c->blockio_device_bandwidths);
 158
 159         while (c->device_allow)
 160                 cgroup_context_free_device_allow(c, c->device_allow);
 161
 162         c->ip_address_allow = ip_address_access_free_all(c->ip_address_allow);
 163         c->ip_address_deny = ip_address_access_free_all(c->ip_address_deny);
 164 }
 165
 166 void cgroup_context_dump(CGroupContext *c, FILE* f, const char *prefix) {
 167         CGroupIODeviceLimit *il;
 168         CGroupIODeviceWeight *iw;
 169         CGroupBlockIODeviceBandwidth *b;
 170         CGroupBlockIODeviceWeight *w;
 171         CGroupDeviceAllow *a;
 172         IPAddressAccessItem *iaai;
 173         char u[FORMAT_TIMESPAN_MAX];
 174
 175         assert(c);
 176         assert(f);
 177
 178         prefix = strempty(prefix);
 179
 180         fprintf(f,
 181                 "%sCPUAccounting=%s\n"
 182                 "%sIOAccounting=%s\n"
 183                 "%sBlockIOAccounting=%s\n"
 184                 "%sMemoryAccounting=%s\n"
 185                 "%sTasksAccounting=%s\n"
 186                 "%sIPAccounting=%s\n"
 187                 "%sCPUWeight=%" PRIu64 "\n"
 188                 "%sStartupCPUWeight=%" PRIu64 "\n"
 189                 "%sCPUShares=%" PRIu64 "\n"
 190                 "%sStartupCPUShares=%" PRIu64 "\n"
 191                 "%sCPUQuotaPerSecSec=%s\n"
 192                 "%sIOWeight=%" PRIu64 "\n"
 193                 "%sStartupIOWeight=%" PRIu64 "\n"
 194                 "%sBlockIOWeight=%" PRIu64 "\n"
 195                 "%sStartupBlockIOWeight=%" PRIu64 "\n"
 196                 "%sMemoryLow=%" PRIu64 "\n"
 197                 "%sMemoryHigh=%" PRIu64 "\n"
 198                 "%sMemoryMax=%" PRIu64 "\n"
 199                 "%sMemorySwapMax=%" PRIu64 "\n"
 200                 "%sMemoryLimit=%" PRIu64 "\n"
 201                 "%sTasksMax=%" PRIu64 "\n"
 202                 "%sDevicePolicy=%s\n"
 203                 "%sDelegate=%s\n",
 204                 prefix, yes_no(c->cpu_accounting),
 205                 prefix, yes_no(c->io_accounting),
 206                 prefix, yes_no(c->blockio_accounting),
 207                 prefix, yes_no(c->memory_accounting),
 208                 prefix, yes_no(c->tasks_accounting),
 209                 prefix, yes_no(c->ip_accounting),
 210                 prefix, c->cpu_weight,
 211                 prefix, c->startup_cpu_weight,
 212                 prefix, c->cpu_shares,
 213                 prefix, c->startup_cpu_shares,
 214                 prefix, format_timespan(u, sizeof(u), c->cpu_quota_per_sec_usec, 1),
 215                 prefix, c->io_weight,
 216                 prefix, c->startup_io_weight,
 217                 prefix, c->blockio_weight,
 218                 prefix, c->startup_blockio_weight,
 219                 prefix, c->memory_low,
 220                 prefix, c->memory_high,
 221                 prefix, c->memory_max,
 222                 prefix, c->memory_swap_max,
 223                 prefix, c->memory_limit,
 224                 prefix, c->tasks_max,
 225                 prefix, cgroup_device_policy_to_string(c->device_policy),
 226                 prefix, yes_no(c->delegate));
 227
 228         if (c->delegate) {
 229                 _cleanup_free_ char *t = NULL;
 230
 231                 (void) cg_mask_to_string(c->delegate_controllers, &t);
 232
 233                 fprintf(f, "%sDelegateControllers=%s\n",
 234                         prefix,
 235                         strempty(t));
 236         }
 237
 238         LIST_FOREACH(device_allow, a, c->device_allow)
 239                 fprintf(f,
 240                         "%sDeviceAllow=%s %s%s%s\n",
 241                         prefix,
 242                         a->path,
 243                         a->r ? "r" : "", a->w ? "w" : "", a->m ? "m" : "");
 244
 245         LIST_FOREACH(device_weights, iw, c->io_device_weights)
 246                 fprintf(f,
 247                         "%sIODeviceWeight=%s %" PRIu64,
 248                         prefix,
 249                         iw->path,
 250                         iw->weight);
 251
 252         LIST_FOREACH(device_limits, il, c->io_device_limits) {
 253                 char buf[FORMAT_BYTES_MAX];
 254                 CGroupIOLimitType type;
 255
 256                 for (type = 0; type < _CGROUP_IO_LIMIT_TYPE_MAX; type++)
 257                         if (il->limits[type] != cgroup_io_limit_defaults[type])
 258                                 fprintf(f,
 259                                         "%s%s=%s %s\n",
 260                                         prefix,
 261                                         cgroup_io_limit_type_to_string(type),
 262                                         il->path,
 263                                         format_bytes(buf, sizeof(buf), il->limits[type]));
 264         }
 265
 266         LIST_FOREACH(device_weights, w, c->blockio_device_weights)
 267                 fprintf(f,
 268                         "%sBlockIODeviceWeight=%s %" PRIu64,
 269                         prefix,
 270                         w->path,
 271                         w->weight);
 272
 273         LIST_FOREACH(device_bandwidths, b, c->blockio_device_bandwidths) {
 274                 char buf[FORMAT_BYTES_MAX];
 275
 276                 if (b->rbps != CGROUP_LIMIT_MAX)
 277                         fprintf(f,
 278                                 "%sBlockIOReadBandwidth=%s %s\n",
 279                                 prefix,
 280                                 b->path,
 281                                 format_bytes(buf, sizeof(buf), b->rbps));
 282                 if (b->wbps != CGROUP_LIMIT_MAX)
 283                         fprintf(f,
 284                                 "%sBlockIOWriteBandwidth=%s %s\n",
 285                                 prefix,
 286                                 b->path,
 287                                 format_bytes(buf, sizeof(buf), b->wbps));
 288         }
 289
 290         LIST_FOREACH(items, iaai, c->ip_address_allow) {
 291                 _cleanup_free_ char *k = NULL;
 292
 293                 (void) in_addr_to_string(iaai->family, &iaai->address, &k);
 294                 fprintf(f, "%sIPAddressAllow=%s/%u\n", prefix, strnull(k), iaai->prefixlen);
 295         }
 296
 297         LIST_FOREACH(items, iaai, c->ip_address_deny) {
 298                 _cleanup_free_ char *k = NULL;
 299
 300                 (void) in_addr_to_string(iaai->family, &iaai->address, &k);
 301                 fprintf(f, "%sIPAddressDeny=%s/%u\n", prefix, strnull(k), iaai->prefixlen);
 302         }
 303 }
 304
 305 static int lookup_block_device(const char *p, dev_t *dev) {
 306         struct stat st;
 307         int r;
 308
 309         assert(p);
 310         assert(dev);
 311
 312         r = stat(p, &st);
 313         if (r < 0)
 314                 return log_warning_errno(errno, "Couldn't stat device %s: %m", p);
 315
 316         if (S_ISBLK(st.st_mode))
 317                 *dev = st.st_rdev;
 318         else if (major(st.st_dev) != 0) {
 319                 /* If this is not a device node then find the block
 320                  * device this file is stored on */
 321                 *dev = st.st_dev;
 322
 323                 /* If this is a partition, try to get the originating
 324                  * block device */
 325                 (void) block_get_whole_disk(*dev, dev);
 326         } else {
 327                 log_warning("%s is not a block device and file system block device cannot be determined or is not local.", p);
 328                 return -ENODEV;
 329         }
 330
 331         return 0;
 332 }
 333
 334 static int whitelist_device(const char *path, const char *node, const char *acc) {
 335         char buf[2+DECIMAL_STR_MAX(dev_t)*2+2+4];
 336         struct stat st;
 337         bool ignore_notfound;
 338         int r;
 339
 340         assert(path);
 341         assert(acc);
 342
 343         if (node[0] == '-') {
 344                 /* Non-existent paths starting with "-" must be silently ignored */
 345                 node++;
 346                 ignore_notfound = true;
 347         } else
 348                 ignore_notfound = false;
 349
 350         if (stat(node, &st) < 0) {
 351                 if (errno == ENOENT && ignore_notfound)
 352                         return 0;
 353
 354                 return log_warning_errno(errno, "Couldn't stat device %s: %m", node);
 355         }
 356
 357         if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
 358                 log_warning("%s is not a device.", node);
 359                 return -ENODEV;
 360         }
 361
 362         sprintf(buf,
 363                 "%c %u:%u %s",
 364                 S_ISCHR(st.st_mode) ? 'c' : 'b',
 365                 major(st.st_rdev), minor(st.st_rdev),
 366                 acc);
 367
 368         r = cg_set_attribute("devices", path, "devices.allow", buf);
 369         if (r < 0)
 370                 log_full_errno(IN_SET(r, -ENOENT, -EROFS, -EINVAL, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
 371                                "Failed to set devices.allow on %s: %m", path);
 372
 373         return r;
 374 }
 375
 376 static int whitelist_major(const char *path, const char *name, char type, const char *acc) {
 377         _cleanup_fclose_ FILE *f = NULL;
 378         char line[LINE_MAX];
 379         bool good = false;
 380         int r;
 381
 382         assert(path);
 383         assert(acc);
 384         assert(IN_SET(type, 'b', 'c'));
 385
 386         f = fopen("/proc/devices", "re");
 387         if (!f)
 388                 return log_warning_errno(errno, "Cannot open /proc/devices to resolve %s (%c): %m", name, type);
 389
 390         FOREACH_LINE(line, f, goto fail) {
 391                 char buf[2+DECIMAL_STR_MAX(unsigned)+3+4], *p, *w;
 392                 unsigned maj;
 393
 394                 truncate_nl(line);
 395
 396                 if (type == 'c' && streq(line, "Character devices:")) {
 397                         good = true;
 398                         continue;
 399                 }
 400
 401                 if (type == 'b' && streq(line, "Block devices:")) {
 402                         good = true;
 403                         continue;
 404                 }
 405
 406                 if (isempty(line)) {
 407                         good = false;
 408                         continue;
 409                 }
 410
 411                 if (!good)
 412                         continue;
 413
 414                 p = strstrip(line);
 415
 416                 w = strpbrk(p, WHITESPACE);
 417                 if (!w)
 418                         continue;
 419                 *w = 0;
 420
 421                 r = safe_atou(p, &maj);
 422                 if (r < 0)
 423                         continue;
 424                 if (maj <= 0)
 425                         continue;
 426
 427                 w++;
 428                 w += strspn(w, WHITESPACE);
 429
 430                 if (fnmatch(name, w, 0) != 0)
 431                         continue;
 432
 433                 sprintf(buf,
 434                         "%c %u:* %s",
 435                         type,
 436                         maj,
 437                         acc);
 438
 439                 r = cg_set_attribute("devices", path, "devices.allow", buf);
 440                 if (r < 0)
 441                         log_full_errno(IN_SET(r, -ENOENT, -EROFS, -EINVAL, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
 442                                        "Failed to set devices.allow on %s: %m", path);
 443         }
 444
 445         return 0;
 446
 447 fail:
 448         return log_warning_errno(errno, "Failed to read /proc/devices: %m");
 449 }
 450
 451 static bool cgroup_context_has_cpu_weight(CGroupContext *c) {
 452         return c->cpu_weight != CGROUP_WEIGHT_INVALID ||
 453                 c->startup_cpu_weight != CGROUP_WEIGHT_INVALID;
 454 }
 455
 456 static bool cgroup_context_has_cpu_shares(CGroupContext *c) {
 457         return c->cpu_shares != CGROUP_CPU_SHARES_INVALID ||
 458                 c->startup_cpu_shares != CGROUP_CPU_SHARES_INVALID;
 459 }
 460
 461 static uint64_t cgroup_context_cpu_weight(CGroupContext *c, ManagerState state) {
 462         if (IN_SET(state, MANAGER_STARTING, MANAGER_INITIALIZING) &&
 463             c->startup_cpu_weight != CGROUP_WEIGHT_INVALID)
 464                 return c->startup_cpu_weight;
 465         else if (c->cpu_weight != CGROUP_WEIGHT_INVALID)
 466                 return c->cpu_weight;
 467         else
 468                 return CGROUP_WEIGHT_DEFAULT;
 469 }
 470
 471 static uint64_t cgroup_context_cpu_shares(CGroupContext *c, ManagerState state) {
 472         if (IN_SET(state, MANAGER_STARTING, MANAGER_INITIALIZING) &&
 473             c->startup_cpu_shares != CGROUP_CPU_SHARES_INVALID)
 474                 return c->startup_cpu_shares;
 475         else if (c->cpu_shares != CGROUP_CPU_SHARES_INVALID)
 476                 return c->cpu_shares;
 477         else
 478                 return CGROUP_CPU_SHARES_DEFAULT;
 479 }
 480
 481 static void cgroup_apply_unified_cpu_config(Unit *u, uint64_t weight, uint64_t quota) {
 482         char buf[MAX(DECIMAL_STR_MAX(uint64_t) + 1, (DECIMAL_STR_MAX(usec_t) + 1) * 2)];
 483         int r;
 484
 485         xsprintf(buf, "%" PRIu64 "\n", weight);
 486         r = cg_set_attribute("cpu", u->cgroup_path, "cpu.weight", buf);
 487         if (r < 0)
 488                 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
 489                               "Failed to set cpu.weight: %m");
 490
 491         if (quota != USEC_INFINITY)
 492                 xsprintf(buf, USEC_FMT " " USEC_FMT "\n",
 493                          quota * CGROUP_CPU_QUOTA_PERIOD_USEC / USEC_PER_SEC, CGROUP_CPU_QUOTA_PERIOD_USEC);
 494         else
 495                 xsprintf(buf, "max " USEC_FMT "\n", CGROUP_CPU_QUOTA_PERIOD_USEC);
 496
 497         r = cg_set_attribute("cpu", u->cgroup_path, "cpu.max", buf);
 498
 499         if (r < 0)
 500                 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
 501                               "Failed to set cpu.max: %m");
 502 }
 503
 504 static void cgroup_apply_legacy_cpu_config(Unit *u, uint64_t shares, uint64_t quota) {
 505         char buf[MAX(DECIMAL_STR_MAX(uint64_t), DECIMAL_STR_MAX(usec_t)) + 1];
 506         int r;
 507
 508         xsprintf(buf, "%" PRIu64 "\n", shares);
 509         r = cg_set_attribute("cpu", u->cgroup_path, "cpu.shares", buf);
 510         if (r < 0)
 511                 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
 512                               "Failed to set cpu.shares: %m");
 513
 514         xsprintf(buf, USEC_FMT "\n", CGROUP_CPU_QUOTA_PERIOD_USEC);
 515         r = cg_set_attribute("cpu", u->cgroup_path, "cpu.cfs_period_us", buf);
 516         if (r < 0)
 517                 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
 518                               "Failed to set cpu.cfs_period_us: %m");
 519
 520         if (quota != USEC_INFINITY) {
 521                 xsprintf(buf, USEC_FMT "\n", quota * CGROUP_CPU_QUOTA_PERIOD_USEC / USEC_PER_SEC);
 522                 r = cg_set_attribute("cpu", u->cgroup_path, "cpu.cfs_quota_us", buf);
 523         } else
 524                 r = cg_set_attribute("cpu", u->cgroup_path, "cpu.cfs_quota_us", "-1");
 525         if (r < 0)
 526                 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
 527                               "Failed to set cpu.cfs_quota_us: %m");
 528 }
 529
 530 static uint64_t cgroup_cpu_shares_to_weight(uint64_t shares) {
 531         return CLAMP(shares * CGROUP_WEIGHT_DEFAULT / CGROUP_CPU_SHARES_DEFAULT,
 532                      CGROUP_WEIGHT_MIN, CGROUP_WEIGHT_MAX);
 533 }
 534
 535 static uint64_t cgroup_cpu_weight_to_shares(uint64_t weight) {
 536         return CLAMP(weight * CGROUP_CPU_SHARES_DEFAULT / CGROUP_WEIGHT_DEFAULT,
 537                      CGROUP_CPU_SHARES_MIN, CGROUP_CPU_SHARES_MAX);
 538 }
 539
 540 static bool cgroup_context_has_io_config(CGroupContext *c) {
 541         return c->io_accounting ||
 542                 c->io_weight != CGROUP_WEIGHT_INVALID ||
 543                 c->startup_io_weight != CGROUP_WEIGHT_INVALID ||
 544                 c->io_device_weights ||
 545                 c->io_device_limits;
 546 }
 547
 548 static bool cgroup_context_has_blockio_config(CGroupContext *c) {
 549         return c->blockio_accounting ||
 550                 c->blockio_weight != CGROUP_BLKIO_WEIGHT_INVALID ||
 551                 c->startup_blockio_weight != CGROUP_BLKIO_WEIGHT_INVALID ||
 552                 c->blockio_device_weights ||
 553                 c->blockio_device_bandwidths;
 554 }
 555
 556 static uint64_t cgroup_context_io_weight(CGroupContext *c, ManagerState state) {
 557         if (IN_SET(state, MANAGER_STARTING, MANAGER_INITIALIZING) &&
 558             c->startup_io_weight != CGROUP_WEIGHT_INVALID)
 559                 return c->startup_io_weight;
 560         else if (c->io_weight != CGROUP_WEIGHT_INVALID)
 561                 return c->io_weight;
 562         else
 563                 return CGROUP_WEIGHT_DEFAULT;
 564 }
 565
 566 static uint64_t cgroup_context_blkio_weight(CGroupContext *c, ManagerState state) {
 567         if (IN_SET(state, MANAGER_STARTING, MANAGER_INITIALIZING) &&
 568             c->startup_blockio_weight != CGROUP_BLKIO_WEIGHT_INVALID)
 569                 return c->startup_blockio_weight;
 570         else if (c->blockio_weight != CGROUP_BLKIO_WEIGHT_INVALID)
 571                 return c->blockio_weight;
 572         else
 573                 return CGROUP_BLKIO_WEIGHT_DEFAULT;
 574 }
 575
 576 static uint64_t cgroup_weight_blkio_to_io(uint64_t blkio_weight) {
 577         return CLAMP(blkio_weight * CGROUP_WEIGHT_DEFAULT / CGROUP_BLKIO_WEIGHT_DEFAULT,
 578                      CGROUP_WEIGHT_MIN, CGROUP_WEIGHT_MAX);
 579 }
 580
 581 static uint64_t cgroup_weight_io_to_blkio(uint64_t io_weight) {
 582         return CLAMP(io_weight * CGROUP_BLKIO_WEIGHT_DEFAULT / CGROUP_WEIGHT_DEFAULT,
 583                      CGROUP_BLKIO_WEIGHT_MIN, CGROUP_BLKIO_WEIGHT_MAX);
 584 }
 585
 586 static void cgroup_apply_io_device_weight(Unit *u, const char *dev_path, uint64_t io_weight) {
 587         char buf[DECIMAL_STR_MAX(dev_t)*2+2+DECIMAL_STR_MAX(uint64_t)+1];
 588         dev_t dev;
 589         int r;
 590
 591         r = lookup_block_device(dev_path, &dev);
 592         if (r < 0)
 593                 return;
 594
 595         xsprintf(buf, "%u:%u %" PRIu64 "\n", major(dev), minor(dev), io_weight);
 596         r = cg_set_attribute("io", u->cgroup_path, "io.weight", buf);
 597         if (r < 0)
 598                 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
 599                               "Failed to set io.weight: %m");
 600 }
 601
 602 static void cgroup_apply_blkio_device_weight(Unit *u, const char *dev_path, uint64_t blkio_weight) {
 603         char buf[DECIMAL_STR_MAX(dev_t)*2+2+DECIMAL_STR_MAX(uint64_t)+1];
 604         dev_t dev;
 605         int r;
 606
 607         r = lookup_block_device(dev_path, &dev);
 608         if (r < 0)
 609                 return;
 610
 611         xsprintf(buf, "%u:%u %" PRIu64 "\n", major(dev), minor(dev), blkio_weight);
 612         r = cg_set_attribute("blkio", u->cgroup_path, "blkio.weight_device", buf);
 613         if (r < 0)
 614                 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
 615                               "Failed to set blkio.weight_device: %m");
 616 }
 617
 618 static unsigned cgroup_apply_io_device_limit(Unit *u, const char *dev_path, uint64_t *limits) {
 619         char limit_bufs[_CGROUP_IO_LIMIT_TYPE_MAX][DECIMAL_STR_MAX(uint64_t)];
 620         char buf[DECIMAL_STR_MAX(dev_t)*2+2+(6+DECIMAL_STR_MAX(uint64_t)+1)*4];
 621         CGroupIOLimitType type;
 622         dev_t dev;
 623         unsigned n = 0;
 624         int r;
 625
 626         r = lookup_block_device(dev_path, &dev);
 627         if (r < 0)
 628                 return 0;
 629
 630         for (type = 0; type < _CGROUP_IO_LIMIT_TYPE_MAX; type++) {
 631                 if (limits[type] != cgroup_io_limit_defaults[type]) {
 632                         xsprintf(limit_bufs[type], "%" PRIu64, limits[type]);
 633                         n++;
 634                 } else {
 635                         xsprintf(limit_bufs[type], "%s", limits[type] == CGROUP_LIMIT_MAX ? "max" : "0");
 636                 }
 637         }
 638
 639         xsprintf(buf, "%u:%u rbps=%s wbps=%s riops=%s wiops=%s\n", major(dev), minor(dev),
 640                  limit_bufs[CGROUP_IO_RBPS_MAX], limit_bufs[CGROUP_IO_WBPS_MAX],
 641                  limit_bufs[CGROUP_IO_RIOPS_MAX], limit_bufs[CGROUP_IO_WIOPS_MAX]);
 642         r = cg_set_attribute("io", u->cgroup_path, "io.max", buf);
 643         if (r < 0)
 644                 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
 645                               "Failed to set io.max: %m");
 646         return n;
 647 }
 648
 649 static unsigned cgroup_apply_blkio_device_limit(Unit *u, const char *dev_path, uint64_t rbps, uint64_t wbps) {
 650         char buf[DECIMAL_STR_MAX(dev_t)*2+2+DECIMAL_STR_MAX(uint64_t)+1];
 651         dev_t dev;
 652         unsigned n = 0;
 653         int r;
 654
 655         r = lookup_block_device(dev_path, &dev);
 656         if (r < 0)
 657                 return 0;
 658
 659         if (rbps != CGROUP_LIMIT_MAX)
 660                 n++;
 661         sprintf(buf, "%u:%u %" PRIu64 "\n", major(dev), minor(dev), rbps);
 662         r = cg_set_attribute("blkio", u->cgroup_path, "blkio.throttle.read_bps_device", buf);
 663         if (r < 0)
 664                 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
 665                               "Failed to set blkio.throttle.read_bps_device: %m");
 666
 667         if (wbps != CGROUP_LIMIT_MAX)
 668                 n++;
 669         sprintf(buf, "%u:%u %" PRIu64 "\n", major(dev), minor(dev), wbps);
 670         r = cg_set_attribute("blkio", u->cgroup_path, "blkio.throttle.write_bps_device", buf);
 671         if (r < 0)
 672                 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
 673                               "Failed to set blkio.throttle.write_bps_device: %m");
 674
 675         return n;
 676 }
 677
 678 static bool cgroup_context_has_unified_memory_config(CGroupContext *c) {
 679         return c->memory_low > 0 || c->memory_high != CGROUP_LIMIT_MAX || c->memory_max != CGROUP_LIMIT_MAX || c->memory_swap_max != CGROUP_LIMIT_MAX;
 680 }
 681
 682 static void cgroup_apply_unified_memory_limit(Unit *u, const char *file, uint64_t v) {
 683         char buf[DECIMAL_STR_MAX(uint64_t) + 1] = "max";
 684         int r;
 685
 686         if (v != CGROUP_LIMIT_MAX)
 687                 xsprintf(buf, "%" PRIu64 "\n", v);
 688
 689         r = cg_set_attribute("memory", u->cgroup_path, file, buf);
 690         if (r < 0)
 691                 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
 692                               "Failed to set %s: %m", file);
 693 }
 694
 695 static void cgroup_apply_firewall(Unit *u) {
 696         int r;
 697
 698         assert(u);
 699
 700         if (u->type == UNIT_SLICE) /* Skip this for slice units, they are inner cgroup nodes, and since bpf/cgroup is
 701                                     * not recursive we don't ever touch the bpf on them */
 702                 return;
 703
 704         r = bpf_firewall_compile(u);
 705         if (r < 0)
 706                 return;
 707
 708         (void) bpf_firewall_install(u);
 709         return;
 710 }
 711
 712 static void cgroup_context_apply(
 713                 Unit *u,
 714                 CGroupMask apply_mask,
 715                 bool apply_bpf,
 716                 ManagerState state) {
 717
 718         const char *path;
 719         CGroupContext *c;
 720         bool is_root;
 721         int r;
 722
 723         assert(u);
 724
 725         /* Nothing to do? Exit early! */
 726         if (apply_mask == 0 && !apply_bpf)
 727                 return;
 728
 729         /* Some cgroup attributes are not supported on the root cgroup, hence silently ignore */
 730         is_root = unit_has_root_cgroup(u);
 731
 732         assert_se(c = unit_get_cgroup_context(u));
 733         assert_se(path = u->cgroup_path);
 734
 735         if (is_root) /* Make sure we don't try to display messages with an empty path. */
 736                 path = "/";
 737
 738         /* We generally ignore errors caused by read-only mounted
 739          * cgroup trees (assuming we are running in a container then),
 740          * and missing cgroups, i.e. EROFS and ENOENT. */
 741
 742         if ((apply_mask & CGROUP_MASK_CPU) && !is_root) {
 743                 bool has_weight, has_shares;
 744
 745                 has_weight = cgroup_context_has_cpu_weight(c);
 746                 has_shares = cgroup_context_has_cpu_shares(c);
 747
 748                 if (cg_all_unified() > 0) {
 749                         uint64_t weight;
 750
 751                         if (has_weight)
 752                                 weight = cgroup_context_cpu_weight(c, state);
 753                         else if (has_shares) {
 754                                 uint64_t shares = cgroup_context_cpu_shares(c, state);
 755
 756                                 weight = cgroup_cpu_shares_to_weight(shares);
 757
 758                                 log_cgroup_compat(u, "Applying [Startup]CpuShares %" PRIu64 " as [Startup]CpuWeight %" PRIu64 " on %s",
 759                                                   shares, weight, path);
 760                         } else
 761                                 weight = CGROUP_WEIGHT_DEFAULT;
 762
 763                         cgroup_apply_unified_cpu_config(u, weight, c->cpu_quota_per_sec_usec);
 764                 } else {
 765                         uint64_t shares;
 766
 767                         if (has_weight) {
 768                                 uint64_t weight = cgroup_context_cpu_weight(c, state);
 769
 770                                 shares = cgroup_cpu_weight_to_shares(weight);
 771
 772                                 log_cgroup_compat(u, "Applying [Startup]CpuWeight %" PRIu64 " as [Startup]CpuShares %" PRIu64 " on %s",
 773                                                   weight, shares, path);
 774                         } else if (has_shares)
 775                                 shares = cgroup_context_cpu_shares(c, state);
 776                         else
 777                                 shares = CGROUP_CPU_SHARES_DEFAULT;
 778
 779                         cgroup_apply_legacy_cpu_config(u, shares, c->cpu_quota_per_sec_usec);
 780                 }
 781         }
 782
 783         if (apply_mask & CGROUP_MASK_IO) {
 784                 bool has_io = cgroup_context_has_io_config(c);
 785                 bool has_blockio = cgroup_context_has_blockio_config(c);
 786
 787                 if (!is_root) {
 788                         char buf[8+DECIMAL_STR_MAX(uint64_t)+1];
 789                         uint64_t weight;
 790
 791                         if (has_io)
 792                                 weight = cgroup_context_io_weight(c, state);
 793                         else if (has_blockio) {
 794                                 uint64_t blkio_weight = cgroup_context_blkio_weight(c, state);
 795
 796                                 weight = cgroup_weight_blkio_to_io(blkio_weight);
 797
 798                                 log_cgroup_compat(u, "Applying [Startup]BlockIOWeight %" PRIu64 " as [Startup]IOWeight %" PRIu64,
 799                                                   blkio_weight, weight);
 800                         } else
 801                                 weight = CGROUP_WEIGHT_DEFAULT;
 802
 803                         xsprintf(buf, "default %" PRIu64 "\n", weight);
 804                         r = cg_set_attribute("io", path, "io.weight", buf);
 805                         if (r < 0)
 806                                 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
 807                                               "Failed to set io.weight: %m");
 808
 809                         if (has_io) {
 810                                 CGroupIODeviceWeight *w;
 811
 812                                 /* FIXME: no way to reset this list */
 813                                 LIST_FOREACH(device_weights, w, c->io_device_weights)
 814                                         cgroup_apply_io_device_weight(u, w->path, w->weight);
 815                         } else if (has_blockio) {
 816                                 CGroupBlockIODeviceWeight *w;
 817
 818                                 /* FIXME: no way to reset this list */
 819                                 LIST_FOREACH(device_weights, w, c->blockio_device_weights) {
 820                                         weight = cgroup_weight_blkio_to_io(w->weight);
 821
 822                                         log_cgroup_compat(u, "Applying BlockIODeviceWeight %" PRIu64 " as IODeviceWeight %" PRIu64 " for %s",
 823                                                           w->weight, weight, w->path);
 824
 825                                         cgroup_apply_io_device_weight(u, w->path, weight);
 826                                 }
 827                         }
 828                 }
 829
 830                 /* Apply limits and free ones without config. */
 831                 if (has_io) {
 832                         CGroupIODeviceLimit *l, *next;
 833
 834                         LIST_FOREACH_SAFE(device_limits, l, next, c->io_device_limits) {
 835                                 if (!cgroup_apply_io_device_limit(u, l->path, l->limits))
 836                                         cgroup_context_free_io_device_limit(c, l);
 837                         }
 838                 } else if (has_blockio) {
 839                         CGroupBlockIODeviceBandwidth *b, *next;
 840
 841                         LIST_FOREACH_SAFE(device_bandwidths, b, next, c->blockio_device_bandwidths) {
 842                                 uint64_t limits[_CGROUP_IO_LIMIT_TYPE_MAX];
 843                                 CGroupIOLimitType type;
 844
 845                                 for (type = 0; type < _CGROUP_IO_LIMIT_TYPE_MAX; type++)
 846                                         limits[type] = cgroup_io_limit_defaults[type];
 847
 848                                 limits[CGROUP_IO_RBPS_MAX] = b->rbps;
 849                                 limits[CGROUP_IO_WBPS_MAX] = b->wbps;
 850
 851                                 log_cgroup_compat(u, "Applying BlockIO{Read|Write}Bandwidth %" PRIu64 " %" PRIu64 " as IO{Read|Write}BandwidthMax for %s",
 852                                                   b->rbps, b->wbps, b->path);
 853
 854                                 if (!cgroup_apply_io_device_limit(u, b->path, limits))
 855                                         cgroup_context_free_blockio_device_bandwidth(c, b);
 856                         }
 857                 }
 858         }
 859
 860         if (apply_mask & CGROUP_MASK_BLKIO) {
 861                 bool has_io = cgroup_context_has_io_config(c);
 862                 bool has_blockio = cgroup_context_has_blockio_config(c);
 863
 864                 if (!is_root) {
 865                         char buf[DECIMAL_STR_MAX(uint64_t)+1];
 866                         uint64_t weight;
 867
 868                         if (has_io) {
 869                                 uint64_t io_weight = cgroup_context_io_weight(c, state);
 870
 871                                 weight = cgroup_weight_io_to_blkio(cgroup_context_io_weight(c, state));
 872
 873                                 log_cgroup_compat(u, "Applying [Startup]IOWeight %" PRIu64 " as [Startup]BlockIOWeight %" PRIu64,
 874                                                   io_weight, weight);
 875                         } else if (has_blockio)
 876                                 weight = cgroup_context_blkio_weight(c, state);
 877                         else
 878                                 weight = CGROUP_BLKIO_WEIGHT_DEFAULT;
 879
 880                         xsprintf(buf, "%" PRIu64 "\n", weight);
 881                         r = cg_set_attribute("blkio", path, "blkio.weight", buf);
 882                         if (r < 0)
 883                                 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
 884                                               "Failed to set blkio.weight: %m");
 885
 886                         if (has_io) {
 887                                 CGroupIODeviceWeight *w;
 888
 889                                 /* FIXME: no way to reset this list */
 890                                 LIST_FOREACH(device_weights, w, c->io_device_weights) {
 891                                         weight = cgroup_weight_io_to_blkio(w->weight);
 892
 893                                         log_cgroup_compat(u, "Applying IODeviceWeight %" PRIu64 " as BlockIODeviceWeight %" PRIu64 " for %s",
 894                                                           w->weight, weight, w->path);
 895
 896                                         cgroup_apply_blkio_device_weight(u, w->path, weight);
 897                                 }
 898                         } else if (has_blockio) {
 899                                 CGroupBlockIODeviceWeight *w;
 900
 901                                 /* FIXME: no way to reset this list */
 902                                 LIST_FOREACH(device_weights, w, c->blockio_device_weights)
 903                                         cgroup_apply_blkio_device_weight(u, w->path, w->weight);
 904                         }
 905                 }
 906
 907                 /* Apply limits and free ones without config. */
 908                 if (has_io) {
 909                         CGroupIODeviceLimit *l, *next;
 910
 911                         LIST_FOREACH_SAFE(device_limits, l, next, c->io_device_limits) {
 912                                 log_cgroup_compat(u, "Applying IO{Read|Write}Bandwidth %" PRIu64 " %" PRIu64 " as BlockIO{Read|Write}BandwidthMax for %s",
 913                                                   l->limits[CGROUP_IO_RBPS_MAX], l->limits[CGROUP_IO_WBPS_MAX], l->path);
 914
 915                                 if (!cgroup_apply_blkio_device_limit(u, l->path, l->limits[CGROUP_IO_RBPS_MAX], l->limits[CGROUP_IO_WBPS_MAX]))
 916                                         cgroup_context_free_io_device_limit(c, l);
 917                         }
 918                 } else if (has_blockio) {
 919                         CGroupBlockIODeviceBandwidth *b, *next;
 920
 921                         LIST_FOREACH_SAFE(device_bandwidths, b, next, c->blockio_device_bandwidths)
 922                                 if (!cgroup_apply_blkio_device_limit(u, b->path, b->rbps, b->wbps))
 923                                         cgroup_context_free_blockio_device_bandwidth(c, b);
 924                 }
 925         }
 926
 927         if ((apply_mask & CGROUP_MASK_MEMORY) && !is_root) {
 928                 if (cg_all_unified() > 0) {
 929                         uint64_t max, swap_max = CGROUP_LIMIT_MAX;
 930
 931                         if (cgroup_context_has_unified_memory_config(c)) {
 932                                 max = c->memory_max;
 933                                 swap_max = c->memory_swap_max;
 934                         } else {
 935                                 max = c->memory_limit;
 936
 937                                 if (max != CGROUP_LIMIT_MAX)
 938                                         log_cgroup_compat(u, "Applying MemoryLimit %" PRIu64 " as MemoryMax", max);
 939                         }
 940
 941                         cgroup_apply_unified_memory_limit(u, "memory.low", c->memory_low);
 942                         cgroup_apply_unified_memory_limit(u, "memory.high", c->memory_high);
 943                         cgroup_apply_unified_memory_limit(u, "memory.max", max);
 944                         cgroup_apply_unified_memory_limit(u, "memory.swap.max", swap_max);
 945                 } else {
 946                         char buf[DECIMAL_STR_MAX(uint64_t) + 1];
 947                         uint64_t val;
 948
 949                         if (cgroup_context_has_unified_memory_config(c)) {
 950                                 val = c->memory_max;
 951                                 log_cgroup_compat(u, "Applying MemoryMax %" PRIi64 " as MemoryLimit", val);
 952                         } else
 953                                 val = c->memory_limit;
 954
 955                         if (val == CGROUP_LIMIT_MAX)
 956                                 strncpy(buf, "-1\n", sizeof(buf));
 957                         else
 958                                 xsprintf(buf, "%" PRIu64 "\n", val);
 959
 960                         r = cg_set_attribute("memory", path, "memory.limit_in_bytes", buf);
 961                         if (r < 0)
 962                                 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
 963                                               "Failed to set memory.limit_in_bytes: %m");
 964                 }
 965         }
 966
 967         if ((apply_mask & CGROUP_MASK_DEVICES) && !is_root) {
 968                 CGroupDeviceAllow *a;
 969
 970                 /* Changing the devices list of a populated cgroup
 971                  * might result in EINVAL, hence ignore EINVAL
 972                  * here. */
 973
 974                 if (c->device_allow || c->device_policy != CGROUP_AUTO)
 975                         r = cg_set_attribute("devices", path, "devices.deny", "a");
 976                 else
 977                         r = cg_set_attribute("devices", path, "devices.allow", "a");
 978                 if (r < 0)
 979                         log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EINVAL, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
 980                                       "Failed to reset devices.list: %m");
 981
 982                 if (c->device_policy == CGROUP_CLOSED ||
 983                     (c->device_policy == CGROUP_AUTO && c->device_allow)) {
 984                         static const char auto_devices[] =
 985                                 "/dev/null\0" "rwm\0"
 986                                 "/dev/zero\0" "rwm\0"
 987                                 "/dev/full\0" "rwm\0"
 988                                 "/dev/random\0" "rwm\0"
 989                                 "/dev/urandom\0" "rwm\0"
 990                                 "/dev/tty\0" "rwm\0"
 991                                 "/dev/ptmx\0" "rwm\0"
 992                                 /* Allow /run/systemd/inaccessible/{chr,blk} devices for mapping InaccessiblePaths */
 993                                 "-/run/systemd/inaccessible/chr\0" "rwm\0"
 994                                 "-/run/systemd/inaccessible/blk\0" "rwm\0";
 995
 996                         const char *x, *y;
 997
 998                         NULSTR_FOREACH_PAIR(x, y, auto_devices)
 999                                 whitelist_device(path, x, y);
1000
1001                         /* PTS (/dev/pts) devices may not be duplicated, but accessed */
1002                         whitelist_major(path, "pts", 'c', "rw");
1003                 }
1004
1005                 LIST_FOREACH(device_allow, a, c->device_allow) {
1006                         char acc[4], *val;
1007                         unsigned k = 0;
1008
1009                         if (a->r)
1010                                 acc[k++] = 'r';
1011                         if (a->w)
1012                                 acc[k++] = 'w';
1013                         if (a->m)
1014                                 acc[k++] = 'm';
1015
1016                         if (k == 0)
1017                                 continue;
1018
1019                         acc[k++] = 0;
1020
1021                         if (path_startswith(a->path, "/dev/"))
1022                                 whitelist_device(path, a->path, acc);
1023                         else if ((val = startswith(a->path, "block-")))
1024                                 whitelist_major(path, val, 'b', acc);
1025                         else if ((val = startswith(a->path, "char-")))
1026                                 whitelist_major(path, val, 'c', acc);
1027                         else
1028                                 log_unit_debug(u, "Ignoring device %s while writing cgroup attribute.", a->path);
1029                 }
1030         }
1031
1032         if (apply_mask & CGROUP_MASK_PIDS) {
1033
1034                 if (is_root) {
1035                         /* So, the "pids" controller does not expose anything on the root cgroup, in order not to
1036                          * replicate knobs exposed elsewhere needlessly. We abstract this away here however, and when
1037                          * the knobs of the root cgroup are modified propagate this to the relevant sysctls. There's a
1038                          * non-obvious asymmetry however: unlike the cgroup properties we don't really want to take
1039                          * exclusive ownership of the sysctls, but we still want to honour things if the user sets
1040                          * limits. Hence we employ sort of a one-way strategy: when the user sets a bounded limit
1041                          * through us it counts. When the user afterwards unsets it again (i.e. sets it to unbounded)
1042                          * it also counts. But if the user never set a limit through us (i.e. we are the default of
1043                          * "unbounded") we leave things unmodified. For this we manage a global boolean that we turn on
1044                          * the first time we set a limit. Note that this boolean is flushed out on manager reload,
1045                          * which is desirable so that there's an offical way to release control of the sysctl from
1046                          * systemd: set the limit to unbounded and reload. */
1047
1048                         if (c->tasks_max != CGROUP_LIMIT_MAX) {
1049                                 u->manager->sysctl_pid_max_changed = true;
1050                                 r = procfs_tasks_set_limit(c->tasks_max);
1051                         } else if (u->manager->sysctl_pid_max_changed)
1052                                 r = procfs_tasks_set_limit(TASKS_MAX);
1053                         else
1054                                 r = 0;
1055
1056                         if (r < 0)
1057                                 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
1058                                               "Failed to write to tasks limit sysctls: %m");
1059
1060                 } else {
1061                         if (c->tasks_max != CGROUP_LIMIT_MAX) {
1062                                 char buf[DECIMAL_STR_MAX(uint64_t) + 2];
1063
1064                                 sprintf(buf, "%" PRIu64 "\n", c->tasks_max);
1065                                 r = cg_set_attribute("pids", path, "pids.max", buf);
1066                         } else
1067                                 r = cg_set_attribute("pids", path, "pids.max", "max");
1068                         if (r < 0)
1069                                 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
1070                                               "Failed to set pids.max: %m");
1071                 }
1072         }
1073
1074         if (apply_bpf)
1075                 cgroup_apply_firewall(u);
1076 }
1077
1078 CGroupMask cgroup_context_get_mask(CGroupContext *c) {
1079         CGroupMask mask = 0;
1080
1081         /* Figure out which controllers we need */
1082
1083         if (c->cpu_accounting ||
1084             cgroup_context_has_cpu_weight(c) ||
1085             cgroup_context_has_cpu_shares(c) ||
1086             c->cpu_quota_per_sec_usec != USEC_INFINITY)
1087                 mask |= CGROUP_MASK_CPUACCT | CGROUP_MASK_CPU;
1088
1089         if (cgroup_context_has_io_config(c) || cgroup_context_has_blockio_config(c))
1090                 mask |= CGROUP_MASK_IO | CGROUP_MASK_BLKIO;
1091
1092         if (c->memory_accounting ||
1093             c->memory_limit != CGROUP_LIMIT_MAX ||
1094             cgroup_context_has_unified_memory_config(c))
1095                 mask |= CGROUP_MASK_MEMORY;
1096
1097         if (c->device_allow ||
1098             c->device_policy != CGROUP_AUTO)
1099                 mask |= CGROUP_MASK_DEVICES;
1100
1101         if (c->tasks_accounting ||
1102             c->tasks_max != CGROUP_LIMIT_MAX)
1103                 mask |= CGROUP_MASK_PIDS;
1104
1105         return mask;
1106 }
1107
1108 CGroupMask unit_get_own_mask(Unit *u) {
1109         CGroupContext *c;
1110
1111         /* Returns the mask of controllers the unit needs for itself */
1112
1113         c = unit_get_cgroup_context(u);
1114         if (!c)
1115                 return 0;
1116
1117         return cgroup_context_get_mask(c) | unit_get_delegate_mask(u);
1118 }
1119
1120 CGroupMask unit_get_delegate_mask(Unit *u) {
1121         CGroupContext *c;
1122
1123         /* If delegation is turned on, then turn on selected controllers, unless we are on the legacy hierarchy and the
1124          * process we fork into is known to drop privileges, and hence shouldn't get access to the controllers.
1125          *
1126          * Note that on the unified hierarchy it is safe to delegate controllers to unprivileged services. */
1127
1128         if (u->type == UNIT_SLICE)
1129                 return 0;
1130
1131         c = unit_get_cgroup_context(u);
1132         if (!c)
1133                 return 0;
1134
1135         if (!c->delegate)
1136                 return 0;
1137
1138         if (cg_all_unified() <= 0) {
1139                 ExecContext *e;
1140
1141                 e = unit_get_exec_context(u);
1142                 if (e && !exec_context_maintains_privileges(e))
1143                         return 0;
1144         }
1145
1146         return c->delegate_controllers;
1147 }
1148
1149 CGroupMask unit_get_members_mask(Unit *u) {
1150         assert(u);
1151
1152         /* Returns the mask of controllers all of the unit's children require, merged */
1153
1154         if (u->cgroup_members_mask_valid)
1155                 return u->cgroup_members_mask;
1156
1157         u->cgroup_members_mask = 0;
1158
1159         if (u->type == UNIT_SLICE) {
1160                 void *v;
1161                 Unit *member;
1162                 Iterator i;
1163
1164                 HASHMAP_FOREACH_KEY(v, member, u->dependencies[UNIT_BEFORE], i) {
1165
1166                         if (member == u)
1167                                 continue;
1168
1169                         if (UNIT_DEREF(member->slice) != u)
1170                                 continue;
1171
1172                         u->cgroup_members_mask |= unit_get_subtree_mask(member); /* note that this calls ourselves again, for the children */
1173                 }
1174         }
1175
1176         u->cgroup_members_mask_valid = true;
1177         return u->cgroup_members_mask;
1178 }
1179
1180 CGroupMask unit_get_siblings_mask(Unit *u) {
1181         assert(u);
1182
1183         /* Returns the mask of controllers all of the unit's siblings
1184          * require, i.e. the members mask of the unit's parent slice
1185          * if there is one. */
1186
1187         if (UNIT_ISSET(u->slice))
1188                 return unit_get_members_mask(UNIT_DEREF(u->slice));
1189
1190         return unit_get_subtree_mask(u); /* we are the top-level slice */
1191 }
1192
1193 CGroupMask unit_get_subtree_mask(Unit *u) {
1194
1195         /* Returns the mask of this subtree, meaning of the group
1196          * itself and its children. */
1197
1198         return unit_get_own_mask(u) | unit_get_members_mask(u);
1199 }
1200
1201 CGroupMask unit_get_target_mask(Unit *u) {
1202         CGroupMask mask;
1203
1204         /* This returns the cgroup mask of all controllers to enable
1205          * for a specific cgroup, i.e. everything it needs itself,
1206          * plus all that its children need, plus all that its siblings
1207          * need. This is primarily useful on the legacy cgroup
1208          * hierarchy, where we need to duplicate each cgroup in each
1209          * hierarchy that shall be enabled for it. */
1210
1211         mask = unit_get_own_mask(u) | unit_get_members_mask(u) | unit_get_siblings_mask(u);
1212         mask &= u->manager->cgroup_supported;
1213
1214         return mask;
1215 }
1216
1217 CGroupMask unit_get_enable_mask(Unit *u) {
1218         CGroupMask mask;
1219
1220         /* This returns the cgroup mask of all controllers to enable
1221          * for the children of a specific cgroup. This is primarily
1222          * useful for the unified cgroup hierarchy, where each cgroup
1223          * controls which controllers are enabled for its children. */
1224
1225         mask = unit_get_members_mask(u);
1226         mask &= u->manager->cgroup_supported;
1227
1228         return mask;
1229 }
1230
1231 bool unit_get_needs_bpf(Unit *u) {
1232         CGroupContext *c;
1233         Unit *p;
1234         assert(u);
1235
1236         /* We never attach BPF to slice units, as they are inner cgroup nodes and cgroup/BPF is not recursive at the
1237          * moment. */
1238         if (u->type == UNIT_SLICE)
1239                 return false;
1240
1241         c = unit_get_cgroup_context(u);
1242         if (!c)
1243                 return false;
1244
1245         if (c->ip_accounting ||
1246             c->ip_address_allow ||
1247             c->ip_address_deny)
1248                 return true;
1249
1250         /* If any parent slice has an IP access list defined, it applies too */
1251         for (p = UNIT_DEREF(u->slice); p; p = UNIT_DEREF(p->slice)) {
1252                 c = unit_get_cgroup_context(p);
1253                 if (!c)
1254                         return false;
1255
1256                 if (c->ip_address_allow ||
1257                     c->ip_address_deny)
1258                         return true;
1259         }
1260
1261         return false;
1262 }
1263
1264 /* Recurse from a unit up through its containing slices, propagating
1265  * mask bits upward. A unit is also member of itself. */
1266 void unit_update_cgroup_members_masks(Unit *u) {
1267         CGroupMask m;
1268         bool more;
1269
1270         assert(u);
1271
1272         /* Calculate subtree mask */
1273         m = unit_get_subtree_mask(u);
1274
1275         /* See if anything changed from the previous invocation. If
1276          * not, we're done. */
1277         if (u->cgroup_subtree_mask_valid && m == u->cgroup_subtree_mask)
1278                 return;
1279
1280         more =
1281                 u->cgroup_subtree_mask_valid &&
1282                 ((m & ~u->cgroup_subtree_mask) != 0) &&
1283                 ((~m & u->cgroup_subtree_mask) == 0);
1284
1285         u->cgroup_subtree_mask = m;
1286         u->cgroup_subtree_mask_valid = true;
1287
1288         if (UNIT_ISSET(u->slice)) {
1289                 Unit *s = UNIT_DEREF(u->slice);
1290
1291                 if (more)
1292                         /* There's more set now than before. We
1293                          * propagate the new mask to the parent's mask
1294                          * (not caring if it actually was valid or
1295                          * not). */
1296
1297                         s->cgroup_members_mask |= m;
1298
1299                 else
1300                         /* There's less set now than before (or we
1301                          * don't know), we need to recalculate
1302                          * everything, so let's invalidate the
1303                          * parent's members mask */
1304
1305                         s->cgroup_members_mask_valid = false;
1306
1307                 /* And now make sure that this change also hits our
1308                  * grandparents */
1309                 unit_update_cgroup_members_masks(s);
1310         }
1311 }
1312
1313 static const char *migrate_callback(CGroupMask mask, void *userdata) {
1314         Unit *u = userdata;
1315
1316         assert(mask != 0);
1317         assert(u);
1318
1319         while (u) {
1320                 if (u->cgroup_path &&
1321                     u->cgroup_realized &&
1322                     (u->cgroup_realized_mask & mask) == mask)
1323                         return u->cgroup_path;
1324
1325                 u = UNIT_DEREF(u->slice);
1326         }
1327
1328         return NULL;
1329 }
1330
1331 char *unit_default_cgroup_path(Unit *u) {
1332         _cleanup_free_ char *escaped = NULL, *slice = NULL;
1333         int r;
1334
1335         assert(u);
1336
1337         if (unit_has_name(u, SPECIAL_ROOT_SLICE))
1338                 return strdup(u->manager->cgroup_root);
1339
1340         if (UNIT_ISSET(u->slice) && !unit_has_name(UNIT_DEREF(u->slice), SPECIAL_ROOT_SLICE)) {
1341                 r = cg_slice_to_path(UNIT_DEREF(u->slice)->id, &slice);
1342                 if (r < 0)
1343                         return NULL;
1344         }
1345
1346         escaped = cg_escape(u->id);
1347         if (!escaped)
1348                 return NULL;
1349
1350         if (slice)
1351                 return strjoin(u->manager->cgroup_root, "/", slice, "/",
1352                                escaped);
1353         else
1354                 return strjoin(u->manager->cgroup_root, "/", escaped);
1355 }
1356
1357 int unit_set_cgroup_path(Unit *u, const char *path) {
1358         _cleanup_free_ char *p = NULL;
1359         int r;
1360
1361         assert(u);
1362
1363         if (path) {
1364                 p = strdup(path);
1365                 if (!p)
1366                         return -ENOMEM;
1367         } else
1368                 p = NULL;
1369
1370         if (streq_ptr(u->cgroup_path, p))
1371                 return 0;
1372
1373         if (p) {
1374                 r = hashmap_put(u->manager->cgroup_unit, p, u);
1375                 if (r < 0)
1376                         return r;
1377         }
1378
1379         unit_release_cgroup(u);
1380
1381         u->cgroup_path = p;
1382         p = NULL;
1383
1384         return 1;
1385 }
1386
1387 int unit_watch_cgroup(Unit *u) {
1388         _cleanup_free_ char *events = NULL;
1389         int r;
1390
1391         assert(u);
1392
1393         if (!u->cgroup_path)
1394                 return 0;
1395
1396         if (u->cgroup_inotify_wd >= 0)
1397                 return 0;
1398
1399         /* Only applies to the unified hierarchy */
1400         r = cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER);
1401         if (r < 0)
1402                 return log_error_errno(r, "Failed to determine whether the name=systemd hierarchy is unified: %m");
1403         if (r == 0)
1404                 return 0;
1405
1406         /* Don't watch the root slice, it's pointless. */
1407         if (unit_has_name(u, SPECIAL_ROOT_SLICE))
1408                 return 0;
1409
1410         r = hashmap_ensure_allocated(&u->manager->cgroup_inotify_wd_unit, &trivial_hash_ops);
1411         if (r < 0)
1412                 return log_oom();
1413
1414         r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, "cgroup.events", &events);
1415         if (r < 0)
1416                 return log_oom();
1417
1418         u->cgroup_inotify_wd = inotify_add_watch(u->manager->cgroup_inotify_fd, events, IN_MODIFY);
1419         if (u->cgroup_inotify_wd < 0) {
1420
1421                 if (errno == ENOENT) /* If the directory is already
1422                                       * gone we don't need to track
1423                                       * it, so this is not an error */
1424                         return 0;
1425
1426                 return log_unit_error_errno(u, errno, "Failed to add inotify watch descriptor for control group %s: %m", u->cgroup_path);
1427         }
1428
1429         r = hashmap_put(u->manager->cgroup_inotify_wd_unit, INT_TO_PTR(u->cgroup_inotify_wd), u);
1430         if (r < 0)
1431                 return log_unit_error_errno(u, r, "Failed to add inotify watch descriptor to hash map: %m");
1432
1433         return 0;
1434 }
1435
1436 int unit_pick_cgroup_path(Unit *u) {
1437         _cleanup_free_ char *path = NULL;
1438         int r;
1439
1440         assert(u);
1441
1442         if (u->cgroup_path)
1443                 return 0;
1444
1445         if (!UNIT_HAS_CGROUP_CONTEXT(u))
1446                 return -EINVAL;
1447
1448         path = unit_default_cgroup_path(u);
1449         if (!path)
1450                 return log_oom();
1451
1452         r = unit_set_cgroup_path(u, path);
1453         if (r == -EEXIST)
1454                 return log_unit_error_errno(u, r, "Control group %s exists already.", path);
1455         if (r < 0)
1456                 return log_unit_error_errno(u, r, "Failed to set unit's control group path to %s: %m", path);
1457
1458         return 0;
1459 }
1460
1461 static int unit_create_cgroup(
1462                 Unit *u,
1463                 CGroupMask target_mask,
1464                 CGroupMask enable_mask,
1465                 bool needs_bpf) {
1466
1467         CGroupContext *c;
1468         int r;
1469
1470         assert(u);
1471
1472         c = unit_get_cgroup_context(u);
1473         if (!c)
1474                 return 0;
1475
1476         /* Figure out our cgroup path */
1477         r = unit_pick_cgroup_path(u);
1478         if (r < 0)
1479                 return r;
1480
1481         /* First, create our own group */
1482         r = cg_create_everywhere(u->manager->cgroup_supported, target_mask, u->cgroup_path);
1483         if (r < 0)
1484                 return log_unit_error_errno(u, r, "Failed to create cgroup %s: %m", u->cgroup_path);
1485
1486         /* Start watching it */
1487         (void) unit_watch_cgroup(u);
1488
1489         /* Enable all controllers we need */
1490         r = cg_enable_everywhere(u->manager->cgroup_supported, enable_mask, u->cgroup_path);
1491         if (r < 0)
1492                 log_unit_warning_errno(u, r, "Failed to enable controllers on cgroup %s, ignoring: %m", u->cgroup_path);
1493
1494         /* Keep track that this is now realized */
1495         u->cgroup_realized = true;
1496         u->cgroup_realized_mask = target_mask;
1497         u->cgroup_enabled_mask = enable_mask;
1498         u->cgroup_bpf_state = needs_bpf ? UNIT_CGROUP_BPF_ON : UNIT_CGROUP_BPF_OFF;
1499
1500         if (u->type != UNIT_SLICE && !c->delegate) {
1501
1502                 /* Then, possibly move things over, but not if
1503                  * subgroups may contain processes, which is the case
1504                  * for slice and delegation units. */
1505                 r = cg_migrate_everywhere(u->manager->cgroup_supported, u->cgroup_path, u->cgroup_path, migrate_callback, u);
1506                 if (r < 0)
1507                         log_unit_warning_errno(u, r, "Failed to migrate cgroup from to %s, ignoring: %m", u->cgroup_path);
1508         }
1509
1510         return 0;
1511 }
1512
1513 int unit_attach_pids_to_cgroup(Unit *u) {
1514         int r;
1515         assert(u);
1516
1517         r = unit_realize_cgroup(u);
1518         if (r < 0)
1519                 return r;
1520
1521         r = cg_attach_many_everywhere(u->manager->cgroup_supported, u->cgroup_path, u->pids, migrate_callback, u);
1522         if (r < 0)
1523                 return r;
1524
1525         return 0;
1526 }
1527
1528 static void cgroup_xattr_apply(Unit *u) {
1529         char ids[SD_ID128_STRING_MAX];
1530         int r;
1531
1532         assert(u);
1533
1534         if (!MANAGER_IS_SYSTEM(u->manager))
1535                 return;
1536
1537         if (sd_id128_is_null(u->invocation_id))
1538                 return;
1539
1540         r = cg_set_xattr(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path,
1541                          "trusted.invocation_id",
1542                          sd_id128_to_string(u->invocation_id, ids), 32,
1543                          0);
1544         if (r < 0)
1545                 log_unit_debug_errno(u, r, "Failed to set invocation ID on control group %s, ignoring: %m", u->cgroup_path);
1546 }
1547
1548 static bool unit_has_mask_realized(
1549                 Unit *u,
1550                 CGroupMask target_mask,
1551                 CGroupMask enable_mask,
1552                 bool needs_bpf) {
1553
1554         assert(u);
1555
1556         return u->cgroup_realized &&
1557                 u->cgroup_realized_mask == target_mask &&
1558                 u->cgroup_enabled_mask == enable_mask &&
1559                 ((needs_bpf && u->cgroup_bpf_state == UNIT_CGROUP_BPF_ON) ||
1560                  (!needs_bpf && u->cgroup_bpf_state == UNIT_CGROUP_BPF_OFF));
1561 }
1562
1563 static void unit_add_to_cgroup_realize_queue(Unit *u) {
1564         assert(u);
1565
1566         if (u->in_cgroup_realize_queue)
1567                 return;
1568
1569         LIST_PREPEND(cgroup_realize_queue, u->manager->cgroup_realize_queue, u);
1570         u->in_cgroup_realize_queue = true;
1571 }
1572
1573 static void unit_remove_from_cgroup_realize_queue(Unit *u) {
1574         assert(u);
1575
1576         if (!u->in_cgroup_realize_queue)
1577                 return;
1578
1579         LIST_REMOVE(cgroup_realize_queue, u->manager->cgroup_realize_queue, u);
1580         u->in_cgroup_realize_queue = false;
1581 }
1582
1583
1584 /* Check if necessary controllers and attributes for a unit are in place.
1585  *
1586  * If so, do nothing.
1587  * If not, create paths, move processes over, and set attributes.
1588  *
1589  * Returns 0 on success and < 0 on failure. */
1590 static int unit_realize_cgroup_now(Unit *u, ManagerState state) {
1591         CGroupMask target_mask, enable_mask;
1592         bool needs_bpf, apply_bpf;
1593         int r;
1594
1595         assert(u);
1596
1597         unit_remove_from_cgroup_realize_queue(u);
1598
1599         target_mask = unit_get_target_mask(u);
1600         enable_mask = unit_get_enable_mask(u);
1601         needs_bpf = unit_get_needs_bpf(u);
1602
1603         if (unit_has_mask_realized(u, target_mask, enable_mask, needs_bpf))
1604                 return 0;
1605
1606         /* Make sure we apply the BPF filters either when one is configured, or if none is configured but previously
1607          * the state was anything but off. This way, if a unit with a BPF filter applied is reconfigured to lose it
1608          * this will trickle down properly to cgroupfs. */
1609         apply_bpf = needs_bpf || u->cgroup_bpf_state != UNIT_CGROUP_BPF_OFF;
1610
1611         /* First, realize parents */
1612         if (UNIT_ISSET(u->slice)) {
1613                 r = unit_realize_cgroup_now(UNIT_DEREF(u->slice), state);
1614                 if (r < 0)
1615                         return r;
1616         }
1617
1618         /* And then do the real work */
1619         r = unit_create_cgroup(u, target_mask, enable_mask, needs_bpf);
1620         if (r < 0)
1621                 return r;
1622
1623         /* Finally, apply the necessary attributes. */
1624         cgroup_context_apply(u, target_mask, apply_bpf, state);
1625         cgroup_xattr_apply(u);
1626
1627         return 0;
1628 }
1629
1630 unsigned manager_dispatch_cgroup_realize_queue(Manager *m) {
1631         ManagerState state;
1632         unsigned n = 0;
1633         Unit *i;
1634         int r;
1635
1636         assert(m);
1637
1638         state = manager_state(m);
1639
1640         while ((i = m->cgroup_realize_queue)) {
1641                 assert(i->in_cgroup_realize_queue);
1642
1643                 if (UNIT_IS_INACTIVE_OR_FAILED(unit_active_state(i))) {
1644                         /* Maybe things changed, and the unit is not actually active anymore? */
1645                         unit_remove_from_cgroup_realize_queue(i);
1646                         continue;
1647                 }
1648
1649                 r = unit_realize_cgroup_now(i, state);
1650                 if (r < 0)
1651                         log_warning_errno(r, "Failed to realize cgroups for queued unit %s, ignoring: %m", i->id);
1652
1653                 n++;
1654         }
1655
1656         return n;
1657 }
1658
1659 static void unit_add_siblings_to_cgroup_realize_queue(Unit *u) {
1660         Unit *slice;
1661
1662         /* This adds the siblings of the specified unit and the
1663          * siblings of all parent units to the cgroup queue. (But
1664          * neither the specified unit itself nor the parents.) */
1665
1666         while ((slice = UNIT_DEREF(u->slice))) {
1667                 Iterator i;
1668                 Unit *m;
1669                 void *v;
1670
1671                 HASHMAP_FOREACH_KEY(v, m, u->dependencies[UNIT_BEFORE], i) {
1672                         if (m == u)
1673                                 continue;
1674
1675                         /* Skip units that have a dependency on the slice
1676                          * but aren't actually in it. */
1677                         if (UNIT_DEREF(m->slice) != slice)
1678                                 continue;
1679
1680                         /* No point in doing cgroup application for units
1681                          * without active processes. */
1682                         if (UNIT_IS_INACTIVE_OR_FAILED(unit_active_state(m)))
1683                                 continue;
1684
1685                         /* If the unit doesn't need any new controllers
1686                          * and has current ones realized, it doesn't need
1687                          * any changes. */
1688                         if (unit_has_mask_realized(m,
1689                                                    unit_get_target_mask(m),
1690                                                    unit_get_enable_mask(m),
1691                                                    unit_get_needs_bpf(m)))
1692                                 continue;
1693
1694                         unit_add_to_cgroup_realize_queue(m);
1695                 }
1696
1697                 u = slice;
1698         }
1699 }
1700
1701 int unit_realize_cgroup(Unit *u) {
1702         assert(u);
1703
1704         if (!UNIT_HAS_CGROUP_CONTEXT(u))
1705                 return 0;
1706
1707         /* So, here's the deal: when realizing the cgroups for this
1708          * unit, we need to first create all parents, but there's more
1709          * actually: for the weight-based controllers we also need to
1710          * make sure that all our siblings (i.e. units that are in the
1711          * same slice as we are) have cgroups, too. Otherwise, things
1712          * would become very uneven as each of their processes would
1713          * get as much resources as all our group together. This call
1714          * will synchronously create the parent cgroups, but will
1715          * defer work on the siblings to the next event loop
1716          * iteration. */
1717
1718         /* Add all sibling slices to the cgroup queue. */
1719         unit_add_siblings_to_cgroup_realize_queue(u);
1720
1721         /* And realize this one now (and apply the values) */
1722         return unit_realize_cgroup_now(u, manager_state(u->manager));
1723 }
1724
1725 void unit_release_cgroup(Unit *u) {
1726         assert(u);
1727
1728         /* Forgets all cgroup details for this cgroup */
1729
1730         if (u->cgroup_path) {
1731                 (void) hashmap_remove(u->manager->cgroup_unit, u->cgroup_path);
1732                 u->cgroup_path = mfree(u->cgroup_path);
1733         }
1734
1735         if (u->cgroup_inotify_wd >= 0) {
1736                 if (inotify_rm_watch(u->manager->cgroup_inotify_fd, u->cgroup_inotify_wd) < 0)
1737                         log_unit_debug_errno(u, errno, "Failed to remove cgroup inotify watch %i for %s, ignoring", u->cgroup_inotify_wd, u->id);
1738
1739                 (void) hashmap_remove(u->manager->cgroup_inotify_wd_unit, INT_TO_PTR(u->cgroup_inotify_wd));
1740                 u->cgroup_inotify_wd = -1;
1741         }
1742 }
1743
1744 void unit_prune_cgroup(Unit *u) {
1745         int r;
1746         bool is_root_slice;
1747
1748         assert(u);
1749
1750         /* Removes the cgroup, if empty and possible, and stops watching it. */
1751
1752         if (!u->cgroup_path)
1753                 return;
1754
1755         (void) unit_get_cpu_usage(u, NULL); /* Cache the last CPU usage value before we destroy the cgroup */
1756
1757         is_root_slice = unit_has_name(u, SPECIAL_ROOT_SLICE);
1758
1759         r = cg_trim_everywhere(u->manager->cgroup_supported, u->cgroup_path, !is_root_slice);
1760         if (r < 0) {
1761                 log_unit_debug_errno(u, r, "Failed to destroy cgroup %s, ignoring: %m", u->cgroup_path);
1762                 return;
1763         }
1764
1765         if (is_root_slice)
1766                 return;
1767
1768         unit_release_cgroup(u);
1769
1770         u->cgroup_realized = false;
1771         u->cgroup_realized_mask = 0;
1772         u->cgroup_enabled_mask = 0;
1773 }
1774
1775 int unit_search_main_pid(Unit *u, pid_t *ret) {
1776         _cleanup_fclose_ FILE *f = NULL;
1777         pid_t pid = 0, npid, mypid;
1778         int r;
1779
1780         assert(u);
1781         assert(ret);
1782
1783         if (!u->cgroup_path)
1784                 return -ENXIO;
1785
1786         r = cg_enumerate_processes(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, &f);
1787         if (r < 0)
1788                 return r;
1789
1790         mypid = getpid_cached();
1791         while (cg_read_pid(f, &npid) > 0)  {
1792                 pid_t ppid;
1793
1794                 if (npid == pid)
1795                         continue;
1796
1797                 /* Ignore processes that aren't our kids */
1798                 if (get_process_ppid(npid, &ppid) >= 0 && ppid != mypid)
1799                         continue;
1800
1801                 if (pid != 0)
1802                         /* Dang, there's more than one daemonized PID
1803                         in this group, so we don't know what process
1804                         is the main process. */
1805
1806                         return -ENODATA;
1807
1808                 pid = npid;
1809         }
1810
1811         *ret = pid;
1812         return 0;
1813 }
1814
1815 static int unit_watch_pids_in_path(Unit *u, const char *path) {
1816         _cleanup_closedir_ DIR *d = NULL;
1817         _cleanup_fclose_ FILE *f = NULL;
1818         int ret = 0, r;
1819
1820         assert(u);
1821         assert(path);
1822
1823         r = cg_enumerate_processes(SYSTEMD_CGROUP_CONTROLLER, path, &f);
1824         if (r < 0)
1825                 ret = r;
1826         else {
1827                 pid_t pid;
1828
1829                 while ((r = cg_read_pid(f, &pid)) > 0) {
1830                         r = unit_watch_pid(u, pid);
1831                         if (r < 0 && ret >= 0)
1832                                 ret = r;
1833                 }
1834
1835                 if (r < 0 && ret >= 0)
1836                         ret = r;
1837         }
1838
1839         r = cg_enumerate_subgroups(SYSTEMD_CGROUP_CONTROLLER, path, &d);
1840         if (r < 0) {
1841                 if (ret >= 0)
1842                         ret = r;
1843         } else {
1844                 char *fn;
1845
1846                 while ((r = cg_read_subgroup(d, &fn)) > 0) {
1847                         _cleanup_free_ char *p = NULL;
1848
1849                         p = strjoin(path, "/", fn);
1850                         free(fn);
1851
1852                         if (!p)
1853                                 return -ENOMEM;
1854
1855                         r = unit_watch_pids_in_path(u, p);
1856                         if (r < 0 && ret >= 0)
1857                                 ret = r;
1858                 }
1859
1860                 if (r < 0 && ret >= 0)
1861                         ret = r;
1862         }
1863
1864         return ret;
1865 }
1866
1867 int unit_watch_all_pids(Unit *u) {
1868         int r;
1869
1870         assert(u);
1871
1872         /* Adds all PIDs from our cgroup to the set of PIDs we
1873          * watch. This is a fallback logic for cases where we do not
1874          * get reliable cgroup empty notifications: we try to use
1875          * SIGCHLD as replacement. */
1876
1877         if (!u->cgroup_path)
1878                 return -ENOENT;
1879
1880         r = cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER);
1881         if (r < 0)
1882                 return r;
1883         if (r > 0) /* On unified we can use proper notifications */
1884                 return 0;
1885
1886         return unit_watch_pids_in_path(u, u->cgroup_path);
1887 }
1888
1889 static int on_cgroup_empty_event(sd_event_source *s, void *userdata) {
1890         Manager *m = userdata;
1891         Unit *u;
1892         int r;
1893
1894         assert(s);
1895         assert(m);
1896
1897         u = m->cgroup_empty_queue;
1898         if (!u)
1899                 return 0;
1900
1901         assert(u->in_cgroup_empty_queue);
1902         u->in_cgroup_empty_queue = false;
1903         LIST_REMOVE(cgroup_empty_queue, m->cgroup_empty_queue, u);
1904
1905         if (m->cgroup_empty_queue) {
1906                 /* More stuff queued, let's make sure we remain enabled */
1907                 r = sd_event_source_set_enabled(s, SD_EVENT_ONESHOT);
1908                 if (r < 0)
1909                         log_debug_errno(r, "Failed to reenable cgroup empty event source: %m");
1910         }
1911
1912         unit_add_to_gc_queue(u);
1913
1914         if (UNIT_VTABLE(u)->notify_cgroup_empty)
1915                 UNIT_VTABLE(u)->notify_cgroup_empty(u);
1916
1917         return 0;
1918 }
1919
1920 void unit_add_to_cgroup_empty_queue(Unit *u) {
1921         int r;
1922
1923         assert(u);
1924
1925         /* Note that there are four different ways how cgroup empty events reach us:
1926          *
1927          * 1. On the unified hierarchy we get an inotify event on the cgroup
1928          *
1929          * 2. On the legacy hierarchy, when running in system mode, we get a datagram on the cgroup agent socket
1930          *
1931          * 3. On the legacy hierarchy, when running in user mode, we get a D-Bus signal on the system bus
1932          *
1933          * 4. On the legacy hierarchy, in service units we start watching all processes of the cgroup for SIGCHLD as
1934          *    soon as we get one SIGCHLD, to deal with unreliable cgroup notifications.
1935          *
1936          * Regardless which way we got the notification, we'll verify it here, and then add it to a separate
1937          * queue. This queue will be dispatched at a lower priority than the SIGCHLD handler, so that we always use
1938          * SIGCHLD if we can get it first, and only use the cgroup empty notifications if there's no SIGCHLD pending
1939          * (which might happen if the cgroup doesn't contain processes that are our own child, which is typically the
1940          * case for scope units). */
1941
1942         if (u->in_cgroup_empty_queue)
1943                 return;
1944
1945         /* Let's verify that the cgroup is really empty */
1946         if (!u->cgroup_path)
1947                 return;
1948         r = cg_is_empty_recursive(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path);
1949         if (r < 0) {
1950                 log_unit_debug_errno(u, r, "Failed to determine whether cgroup %s is empty: %m", u->cgroup_path);
1951                 return;
1952         }
1953         if (r == 0)
1954                 return;
1955
1956         LIST_PREPEND(cgroup_empty_queue, u->manager->cgroup_empty_queue, u);
1957         u->in_cgroup_empty_queue = true;
1958
1959         /* Trigger the defer event */
1960         r = sd_event_source_set_enabled(u->manager->cgroup_empty_event_source, SD_EVENT_ONESHOT);
1961         if (r < 0)
1962                 log_debug_errno(r, "Failed to enable cgroup empty event source: %m");
1963 }
1964
1965 static int on_cgroup_inotify_event(sd_event_source *s, int fd, uint32_t revents, void *userdata) {
1966         Manager *m = userdata;
1967
1968         assert(s);
1969         assert(fd >= 0);
1970         assert(m);
1971
1972         for (;;) {
1973                 union inotify_event_buffer buffer;
1974                 struct inotify_event *e;
1975                 ssize_t l;
1976
1977                 l = read(fd, &buffer, sizeof(buffer));
1978                 if (l < 0) {
1979                         if (IN_SET(errno, EINTR, EAGAIN))
1980                                 return 0;
1981
1982                         return log_error_errno(errno, "Failed to read control group inotify events: %m");
1983                 }
1984
1985                 FOREACH_INOTIFY_EVENT(e, buffer, l) {
1986                         Unit *u;
1987
1988                         if (e->wd < 0)
1989                                 /* Queue overflow has no watch descriptor */
1990                                 continue;
1991
1992                         if (e->mask & IN_IGNORED)
1993                                 /* The watch was just removed */
1994                                 continue;
1995
1996                         u = hashmap_get(m->cgroup_inotify_wd_unit, INT_TO_PTR(e->wd));
1997                         if (!u) /* Not that inotify might deliver
1998                                  * events for a watch even after it
1999                                  * was removed, because it was queued
2000                                  * before the removal. Let's ignore
2001                                  * this here safely. */
2002                                 continue;
2003
2004                         unit_add_to_cgroup_empty_queue(u);
2005                 }
2006         }
2007 }
2008 #endif // 0
2009
2010 int manager_setup_cgroup(Manager *m) {
2011         _cleanup_free_ char *path = NULL;
2012         const char *scope_path;
2013         CGroupController c;
2014         int r, all_unified;
2015 #if 0 /// UNNEEDED by elogind
2016         char *e;
2017 #endif // 0
2018
2019         assert(m);
2020
2021         /* 1. Determine hierarchy */
2022         m->cgroup_root = mfree(m->cgroup_root);
2023 #if 0 /// elogind is not init and must therefore search for PID 1 instead of self.
2024         r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, 0, &m->cgroup_root);
2025 #else
2026         r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, 1, &m->cgroup_root);
2027 #endif // 0
2028         if (r < 0)
2029                 return log_error_errno(r, "Cannot determine cgroup we are running in: %m");
2030
2031 #if 0 /// elogind does not support systemd scopes and slices
2032         /* Chop off the init scope, if we are already located in it */
2033         e = endswith(m->cgroup_root, "/" SPECIAL_INIT_SCOPE);
2034
2035         /* LEGACY: Also chop off the system slice if we are in
2036          * it. This is to support live upgrades from older systemd
2037          * versions where PID 1 was moved there. Also see
2038          * cg_get_root_path(). */
2039         if (!e && MANAGER_IS_SYSTEM(m)) {
2040                 e = endswith(m->cgroup_root, "/" SPECIAL_SYSTEM_SLICE);
2041                 if (!e)
2042                         e = endswith(m->cgroup_root, "/system"); /* even more legacy */
2043         }
2044         if (e)
2045                 *e = 0;
2046 #endif // 0
2047
2048         log_debug_elogind("Cgroup Controller \"%s\" -> root \"%s\"",
2049                           SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root);
2050         /* And make sure to store away the root value without trailing slash, even for the root dir, so that we can
2051          * easily prepend it everywhere. */
2052         delete_trailing_chars(m->cgroup_root, "/");
2053
2054         /* 2. Show data */
2055         r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, NULL, &path);
2056         if (r < 0)
2057                 return log_error_errno(r, "Cannot find cgroup mount point: %m");
2058
2059         r = cg_unified_flush();
2060         if (r < 0)
2061                 return log_error_errno(r, "Couldn't determine if we are running in the unified hierarchy: %m");
2062
2063         all_unified = cg_all_unified();
2064         if (all_unified < 0)
2065                 return log_error_errno(all_unified, "Couldn't determine whether we are in all unified mode: %m");
2066         if (all_unified > 0)
2067                 log_debug("Unified cgroup hierarchy is located at %s.", path);
2068         else {
2069                 r = cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER);
2070                 if (r < 0)
2071                         return log_error_errno(r, "Failed to determine whether systemd's own controller is in unified mode: %m");
2072                 if (r > 0)
2073                         log_debug("Unified cgroup hierarchy is located at %s. Controllers are on legacy hierarchies.", path);
2074                 else
2075                         log_debug("Using cgroup controller " SYSTEMD_CGROUP_CONTROLLER_LEGACY ". File system hierarchy is at %s.", path);
2076         }
2077
2078 #if 0 /// elogind is not init, and does not install the agent here.
2079         /* 3. Allocate cgroup empty defer event source */
2080         m->cgroup_empty_event_source = sd_event_source_unref(m->cgroup_empty_event_source);
2081         r = sd_event_add_defer(m->event, &m->cgroup_empty_event_source, on_cgroup_empty_event, m);
2082         if (r < 0)
2083                 return log_error_errno(r, "Failed to create cgroup empty event source: %m");
2084
2085         r = sd_event_source_set_priority(m->cgroup_empty_event_source, SD_EVENT_PRIORITY_NORMAL-5);
2086         if (r < 0)
2087                 return log_error_errno(r, "Failed to set priority of cgroup empty event source: %m");
2088
2089         r = sd_event_source_set_enabled(m->cgroup_empty_event_source, SD_EVENT_OFF);
2090         if (r < 0)
2091                 return log_error_errno(r, "Failed to disable cgroup empty event source: %m");
2092
2093         (void) sd_event_source_set_description(m->cgroup_empty_event_source, "cgroup-empty");
2094
2095         /* 4. Install notifier inotify object, or agent */
2096         if (cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER) > 0) {
2097
2098                 /* In the unified hierarchy we can get cgroup empty notifications via inotify. */
2099
2100                 m->cgroup_inotify_event_source = sd_event_source_unref(m->cgroup_inotify_event_source);
2101                 safe_close(m->cgroup_inotify_fd);
2102
2103                 m->cgroup_inotify_fd = inotify_init1(IN_NONBLOCK|IN_CLOEXEC);
2104                 if (m->cgroup_inotify_fd < 0)
2105                         return log_error_errno(errno, "Failed to create control group inotify object: %m");
2106
2107                 r = sd_event_add_io(m->event, &m->cgroup_inotify_event_source, m->cgroup_inotify_fd, EPOLLIN, on_cgroup_inotify_event, m);
2108                 if (r < 0)
2109                         return log_error_errno(r, "Failed to watch control group inotify object: %m");
2110
2111                 /* Process cgroup empty notifications early, but after service notifications and SIGCHLD. Also
2112                  * see handling of cgroup agent notifications, for the classic cgroup hierarchy support. */
2113                 r = sd_event_source_set_priority(m->cgroup_inotify_event_source, SD_EVENT_PRIORITY_NORMAL-4);
2114                 if (r < 0)
2115                         return log_error_errno(r, "Failed to set priority of inotify event source: %m");
2116
2117                 (void) sd_event_source_set_description(m->cgroup_inotify_event_source, "cgroup-inotify");
2118
2119         } else if (MANAGER_IS_SYSTEM(m) && m->test_run_flags == 0) {
2120
2121                 /* On the legacy hierarchy we only get notifications via cgroup agents. (Which isn't really reliable,
2122                  * since it does not generate events when control groups with children run empty. */
2123
2124                 r = cg_install_release_agent(SYSTEMD_CGROUP_CONTROLLER, SYSTEMD_CGROUP_AGENT_PATH);
2125                 if (r < 0)
2126                         log_warning_errno(r, "Failed to install release agent, ignoring: %m");
2127                 else if (r > 0)
2128                         log_debug("Installed release agent.");
2129                 else if (r == 0)
2130                         log_debug("Release agent already installed.");
2131         }
2132
2133         /* 5. Make sure we are in the special "init.scope" unit in the root slice. */
2134         scope_path = strjoina(m->cgroup_root, "/" SPECIAL_INIT_SCOPE);
2135         r = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, scope_path, 0);
2136 #else
2137         /* Note:
2138                 * This method is in core, and normally called by systemd
2139                 * being init. As elogind is never init, we can not install
2140                 * our agent here. We do so when mounting our cgroup file
2141                 * system, so only if elogind is its own tiny controller.
2142                 * Further, elogind is not meant to run in systemd init scope. */
2143         if (MANAGER_IS_SYSTEM(m))
2144                 // we are our own cgroup controller
2145                 scope_path = strjoina("");
2146         else if (streq(m->cgroup_root, "/elogind"))
2147                 // root already is our cgroup
2148                 scope_path = strjoina(m->cgroup_root);
2149         else
2150                 // we have to create our own group
2151                 scope_path = strjoina(m->cgroup_root, "/elogind");
2152         r = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, scope_path, 0);
2153 #endif // 0
2154         if (r < 0)
2155                 return log_error_errno(r, "Failed to create %s control group: %m", scope_path);
2156         log_debug_elogind("Created control group \"%s\"", scope_path);
2157
2158 #if 0 /// elogind is not a "sub-controller" like systemd, so migration is not needed.
2159         /* Also, move all other userspace processes remaining in the root cgroup into that scope. */
2160         r = cg_migrate(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, SYSTEMD_CGROUP_CONTROLLER, scope_path, 0);
2161         if (r < 0)
2162                 log_warning_errno(r, "Couldn't move remaining userspace processes, ignoring: %m");
2163 #endif // 0
2164
2165         /* 6. And pin it, so that it cannot be unmounted */
2166         safe_close(m->pin_cgroupfs_fd);
2167         m->pin_cgroupfs_fd = open(path, O_RDONLY|O_CLOEXEC|O_DIRECTORY|O_NOCTTY|O_NONBLOCK);
2168         if (m->pin_cgroupfs_fd < 0)
2169                 return log_error_errno(errno, "Failed to open pin file: %m");
2170
2171         /* 7. Always enable hierarchical support if it exists... */
2172         if (!all_unified && m->test_run_flags == 0)
2173                 (void) cg_set_attribute("memory", "/", "memory.use_hierarchy", "1");
2174
2175         /* 8. Figure out which controllers are supported, and log about it */
2176         r = cg_mask_supported(&m->cgroup_supported);
2177         if (r < 0)
2178                 return log_error_errno(r, "Failed to determine supported controllers: %m");
2179         for (c = 0; c < _CGROUP_CONTROLLER_MAX; c++)
2180                 log_debug("Controller '%s' supported: %s", cgroup_controller_to_string(c), yes_no(m->cgroup_supported & CGROUP_CONTROLLER_TO_MASK(c)));
2181
2182         return 0;
2183 }
2184
2185 void manager_shutdown_cgroup(Manager *m, bool delete) {
2186         assert(m);
2187
2188         /* We can't really delete the group, since we are in it. But
2189          * let's trim it. */
2190         if (delete && m->cgroup_root)
2191                 (void) cg_trim(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, false);
2192
2193 #if 0 /// elogind is not init
2194         m->cgroup_empty_event_source = sd_event_source_unref(m->cgroup_empty_event_source);
2195
2196         m->cgroup_inotify_wd_unit = hashmap_free(m->cgroup_inotify_wd_unit);
2197
2198         m->cgroup_inotify_event_source = sd_event_source_unref(m->cgroup_inotify_event_source);
2199         m->cgroup_inotify_fd = safe_close(m->cgroup_inotify_fd);
2200 #endif // 0
2201
2202         m->pin_cgroupfs_fd = safe_close(m->pin_cgroupfs_fd);
2203
2204         m->cgroup_root = mfree(m->cgroup_root);
2205 }
2206
2207 #if 0 /// UNNEEDED by elogind
2208 Unit* manager_get_unit_by_cgroup(Manager *m, const char *cgroup) {
2209         char *p;
2210         Unit *u;
2211
2212         assert(m);
2213         assert(cgroup);
2214
2215         u = hashmap_get(m->cgroup_unit, cgroup);
2216         if (u)
2217                 return u;
2218
2219         p = strdupa(cgroup);
2220         for (;;) {
2221                 char *e;
2222
2223                 e = strrchr(p, '/');
2224                 if (!e || e == p)
2225                         return hashmap_get(m->cgroup_unit, SPECIAL_ROOT_SLICE);
2226
2227                 *e = 0;
2228
2229                 u = hashmap_get(m->cgroup_unit, p);
2230                 if (u)
2231                         return u;
2232         }
2233 }
2234
2235 Unit *manager_get_unit_by_pid_cgroup(Manager *m, pid_t pid) {
2236         _cleanup_free_ char *cgroup = NULL;
2237         int r;
2238
2239         assert(m);
2240
2241         if (pid <= 0)
2242                 return NULL;
2243
2244         r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, pid, &cgroup);
2245         if (r < 0)
2246                 return NULL;
2247
2248         return manager_get_unit_by_cgroup(m, cgroup);
2249 }
2250
2251 Unit *manager_get_unit_by_pid(Manager *m, pid_t pid) {
2252         Unit *u;
2253
2254         assert(m);
2255
2256         if (pid <= 0)
2257                 return NULL;
2258
2259         if (pid == 1)
2260                 return hashmap_get(m->units, SPECIAL_INIT_SCOPE);
2261
2262         u = hashmap_get(m->watch_pids1, PID_TO_PTR(pid));
2263         if (u)
2264                 return u;
2265
2266         u = hashmap_get(m->watch_pids2, PID_TO_PTR(pid));
2267         if (u)
2268                 return u;
2269
2270         return manager_get_unit_by_pid_cgroup(m, pid);
2271 }
2272 #endif // 0
2273
2274 #if 0 /// elogind must substitute this with its own variant
2275 int manager_notify_cgroup_empty(Manager *m, const char *cgroup) {
2276         Unit *u;
2277
2278         assert(m);
2279         assert(cgroup);
2280
2281         /* Called on the legacy hierarchy whenever we get an explicit cgroup notification from the cgroup agent process
2282          * or from the --system instance */
2283
2284         log_debug("Got cgroup empty notification for: %s", cgroup);
2285
2286         u = manager_get_unit_by_cgroup(m, cgroup);
2287         if (!u)
2288                 return 0;
2289
2290         unit_add_to_cgroup_empty_queue(u);
2291         return 1;
2292 }
2293 #else
2294 int manager_notify_cgroup_empty(Manager *m, const char *cgroup) {
2295         Session *s;
2296
2297         assert(m);
2298         assert(cgroup);
2299
2300         log_debug("Got cgroup empty notification for: %s", cgroup);
2301
2302         s = hashmap_get(m->sessions, cgroup);
2303
2304         if (s) {
2305                 session_finalize(s);
2306                 session_free(s);
2307         } else
2308                 log_warning("Session not found: %s", cgroup);
2309
2310         return 0;
2311 }
2312 #endif // 0
2313 #if 0 /// UNNEEDED by elogind
2314 int unit_get_memory_current(Unit *u, uint64_t *ret) {
2315         _cleanup_free_ char *v = NULL;
2316         int r;
2317
2318         assert(u);
2319         assert(ret);
2320
2321         if (!UNIT_CGROUP_BOOL(u, memory_accounting))
2322                 return -ENODATA;
2323
2324         if (!u->cgroup_path)
2325                 return -ENODATA;
2326
2327         if ((u->cgroup_realized_mask & CGROUP_MASK_MEMORY) == 0)
2328                 return -ENODATA;
2329
2330         r = cg_all_unified();
2331         if (r < 0)
2332                 return r;
2333         if (r > 0)
2334                 r = cg_get_attribute("memory", u->cgroup_path, "memory.current", &v);
2335         else
2336                 r = cg_get_attribute("memory", u->cgroup_path, "memory.usage_in_bytes", &v);
2337         if (r == -ENOENT)
2338                 return -ENODATA;
2339         if (r < 0)
2340                 return r;
2341
2342         return safe_atou64(v, ret);
2343 }
2344
2345 int unit_get_tasks_current(Unit *u, uint64_t *ret) {
2346         _cleanup_free_ char *v = NULL;
2347         int r;
2348
2349         assert(u);
2350         assert(ret);
2351
2352         if (!UNIT_CGROUP_BOOL(u, tasks_accounting))
2353                 return -ENODATA;
2354
2355         if (!u->cgroup_path)
2356                 return -ENODATA;
2357
2358         if ((u->cgroup_realized_mask & CGROUP_MASK_PIDS) == 0)
2359                 return -ENODATA;
2360
2361         /* The root cgroup doesn't expose this information, let's get it from /proc instead */
2362         if (unit_has_root_cgroup(u))
2363                 return procfs_tasks_get_current(ret);
2364
2365         r = cg_get_attribute("pids", u->cgroup_path, "pids.current", &v);
2366         if (r == -ENOENT)
2367                 return -ENODATA;
2368         if (r < 0)
2369                 return r;
2370
2371         return safe_atou64(v, ret);
2372 }
2373
2374 static int unit_get_cpu_usage_raw(Unit *u, nsec_t *ret) {
2375         _cleanup_free_ char *v = NULL;
2376         uint64_t ns;
2377         int r;
2378
2379         assert(u);
2380         assert(ret);
2381
2382         if (!u->cgroup_path)
2383                 return -ENODATA;
2384
2385         r = cg_all_unified();
2386         if (r < 0)
2387                 return r;
2388         if (r > 0) {
2389                 const char *keys[] = { "usage_usec", NULL };
2390                 _cleanup_free_ char *val = NULL;
2391                 uint64_t us;
2392
2393                 if ((u->cgroup_realized_mask & CGROUP_MASK_CPU) == 0)
2394                         return -ENODATA;
2395
2396                 r = cg_get_keyed_attribute("cpu", u->cgroup_path, "cpu.stat", keys, &val);
2397                 if (r < 0)
2398                         return r;
2399
2400                 r = safe_atou64(val, &us);
2401                 if (r < 0)
2402                         return r;
2403
2404                 ns = us * NSEC_PER_USEC;
2405         } else {
2406                 if ((u->cgroup_realized_mask & CGROUP_MASK_CPUACCT) == 0)
2407                         return -ENODATA;
2408
2409                 r = cg_get_attribute("cpuacct", u->cgroup_path, "cpuacct.usage", &v);
2410                 if (r == -ENOENT)
2411                         return -ENODATA;
2412                 if (r < 0)
2413                         return r;
2414
2415                 r = safe_atou64(v, &ns);
2416                 if (r < 0)
2417                         return r;
2418         }
2419
2420         *ret = ns;
2421         return 0;
2422 }
2423
2424 int unit_get_cpu_usage(Unit *u, nsec_t *ret) {
2425         nsec_t ns;
2426         int r;
2427
2428         assert(u);
2429
2430         /* Retrieve the current CPU usage counter. This will subtract the CPU counter taken when the unit was
2431          * started. If the cgroup has been removed already, returns the last cached value. To cache the value, simply
2432          * call this function with a NULL return value. */
2433
2434         if (!UNIT_CGROUP_BOOL(u, cpu_accounting))
2435                 return -ENODATA;
2436
2437         r = unit_get_cpu_usage_raw(u, &ns);
2438         if (r == -ENODATA && u->cpu_usage_last != NSEC_INFINITY) {
2439                 /* If we can't get the CPU usage anymore (because the cgroup was already removed, for example), use our
2440                  * cached value. */
2441
2442                 if (ret)
2443                         *ret = u->cpu_usage_last;
2444                 return 0;
2445         }
2446         if (r < 0)
2447                 return r;
2448
2449         if (ns > u->cpu_usage_base)
2450                 ns -= u->cpu_usage_base;
2451         else
2452                 ns = 0;
2453
2454         u->cpu_usage_last = ns;
2455         if (ret)
2456                 *ret = ns;
2457
2458         return 0;
2459 }
2460
2461 int unit_get_ip_accounting(
2462                 Unit *u,
2463                 CGroupIPAccountingMetric metric,
2464                 uint64_t *ret) {
2465
2466         uint64_t value;
2467         int fd, r;
2468
2469         assert(u);
2470         assert(metric >= 0);
2471         assert(metric < _CGROUP_IP_ACCOUNTING_METRIC_MAX);
2472         assert(ret);
2473
2474         /* IP accounting is currently not recursive, and hence we refuse to return any data for slice nodes. Slices are
2475          * inner cgroup nodes and hence have no processes directly attached, hence their counters would be zero
2476          * anyway. And if we block this now we can later open this up, if the kernel learns recursive BPF cgroup
2477          * filters. */
2478         if (u->type == UNIT_SLICE)
2479                 return -ENODATA;
2480
2481         if (!UNIT_CGROUP_BOOL(u, ip_accounting))
2482                 return -ENODATA;
2483
2484         fd = IN_SET(metric, CGROUP_IP_INGRESS_BYTES, CGROUP_IP_INGRESS_PACKETS) ?
2485                 u->ip_accounting_ingress_map_fd :
2486                 u->ip_accounting_egress_map_fd;
2487         if (fd < 0)
2488                 return -ENODATA;
2489
2490         if (IN_SET(metric, CGROUP_IP_INGRESS_BYTES, CGROUP_IP_EGRESS_BYTES))
2491                 r = bpf_firewall_read_accounting(fd, &value, NULL);
2492         else
2493                 r = bpf_firewall_read_accounting(fd, NULL, &value);
2494         if (r < 0)
2495                 return r;
2496
2497         /* Add in additional metrics from a previous runtime. Note that when reexecing/reloading the daemon we compile
2498          * all BPF programs and maps anew, but serialize the old counters. When deserializing we store them in the
2499          * ip_accounting_extra[] field, and add them in here transparently. */
2500
2501         *ret = value + u->ip_accounting_extra[metric];
2502
2503         return r;
2504 }
2505
2506 int unit_reset_cpu_accounting(Unit *u) {
2507         nsec_t ns;
2508         int r;
2509
2510         assert(u);
2511
2512         u->cpu_usage_last = NSEC_INFINITY;
2513
2514         r = unit_get_cpu_usage_raw(u, &ns);
2515         if (r < 0) {
2516                 u->cpu_usage_base = 0;
2517                 return r;
2518         }
2519
2520         u->cpu_usage_base = ns;
2521         return 0;
2522 }
2523
2524 int unit_reset_ip_accounting(Unit *u) {
2525         int r = 0, q = 0;
2526
2527         assert(u);
2528
2529         if (u->ip_accounting_ingress_map_fd >= 0)
2530                 r = bpf_firewall_reset_accounting(u->ip_accounting_ingress_map_fd);
2531
2532         if (u->ip_accounting_egress_map_fd >= 0)
2533                 q = bpf_firewall_reset_accounting(u->ip_accounting_egress_map_fd);
2534
2535         zero(u->ip_accounting_extra);
2536
2537         return r < 0 ? r : q;
2538 }
2539
2540 void unit_invalidate_cgroup(Unit *u, CGroupMask m) {
2541         assert(u);
2542
2543         if (!UNIT_HAS_CGROUP_CONTEXT(u))
2544                 return;
2545
2546         if (m == 0)
2547                 return;
2548
2549         /* always invalidate compat pairs together */
2550         if (m & (CGROUP_MASK_IO | CGROUP_MASK_BLKIO))
2551                 m |= CGROUP_MASK_IO | CGROUP_MASK_BLKIO;
2552
2553         if (m & (CGROUP_MASK_CPU | CGROUP_MASK_CPUACCT))
2554                 m |= CGROUP_MASK_CPU | CGROUP_MASK_CPUACCT;
2555
2556         if ((u->cgroup_realized_mask & m) == 0) /* NOP? */
2557                 return;
2558
2559         u->cgroup_realized_mask &= ~m;
2560         unit_add_to_cgroup_realize_queue(u);
2561 }
2562
2563 void unit_invalidate_cgroup_bpf(Unit *u) {
2564         assert(u);
2565
2566         if (!UNIT_HAS_CGROUP_CONTEXT(u))
2567                 return;
2568
2569         if (u->cgroup_bpf_state == UNIT_CGROUP_BPF_INVALIDATED) /* NOP? */
2570                 return;
2571
2572         u->cgroup_bpf_state = UNIT_CGROUP_BPF_INVALIDATED;
2573         unit_add_to_cgroup_realize_queue(u);
2574
2575         /* If we are a slice unit, we also need to put compile a new BPF program for all our children, as the IP access
2576          * list of our children includes our own. */
2577         if (u->type == UNIT_SLICE) {
2578                 Unit *member;
2579                 Iterator i;
2580                 void *v;
2581
2582                 HASHMAP_FOREACH_KEY(v, member, u->dependencies[UNIT_BEFORE], i) {
2583                         if (member == u)
2584                                 continue;
2585
2586                         if (UNIT_DEREF(member->slice) != u)
2587                                 continue;
2588
2589                         unit_invalidate_cgroup_bpf(member);
2590                 }
2591         }
2592 }
2593
2594 void manager_invalidate_startup_units(Manager *m) {
2595         Iterator i;
2596         Unit *u;
2597
2598         assert(m);
2599
2600         SET_FOREACH(u, m->startup_units, i)
2601                 unit_invalidate_cgroup(u, CGROUP_MASK_CPU|CGROUP_MASK_IO|CGROUP_MASK_BLKIO);
2602 }
2603
2604 static const char* const cgroup_device_policy_table[_CGROUP_DEVICE_POLICY_MAX] = {
2605         [CGROUP_AUTO] = "auto",
2606         [CGROUP_CLOSED] = "closed",
2607         [CGROUP_STRICT] = "strict",
2608 };
2609
2610 DEFINE_STRING_TABLE_LOOKUP(cgroup_device_policy, CGroupDevicePolicy);
2611 #endif // 0