src/core/cgroup.c

   1 /* SPDX-License-Identifier: LGPL-2.1+ */
   2 /***
   3   This file is part of systemd.
   4
   5   Copyright 2013 Lennart Poettering
   6
   7   systemd is free software; you can redistribute it and/or modify it
   8   under the terms of the GNU Lesser General Public License as published by
   9   the Free Software Foundation; either version 2.1 of the License, or
  10   (at your option) any later version.
  11
  12   systemd is distributed in the hope that it will be useful, but
  13   WITHOUT ANY WARRANTY; without even the implied warranty of
  14   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  15   Lesser General Public License for more details.
  16
  17   You should have received a copy of the GNU Lesser General Public License
  18   along with systemd; If not, see <http://www.gnu.org/licenses/>.
  19 ***/
  20
  21 #include <fcntl.h>
  22 #include <fnmatch.h>
  23
  24 #include "alloc-util.h"
  25 //#include "blockdev-util.h"
  26 //#include "bpf-firewall.h"
  27 //#include "bus-error.h"
  28 #include "cgroup-util.h"
  29 #include "cgroup.h"
  30 #include "fd-util.h"
  31 #include "fileio.h"
  32 #include "fs-util.h"
  33 #include "parse-util.h"
  34 #include "path-util.h"
  35 #include "process-util.h"
  36 //#include "procfs-util.h"
  37 //#include "special.h"
  38 #include "stdio-util.h"
  39 #include "string-table.h"
  40 #include "string-util.h"
  41 #include "virt.h"
  42
  43 #define CGROUP_CPU_QUOTA_PERIOD_USEC ((usec_t) 100 * USEC_PER_MSEC)
  44
  45 bool manager_owns_root_cgroup(Manager *m) {
  46         assert(m);
  47
  48         /* Returns true if we are managing the root cgroup. Note that it isn't sufficient to just check whether the
  49          * group root path equals "/" since that will also be the case if CLONE_NEWCGROUP is in the mix. Since there's
  50          * appears to be no nice way to detect whether we are in a CLONE_NEWCGROUP namespace we instead just check if
  51          * we run in any kind of container virtualization. */
  52
  53         if (detect_container() > 0)
  54                 return false;
  55
  56         return isempty(m->cgroup_root) || path_equal(m->cgroup_root, "/");
  57 }
  58
  59 #if 0 /// UNNEEDED by elogind
  60 bool unit_has_root_cgroup(Unit *u) {
  61         assert(u);
  62
  63         /* Returns whether this unit manages the root cgroup. This will return true if this unit is the root slice and
  64          * the manager manages the root cgroup. */
  65
  66         if (!manager_owns_root_cgroup(u->manager))
  67                 return false;
  68
  69         return unit_has_name(u, SPECIAL_ROOT_SLICE);
  70 }
  71
  72 static void cgroup_compat_warn(void) {
  73         static bool cgroup_compat_warned = false;
  74
  75         if (cgroup_compat_warned)
  76                 return;
  77
  78         log_warning("cgroup compatibility translation between legacy and unified hierarchy settings activated. "
  79                     "See cgroup-compat debug messages for details.");
  80
  81         cgroup_compat_warned = true;
  82 }
  83
  84 #define log_cgroup_compat(unit, fmt, ...) do {                                  \
  85                 cgroup_compat_warn();                                           \
  86                 log_unit_debug(unit, "cgroup-compat: " fmt, ##__VA_ARGS__);     \
  87         } while (false)
  88
  89 void cgroup_context_init(CGroupContext *c) {
  90         assert(c);
  91
  92         /* Initialize everything to the kernel defaults, assuming the
  93          * structure is preinitialized to 0 */
  94
  95         c->cpu_weight = CGROUP_WEIGHT_INVALID;
  96         c->startup_cpu_weight = CGROUP_WEIGHT_INVALID;
  97         c->cpu_quota_per_sec_usec = USEC_INFINITY;
  98
  99         c->cpu_shares = CGROUP_CPU_SHARES_INVALID;
 100         c->startup_cpu_shares = CGROUP_CPU_SHARES_INVALID;
 101
 102         c->memory_high = CGROUP_LIMIT_MAX;
 103         c->memory_max = CGROUP_LIMIT_MAX;
 104         c->memory_swap_max = CGROUP_LIMIT_MAX;
 105
 106         c->memory_limit = CGROUP_LIMIT_MAX;
 107
 108         c->io_weight = CGROUP_WEIGHT_INVALID;
 109         c->startup_io_weight = CGROUP_WEIGHT_INVALID;
 110
 111         c->blockio_weight = CGROUP_BLKIO_WEIGHT_INVALID;
 112         c->startup_blockio_weight = CGROUP_BLKIO_WEIGHT_INVALID;
 113
 114         c->tasks_max = (uint64_t) -1;
 115 }
 116
 117 void cgroup_context_free_device_allow(CGroupContext *c, CGroupDeviceAllow *a) {
 118         assert(c);
 119         assert(a);
 120
 121         LIST_REMOVE(device_allow, c->device_allow, a);
 122         free(a->path);
 123         free(a);
 124 }
 125
 126 void cgroup_context_free_io_device_weight(CGroupContext *c, CGroupIODeviceWeight *w) {
 127         assert(c);
 128         assert(w);
 129
 130         LIST_REMOVE(device_weights, c->io_device_weights, w);
 131         free(w->path);
 132         free(w);
 133 }
 134
 135 void cgroup_context_free_io_device_limit(CGroupContext *c, CGroupIODeviceLimit *l) {
 136         assert(c);
 137         assert(l);
 138
 139         LIST_REMOVE(device_limits, c->io_device_limits, l);
 140         free(l->path);
 141         free(l);
 142 }
 143
 144 void cgroup_context_free_blockio_device_weight(CGroupContext *c, CGroupBlockIODeviceWeight *w) {
 145         assert(c);
 146         assert(w);
 147
 148         LIST_REMOVE(device_weights, c->blockio_device_weights, w);
 149         free(w->path);
 150         free(w);
 151 }
 152
 153 void cgroup_context_free_blockio_device_bandwidth(CGroupContext *c, CGroupBlockIODeviceBandwidth *b) {
 154         assert(c);
 155         assert(b);
 156
 157         LIST_REMOVE(device_bandwidths, c->blockio_device_bandwidths, b);
 158         free(b->path);
 159         free(b);
 160 }
 161
 162 void cgroup_context_done(CGroupContext *c) {
 163         assert(c);
 164
 165         while (c->io_device_weights)
 166                 cgroup_context_free_io_device_weight(c, c->io_device_weights);
 167
 168         while (c->io_device_limits)
 169                 cgroup_context_free_io_device_limit(c, c->io_device_limits);
 170
 171         while (c->blockio_device_weights)
 172                 cgroup_context_free_blockio_device_weight(c, c->blockio_device_weights);
 173
 174         while (c->blockio_device_bandwidths)
 175                 cgroup_context_free_blockio_device_bandwidth(c, c->blockio_device_bandwidths);
 176
 177         while (c->device_allow)
 178                 cgroup_context_free_device_allow(c, c->device_allow);
 179
 180         c->ip_address_allow = ip_address_access_free_all(c->ip_address_allow);
 181         c->ip_address_deny = ip_address_access_free_all(c->ip_address_deny);
 182 }
 183
 184 void cgroup_context_dump(CGroupContext *c, FILE* f, const char *prefix) {
 185         CGroupIODeviceLimit *il;
 186         CGroupIODeviceWeight *iw;
 187         CGroupBlockIODeviceBandwidth *b;
 188         CGroupBlockIODeviceWeight *w;
 189         CGroupDeviceAllow *a;
 190         IPAddressAccessItem *iaai;
 191         char u[FORMAT_TIMESPAN_MAX];
 192
 193         assert(c);
 194         assert(f);
 195
 196         prefix = strempty(prefix);
 197
 198         fprintf(f,
 199                 "%sCPUAccounting=%s\n"
 200                 "%sIOAccounting=%s\n"
 201                 "%sBlockIOAccounting=%s\n"
 202                 "%sMemoryAccounting=%s\n"
 203                 "%sTasksAccounting=%s\n"
 204                 "%sIPAccounting=%s\n"
 205                 "%sCPUWeight=%" PRIu64 "\n"
 206                 "%sStartupCPUWeight=%" PRIu64 "\n"
 207                 "%sCPUShares=%" PRIu64 "\n"
 208                 "%sStartupCPUShares=%" PRIu64 "\n"
 209                 "%sCPUQuotaPerSecSec=%s\n"
 210                 "%sIOWeight=%" PRIu64 "\n"
 211                 "%sStartupIOWeight=%" PRIu64 "\n"
 212                 "%sBlockIOWeight=%" PRIu64 "\n"
 213                 "%sStartupBlockIOWeight=%" PRIu64 "\n"
 214                 "%sMemoryLow=%" PRIu64 "\n"
 215                 "%sMemoryHigh=%" PRIu64 "\n"
 216                 "%sMemoryMax=%" PRIu64 "\n"
 217                 "%sMemorySwapMax=%" PRIu64 "\n"
 218                 "%sMemoryLimit=%" PRIu64 "\n"
 219                 "%sTasksMax=%" PRIu64 "\n"
 220                 "%sDevicePolicy=%s\n"
 221                 "%sDelegate=%s\n",
 222                 prefix, yes_no(c->cpu_accounting),
 223                 prefix, yes_no(c->io_accounting),
 224                 prefix, yes_no(c->blockio_accounting),
 225                 prefix, yes_no(c->memory_accounting),
 226                 prefix, yes_no(c->tasks_accounting),
 227                 prefix, yes_no(c->ip_accounting),
 228                 prefix, c->cpu_weight,
 229                 prefix, c->startup_cpu_weight,
 230                 prefix, c->cpu_shares,
 231                 prefix, c->startup_cpu_shares,
 232                 prefix, format_timespan(u, sizeof(u), c->cpu_quota_per_sec_usec, 1),
 233                 prefix, c->io_weight,
 234                 prefix, c->startup_io_weight,
 235                 prefix, c->blockio_weight,
 236                 prefix, c->startup_blockio_weight,
 237                 prefix, c->memory_low,
 238                 prefix, c->memory_high,
 239                 prefix, c->memory_max,
 240                 prefix, c->memory_swap_max,
 241                 prefix, c->memory_limit,
 242                 prefix, c->tasks_max,
 243                 prefix, cgroup_device_policy_to_string(c->device_policy),
 244                 prefix, yes_no(c->delegate));
 245
 246         if (c->delegate) {
 247                 _cleanup_free_ char *t = NULL;
 248
 249                 (void) cg_mask_to_string(c->delegate_controllers, &t);
 250
 251                 fprintf(f, "%sDelegateControllers=%s\n",
 252                         prefix,
 253                         strempty(t));
 254         }
 255
 256         LIST_FOREACH(device_allow, a, c->device_allow)
 257                 fprintf(f,
 258                         "%sDeviceAllow=%s %s%s%s\n",
 259                         prefix,
 260                         a->path,
 261                         a->r ? "r" : "", a->w ? "w" : "", a->m ? "m" : "");
 262
 263         LIST_FOREACH(device_weights, iw, c->io_device_weights)
 264                 fprintf(f,
 265                         "%sIODeviceWeight=%s %" PRIu64,
 266                         prefix,
 267                         iw->path,
 268                         iw->weight);
 269
 270         LIST_FOREACH(device_limits, il, c->io_device_limits) {
 271                 char buf[FORMAT_BYTES_MAX];
 272                 CGroupIOLimitType type;
 273
 274                 for (type = 0; type < _CGROUP_IO_LIMIT_TYPE_MAX; type++)
 275                         if (il->limits[type] != cgroup_io_limit_defaults[type])
 276                                 fprintf(f,
 277                                         "%s%s=%s %s\n",
 278                                         prefix,
 279                                         cgroup_io_limit_type_to_string(type),
 280                                         il->path,
 281                                         format_bytes(buf, sizeof(buf), il->limits[type]));
 282         }
 283
 284         LIST_FOREACH(device_weights, w, c->blockio_device_weights)
 285                 fprintf(f,
 286                         "%sBlockIODeviceWeight=%s %" PRIu64,
 287                         prefix,
 288                         w->path,
 289                         w->weight);
 290
 291         LIST_FOREACH(device_bandwidths, b, c->blockio_device_bandwidths) {
 292                 char buf[FORMAT_BYTES_MAX];
 293
 294                 if (b->rbps != CGROUP_LIMIT_MAX)
 295                         fprintf(f,
 296                                 "%sBlockIOReadBandwidth=%s %s\n",
 297                                 prefix,
 298                                 b->path,
 299                                 format_bytes(buf, sizeof(buf), b->rbps));
 300                 if (b->wbps != CGROUP_LIMIT_MAX)
 301                         fprintf(f,
 302                                 "%sBlockIOWriteBandwidth=%s %s\n",
 303                                 prefix,
 304                                 b->path,
 305                                 format_bytes(buf, sizeof(buf), b->wbps));
 306         }
 307
 308         LIST_FOREACH(items, iaai, c->ip_address_allow) {
 309                 _cleanup_free_ char *k = NULL;
 310
 311                 (void) in_addr_to_string(iaai->family, &iaai->address, &k);
 312                 fprintf(f, "%sIPAddressAllow=%s/%u\n", prefix, strnull(k), iaai->prefixlen);
 313         }
 314
 315         LIST_FOREACH(items, iaai, c->ip_address_deny) {
 316                 _cleanup_free_ char *k = NULL;
 317
 318                 (void) in_addr_to_string(iaai->family, &iaai->address, &k);
 319                 fprintf(f, "%sIPAddressDeny=%s/%u\n", prefix, strnull(k), iaai->prefixlen);
 320         }
 321 }
 322
 323 static int lookup_block_device(const char *p, dev_t *dev) {
 324         struct stat st;
 325         int r;
 326
 327         assert(p);
 328         assert(dev);
 329
 330         r = stat(p, &st);
 331         if (r < 0)
 332                 return log_warning_errno(errno, "Couldn't stat device %s: %m", p);
 333
 334         if (S_ISBLK(st.st_mode))
 335                 *dev = st.st_rdev;
 336         else if (major(st.st_dev) != 0) {
 337                 /* If this is not a device node then find the block
 338                  * device this file is stored on */
 339                 *dev = st.st_dev;
 340
 341                 /* If this is a partition, try to get the originating
 342                  * block device */
 343                 (void) block_get_whole_disk(*dev, dev);
 344         } else {
 345                 log_warning("%s is not a block device and file system block device cannot be determined or is not local.", p);
 346                 return -ENODEV;
 347         }
 348
 349         return 0;
 350 }
 351
 352 static int whitelist_device(const char *path, const char *node, const char *acc) {
 353         char buf[2+DECIMAL_STR_MAX(dev_t)*2+2+4];
 354         struct stat st;
 355         bool ignore_notfound;
 356         int r;
 357
 358         assert(path);
 359         assert(acc);
 360
 361         if (node[0] == '-') {
 362                 /* Non-existent paths starting with "-" must be silently ignored */
 363                 node++;
 364                 ignore_notfound = true;
 365         } else
 366                 ignore_notfound = false;
 367
 368         if (stat(node, &st) < 0) {
 369                 if (errno == ENOENT && ignore_notfound)
 370                         return 0;
 371
 372                 return log_warning_errno(errno, "Couldn't stat device %s: %m", node);
 373         }
 374
 375         if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
 376                 log_warning("%s is not a device.", node);
 377                 return -ENODEV;
 378         }
 379
 380         sprintf(buf,
 381                 "%c %u:%u %s",
 382                 S_ISCHR(st.st_mode) ? 'c' : 'b',
 383                 major(st.st_rdev), minor(st.st_rdev),
 384                 acc);
 385
 386         r = cg_set_attribute("devices", path, "devices.allow", buf);
 387         if (r < 0)
 388                 log_full_errno(IN_SET(r, -ENOENT, -EROFS, -EINVAL, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
 389                                "Failed to set devices.allow on %s: %m", path);
 390
 391         return r;
 392 }
 393
 394 static int whitelist_major(const char *path, const char *name, char type, const char *acc) {
 395         _cleanup_fclose_ FILE *f = NULL;
 396         char line[LINE_MAX];
 397         bool good = false;
 398         int r;
 399
 400         assert(path);
 401         assert(acc);
 402         assert(IN_SET(type, 'b', 'c'));
 403
 404         f = fopen("/proc/devices", "re");
 405         if (!f)
 406                 return log_warning_errno(errno, "Cannot open /proc/devices to resolve %s (%c): %m", name, type);
 407
 408         FOREACH_LINE(line, f, goto fail) {
 409                 char buf[2+DECIMAL_STR_MAX(unsigned)+3+4], *p, *w;
 410                 unsigned maj;
 411
 412                 truncate_nl(line);
 413
 414                 if (type == 'c' && streq(line, "Character devices:")) {
 415                         good = true;
 416                         continue;
 417                 }
 418
 419                 if (type == 'b' && streq(line, "Block devices:")) {
 420                         good = true;
 421                         continue;
 422                 }
 423
 424                 if (isempty(line)) {
 425                         good = false;
 426                         continue;
 427                 }
 428
 429                 if (!good)
 430                         continue;
 431
 432                 p = strstrip(line);
 433
 434                 w = strpbrk(p, WHITESPACE);
 435                 if (!w)
 436                         continue;
 437                 *w = 0;
 438
 439                 r = safe_atou(p, &maj);
 440                 if (r < 0)
 441                         continue;
 442                 if (maj <= 0)
 443                         continue;
 444
 445                 w++;
 446                 w += strspn(w, WHITESPACE);
 447
 448                 if (fnmatch(name, w, 0) != 0)
 449                         continue;
 450
 451                 sprintf(buf,
 452                         "%c %u:* %s",
 453                         type,
 454                         maj,
 455                         acc);
 456
 457                 r = cg_set_attribute("devices", path, "devices.allow", buf);
 458                 if (r < 0)
 459                         log_full_errno(IN_SET(r, -ENOENT, -EROFS, -EINVAL, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
 460                                        "Failed to set devices.allow on %s: %m", path);
 461         }
 462
 463         return 0;
 464
 465 fail:
 466         return log_warning_errno(errno, "Failed to read /proc/devices: %m");
 467 }
 468
 469 static bool cgroup_context_has_cpu_weight(CGroupContext *c) {
 470         return c->cpu_weight != CGROUP_WEIGHT_INVALID ||
 471                 c->startup_cpu_weight != CGROUP_WEIGHT_INVALID;
 472 }
 473
 474 static bool cgroup_context_has_cpu_shares(CGroupContext *c) {
 475         return c->cpu_shares != CGROUP_CPU_SHARES_INVALID ||
 476                 c->startup_cpu_shares != CGROUP_CPU_SHARES_INVALID;
 477 }
 478
 479 static uint64_t cgroup_context_cpu_weight(CGroupContext *c, ManagerState state) {
 480         if (IN_SET(state, MANAGER_STARTING, MANAGER_INITIALIZING) &&
 481             c->startup_cpu_weight != CGROUP_WEIGHT_INVALID)
 482                 return c->startup_cpu_weight;
 483         else if (c->cpu_weight != CGROUP_WEIGHT_INVALID)
 484                 return c->cpu_weight;
 485         else
 486                 return CGROUP_WEIGHT_DEFAULT;
 487 }
 488
 489 static uint64_t cgroup_context_cpu_shares(CGroupContext *c, ManagerState state) {
 490         if (IN_SET(state, MANAGER_STARTING, MANAGER_INITIALIZING) &&
 491             c->startup_cpu_shares != CGROUP_CPU_SHARES_INVALID)
 492                 return c->startup_cpu_shares;
 493         else if (c->cpu_shares != CGROUP_CPU_SHARES_INVALID)
 494                 return c->cpu_shares;
 495         else
 496                 return CGROUP_CPU_SHARES_DEFAULT;
 497 }
 498
 499 static void cgroup_apply_unified_cpu_config(Unit *u, uint64_t weight, uint64_t quota) {
 500         char buf[MAX(DECIMAL_STR_MAX(uint64_t) + 1, (DECIMAL_STR_MAX(usec_t) + 1) * 2)];
 501         int r;
 502
 503         xsprintf(buf, "%" PRIu64 "\n", weight);
 504         r = cg_set_attribute("cpu", u->cgroup_path, "cpu.weight", buf);
 505         if (r < 0)
 506                 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
 507                               "Failed to set cpu.weight: %m");
 508
 509         if (quota != USEC_INFINITY)
 510                 xsprintf(buf, USEC_FMT " " USEC_FMT "\n",
 511                          quota * CGROUP_CPU_QUOTA_PERIOD_USEC / USEC_PER_SEC, CGROUP_CPU_QUOTA_PERIOD_USEC);
 512         else
 513                 xsprintf(buf, "max " USEC_FMT "\n", CGROUP_CPU_QUOTA_PERIOD_USEC);
 514
 515         r = cg_set_attribute("cpu", u->cgroup_path, "cpu.max", buf);
 516
 517         if (r < 0)
 518                 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
 519                               "Failed to set cpu.max: %m");
 520 }
 521
 522 static void cgroup_apply_legacy_cpu_config(Unit *u, uint64_t shares, uint64_t quota) {
 523         char buf[MAX(DECIMAL_STR_MAX(uint64_t), DECIMAL_STR_MAX(usec_t)) + 1];
 524         int r;
 525
 526         xsprintf(buf, "%" PRIu64 "\n", shares);
 527         r = cg_set_attribute("cpu", u->cgroup_path, "cpu.shares", buf);
 528         if (r < 0)
 529                 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
 530                               "Failed to set cpu.shares: %m");
 531
 532         xsprintf(buf, USEC_FMT "\n", CGROUP_CPU_QUOTA_PERIOD_USEC);
 533         r = cg_set_attribute("cpu", u->cgroup_path, "cpu.cfs_period_us", buf);
 534         if (r < 0)
 535                 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
 536                               "Failed to set cpu.cfs_period_us: %m");
 537
 538         if (quota != USEC_INFINITY) {
 539                 xsprintf(buf, USEC_FMT "\n", quota * CGROUP_CPU_QUOTA_PERIOD_USEC / USEC_PER_SEC);
 540                 r = cg_set_attribute("cpu", u->cgroup_path, "cpu.cfs_quota_us", buf);
 541         } else
 542                 r = cg_set_attribute("cpu", u->cgroup_path, "cpu.cfs_quota_us", "-1");
 543         if (r < 0)
 544                 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
 545                               "Failed to set cpu.cfs_quota_us: %m");
 546 }
 547
 548 static uint64_t cgroup_cpu_shares_to_weight(uint64_t shares) {
 549         return CLAMP(shares * CGROUP_WEIGHT_DEFAULT / CGROUP_CPU_SHARES_DEFAULT,
 550                      CGROUP_WEIGHT_MIN, CGROUP_WEIGHT_MAX);
 551 }
 552
 553 static uint64_t cgroup_cpu_weight_to_shares(uint64_t weight) {
 554         return CLAMP(weight * CGROUP_CPU_SHARES_DEFAULT / CGROUP_WEIGHT_DEFAULT,
 555                      CGROUP_CPU_SHARES_MIN, CGROUP_CPU_SHARES_MAX);
 556 }
 557
 558 static bool cgroup_context_has_io_config(CGroupContext *c) {
 559         return c->io_accounting ||
 560                 c->io_weight != CGROUP_WEIGHT_INVALID ||
 561                 c->startup_io_weight != CGROUP_WEIGHT_INVALID ||
 562                 c->io_device_weights ||
 563                 c->io_device_limits;
 564 }
 565
 566 static bool cgroup_context_has_blockio_config(CGroupContext *c) {
 567         return c->blockio_accounting ||
 568                 c->blockio_weight != CGROUP_BLKIO_WEIGHT_INVALID ||
 569                 c->startup_blockio_weight != CGROUP_BLKIO_WEIGHT_INVALID ||
 570                 c->blockio_device_weights ||
 571                 c->blockio_device_bandwidths;
 572 }
 573
 574 static uint64_t cgroup_context_io_weight(CGroupContext *c, ManagerState state) {
 575         if (IN_SET(state, MANAGER_STARTING, MANAGER_INITIALIZING) &&
 576             c->startup_io_weight != CGROUP_WEIGHT_INVALID)
 577                 return c->startup_io_weight;
 578         else if (c->io_weight != CGROUP_WEIGHT_INVALID)
 579                 return c->io_weight;
 580         else
 581                 return CGROUP_WEIGHT_DEFAULT;
 582 }
 583
 584 static uint64_t cgroup_context_blkio_weight(CGroupContext *c, ManagerState state) {
 585         if (IN_SET(state, MANAGER_STARTING, MANAGER_INITIALIZING) &&
 586             c->startup_blockio_weight != CGROUP_BLKIO_WEIGHT_INVALID)
 587                 return c->startup_blockio_weight;
 588         else if (c->blockio_weight != CGROUP_BLKIO_WEIGHT_INVALID)
 589                 return c->blockio_weight;
 590         else
 591                 return CGROUP_BLKIO_WEIGHT_DEFAULT;
 592 }
 593
 594 static uint64_t cgroup_weight_blkio_to_io(uint64_t blkio_weight) {
 595         return CLAMP(blkio_weight * CGROUP_WEIGHT_DEFAULT / CGROUP_BLKIO_WEIGHT_DEFAULT,
 596                      CGROUP_WEIGHT_MIN, CGROUP_WEIGHT_MAX);
 597 }
 598
 599 static uint64_t cgroup_weight_io_to_blkio(uint64_t io_weight) {
 600         return CLAMP(io_weight * CGROUP_BLKIO_WEIGHT_DEFAULT / CGROUP_WEIGHT_DEFAULT,
 601                      CGROUP_BLKIO_WEIGHT_MIN, CGROUP_BLKIO_WEIGHT_MAX);
 602 }
 603
 604 static void cgroup_apply_io_device_weight(Unit *u, const char *dev_path, uint64_t io_weight) {
 605         char buf[DECIMAL_STR_MAX(dev_t)*2+2+DECIMAL_STR_MAX(uint64_t)+1];
 606         dev_t dev;
 607         int r;
 608
 609         r = lookup_block_device(dev_path, &dev);
 610         if (r < 0)
 611                 return;
 612
 613         xsprintf(buf, "%u:%u %" PRIu64 "\n", major(dev), minor(dev), io_weight);
 614         r = cg_set_attribute("io", u->cgroup_path, "io.weight", buf);
 615         if (r < 0)
 616                 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
 617                               "Failed to set io.weight: %m");
 618 }
 619
 620 static void cgroup_apply_blkio_device_weight(Unit *u, const char *dev_path, uint64_t blkio_weight) {
 621         char buf[DECIMAL_STR_MAX(dev_t)*2+2+DECIMAL_STR_MAX(uint64_t)+1];
 622         dev_t dev;
 623         int r;
 624
 625         r = lookup_block_device(dev_path, &dev);
 626         if (r < 0)
 627                 return;
 628
 629         xsprintf(buf, "%u:%u %" PRIu64 "\n", major(dev), minor(dev), blkio_weight);
 630         r = cg_set_attribute("blkio", u->cgroup_path, "blkio.weight_device", buf);
 631         if (r < 0)
 632                 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
 633                               "Failed to set blkio.weight_device: %m");
 634 }
 635
 636 static unsigned cgroup_apply_io_device_limit(Unit *u, const char *dev_path, uint64_t *limits) {
 637         char limit_bufs[_CGROUP_IO_LIMIT_TYPE_MAX][DECIMAL_STR_MAX(uint64_t)];
 638         char buf[DECIMAL_STR_MAX(dev_t)*2+2+(6+DECIMAL_STR_MAX(uint64_t)+1)*4];
 639         CGroupIOLimitType type;
 640         dev_t dev;
 641         unsigned n = 0;
 642         int r;
 643
 644         r = lookup_block_device(dev_path, &dev);
 645         if (r < 0)
 646                 return 0;
 647
 648         for (type = 0; type < _CGROUP_IO_LIMIT_TYPE_MAX; type++) {
 649                 if (limits[type] != cgroup_io_limit_defaults[type]) {
 650                         xsprintf(limit_bufs[type], "%" PRIu64, limits[type]);
 651                         n++;
 652                 } else {
 653                         xsprintf(limit_bufs[type], "%s", limits[type] == CGROUP_LIMIT_MAX ? "max" : "0");
 654                 }
 655         }
 656
 657         xsprintf(buf, "%u:%u rbps=%s wbps=%s riops=%s wiops=%s\n", major(dev), minor(dev),
 658                  limit_bufs[CGROUP_IO_RBPS_MAX], limit_bufs[CGROUP_IO_WBPS_MAX],
 659                  limit_bufs[CGROUP_IO_RIOPS_MAX], limit_bufs[CGROUP_IO_WIOPS_MAX]);
 660         r = cg_set_attribute("io", u->cgroup_path, "io.max", buf);
 661         if (r < 0)
 662                 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
 663                               "Failed to set io.max: %m");
 664         return n;
 665 }
 666
 667 static unsigned cgroup_apply_blkio_device_limit(Unit *u, const char *dev_path, uint64_t rbps, uint64_t wbps) {
 668         char buf[DECIMAL_STR_MAX(dev_t)*2+2+DECIMAL_STR_MAX(uint64_t)+1];
 669         dev_t dev;
 670         unsigned n = 0;
 671         int r;
 672
 673         r = lookup_block_device(dev_path, &dev);
 674         if (r < 0)
 675                 return 0;
 676
 677         if (rbps != CGROUP_LIMIT_MAX)
 678                 n++;
 679         sprintf(buf, "%u:%u %" PRIu64 "\n", major(dev), minor(dev), rbps);
 680         r = cg_set_attribute("blkio", u->cgroup_path, "blkio.throttle.read_bps_device", buf);
 681         if (r < 0)
 682                 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
 683                               "Failed to set blkio.throttle.read_bps_device: %m");
 684
 685         if (wbps != CGROUP_LIMIT_MAX)
 686                 n++;
 687         sprintf(buf, "%u:%u %" PRIu64 "\n", major(dev), minor(dev), wbps);
 688         r = cg_set_attribute("blkio", u->cgroup_path, "blkio.throttle.write_bps_device", buf);
 689         if (r < 0)
 690                 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
 691                               "Failed to set blkio.throttle.write_bps_device: %m");
 692
 693         return n;
 694 }
 695
 696 static bool cgroup_context_has_unified_memory_config(CGroupContext *c) {
 697         return c->memory_low > 0 || c->memory_high != CGROUP_LIMIT_MAX || c->memory_max != CGROUP_LIMIT_MAX || c->memory_swap_max != CGROUP_LIMIT_MAX;
 698 }
 699
 700 static void cgroup_apply_unified_memory_limit(Unit *u, const char *file, uint64_t v) {
 701         char buf[DECIMAL_STR_MAX(uint64_t) + 1] = "max";
 702         int r;
 703
 704         if (v != CGROUP_LIMIT_MAX)
 705                 xsprintf(buf, "%" PRIu64 "\n", v);
 706
 707         r = cg_set_attribute("memory", u->cgroup_path, file, buf);
 708         if (r < 0)
 709                 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
 710                               "Failed to set %s: %m", file);
 711 }
 712
 713 static void cgroup_apply_firewall(Unit *u) {
 714         assert(u);
 715
 716         /* Best-effort: let's apply IP firewalling and/or accounting if that's enabled */
 717
 718         if (bpf_firewall_compile(u) < 0)
 719                 return;
 720
 721         (void) bpf_firewall_install(u);
 722 }
 723
 724 static void cgroup_context_apply(
 725                 Unit *u,
 726                 CGroupMask apply_mask,
 727                 bool apply_bpf,
 728                 ManagerState state) {
 729
 730         const char *path;
 731         CGroupContext *c;
 732         bool is_root;
 733         int r;
 734
 735         assert(u);
 736
 737         /* Nothing to do? Exit early! */
 738         if (apply_mask == 0 && !apply_bpf)
 739                 return;
 740
 741         /* Some cgroup attributes are not supported on the root cgroup, hence silently ignore */
 742         is_root = unit_has_root_cgroup(u);
 743
 744         assert_se(c = unit_get_cgroup_context(u));
 745         assert_se(path = u->cgroup_path);
 746
 747         if (is_root) /* Make sure we don't try to display messages with an empty path. */
 748                 path = "/";
 749
 750         /* We generally ignore errors caused by read-only mounted
 751          * cgroup trees (assuming we are running in a container then),
 752          * and missing cgroups, i.e. EROFS and ENOENT. */
 753
 754         if ((apply_mask & CGROUP_MASK_CPU) && !is_root) {
 755                 bool has_weight, has_shares;
 756
 757                 has_weight = cgroup_context_has_cpu_weight(c);
 758                 has_shares = cgroup_context_has_cpu_shares(c);
 759
 760                 if (cg_all_unified() > 0) {
 761                         uint64_t weight;
 762
 763                         if (has_weight)
 764                                 weight = cgroup_context_cpu_weight(c, state);
 765                         else if (has_shares) {
 766                                 uint64_t shares = cgroup_context_cpu_shares(c, state);
 767
 768                                 weight = cgroup_cpu_shares_to_weight(shares);
 769
 770                                 log_cgroup_compat(u, "Applying [Startup]CpuShares %" PRIu64 " as [Startup]CpuWeight %" PRIu64 " on %s",
 771                                                   shares, weight, path);
 772                         } else
 773                                 weight = CGROUP_WEIGHT_DEFAULT;
 774
 775                         cgroup_apply_unified_cpu_config(u, weight, c->cpu_quota_per_sec_usec);
 776                 } else {
 777                         uint64_t shares;
 778
 779                         if (has_weight) {
 780                                 uint64_t weight = cgroup_context_cpu_weight(c, state);
 781
 782                                 shares = cgroup_cpu_weight_to_shares(weight);
 783
 784                                 log_cgroup_compat(u, "Applying [Startup]CpuWeight %" PRIu64 " as [Startup]CpuShares %" PRIu64 " on %s",
 785                                                   weight, shares, path);
 786                         } else if (has_shares)
 787                                 shares = cgroup_context_cpu_shares(c, state);
 788                         else
 789                                 shares = CGROUP_CPU_SHARES_DEFAULT;
 790
 791                         cgroup_apply_legacy_cpu_config(u, shares, c->cpu_quota_per_sec_usec);
 792                 }
 793         }
 794
 795         if (apply_mask & CGROUP_MASK_IO) {
 796                 bool has_io = cgroup_context_has_io_config(c);
 797                 bool has_blockio = cgroup_context_has_blockio_config(c);
 798
 799                 if (!is_root) {
 800                         char buf[8+DECIMAL_STR_MAX(uint64_t)+1];
 801                         uint64_t weight;
 802
 803                         if (has_io)
 804                                 weight = cgroup_context_io_weight(c, state);
 805                         else if (has_blockio) {
 806                                 uint64_t blkio_weight = cgroup_context_blkio_weight(c, state);
 807
 808                                 weight = cgroup_weight_blkio_to_io(blkio_weight);
 809
 810                                 log_cgroup_compat(u, "Applying [Startup]BlockIOWeight %" PRIu64 " as [Startup]IOWeight %" PRIu64,
 811                                                   blkio_weight, weight);
 812                         } else
 813                                 weight = CGROUP_WEIGHT_DEFAULT;
 814
 815                         xsprintf(buf, "default %" PRIu64 "\n", weight);
 816                         r = cg_set_attribute("io", path, "io.weight", buf);
 817                         if (r < 0)
 818                                 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
 819                                               "Failed to set io.weight: %m");
 820
 821                         if (has_io) {
 822                                 CGroupIODeviceWeight *w;
 823
 824                                 /* FIXME: no way to reset this list */
 825                                 LIST_FOREACH(device_weights, w, c->io_device_weights)
 826                                         cgroup_apply_io_device_weight(u, w->path, w->weight);
 827                         } else if (has_blockio) {
 828                                 CGroupBlockIODeviceWeight *w;
 829
 830                                 /* FIXME: no way to reset this list */
 831                                 LIST_FOREACH(device_weights, w, c->blockio_device_weights) {
 832                                         weight = cgroup_weight_blkio_to_io(w->weight);
 833
 834                                         log_cgroup_compat(u, "Applying BlockIODeviceWeight %" PRIu64 " as IODeviceWeight %" PRIu64 " for %s",
 835                                                           w->weight, weight, w->path);
 836
 837                                         cgroup_apply_io_device_weight(u, w->path, weight);
 838                                 }
 839                         }
 840                 }
 841
 842                 /* Apply limits and free ones without config. */
 843                 if (has_io) {
 844                         CGroupIODeviceLimit *l, *next;
 845
 846                         LIST_FOREACH_SAFE(device_limits, l, next, c->io_device_limits) {
 847                                 if (!cgroup_apply_io_device_limit(u, l->path, l->limits))
 848                                         cgroup_context_free_io_device_limit(c, l);
 849                         }
 850                 } else if (has_blockio) {
 851                         CGroupBlockIODeviceBandwidth *b, *next;
 852
 853                         LIST_FOREACH_SAFE(device_bandwidths, b, next, c->blockio_device_bandwidths) {
 854                                 uint64_t limits[_CGROUP_IO_LIMIT_TYPE_MAX];
 855                                 CGroupIOLimitType type;
 856
 857                                 for (type = 0; type < _CGROUP_IO_LIMIT_TYPE_MAX; type++)
 858                                         limits[type] = cgroup_io_limit_defaults[type];
 859
 860                                 limits[CGROUP_IO_RBPS_MAX] = b->rbps;
 861                                 limits[CGROUP_IO_WBPS_MAX] = b->wbps;
 862
 863                                 log_cgroup_compat(u, "Applying BlockIO{Read|Write}Bandwidth %" PRIu64 " %" PRIu64 " as IO{Read|Write}BandwidthMax for %s",
 864                                                   b->rbps, b->wbps, b->path);
 865
 866                                 if (!cgroup_apply_io_device_limit(u, b->path, limits))
 867                                         cgroup_context_free_blockio_device_bandwidth(c, b);
 868                         }
 869                 }
 870         }
 871
 872         if (apply_mask & CGROUP_MASK_BLKIO) {
 873                 bool has_io = cgroup_context_has_io_config(c);
 874                 bool has_blockio = cgroup_context_has_blockio_config(c);
 875
 876                 if (!is_root) {
 877                         char buf[DECIMAL_STR_MAX(uint64_t)+1];
 878                         uint64_t weight;
 879
 880                         if (has_io) {
 881                                 uint64_t io_weight = cgroup_context_io_weight(c, state);
 882
 883                                 weight = cgroup_weight_io_to_blkio(cgroup_context_io_weight(c, state));
 884
 885                                 log_cgroup_compat(u, "Applying [Startup]IOWeight %" PRIu64 " as [Startup]BlockIOWeight %" PRIu64,
 886                                                   io_weight, weight);
 887                         } else if (has_blockio)
 888                                 weight = cgroup_context_blkio_weight(c, state);
 889                         else
 890                                 weight = CGROUP_BLKIO_WEIGHT_DEFAULT;
 891
 892                         xsprintf(buf, "%" PRIu64 "\n", weight);
 893                         r = cg_set_attribute("blkio", path, "blkio.weight", buf);
 894                         if (r < 0)
 895                                 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
 896                                               "Failed to set blkio.weight: %m");
 897
 898                         if (has_io) {
 899                                 CGroupIODeviceWeight *w;
 900
 901                                 /* FIXME: no way to reset this list */
 902                                 LIST_FOREACH(device_weights, w, c->io_device_weights) {
 903                                         weight = cgroup_weight_io_to_blkio(w->weight);
 904
 905                                         log_cgroup_compat(u, "Applying IODeviceWeight %" PRIu64 " as BlockIODeviceWeight %" PRIu64 " for %s",
 906                                                           w->weight, weight, w->path);
 907
 908                                         cgroup_apply_blkio_device_weight(u, w->path, weight);
 909                                 }
 910                         } else if (has_blockio) {
 911                                 CGroupBlockIODeviceWeight *w;
 912
 913                                 /* FIXME: no way to reset this list */
 914                                 LIST_FOREACH(device_weights, w, c->blockio_device_weights)
 915                                         cgroup_apply_blkio_device_weight(u, w->path, w->weight);
 916                         }
 917                 }
 918
 919                 /* Apply limits and free ones without config. */
 920                 if (has_io) {
 921                         CGroupIODeviceLimit *l, *next;
 922
 923                         LIST_FOREACH_SAFE(device_limits, l, next, c->io_device_limits) {
 924                                 log_cgroup_compat(u, "Applying IO{Read|Write}Bandwidth %" PRIu64 " %" PRIu64 " as BlockIO{Read|Write}BandwidthMax for %s",
 925                                                   l->limits[CGROUP_IO_RBPS_MAX], l->limits[CGROUP_IO_WBPS_MAX], l->path);
 926
 927                                 if (!cgroup_apply_blkio_device_limit(u, l->path, l->limits[CGROUP_IO_RBPS_MAX], l->limits[CGROUP_IO_WBPS_MAX]))
 928                                         cgroup_context_free_io_device_limit(c, l);
 929                         }
 930                 } else if (has_blockio) {
 931                         CGroupBlockIODeviceBandwidth *b, *next;
 932
 933                         LIST_FOREACH_SAFE(device_bandwidths, b, next, c->blockio_device_bandwidths)
 934                                 if (!cgroup_apply_blkio_device_limit(u, b->path, b->rbps, b->wbps))
 935                                         cgroup_context_free_blockio_device_bandwidth(c, b);
 936                 }
 937         }
 938
 939         if ((apply_mask & CGROUP_MASK_MEMORY) && !is_root) {
 940                 if (cg_all_unified() > 0) {
 941                         uint64_t max, swap_max = CGROUP_LIMIT_MAX;
 942
 943                         if (cgroup_context_has_unified_memory_config(c)) {
 944                                 max = c->memory_max;
 945                                 swap_max = c->memory_swap_max;
 946                         } else {
 947                                 max = c->memory_limit;
 948
 949                                 if (max != CGROUP_LIMIT_MAX)
 950                                         log_cgroup_compat(u, "Applying MemoryLimit %" PRIu64 " as MemoryMax", max);
 951                         }
 952
 953                         cgroup_apply_unified_memory_limit(u, "memory.low", c->memory_low);
 954                         cgroup_apply_unified_memory_limit(u, "memory.high", c->memory_high);
 955                         cgroup_apply_unified_memory_limit(u, "memory.max", max);
 956                         cgroup_apply_unified_memory_limit(u, "memory.swap.max", swap_max);
 957                 } else {
 958                         char buf[DECIMAL_STR_MAX(uint64_t) + 1];
 959                         uint64_t val;
 960
 961                         if (cgroup_context_has_unified_memory_config(c)) {
 962                                 val = c->memory_max;
 963                                 log_cgroup_compat(u, "Applying MemoryMax %" PRIi64 " as MemoryLimit", val);
 964                         } else
 965                                 val = c->memory_limit;
 966
 967                         if (val == CGROUP_LIMIT_MAX)
 968                                 strncpy(buf, "-1\n", sizeof(buf));
 969                         else
 970                                 xsprintf(buf, "%" PRIu64 "\n", val);
 971
 972                         r = cg_set_attribute("memory", path, "memory.limit_in_bytes", buf);
 973                         if (r < 0)
 974                                 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
 975                                               "Failed to set memory.limit_in_bytes: %m");
 976                 }
 977         }
 978
 979         if ((apply_mask & CGROUP_MASK_DEVICES) && !is_root) {
 980                 CGroupDeviceAllow *a;
 981
 982                 /* Changing the devices list of a populated cgroup
 983                  * might result in EINVAL, hence ignore EINVAL
 984                  * here. */
 985
 986                 if (c->device_allow || c->device_policy != CGROUP_AUTO)
 987                         r = cg_set_attribute("devices", path, "devices.deny", "a");
 988                 else
 989                         r = cg_set_attribute("devices", path, "devices.allow", "a");
 990                 if (r < 0)
 991                         log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EINVAL, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
 992                                       "Failed to reset devices.list: %m");
 993
 994                 if (c->device_policy == CGROUP_CLOSED ||
 995                     (c->device_policy == CGROUP_AUTO && c->device_allow)) {
 996                         static const char auto_devices[] =
 997                                 "/dev/null\0" "rwm\0"
 998                                 "/dev/zero\0" "rwm\0"
 999                                 "/dev/full\0" "rwm\0"
1000                                 "/dev/random\0" "rwm\0"
1001                                 "/dev/urandom\0" "rwm\0"
1002                                 "/dev/tty\0" "rwm\0"
1003                                 "/dev/ptmx\0" "rwm\0"
1004                                 /* Allow /run/systemd/inaccessible/{chr,blk} devices for mapping InaccessiblePaths */
1005                                 "-/run/systemd/inaccessible/chr\0" "rwm\0"
1006                                 "-/run/systemd/inaccessible/blk\0" "rwm\0";
1007
1008                         const char *x, *y;
1009
1010                         NULSTR_FOREACH_PAIR(x, y, auto_devices)
1011                                 whitelist_device(path, x, y);
1012
1013                         /* PTS (/dev/pts) devices may not be duplicated, but accessed */
1014                         whitelist_major(path, "pts", 'c', "rw");
1015                 }
1016
1017                 LIST_FOREACH(device_allow, a, c->device_allow) {
1018                         char acc[4], *val;
1019                         unsigned k = 0;
1020
1021                         if (a->r)
1022                                 acc[k++] = 'r';
1023                         if (a->w)
1024                                 acc[k++] = 'w';
1025                         if (a->m)
1026                                 acc[k++] = 'm';
1027
1028                         if (k == 0)
1029                                 continue;
1030
1031                         acc[k++] = 0;
1032
1033                         if (path_startswith(a->path, "/dev/"))
1034                                 whitelist_device(path, a->path, acc);
1035                         else if ((val = startswith(a->path, "block-")))
1036                                 whitelist_major(path, val, 'b', acc);
1037                         else if ((val = startswith(a->path, "char-")))
1038                                 whitelist_major(path, val, 'c', acc);
1039                         else
1040                                 log_unit_debug(u, "Ignoring device %s while writing cgroup attribute.", a->path);
1041                 }
1042         }
1043
1044         if (apply_mask & CGROUP_MASK_PIDS) {
1045
1046                 if (is_root) {
1047                         /* So, the "pids" controller does not expose anything on the root cgroup, in order not to
1048                          * replicate knobs exposed elsewhere needlessly. We abstract this away here however, and when
1049                          * the knobs of the root cgroup are modified propagate this to the relevant sysctls. There's a
1050                          * non-obvious asymmetry however: unlike the cgroup properties we don't really want to take
1051                          * exclusive ownership of the sysctls, but we still want to honour things if the user sets
1052                          * limits. Hence we employ sort of a one-way strategy: when the user sets a bounded limit
1053                          * through us it counts. When the user afterwards unsets it again (i.e. sets it to unbounded)
1054                          * it also counts. But if the user never set a limit through us (i.e. we are the default of
1055                          * "unbounded") we leave things unmodified. For this we manage a global boolean that we turn on
1056                          * the first time we set a limit. Note that this boolean is flushed out on manager reload,
1057                          * which is desirable so that there's an offical way to release control of the sysctl from
1058                          * systemd: set the limit to unbounded and reload. */
1059
1060                         if (c->tasks_max != CGROUP_LIMIT_MAX) {
1061                                 u->manager->sysctl_pid_max_changed = true;
1062                                 r = procfs_tasks_set_limit(c->tasks_max);
1063                         } else if (u->manager->sysctl_pid_max_changed)
1064                                 r = procfs_tasks_set_limit(TASKS_MAX);
1065                         else
1066                                 r = 0;
1067
1068                         if (r < 0)
1069                                 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
1070                                               "Failed to write to tasks limit sysctls: %m");
1071
1072                 } else {
1073                         if (c->tasks_max != CGROUP_LIMIT_MAX) {
1074                                 char buf[DECIMAL_STR_MAX(uint64_t) + 2];
1075
1076                                 sprintf(buf, "%" PRIu64 "\n", c->tasks_max);
1077                                 r = cg_set_attribute("pids", path, "pids.max", buf);
1078                         } else
1079                                 r = cg_set_attribute("pids", path, "pids.max", "max");
1080                         if (r < 0)
1081                                 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
1082                                               "Failed to set pids.max: %m");
1083                 }
1084         }
1085
1086         if (apply_bpf)
1087                 cgroup_apply_firewall(u);
1088 }
1089
1090 CGroupMask cgroup_context_get_mask(CGroupContext *c) {
1091         CGroupMask mask = 0;
1092
1093         /* Figure out which controllers we need */
1094
1095         if (c->cpu_accounting ||
1096             cgroup_context_has_cpu_weight(c) ||
1097             cgroup_context_has_cpu_shares(c) ||
1098             c->cpu_quota_per_sec_usec != USEC_INFINITY)
1099                 mask |= CGROUP_MASK_CPUACCT | CGROUP_MASK_CPU;
1100
1101         if (cgroup_context_has_io_config(c) || cgroup_context_has_blockio_config(c))
1102                 mask |= CGROUP_MASK_IO | CGROUP_MASK_BLKIO;
1103
1104         if (c->memory_accounting ||
1105             c->memory_limit != CGROUP_LIMIT_MAX ||
1106             cgroup_context_has_unified_memory_config(c))
1107                 mask |= CGROUP_MASK_MEMORY;
1108
1109         if (c->device_allow ||
1110             c->device_policy != CGROUP_AUTO)
1111                 mask |= CGROUP_MASK_DEVICES;
1112
1113         if (c->tasks_accounting ||
1114             c->tasks_max != CGROUP_LIMIT_MAX)
1115                 mask |= CGROUP_MASK_PIDS;
1116
1117         return mask;
1118 }
1119
1120 CGroupMask unit_get_own_mask(Unit *u) {
1121         CGroupContext *c;
1122
1123         /* Returns the mask of controllers the unit needs for itself */
1124
1125         c = unit_get_cgroup_context(u);
1126         if (!c)
1127                 return 0;
1128
1129         return cgroup_context_get_mask(c) | unit_get_delegate_mask(u);
1130 }
1131
1132 CGroupMask unit_get_delegate_mask(Unit *u) {
1133         CGroupContext *c;
1134
1135         /* If delegation is turned on, then turn on selected controllers, unless we are on the legacy hierarchy and the
1136          * process we fork into is known to drop privileges, and hence shouldn't get access to the controllers.
1137          *
1138          * Note that on the unified hierarchy it is safe to delegate controllers to unprivileged services. */
1139
1140         if (!unit_cgroup_delegate(u))
1141                 return 0;
1142
1143         if (cg_all_unified() <= 0) {
1144                 ExecContext *e;
1145
1146                 e = unit_get_exec_context(u);
1147                 if (e && !exec_context_maintains_privileges(e))
1148                         return 0;
1149         }
1150
1151         assert_se(c = unit_get_cgroup_context(u));
1152         return c->delegate_controllers;
1153 }
1154
1155 CGroupMask unit_get_members_mask(Unit *u) {
1156         assert(u);
1157
1158         /* Returns the mask of controllers all of the unit's children require, merged */
1159
1160         if (u->cgroup_members_mask_valid)
1161                 return u->cgroup_members_mask;
1162
1163         u->cgroup_members_mask = 0;
1164
1165         if (u->type == UNIT_SLICE) {
1166                 void *v;
1167                 Unit *member;
1168                 Iterator i;
1169
1170                 HASHMAP_FOREACH_KEY(v, member, u->dependencies[UNIT_BEFORE], i) {
1171
1172                         if (member == u)
1173                                 continue;
1174
1175                         if (UNIT_DEREF(member->slice) != u)
1176                                 continue;
1177
1178                         u->cgroup_members_mask |= unit_get_subtree_mask(member); /* note that this calls ourselves again, for the children */
1179                 }
1180         }
1181
1182         u->cgroup_members_mask_valid = true;
1183         return u->cgroup_members_mask;
1184 }
1185
1186 CGroupMask unit_get_siblings_mask(Unit *u) {
1187         assert(u);
1188
1189         /* Returns the mask of controllers all of the unit's siblings
1190          * require, i.e. the members mask of the unit's parent slice
1191          * if there is one. */
1192
1193         if (UNIT_ISSET(u->slice))
1194                 return unit_get_members_mask(UNIT_DEREF(u->slice));
1195
1196         return unit_get_subtree_mask(u); /* we are the top-level slice */
1197 }
1198
1199 CGroupMask unit_get_subtree_mask(Unit *u) {
1200
1201         /* Returns the mask of this subtree, meaning of the group
1202          * itself and its children. */
1203
1204         return unit_get_own_mask(u) | unit_get_members_mask(u);
1205 }
1206
1207 CGroupMask unit_get_target_mask(Unit *u) {
1208         CGroupMask mask;
1209
1210         /* This returns the cgroup mask of all controllers to enable
1211          * for a specific cgroup, i.e. everything it needs itself,
1212          * plus all that its children need, plus all that its siblings
1213          * need. This is primarily useful on the legacy cgroup
1214          * hierarchy, where we need to duplicate each cgroup in each
1215          * hierarchy that shall be enabled for it. */
1216
1217         mask = unit_get_own_mask(u) | unit_get_members_mask(u) | unit_get_siblings_mask(u);
1218         mask &= u->manager->cgroup_supported;
1219
1220         return mask;
1221 }
1222
1223 CGroupMask unit_get_enable_mask(Unit *u) {
1224         CGroupMask mask;
1225
1226         /* This returns the cgroup mask of all controllers to enable
1227          * for the children of a specific cgroup. This is primarily
1228          * useful for the unified cgroup hierarchy, where each cgroup
1229          * controls which controllers are enabled for its children. */
1230
1231         mask = unit_get_members_mask(u);
1232         mask &= u->manager->cgroup_supported;
1233
1234         return mask;
1235 }
1236
1237 bool unit_get_needs_bpf(Unit *u) {
1238         CGroupContext *c;
1239         Unit *p;
1240         assert(u);
1241
1242         c = unit_get_cgroup_context(u);
1243         if (!c)
1244                 return false;
1245
1246         if (c->ip_accounting ||
1247             c->ip_address_allow ||
1248             c->ip_address_deny)
1249                 return true;
1250
1251         /* If any parent slice has an IP access list defined, it applies too */
1252         for (p = UNIT_DEREF(u->slice); p; p = UNIT_DEREF(p->slice)) {
1253                 c = unit_get_cgroup_context(p);
1254                 if (!c)
1255                         return false;
1256
1257                 if (c->ip_address_allow ||
1258                     c->ip_address_deny)
1259                         return true;
1260         }
1261
1262         return false;
1263 }
1264
1265 /* Recurse from a unit up through its containing slices, propagating
1266  * mask bits upward. A unit is also member of itself. */
1267 void unit_update_cgroup_members_masks(Unit *u) {
1268         CGroupMask m;
1269         bool more;
1270
1271         assert(u);
1272
1273         /* Calculate subtree mask */
1274         m = unit_get_subtree_mask(u);
1275
1276         /* See if anything changed from the previous invocation. If
1277          * not, we're done. */
1278         if (u->cgroup_subtree_mask_valid && m == u->cgroup_subtree_mask)
1279                 return;
1280
1281         more =
1282                 u->cgroup_subtree_mask_valid &&
1283                 ((m & ~u->cgroup_subtree_mask) != 0) &&
1284                 ((~m & u->cgroup_subtree_mask) == 0);
1285
1286         u->cgroup_subtree_mask = m;
1287         u->cgroup_subtree_mask_valid = true;
1288
1289         if (UNIT_ISSET(u->slice)) {
1290                 Unit *s = UNIT_DEREF(u->slice);
1291
1292                 if (more)
1293                         /* There's more set now than before. We
1294                          * propagate the new mask to the parent's mask
1295                          * (not caring if it actually was valid or
1296                          * not). */
1297
1298                         s->cgroup_members_mask |= m;
1299
1300                 else
1301                         /* There's less set now than before (or we
1302                          * don't know), we need to recalculate
1303                          * everything, so let's invalidate the
1304                          * parent's members mask */
1305
1306                         s->cgroup_members_mask_valid = false;
1307
1308                 /* And now make sure that this change also hits our
1309                  * grandparents */
1310                 unit_update_cgroup_members_masks(s);
1311         }
1312 }
1313
1314 const char *unit_get_realized_cgroup_path(Unit *u, CGroupMask mask) {
1315
1316         /* Returns the realized cgroup path of the specified unit where all specified controllers are available. */
1317
1318         while (u) {
1319
1320                 if (u->cgroup_path &&
1321                     u->cgroup_realized &&
1322                     (u->cgroup_realized_mask & mask) == mask)
1323                         return u->cgroup_path;
1324
1325                 u = UNIT_DEREF(u->slice);
1326         }
1327
1328         return NULL;
1329 }
1330
1331 static const char *migrate_callback(CGroupMask mask, void *userdata) {
1332         return unit_get_realized_cgroup_path(userdata, mask);
1333 }
1334
1335 char *unit_default_cgroup_path(Unit *u) {
1336         _cleanup_free_ char *escaped = NULL, *slice = NULL;
1337         int r;
1338
1339         assert(u);
1340
1341         if (unit_has_name(u, SPECIAL_ROOT_SLICE))
1342                 return strdup(u->manager->cgroup_root);
1343
1344         if (UNIT_ISSET(u->slice) && !unit_has_name(UNIT_DEREF(u->slice), SPECIAL_ROOT_SLICE)) {
1345                 r = cg_slice_to_path(UNIT_DEREF(u->slice)->id, &slice);
1346                 if (r < 0)
1347                         return NULL;
1348         }
1349
1350         escaped = cg_escape(u->id);
1351         if (!escaped)
1352                 return NULL;
1353
1354         if (slice)
1355                 return strjoin(u->manager->cgroup_root, "/", slice, "/",
1356                                escaped);
1357         else
1358                 return strjoin(u->manager->cgroup_root, "/", escaped);
1359 }
1360
1361 int unit_set_cgroup_path(Unit *u, const char *path) {
1362         _cleanup_free_ char *p = NULL;
1363         int r;
1364
1365         assert(u);
1366
1367         if (path) {
1368                 p = strdup(path);
1369                 if (!p)
1370                         return -ENOMEM;
1371         } else
1372                 p = NULL;
1373
1374         if (streq_ptr(u->cgroup_path, p))
1375                 return 0;
1376
1377         if (p) {
1378                 r = hashmap_put(u->manager->cgroup_unit, p, u);
1379                 if (r < 0)
1380                         return r;
1381         }
1382
1383         unit_release_cgroup(u);
1384
1385         u->cgroup_path = TAKE_PTR(p);
1386
1387         return 1;
1388 }
1389
1390 int unit_watch_cgroup(Unit *u) {
1391         _cleanup_free_ char *events = NULL;
1392         int r;
1393
1394         assert(u);
1395
1396         if (!u->cgroup_path)
1397                 return 0;
1398
1399         if (u->cgroup_inotify_wd >= 0)
1400                 return 0;
1401
1402         /* Only applies to the unified hierarchy */
1403         r = cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER);
1404         if (r < 0)
1405                 return log_error_errno(r, "Failed to determine whether the name=systemd hierarchy is unified: %m");
1406         if (r == 0)
1407                 return 0;
1408
1409         /* Don't watch the root slice, it's pointless. */
1410         if (unit_has_name(u, SPECIAL_ROOT_SLICE))
1411                 return 0;
1412
1413         r = hashmap_ensure_allocated(&u->manager->cgroup_inotify_wd_unit, &trivial_hash_ops);
1414         if (r < 0)
1415                 return log_oom();
1416
1417         r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, "cgroup.events", &events);
1418         if (r < 0)
1419                 return log_oom();
1420
1421         u->cgroup_inotify_wd = inotify_add_watch(u->manager->cgroup_inotify_fd, events, IN_MODIFY);
1422         if (u->cgroup_inotify_wd < 0) {
1423
1424                 if (errno == ENOENT) /* If the directory is already
1425                                       * gone we don't need to track
1426                                       * it, so this is not an error */
1427                         return 0;
1428
1429                 return log_unit_error_errno(u, errno, "Failed to add inotify watch descriptor for control group %s: %m", u->cgroup_path);
1430         }
1431
1432         r = hashmap_put(u->manager->cgroup_inotify_wd_unit, INT_TO_PTR(u->cgroup_inotify_wd), u);
1433         if (r < 0)
1434                 return log_unit_error_errno(u, r, "Failed to add inotify watch descriptor to hash map: %m");
1435
1436         return 0;
1437 }
1438
1439 int unit_pick_cgroup_path(Unit *u) {
1440         _cleanup_free_ char *path = NULL;
1441         int r;
1442
1443         assert(u);
1444
1445         if (u->cgroup_path)
1446                 return 0;
1447
1448         if (!UNIT_HAS_CGROUP_CONTEXT(u))
1449                 return -EINVAL;
1450
1451         path = unit_default_cgroup_path(u);
1452         if (!path)
1453                 return log_oom();
1454
1455         r = unit_set_cgroup_path(u, path);
1456         if (r == -EEXIST)
1457                 return log_unit_error_errno(u, r, "Control group %s exists already.", path);
1458         if (r < 0)
1459                 return log_unit_error_errno(u, r, "Failed to set unit's control group path to %s: %m", path);
1460
1461         return 0;
1462 }
1463
1464 static int unit_create_cgroup(
1465                 Unit *u,
1466                 CGroupMask target_mask,
1467                 CGroupMask enable_mask,
1468                 bool needs_bpf) {
1469
1470         CGroupContext *c;
1471         int r;
1472
1473         assert(u);
1474
1475         c = unit_get_cgroup_context(u);
1476         if (!c)
1477                 return 0;
1478
1479         /* Figure out our cgroup path */
1480         r = unit_pick_cgroup_path(u);
1481         if (r < 0)
1482                 return r;
1483
1484         /* First, create our own group */
1485         r = cg_create_everywhere(u->manager->cgroup_supported, target_mask, u->cgroup_path);
1486         if (r < 0)
1487                 return log_unit_error_errno(u, r, "Failed to create cgroup %s: %m", u->cgroup_path);
1488
1489         /* Start watching it */
1490         (void) unit_watch_cgroup(u);
1491
1492         /* Enable all controllers we need */
1493         r = cg_enable_everywhere(u->manager->cgroup_supported, enable_mask, u->cgroup_path);
1494         if (r < 0)
1495                 log_unit_warning_errno(u, r, "Failed to enable controllers on cgroup %s, ignoring: %m", u->cgroup_path);
1496
1497         /* Keep track that this is now realized */
1498         u->cgroup_realized = true;
1499         u->cgroup_realized_mask = target_mask;
1500         u->cgroup_enabled_mask = enable_mask;
1501         u->cgroup_bpf_state = needs_bpf ? UNIT_CGROUP_BPF_ON : UNIT_CGROUP_BPF_OFF;
1502
1503         if (u->type != UNIT_SLICE && !unit_cgroup_delegate(u)) {
1504
1505                 /* Then, possibly move things over, but not if
1506                  * subgroups may contain processes, which is the case
1507                  * for slice and delegation units. */
1508                 r = cg_migrate_everywhere(u->manager->cgroup_supported, u->cgroup_path, u->cgroup_path, migrate_callback, u);
1509                 if (r < 0)
1510                         log_unit_warning_errno(u, r, "Failed to migrate cgroup from to %s, ignoring: %m", u->cgroup_path);
1511         }
1512
1513         return 0;
1514 }
1515
1516 static int unit_attach_pid_to_cgroup_via_bus(Unit *u, pid_t pid, const char *suffix_path) {
1517         _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL;
1518         char *pp;
1519         int r;
1520
1521         assert(u);
1522
1523         if (MANAGER_IS_SYSTEM(u->manager))
1524                 return -EINVAL;
1525
1526         if (!u->manager->system_bus)
1527                 return -EIO;
1528
1529         if (!u->cgroup_path)
1530                 return -EINVAL;
1531
1532         /* Determine this unit's cgroup path relative to our cgroup root */
1533         pp = path_startswith(u->cgroup_path, u->manager->cgroup_root);
1534         if (!pp)
1535                 return -EINVAL;
1536
1537         pp = strjoina("/", pp, suffix_path);
1538         path_kill_slashes(pp);
1539
1540         r = sd_bus_call_method(u->manager->system_bus,
1541                                "org.freedesktop.systemd1",
1542                                "/org/freedesktop/systemd1",
1543                                "org.freedesktop.systemd1.Manager",
1544                                "AttachProcessesToUnit",
1545                                &error, NULL,
1546                                "ssau",
1547                                NULL /* empty unit name means client's unit, i.e. us */, pp, 1, (uint32_t) pid);
1548         if (r < 0)
1549                 return log_unit_debug_errno(u, r, "Failed to attach unit process " PID_FMT " via the bus: %s", pid, bus_error_message(&error, r));
1550
1551         return 0;
1552 }
1553
1554 int unit_attach_pids_to_cgroup(Unit *u, Set *pids, const char *suffix_path) {
1555         CGroupMask delegated_mask;
1556         const char *p;
1557         Iterator i;
1558         void *pidp;
1559         int r, q;
1560
1561         assert(u);
1562
1563         if (!UNIT_HAS_CGROUP_CONTEXT(u))
1564                 return -EINVAL;
1565
1566         if (set_isempty(pids))
1567                 return 0;
1568
1569         r = unit_realize_cgroup(u);
1570         if (r < 0)
1571                 return r;
1572
1573         if (isempty(suffix_path))
1574                 p = u->cgroup_path;
1575         else
1576                 p = strjoina(u->cgroup_path, "/", suffix_path);
1577
1578         delegated_mask = unit_get_delegate_mask(u);
1579
1580         r = 0;
1581         SET_FOREACH(pidp, pids, i) {
1582                 pid_t pid = PTR_TO_PID(pidp);
1583                 CGroupController c;
1584
1585                 /* First, attach the PID to the main cgroup hierarchy */
1586                 q = cg_attach(SYSTEMD_CGROUP_CONTROLLER, p, pid);
1587                 if (q < 0) {
1588                         log_unit_debug_errno(u, q, "Couldn't move process " PID_FMT " to requested cgroup '%s': %m", pid, p);
1589
1590                         if (MANAGER_IS_USER(u->manager) && IN_SET(q, -EPERM, -EACCES)) {
1591                                 int z;
1592
1593                                 /* If we are in a user instance, and we can't move the process ourselves due to
1594                                  * permission problems, let's ask the system instance about it instead. Since it's more
1595                                  * privileged it might be able to move the process across the leaves of a subtree who's
1596                                  * top node is not owned by us. */
1597
1598                                 z = unit_attach_pid_to_cgroup_via_bus(u, pid, suffix_path);
1599                                 if (z < 0)
1600                                         log_unit_debug_errno(u, z, "Couldn't move process " PID_FMT " to requested cgroup '%s' via the system bus either: %m", pid, p);
1601                                 else
1602                                         continue; /* When the bus thing worked via the bus we are fully done for this PID. */
1603                         }
1604
1605                         if (r >= 0)
1606                                 r = q; /* Remember first error */
1607
1608                         continue;
1609                 }
1610
1611                 q = cg_all_unified();
1612                 if (q < 0)
1613                         return q;
1614                 if (q > 0)
1615                         continue;
1616
1617                 /* In the legacy hierarchy, attach the process to the request cgroup if possible, and if not to the
1618                  * innermost realized one */
1619
1620                 for (c = 0; c < _CGROUP_CONTROLLER_MAX; c++) {
1621                         CGroupMask bit = CGROUP_CONTROLLER_TO_MASK(c);
1622                         const char *realized;
1623
1624                         if (!(u->manager->cgroup_supported & bit))
1625                                 continue;
1626
1627                         /* If this controller is delegated and realized, honour the caller's request for the cgroup suffix. */
1628                         if (delegated_mask & u->cgroup_realized_mask & bit) {
1629                                 q = cg_attach(cgroup_controller_to_string(c), p, pid);
1630                                 if (q >= 0)
1631                                         continue; /* Success! */
1632
1633                                 log_unit_debug_errno(u, q, "Failed to attach PID " PID_FMT " to requested cgroup %s in controller %s, falling back to unit's cgroup: %m",
1634                                                      pid, p, cgroup_controller_to_string(c));
1635                         }
1636
1637                         /* So this controller is either not delegate or realized, or something else weird happened. In
1638                          * that case let's attach the PID at least to the closest cgroup up the tree that is
1639                          * realized. */
1640                         realized = unit_get_realized_cgroup_path(u, bit);
1641                         if (!realized)
1642                                 continue; /* Not even realized in the root slice? Then let's not bother */
1643
1644                         q = cg_attach(cgroup_controller_to_string(c), realized, pid);
1645                         if (q < 0)
1646                                 log_unit_debug_errno(u, q, "Failed to attach PID " PID_FMT " to realized cgroup %s in controller %s, ignoring: %m",
1647                                                      pid, realized, cgroup_controller_to_string(c));
1648                 }
1649         }
1650
1651         return r;
1652 }
1653
1654 static void cgroup_xattr_apply(Unit *u) {
1655         char ids[SD_ID128_STRING_MAX];
1656         int r;
1657
1658         assert(u);
1659
1660         if (!MANAGER_IS_SYSTEM(u->manager))
1661                 return;
1662
1663         if (sd_id128_is_null(u->invocation_id))
1664                 return;
1665
1666         r = cg_set_xattr(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path,
1667                          "trusted.invocation_id",
1668                          sd_id128_to_string(u->invocation_id, ids), 32,
1669                          0);
1670         if (r < 0)
1671                 log_unit_debug_errno(u, r, "Failed to set invocation ID on control group %s, ignoring: %m", u->cgroup_path);
1672 }
1673
1674 static bool unit_has_mask_realized(
1675                 Unit *u,
1676                 CGroupMask target_mask,
1677                 CGroupMask enable_mask,
1678                 bool needs_bpf) {
1679
1680         assert(u);
1681
1682         return u->cgroup_realized &&
1683                 u->cgroup_realized_mask == target_mask &&
1684                 u->cgroup_enabled_mask == enable_mask &&
1685                 ((needs_bpf && u->cgroup_bpf_state == UNIT_CGROUP_BPF_ON) ||
1686                  (!needs_bpf && u->cgroup_bpf_state == UNIT_CGROUP_BPF_OFF));
1687 }
1688
1689 static void unit_add_to_cgroup_realize_queue(Unit *u) {
1690         assert(u);
1691
1692         if (u->in_cgroup_realize_queue)
1693                 return;
1694
1695         LIST_PREPEND(cgroup_realize_queue, u->manager->cgroup_realize_queue, u);
1696         u->in_cgroup_realize_queue = true;
1697 }
1698
1699 static void unit_remove_from_cgroup_realize_queue(Unit *u) {
1700         assert(u);
1701
1702         if (!u->in_cgroup_realize_queue)
1703                 return;
1704
1705         LIST_REMOVE(cgroup_realize_queue, u->manager->cgroup_realize_queue, u);
1706         u->in_cgroup_realize_queue = false;
1707 }
1708
1709
1710 /* Check if necessary controllers and attributes for a unit are in place.
1711  *
1712  * If so, do nothing.
1713  * If not, create paths, move processes over, and set attributes.
1714  *
1715  * Returns 0 on success and < 0 on failure. */
1716 static int unit_realize_cgroup_now(Unit *u, ManagerState state) {
1717         CGroupMask target_mask, enable_mask;
1718         bool needs_bpf, apply_bpf;
1719         int r;
1720
1721         assert(u);
1722
1723         unit_remove_from_cgroup_realize_queue(u);
1724
1725         target_mask = unit_get_target_mask(u);
1726         enable_mask = unit_get_enable_mask(u);
1727         needs_bpf = unit_get_needs_bpf(u);
1728
1729         if (unit_has_mask_realized(u, target_mask, enable_mask, needs_bpf))
1730                 return 0;
1731
1732         /* Make sure we apply the BPF filters either when one is configured, or if none is configured but previously
1733          * the state was anything but off. This way, if a unit with a BPF filter applied is reconfigured to lose it
1734          * this will trickle down properly to cgroupfs. */
1735         apply_bpf = needs_bpf || u->cgroup_bpf_state != UNIT_CGROUP_BPF_OFF;
1736
1737         /* First, realize parents */
1738         if (UNIT_ISSET(u->slice)) {
1739                 r = unit_realize_cgroup_now(UNIT_DEREF(u->slice), state);
1740                 if (r < 0)
1741                         return r;
1742         }
1743
1744         /* And then do the real work */
1745         r = unit_create_cgroup(u, target_mask, enable_mask, needs_bpf);
1746         if (r < 0)
1747                 return r;
1748
1749         /* Finally, apply the necessary attributes. */
1750         cgroup_context_apply(u, target_mask, apply_bpf, state);
1751         cgroup_xattr_apply(u);
1752
1753         return 0;
1754 }
1755
1756 unsigned manager_dispatch_cgroup_realize_queue(Manager *m) {
1757         ManagerState state;
1758         unsigned n = 0;
1759         Unit *i;
1760         int r;
1761
1762         assert(m);
1763
1764         state = manager_state(m);
1765
1766         while ((i = m->cgroup_realize_queue)) {
1767                 assert(i->in_cgroup_realize_queue);
1768
1769                 if (UNIT_IS_INACTIVE_OR_FAILED(unit_active_state(i))) {
1770                         /* Maybe things changed, and the unit is not actually active anymore? */
1771                         unit_remove_from_cgroup_realize_queue(i);
1772                         continue;
1773                 }
1774
1775                 r = unit_realize_cgroup_now(i, state);
1776                 if (r < 0)
1777                         log_warning_errno(r, "Failed to realize cgroups for queued unit %s, ignoring: %m", i->id);
1778
1779                 n++;
1780         }
1781
1782         return n;
1783 }
1784
1785 static void unit_add_siblings_to_cgroup_realize_queue(Unit *u) {
1786         Unit *slice;
1787
1788         /* This adds the siblings of the specified unit and the
1789          * siblings of all parent units to the cgroup queue. (But
1790          * neither the specified unit itself nor the parents.) */
1791
1792         while ((slice = UNIT_DEREF(u->slice))) {
1793                 Iterator i;
1794                 Unit *m;
1795                 void *v;
1796
1797                 HASHMAP_FOREACH_KEY(v, m, u->dependencies[UNIT_BEFORE], i) {
1798                         if (m == u)
1799                                 continue;
1800
1801                         /* Skip units that have a dependency on the slice
1802                          * but aren't actually in it. */
1803                         if (UNIT_DEREF(m->slice) != slice)
1804                                 continue;
1805
1806                         /* No point in doing cgroup application for units
1807                          * without active processes. */
1808                         if (UNIT_IS_INACTIVE_OR_FAILED(unit_active_state(m)))
1809                                 continue;
1810
1811                         /* If the unit doesn't need any new controllers
1812                          * and has current ones realized, it doesn't need
1813                          * any changes. */
1814                         if (unit_has_mask_realized(m,
1815                                                    unit_get_target_mask(m),
1816                                                    unit_get_enable_mask(m),
1817                                                    unit_get_needs_bpf(m)))
1818                                 continue;
1819
1820                         unit_add_to_cgroup_realize_queue(m);
1821                 }
1822
1823                 u = slice;
1824         }
1825 }
1826
1827 int unit_realize_cgroup(Unit *u) {
1828         assert(u);
1829
1830         if (!UNIT_HAS_CGROUP_CONTEXT(u))
1831                 return 0;
1832
1833         /* So, here's the deal: when realizing the cgroups for this
1834          * unit, we need to first create all parents, but there's more
1835          * actually: for the weight-based controllers we also need to
1836          * make sure that all our siblings (i.e. units that are in the
1837          * same slice as we are) have cgroups, too. Otherwise, things
1838          * would become very uneven as each of their processes would
1839          * get as much resources as all our group together. This call
1840          * will synchronously create the parent cgroups, but will
1841          * defer work on the siblings to the next event loop
1842          * iteration. */
1843
1844         /* Add all sibling slices to the cgroup queue. */
1845         unit_add_siblings_to_cgroup_realize_queue(u);
1846
1847         /* And realize this one now (and apply the values) */
1848         return unit_realize_cgroup_now(u, manager_state(u->manager));
1849 }
1850
1851 void unit_release_cgroup(Unit *u) {
1852         assert(u);
1853
1854         /* Forgets all cgroup details for this cgroup */
1855
1856         if (u->cgroup_path) {
1857                 (void) hashmap_remove(u->manager->cgroup_unit, u->cgroup_path);
1858                 u->cgroup_path = mfree(u->cgroup_path);
1859         }
1860
1861         if (u->cgroup_inotify_wd >= 0) {
1862                 if (inotify_rm_watch(u->manager->cgroup_inotify_fd, u->cgroup_inotify_wd) < 0)
1863                         log_unit_debug_errno(u, errno, "Failed to remove cgroup inotify watch %i for %s, ignoring", u->cgroup_inotify_wd, u->id);
1864
1865                 (void) hashmap_remove(u->manager->cgroup_inotify_wd_unit, INT_TO_PTR(u->cgroup_inotify_wd));
1866                 u->cgroup_inotify_wd = -1;
1867         }
1868 }
1869
1870 void unit_prune_cgroup(Unit *u) {
1871         int r;
1872         bool is_root_slice;
1873
1874         assert(u);
1875
1876         /* Removes the cgroup, if empty and possible, and stops watching it. */
1877
1878         if (!u->cgroup_path)
1879                 return;
1880
1881         (void) unit_get_cpu_usage(u, NULL); /* Cache the last CPU usage value before we destroy the cgroup */
1882
1883         is_root_slice = unit_has_name(u, SPECIAL_ROOT_SLICE);
1884
1885         r = cg_trim_everywhere(u->manager->cgroup_supported, u->cgroup_path, !is_root_slice);
1886         if (r < 0) {
1887                 log_unit_debug_errno(u, r, "Failed to destroy cgroup %s, ignoring: %m", u->cgroup_path);
1888                 return;
1889         }
1890
1891         if (is_root_slice)
1892                 return;
1893
1894         unit_release_cgroup(u);
1895
1896         u->cgroup_realized = false;
1897         u->cgroup_realized_mask = 0;
1898         u->cgroup_enabled_mask = 0;
1899 }
1900
1901 int unit_search_main_pid(Unit *u, pid_t *ret) {
1902         _cleanup_fclose_ FILE *f = NULL;
1903         pid_t pid = 0, npid, mypid;
1904         int r;
1905
1906         assert(u);
1907         assert(ret);
1908
1909         if (!u->cgroup_path)
1910                 return -ENXIO;
1911
1912         r = cg_enumerate_processes(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, &f);
1913         if (r < 0)
1914                 return r;
1915
1916         mypid = getpid_cached();
1917         while (cg_read_pid(f, &npid) > 0)  {
1918                 pid_t ppid;
1919
1920                 if (npid == pid)
1921                         continue;
1922
1923                 /* Ignore processes that aren't our kids */
1924                 if (get_process_ppid(npid, &ppid) >= 0 && ppid != mypid)
1925                         continue;
1926
1927                 if (pid != 0)
1928                         /* Dang, there's more than one daemonized PID
1929                         in this group, so we don't know what process
1930                         is the main process. */
1931
1932                         return -ENODATA;
1933
1934                 pid = npid;
1935         }
1936
1937         *ret = pid;
1938         return 0;
1939 }
1940
1941 static int unit_watch_pids_in_path(Unit *u, const char *path) {
1942         _cleanup_closedir_ DIR *d = NULL;
1943         _cleanup_fclose_ FILE *f = NULL;
1944         int ret = 0, r;
1945
1946         assert(u);
1947         assert(path);
1948
1949         r = cg_enumerate_processes(SYSTEMD_CGROUP_CONTROLLER, path, &f);
1950         if (r < 0)
1951                 ret = r;
1952         else {
1953                 pid_t pid;
1954
1955                 while ((r = cg_read_pid(f, &pid)) > 0) {
1956                         r = unit_watch_pid(u, pid);
1957                         if (r < 0 && ret >= 0)
1958                                 ret = r;
1959                 }
1960
1961                 if (r < 0 && ret >= 0)
1962                         ret = r;
1963         }
1964
1965         r = cg_enumerate_subgroups(SYSTEMD_CGROUP_CONTROLLER, path, &d);
1966         if (r < 0) {
1967                 if (ret >= 0)
1968                         ret = r;
1969         } else {
1970                 char *fn;
1971
1972                 while ((r = cg_read_subgroup(d, &fn)) > 0) {
1973                         _cleanup_free_ char *p = NULL;
1974
1975                         p = strjoin(path, "/", fn);
1976                         free(fn);
1977
1978                         if (!p)
1979                                 return -ENOMEM;
1980
1981                         r = unit_watch_pids_in_path(u, p);
1982                         if (r < 0 && ret >= 0)
1983                                 ret = r;
1984                 }
1985
1986                 if (r < 0 && ret >= 0)
1987                         ret = r;
1988         }
1989
1990         return ret;
1991 }
1992
1993 int unit_synthesize_cgroup_empty_event(Unit *u) {
1994         int r;
1995
1996         assert(u);
1997
1998         /* Enqueue a synthetic cgroup empty event if this unit doesn't watch any PIDs anymore. This is compatibility
1999          * support for non-unified systems where notifications aren't reliable, and hence need to take whatever we can
2000          * get as notification source as soon as we stopped having any useful PIDs to watch for. */
2001
2002         if (!u->cgroup_path)
2003                 return -ENOENT;
2004
2005         r = cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER);
2006         if (r < 0)
2007                 return r;
2008         if (r > 0) /* On unified we have reliable notifications, and don't need this */
2009                 return 0;
2010
2011         if (!set_isempty(u->pids))
2012                 return 0;
2013
2014         unit_add_to_cgroup_empty_queue(u);
2015         return 0;
2016 }
2017
2018 int unit_watch_all_pids(Unit *u) {
2019         int r;
2020
2021         assert(u);
2022
2023         /* Adds all PIDs from our cgroup to the set of PIDs we
2024          * watch. This is a fallback logic for cases where we do not
2025          * get reliable cgroup empty notifications: we try to use
2026          * SIGCHLD as replacement. */
2027
2028         if (!u->cgroup_path)
2029                 return -ENOENT;
2030
2031         r = cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER);
2032         if (r < 0)
2033                 return r;
2034         if (r > 0) /* On unified we can use proper notifications */
2035                 return 0;
2036
2037         return unit_watch_pids_in_path(u, u->cgroup_path);
2038 }
2039
2040 static int on_cgroup_empty_event(sd_event_source *s, void *userdata) {
2041         Manager *m = userdata;
2042         Unit *u;
2043         int r;
2044
2045         assert(s);
2046         assert(m);
2047
2048         u = m->cgroup_empty_queue;
2049         if (!u)
2050                 return 0;
2051
2052         assert(u->in_cgroup_empty_queue);
2053         u->in_cgroup_empty_queue = false;
2054         LIST_REMOVE(cgroup_empty_queue, m->cgroup_empty_queue, u);
2055
2056         if (m->cgroup_empty_queue) {
2057                 /* More stuff queued, let's make sure we remain enabled */
2058                 r = sd_event_source_set_enabled(s, SD_EVENT_ONESHOT);
2059                 if (r < 0)
2060                         log_debug_errno(r, "Failed to reenable cgroup empty event source: %m");
2061         }
2062
2063         unit_add_to_gc_queue(u);
2064
2065         if (UNIT_VTABLE(u)->notify_cgroup_empty)
2066                 UNIT_VTABLE(u)->notify_cgroup_empty(u);
2067
2068         return 0;
2069 }
2070
2071 void unit_add_to_cgroup_empty_queue(Unit *u) {
2072         int r;
2073
2074         assert(u);
2075
2076         /* Note that there are four different ways how cgroup empty events reach us:
2077          *
2078          * 1. On the unified hierarchy we get an inotify event on the cgroup
2079          *
2080          * 2. On the legacy hierarchy, when running in system mode, we get a datagram on the cgroup agent socket
2081          *
2082          * 3. On the legacy hierarchy, when running in user mode, we get a D-Bus signal on the system bus
2083          *
2084          * 4. On the legacy hierarchy, in service units we start watching all processes of the cgroup for SIGCHLD as
2085          *    soon as we get one SIGCHLD, to deal with unreliable cgroup notifications.
2086          *
2087          * Regardless which way we got the notification, we'll verify it here, and then add it to a separate
2088          * queue. This queue will be dispatched at a lower priority than the SIGCHLD handler, so that we always use
2089          * SIGCHLD if we can get it first, and only use the cgroup empty notifications if there's no SIGCHLD pending
2090          * (which might happen if the cgroup doesn't contain processes that are our own child, which is typically the
2091          * case for scope units). */
2092
2093         if (u->in_cgroup_empty_queue)
2094                 return;
2095
2096         /* Let's verify that the cgroup is really empty */
2097         if (!u->cgroup_path)
2098                 return;
2099         r = cg_is_empty_recursive(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path);
2100         if (r < 0) {
2101                 log_unit_debug_errno(u, r, "Failed to determine whether cgroup %s is empty: %m", u->cgroup_path);
2102                 return;
2103         }
2104         if (r == 0)
2105                 return;
2106
2107         LIST_PREPEND(cgroup_empty_queue, u->manager->cgroup_empty_queue, u);
2108         u->in_cgroup_empty_queue = true;
2109
2110         /* Trigger the defer event */
2111         r = sd_event_source_set_enabled(u->manager->cgroup_empty_event_source, SD_EVENT_ONESHOT);
2112         if (r < 0)
2113                 log_debug_errno(r, "Failed to enable cgroup empty event source: %m");
2114 }
2115
2116 static int on_cgroup_inotify_event(sd_event_source *s, int fd, uint32_t revents, void *userdata) {
2117         Manager *m = userdata;
2118
2119         assert(s);
2120         assert(fd >= 0);
2121         assert(m);
2122
2123         for (;;) {
2124                 union inotify_event_buffer buffer;
2125                 struct inotify_event *e;
2126                 ssize_t l;
2127
2128                 l = read(fd, &buffer, sizeof(buffer));
2129                 if (l < 0) {
2130                         if (IN_SET(errno, EINTR, EAGAIN))
2131                                 return 0;
2132
2133                         return log_error_errno(errno, "Failed to read control group inotify events: %m");
2134                 }
2135
2136                 FOREACH_INOTIFY_EVENT(e, buffer, l) {
2137                         Unit *u;
2138
2139                         if (e->wd < 0)
2140                                 /* Queue overflow has no watch descriptor */
2141                                 continue;
2142
2143                         if (e->mask & IN_IGNORED)
2144                                 /* The watch was just removed */
2145                                 continue;
2146
2147                         u = hashmap_get(m->cgroup_inotify_wd_unit, INT_TO_PTR(e->wd));
2148                         if (!u) /* Not that inotify might deliver
2149                                  * events for a watch even after it
2150                                  * was removed, because it was queued
2151                                  * before the removal. Let's ignore
2152                                  * this here safely. */
2153                                 continue;
2154
2155                         unit_add_to_cgroup_empty_queue(u);
2156                 }
2157         }
2158 }
2159 #endif // 0
2160
2161 int manager_setup_cgroup(Manager *m) {
2162         _cleanup_free_ char *path = NULL;
2163         const char *scope_path;
2164         CGroupController c;
2165         int r, all_unified;
2166 #if 0 /// UNNEEDED by elogind
2167         char *e;
2168 #endif // 0
2169
2170         assert(m);
2171
2172         /* 1. Determine hierarchy */
2173         m->cgroup_root = mfree(m->cgroup_root);
2174 #if 0 /// elogind is not init and must therefore search for PID 1 instead of self.
2175         r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, 0, &m->cgroup_root);
2176 #else
2177         r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, 1, &m->cgroup_root);
2178 #endif // 0
2179         if (r < 0)
2180                 return log_error_errno(r, "Cannot determine cgroup we are running in: %m");
2181
2182 #if 0 /// elogind does not support systemd scopes and slices
2183         /* Chop off the init scope, if we are already located in it */
2184         e = endswith(m->cgroup_root, "/" SPECIAL_INIT_SCOPE);
2185
2186         /* LEGACY: Also chop off the system slice if we are in
2187          * it. This is to support live upgrades from older systemd
2188          * versions where PID 1 was moved there. Also see
2189          * cg_get_root_path(). */
2190         if (!e && MANAGER_IS_SYSTEM(m)) {
2191                 e = endswith(m->cgroup_root, "/" SPECIAL_SYSTEM_SLICE);
2192                 if (!e)
2193                         e = endswith(m->cgroup_root, "/system"); /* even more legacy */
2194         }
2195         if (e)
2196                 *e = 0;
2197 #endif // 0
2198
2199         log_debug_elogind("Cgroup Controller \"%s\" -> root \"%s\"",
2200                           SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root);
2201         /* And make sure to store away the root value without trailing slash, even for the root dir, so that we can
2202          * easily prepend it everywhere. */
2203         delete_trailing_chars(m->cgroup_root, "/");
2204
2205         /* 2. Show data */
2206         r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, NULL, &path);
2207         if (r < 0)
2208                 return log_error_errno(r, "Cannot find cgroup mount point: %m");
2209
2210         r = cg_unified_flush();
2211         if (r < 0)
2212                 return log_error_errno(r, "Couldn't determine if we are running in the unified hierarchy: %m");
2213
2214         all_unified = cg_all_unified();
2215         if (all_unified < 0)
2216                 return log_error_errno(all_unified, "Couldn't determine whether we are in all unified mode: %m");
2217         if (all_unified > 0)
2218                 log_debug("Unified cgroup hierarchy is located at %s.", path);
2219         else {
2220                 r = cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER);
2221                 if (r < 0)
2222                         return log_error_errno(r, "Failed to determine whether systemd's own controller is in unified mode: %m");
2223                 if (r > 0)
2224                         log_debug("Unified cgroup hierarchy is located at %s. Controllers are on legacy hierarchies.", path);
2225                 else
2226                         log_debug("Using cgroup controller " SYSTEMD_CGROUP_CONTROLLER_LEGACY ". File system hierarchy is at %s.", path);
2227         }
2228
2229 #if 0 /// elogind is not init, and does not install the agent here.
2230         /* 3. Allocate cgroup empty defer event source */
2231         m->cgroup_empty_event_source = sd_event_source_unref(m->cgroup_empty_event_source);
2232         r = sd_event_add_defer(m->event, &m->cgroup_empty_event_source, on_cgroup_empty_event, m);
2233         if (r < 0)
2234                 return log_error_errno(r, "Failed to create cgroup empty event source: %m");
2235
2236         r = sd_event_source_set_priority(m->cgroup_empty_event_source, SD_EVENT_PRIORITY_NORMAL-5);
2237         if (r < 0)
2238                 return log_error_errno(r, "Failed to set priority of cgroup empty event source: %m");
2239
2240         r = sd_event_source_set_enabled(m->cgroup_empty_event_source, SD_EVENT_OFF);
2241         if (r < 0)
2242                 return log_error_errno(r, "Failed to disable cgroup empty event source: %m");
2243
2244         (void) sd_event_source_set_description(m->cgroup_empty_event_source, "cgroup-empty");
2245
2246         /* 4. Install notifier inotify object, or agent */
2247         if (cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER) > 0) {
2248
2249                 /* In the unified hierarchy we can get cgroup empty notifications via inotify. */
2250
2251                 m->cgroup_inotify_event_source = sd_event_source_unref(m->cgroup_inotify_event_source);
2252                 safe_close(m->cgroup_inotify_fd);
2253
2254                 m->cgroup_inotify_fd = inotify_init1(IN_NONBLOCK|IN_CLOEXEC);
2255                 if (m->cgroup_inotify_fd < 0)
2256                         return log_error_errno(errno, "Failed to create control group inotify object: %m");
2257
2258                 r = sd_event_add_io(m->event, &m->cgroup_inotify_event_source, m->cgroup_inotify_fd, EPOLLIN, on_cgroup_inotify_event, m);
2259                 if (r < 0)
2260                         return log_error_errno(r, "Failed to watch control group inotify object: %m");
2261
2262                 /* Process cgroup empty notifications early, but after service notifications and SIGCHLD. Also
2263                  * see handling of cgroup agent notifications, for the classic cgroup hierarchy support. */
2264                 r = sd_event_source_set_priority(m->cgroup_inotify_event_source, SD_EVENT_PRIORITY_NORMAL-4);
2265                 if (r < 0)
2266                         return log_error_errno(r, "Failed to set priority of inotify event source: %m");
2267
2268                 (void) sd_event_source_set_description(m->cgroup_inotify_event_source, "cgroup-inotify");
2269
2270         } else if (MANAGER_IS_SYSTEM(m) && m->test_run_flags == 0) {
2271
2272                 /* On the legacy hierarchy we only get notifications via cgroup agents. (Which isn't really reliable,
2273                  * since it does not generate events when control groups with children run empty. */
2274
2275                 r = cg_install_release_agent(SYSTEMD_CGROUP_CONTROLLER, SYSTEMD_CGROUP_AGENT_PATH);
2276                 if (r < 0)
2277                         log_warning_errno(r, "Failed to install release agent, ignoring: %m");
2278                 else if (r > 0)
2279                         log_debug("Installed release agent.");
2280                 else if (r == 0)
2281                         log_debug("Release agent already installed.");
2282         }
2283
2284         /* 5. Make sure we are in the special "init.scope" unit in the root slice. */
2285         scope_path = strjoina(m->cgroup_root, "/" SPECIAL_INIT_SCOPE);
2286         r = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, scope_path, 0);
2287         if (r >= 0) {
2288                 /* Also, move all other userspace processes remaining in the root cgroup into that scope. */
2289                 r = cg_migrate(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, SYSTEMD_CGROUP_CONTROLLER, scope_path, 0);
2290                 if (r < 0)
2291                         log_warning_errno(r, "Couldn't move remaining userspace processes, ignoring: %m");
2292 #else
2293         /* Note:
2294                 * This method is in core, and normally called by systemd
2295                 * being init. As elogind is never init, we can not install
2296                 * our agent here. We do so when mounting our cgroup file
2297                 * system, so only if elogind is its own tiny controller.
2298                 * Further, elogind is not meant to run in systemd init scope. */
2299         if (MANAGER_IS_SYSTEM(m))
2300                 // we are our own cgroup controller
2301                 scope_path = strjoina("");
2302         else if (streq(m->cgroup_root, "/elogind"))
2303                 // root already is our cgroup
2304                 scope_path = strjoina(m->cgroup_root);
2305         else
2306                 // we have to create our own group
2307                 scope_path = strjoina(m->cgroup_root, "/elogind");
2308         r = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, scope_path, 0);
2309 #endif // 0
2310         log_debug_elogind("Created control group \"%s\"", scope_path);
2311
2312                 /* 6. And pin it, so that it cannot be unmounted */
2313                 safe_close(m->pin_cgroupfs_fd);
2314                 m->pin_cgroupfs_fd = open(path, O_RDONLY|O_CLOEXEC|O_DIRECTORY|O_NOCTTY|O_NONBLOCK);
2315                 if (m->pin_cgroupfs_fd < 0)
2316                         return log_error_errno(errno, "Failed to open pin file: %m");
2317
2318 #if 0 /// this is from the cgroup migration above that elogind does not need.
2319         } else if (r < 0 && !m->test_run_flags)
2320                 return log_error_errno(r, "Failed to create %s control group: %m", scope_path);
2321 #endif // 0
2322
2323         /* 7. Always enable hierarchical support if it exists... */
2324         if (!all_unified && m->test_run_flags == 0)
2325                 (void) cg_set_attribute("memory", "/", "memory.use_hierarchy", "1");
2326
2327         /* 8. Figure out which controllers are supported, and log about it */
2328         r = cg_mask_supported(&m->cgroup_supported);
2329         if (r < 0)
2330                 return log_error_errno(r, "Failed to determine supported controllers: %m");
2331         for (c = 0; c < _CGROUP_CONTROLLER_MAX; c++)
2332                 log_debug("Controller '%s' supported: %s", cgroup_controller_to_string(c), yes_no(m->cgroup_supported & CGROUP_CONTROLLER_TO_MASK(c)));
2333
2334         return 0;
2335 }
2336
2337 void manager_shutdown_cgroup(Manager *m, bool delete) {
2338         assert(m);
2339
2340 #if 0 /// elogind is not init
2341         /* We can't really delete the group, since we are in it. But
2342          * let's trim it. */
2343         if (delete && m->cgroup_root)
2344                 (void) cg_trim(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, false);
2345
2346         m->cgroup_empty_event_source = sd_event_source_unref(m->cgroup_empty_event_source);
2347
2348         m->cgroup_inotify_wd_unit = hashmap_free(m->cgroup_inotify_wd_unit);
2349
2350         m->cgroup_inotify_event_source = sd_event_source_unref(m->cgroup_inotify_event_source);
2351         m->cgroup_inotify_fd = safe_close(m->cgroup_inotify_fd);
2352 #endif // 0
2353
2354         m->pin_cgroupfs_fd = safe_close(m->pin_cgroupfs_fd);
2355
2356         m->cgroup_root = mfree(m->cgroup_root);
2357 }
2358
2359 #if 0 /// UNNEEDED by elogind
2360 Unit* manager_get_unit_by_cgroup(Manager *m, const char *cgroup) {
2361         char *p;
2362         Unit *u;
2363
2364         assert(m);
2365         assert(cgroup);
2366
2367         u = hashmap_get(m->cgroup_unit, cgroup);
2368         if (u)
2369                 return u;
2370
2371         p = strdupa(cgroup);
2372         for (;;) {
2373                 char *e;
2374
2375                 e = strrchr(p, '/');
2376                 if (!e || e == p)
2377                         return hashmap_get(m->cgroup_unit, SPECIAL_ROOT_SLICE);
2378
2379                 *e = 0;
2380
2381                 u = hashmap_get(m->cgroup_unit, p);
2382                 if (u)
2383                         return u;
2384         }
2385 }
2386
2387 Unit *manager_get_unit_by_pid_cgroup(Manager *m, pid_t pid) {
2388         _cleanup_free_ char *cgroup = NULL;
2389
2390         assert(m);
2391
2392         if (!pid_is_valid(pid))
2393                 return NULL;
2394
2395         if (cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, pid, &cgroup) < 0)
2396                 return NULL;
2397
2398         return manager_get_unit_by_cgroup(m, cgroup);
2399 }
2400
2401 Unit *manager_get_unit_by_pid(Manager *m, pid_t pid) {
2402         Unit *u, **array;
2403
2404         assert(m);
2405
2406         /* Note that a process might be owned by multiple units, we return only one here, which is good enough for most
2407          * cases, though not strictly correct. We prefer the one reported by cgroup membership, as that's the most
2408          * relevant one as children of the process will be assigned to that one, too, before all else. */
2409
2410         if (!pid_is_valid(pid))
2411                 return NULL;
2412
2413         if (pid == getpid_cached())
2414                 return hashmap_get(m->units, SPECIAL_INIT_SCOPE);
2415
2416         u = manager_get_unit_by_pid_cgroup(m, pid);
2417         if (u)
2418                 return u;
2419
2420         u = hashmap_get(m->watch_pids, PID_TO_PTR(pid));
2421         if (u)
2422                 return u;
2423
2424         array = hashmap_get(m->watch_pids, PID_TO_PTR(-pid));
2425         if (array)
2426                 return array[0];
2427
2428         return NULL;
2429 }
2430 #endif // 0
2431
2432 #if 0 /// elogind must substitute this with its own variant
2433 int manager_notify_cgroup_empty(Manager *m, const char *cgroup) {
2434         Unit *u;
2435
2436         assert(m);
2437         assert(cgroup);
2438
2439         /* Called on the legacy hierarchy whenever we get an explicit cgroup notification from the cgroup agent process
2440          * or from the --system instance */
2441
2442         log_debug("Got cgroup empty notification for: %s", cgroup);
2443
2444         u = manager_get_unit_by_cgroup(m, cgroup);
2445         if (!u)
2446                 return 0;
2447
2448         unit_add_to_cgroup_empty_queue(u);
2449         return 1;
2450 }
2451 #else
2452 int manager_notify_cgroup_empty(Manager *m, const char *cgroup) {
2453         Session *s;
2454
2455         assert(m);
2456         assert(cgroup);
2457
2458         log_debug("Got cgroup empty notification for: %s", cgroup);
2459
2460         s = hashmap_get(m->sessions, cgroup);
2461
2462         if (s) {
2463                 session_finalize(s);
2464                 session_free(s);
2465         } else
2466                 log_warning("Session not found: %s", cgroup);
2467
2468         return 0;
2469 }
2470 #endif // 0
2471 #if 0 /// UNNEEDED by elogind
2472 int unit_get_memory_current(Unit *u, uint64_t *ret) {
2473         _cleanup_free_ char *v = NULL;
2474         int r;
2475
2476         assert(u);
2477         assert(ret);
2478
2479         if (!UNIT_CGROUP_BOOL(u, memory_accounting))
2480                 return -ENODATA;
2481
2482         if (!u->cgroup_path)
2483                 return -ENODATA;
2484
2485         /* The root cgroup doesn't expose this information, let's get it from /proc instead */
2486         if (unit_has_root_cgroup(u))
2487                 return procfs_memory_get_current(ret);
2488
2489         if ((u->cgroup_realized_mask & CGROUP_MASK_MEMORY) == 0)
2490                 return -ENODATA;
2491
2492         r = cg_all_unified();
2493         if (r < 0)
2494                 return r;
2495         if (r > 0)
2496                 r = cg_get_attribute("memory", u->cgroup_path, "memory.current", &v);
2497         else
2498                 r = cg_get_attribute("memory", u->cgroup_path, "memory.usage_in_bytes", &v);
2499         if (r == -ENOENT)
2500                 return -ENODATA;
2501         if (r < 0)
2502                 return r;
2503
2504         return safe_atou64(v, ret);
2505 }
2506
2507 int unit_get_tasks_current(Unit *u, uint64_t *ret) {
2508         _cleanup_free_ char *v = NULL;
2509         int r;
2510
2511         assert(u);
2512         assert(ret);
2513
2514         if (!UNIT_CGROUP_BOOL(u, tasks_accounting))
2515                 return -ENODATA;
2516
2517         if (!u->cgroup_path)
2518                 return -ENODATA;
2519
2520         /* The root cgroup doesn't expose this information, let's get it from /proc instead */
2521         if (unit_has_root_cgroup(u))
2522                 return procfs_tasks_get_current(ret);
2523
2524         if ((u->cgroup_realized_mask & CGROUP_MASK_PIDS) == 0)
2525                 return -ENODATA;
2526
2527         r = cg_get_attribute("pids", u->cgroup_path, "pids.current", &v);
2528         if (r == -ENOENT)
2529                 return -ENODATA;
2530         if (r < 0)
2531                 return r;
2532
2533         return safe_atou64(v, ret);
2534 }
2535
2536 static int unit_get_cpu_usage_raw(Unit *u, nsec_t *ret) {
2537         _cleanup_free_ char *v = NULL;
2538         uint64_t ns;
2539         int r;
2540
2541         assert(u);
2542         assert(ret);
2543
2544         if (!u->cgroup_path)
2545                 return -ENODATA;
2546
2547         /* The root cgroup doesn't expose this information, let's get it from /proc instead */
2548         if (unit_has_root_cgroup(u))
2549                 return procfs_cpu_get_usage(ret);
2550
2551         r = cg_all_unified();
2552         if (r < 0)
2553                 return r;
2554         if (r > 0) {
2555                 _cleanup_free_ char *val = NULL;
2556                 uint64_t us;
2557
2558                 if ((u->cgroup_realized_mask & CGROUP_MASK_CPU) == 0)
2559                         return -ENODATA;
2560
2561                 r = cg_get_keyed_attribute("cpu", u->cgroup_path, "cpu.stat", STRV_MAKE("usage_usec"), &val);
2562                 if (r < 0)
2563                         return r;
2564                 if (IN_SET(r, -ENOENT, -ENXIO))
2565                         return -ENODATA;
2566
2567                 r = safe_atou64(val, &us);
2568                 if (r < 0)
2569                         return r;
2570
2571                 ns = us * NSEC_PER_USEC;
2572         } else {
2573                 if ((u->cgroup_realized_mask & CGROUP_MASK_CPUACCT) == 0)
2574                         return -ENODATA;
2575
2576                 r = cg_get_attribute("cpuacct", u->cgroup_path, "cpuacct.usage", &v);
2577                 if (r == -ENOENT)
2578                         return -ENODATA;
2579                 if (r < 0)
2580                         return r;
2581
2582                 r = safe_atou64(v, &ns);
2583                 if (r < 0)
2584                         return r;
2585         }
2586
2587         *ret = ns;
2588         return 0;
2589 }
2590
2591 int unit_get_cpu_usage(Unit *u, nsec_t *ret) {
2592         nsec_t ns;
2593         int r;
2594
2595         assert(u);
2596
2597         /* Retrieve the current CPU usage counter. This will subtract the CPU counter taken when the unit was
2598          * started. If the cgroup has been removed already, returns the last cached value. To cache the value, simply
2599          * call this function with a NULL return value. */
2600
2601         if (!UNIT_CGROUP_BOOL(u, cpu_accounting))
2602                 return -ENODATA;
2603
2604         r = unit_get_cpu_usage_raw(u, &ns);
2605         if (r == -ENODATA && u->cpu_usage_last != NSEC_INFINITY) {
2606                 /* If we can't get the CPU usage anymore (because the cgroup was already removed, for example), use our
2607                  * cached value. */
2608
2609                 if (ret)
2610                         *ret = u->cpu_usage_last;
2611                 return 0;
2612         }
2613         if (r < 0)
2614                 return r;
2615
2616         if (ns > u->cpu_usage_base)
2617                 ns -= u->cpu_usage_base;
2618         else
2619                 ns = 0;
2620
2621         u->cpu_usage_last = ns;
2622         if (ret)
2623                 *ret = ns;
2624
2625         return 0;
2626 }
2627
2628 int unit_get_ip_accounting(
2629                 Unit *u,
2630                 CGroupIPAccountingMetric metric,
2631                 uint64_t *ret) {
2632
2633         uint64_t value;
2634         int fd, r;
2635
2636         assert(u);
2637         assert(metric >= 0);
2638         assert(metric < _CGROUP_IP_ACCOUNTING_METRIC_MAX);
2639         assert(ret);
2640
2641         if (!UNIT_CGROUP_BOOL(u, ip_accounting))
2642                 return -ENODATA;
2643
2644         fd = IN_SET(metric, CGROUP_IP_INGRESS_BYTES, CGROUP_IP_INGRESS_PACKETS) ?
2645                 u->ip_accounting_ingress_map_fd :
2646                 u->ip_accounting_egress_map_fd;
2647         if (fd < 0)
2648                 return -ENODATA;
2649
2650         if (IN_SET(metric, CGROUP_IP_INGRESS_BYTES, CGROUP_IP_EGRESS_BYTES))
2651                 r = bpf_firewall_read_accounting(fd, &value, NULL);
2652         else
2653                 r = bpf_firewall_read_accounting(fd, NULL, &value);
2654         if (r < 0)
2655                 return r;
2656
2657         /* Add in additional metrics from a previous runtime. Note that when reexecing/reloading the daemon we compile
2658          * all BPF programs and maps anew, but serialize the old counters. When deserializing we store them in the
2659          * ip_accounting_extra[] field, and add them in here transparently. */
2660
2661         *ret = value + u->ip_accounting_extra[metric];
2662
2663         return r;
2664 }
2665
2666 int unit_reset_cpu_accounting(Unit *u) {
2667         nsec_t ns;
2668         int r;
2669
2670         assert(u);
2671
2672         u->cpu_usage_last = NSEC_INFINITY;
2673
2674         r = unit_get_cpu_usage_raw(u, &ns);
2675         if (r < 0) {
2676                 u->cpu_usage_base = 0;
2677                 return r;
2678         }
2679
2680         u->cpu_usage_base = ns;
2681         return 0;
2682 }
2683
2684 int unit_reset_ip_accounting(Unit *u) {
2685         int r = 0, q = 0;
2686
2687         assert(u);
2688
2689         if (u->ip_accounting_ingress_map_fd >= 0)
2690                 r = bpf_firewall_reset_accounting(u->ip_accounting_ingress_map_fd);
2691
2692         if (u->ip_accounting_egress_map_fd >= 0)
2693                 q = bpf_firewall_reset_accounting(u->ip_accounting_egress_map_fd);
2694
2695         zero(u->ip_accounting_extra);
2696
2697         return r < 0 ? r : q;
2698 }
2699
2700 void unit_invalidate_cgroup(Unit *u, CGroupMask m) {
2701         assert(u);
2702
2703         if (!UNIT_HAS_CGROUP_CONTEXT(u))
2704                 return;
2705
2706         if (m == 0)
2707                 return;
2708
2709         /* always invalidate compat pairs together */
2710         if (m & (CGROUP_MASK_IO | CGROUP_MASK_BLKIO))
2711                 m |= CGROUP_MASK_IO | CGROUP_MASK_BLKIO;
2712
2713         if (m & (CGROUP_MASK_CPU | CGROUP_MASK_CPUACCT))
2714                 m |= CGROUP_MASK_CPU | CGROUP_MASK_CPUACCT;
2715
2716         if ((u->cgroup_realized_mask & m) == 0) /* NOP? */
2717                 return;
2718
2719         u->cgroup_realized_mask &= ~m;
2720         unit_add_to_cgroup_realize_queue(u);
2721 }
2722
2723 void unit_invalidate_cgroup_bpf(Unit *u) {
2724         assert(u);
2725
2726         if (!UNIT_HAS_CGROUP_CONTEXT(u))
2727                 return;
2728
2729         if (u->cgroup_bpf_state == UNIT_CGROUP_BPF_INVALIDATED) /* NOP? */
2730                 return;
2731
2732         u->cgroup_bpf_state = UNIT_CGROUP_BPF_INVALIDATED;
2733         unit_add_to_cgroup_realize_queue(u);
2734
2735         /* If we are a slice unit, we also need to put compile a new BPF program for all our children, as the IP access
2736          * list of our children includes our own. */
2737         if (u->type == UNIT_SLICE) {
2738                 Unit *member;
2739                 Iterator i;
2740                 void *v;
2741
2742                 HASHMAP_FOREACH_KEY(v, member, u->dependencies[UNIT_BEFORE], i) {
2743                         if (member == u)
2744                                 continue;
2745
2746                         if (UNIT_DEREF(member->slice) != u)
2747                                 continue;
2748
2749                         unit_invalidate_cgroup_bpf(member);
2750                 }
2751         }
2752 }
2753
2754 bool unit_cgroup_delegate(Unit *u) {
2755         CGroupContext *c;
2756
2757         assert(u);
2758
2759         if (!UNIT_VTABLE(u)->can_delegate)
2760                 return false;
2761
2762         c = unit_get_cgroup_context(u);
2763         if (!c)
2764                 return false;
2765
2766         return c->delegate;
2767 }
2768
2769 void manager_invalidate_startup_units(Manager *m) {
2770         Iterator i;
2771         Unit *u;
2772
2773         assert(m);
2774
2775         SET_FOREACH(u, m->startup_units, i)
2776                 unit_invalidate_cgroup(u, CGROUP_MASK_CPU|CGROUP_MASK_IO|CGROUP_MASK_BLKIO);
2777 }
2778
2779 static const char* const cgroup_device_policy_table[_CGROUP_DEVICE_POLICY_MAX] = {
2780         [CGROUP_AUTO] = "auto",
2781         [CGROUP_CLOSED] = "closed",
2782         [CGROUP_STRICT] = "strict",
2783 };
2784
2785 DEFINE_STRING_TABLE_LOOKUP(cgroup_device_policy, CGroupDevicePolicy);
2786 #endif // 0