src/core/cgroup.c

   1 /* SPDX-License-Identifier: LGPL-2.1+ */
   2 /***
   3   This file is part of systemd.
   4
   5   Copyright 2013 Lennart Poettering
   6
   7   systemd is free software; you can redistribute it and/or modify it
   8   under the terms of the GNU Lesser General Public License as published by
   9   the Free Software Foundation; either version 2.1 of the License, or
  10   (at your option) any later version.
  11
  12   systemd is distributed in the hope that it will be useful, but
  13   WITHOUT ANY WARRANTY; without even the implied warranty of
  14   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  15   Lesser General Public License for more details.
  16
  17   You should have received a copy of the GNU Lesser General Public License
  18   along with systemd; If not, see <http://www.gnu.org/licenses/>.
  19 ***/
  20
  21 #include <fcntl.h>
  22 #include <fnmatch.h>
  23
  24 #include "alloc-util.h"
  25 //#include "blockdev-util.h"
  26 //#include "bpf-firewall.h"
  27 //#include "bus-error.h"
  28 #include "cgroup-util.h"
  29 #include "cgroup.h"
  30 #include "fd-util.h"
  31 #include "fileio.h"
  32 #include "fs-util.h"
  33 #include "parse-util.h"
  34 #include "path-util.h"
  35 #include "process-util.h"
  36 //#include "procfs-util.h"
  37 //#include "special.h"
  38 #include "stdio-util.h"
  39 #include "string-table.h"
  40 #include "string-util.h"
  41 #include "virt.h"
  42
  43 #define CGROUP_CPU_QUOTA_PERIOD_USEC ((usec_t) 100 * USEC_PER_MSEC)
  44
  45 bool manager_owns_root_cgroup(Manager *m) {
  46         assert(m);
  47
  48         /* Returns true if we are managing the root cgroup. Note that it isn't sufficient to just check whether the
  49          * group root path equals "/" since that will also be the case if CLONE_NEWCGROUP is in the mix. Since there's
  50          * appears to be no nice way to detect whether we are in a CLONE_NEWCGROUP namespace we instead just check if
  51          * we run in any kind of container virtualization. */
  52
  53         if (detect_container() > 0)
  54                 return false;
  55
  56         return isempty(m->cgroup_root) || path_equal(m->cgroup_root, "/");
  57 }
  58
  59 #if 0 /// UNNEEDED by elogind
  60 bool unit_has_root_cgroup(Unit *u) {
  61         assert(u);
  62
  63         /* Returns whether this unit manages the root cgroup. This will return true if this unit is the root slice and
  64          * the manager manages the root cgroup. */
  65
  66         if (!manager_owns_root_cgroup(u->manager))
  67                 return false;
  68
  69         return unit_has_name(u, SPECIAL_ROOT_SLICE);
  70 }
  71
  72 static void cgroup_compat_warn(void) {
  73         static bool cgroup_compat_warned = false;
  74
  75         if (cgroup_compat_warned)
  76                 return;
  77
  78         log_warning("cgroup compatibility translation between legacy and unified hierarchy settings activated. "
  79                     "See cgroup-compat debug messages for details.");
  80
  81         cgroup_compat_warned = true;
  82 }
  83
  84 #define log_cgroup_compat(unit, fmt, ...) do {                                  \
  85                 cgroup_compat_warn();                                           \
  86                 log_unit_debug(unit, "cgroup-compat: " fmt, ##__VA_ARGS__);     \
  87         } while (false)
  88
  89 void cgroup_context_init(CGroupContext *c) {
  90         assert(c);
  91
  92         /* Initialize everything to the kernel defaults, assuming the
  93          * structure is preinitialized to 0 */
  94
  95         c->cpu_weight = CGROUP_WEIGHT_INVALID;
  96         c->startup_cpu_weight = CGROUP_WEIGHT_INVALID;
  97         c->cpu_quota_per_sec_usec = USEC_INFINITY;
  98
  99         c->cpu_shares = CGROUP_CPU_SHARES_INVALID;
 100         c->startup_cpu_shares = CGROUP_CPU_SHARES_INVALID;
 101
 102         c->memory_high = CGROUP_LIMIT_MAX;
 103         c->memory_max = CGROUP_LIMIT_MAX;
 104         c->memory_swap_max = CGROUP_LIMIT_MAX;
 105
 106         c->memory_limit = CGROUP_LIMIT_MAX;
 107
 108         c->io_weight = CGROUP_WEIGHT_INVALID;
 109         c->startup_io_weight = CGROUP_WEIGHT_INVALID;
 110
 111         c->blockio_weight = CGROUP_BLKIO_WEIGHT_INVALID;
 112         c->startup_blockio_weight = CGROUP_BLKIO_WEIGHT_INVALID;
 113
 114         c->tasks_max = (uint64_t) -1;
 115 }
 116
 117 void cgroup_context_free_device_allow(CGroupContext *c, CGroupDeviceAllow *a) {
 118         assert(c);
 119         assert(a);
 120
 121         LIST_REMOVE(device_allow, c->device_allow, a);
 122         free(a->path);
 123         free(a);
 124 }
 125
 126 void cgroup_context_free_io_device_weight(CGroupContext *c, CGroupIODeviceWeight *w) {
 127         assert(c);
 128         assert(w);
 129
 130         LIST_REMOVE(device_weights, c->io_device_weights, w);
 131         free(w->path);
 132         free(w);
 133 }
 134
 135 void cgroup_context_free_io_device_limit(CGroupContext *c, CGroupIODeviceLimit *l) {
 136         assert(c);
 137         assert(l);
 138
 139         LIST_REMOVE(device_limits, c->io_device_limits, l);
 140         free(l->path);
 141         free(l);
 142 }
 143
 144 void cgroup_context_free_blockio_device_weight(CGroupContext *c, CGroupBlockIODeviceWeight *w) {
 145         assert(c);
 146         assert(w);
 147
 148         LIST_REMOVE(device_weights, c->blockio_device_weights, w);
 149         free(w->path);
 150         free(w);
 151 }
 152
 153 void cgroup_context_free_blockio_device_bandwidth(CGroupContext *c, CGroupBlockIODeviceBandwidth *b) {
 154         assert(c);
 155         assert(b);
 156
 157         LIST_REMOVE(device_bandwidths, c->blockio_device_bandwidths, b);
 158         free(b->path);
 159         free(b);
 160 }
 161
 162 void cgroup_context_done(CGroupContext *c) {
 163         assert(c);
 164
 165         while (c->io_device_weights)
 166                 cgroup_context_free_io_device_weight(c, c->io_device_weights);
 167
 168         while (c->io_device_limits)
 169                 cgroup_context_free_io_device_limit(c, c->io_device_limits);
 170
 171         while (c->blockio_device_weights)
 172                 cgroup_context_free_blockio_device_weight(c, c->blockio_device_weights);
 173
 174         while (c->blockio_device_bandwidths)
 175                 cgroup_context_free_blockio_device_bandwidth(c, c->blockio_device_bandwidths);
 176
 177         while (c->device_allow)
 178                 cgroup_context_free_device_allow(c, c->device_allow);
 179
 180         c->ip_address_allow = ip_address_access_free_all(c->ip_address_allow);
 181         c->ip_address_deny = ip_address_access_free_all(c->ip_address_deny);
 182 }
 183
 184 void cgroup_context_dump(CGroupContext *c, FILE* f, const char *prefix) {
 185         CGroupIODeviceLimit *il;
 186         CGroupIODeviceWeight *iw;
 187         CGroupBlockIODeviceBandwidth *b;
 188         CGroupBlockIODeviceWeight *w;
 189         CGroupDeviceAllow *a;
 190         IPAddressAccessItem *iaai;
 191         char u[FORMAT_TIMESPAN_MAX];
 192
 193         assert(c);
 194         assert(f);
 195
 196         prefix = strempty(prefix);
 197
 198         fprintf(f,
 199                 "%sCPUAccounting=%s\n"
 200                 "%sIOAccounting=%s\n"
 201                 "%sBlockIOAccounting=%s\n"
 202                 "%sMemoryAccounting=%s\n"
 203                 "%sTasksAccounting=%s\n"
 204                 "%sIPAccounting=%s\n"
 205                 "%sCPUWeight=%" PRIu64 "\n"
 206                 "%sStartupCPUWeight=%" PRIu64 "\n"
 207                 "%sCPUShares=%" PRIu64 "\n"
 208                 "%sStartupCPUShares=%" PRIu64 "\n"
 209                 "%sCPUQuotaPerSecSec=%s\n"
 210                 "%sIOWeight=%" PRIu64 "\n"
 211                 "%sStartupIOWeight=%" PRIu64 "\n"
 212                 "%sBlockIOWeight=%" PRIu64 "\n"
 213                 "%sStartupBlockIOWeight=%" PRIu64 "\n"
 214                 "%sMemoryLow=%" PRIu64 "\n"
 215                 "%sMemoryHigh=%" PRIu64 "\n"
 216                 "%sMemoryMax=%" PRIu64 "\n"
 217                 "%sMemorySwapMax=%" PRIu64 "\n"
 218                 "%sMemoryLimit=%" PRIu64 "\n"
 219                 "%sTasksMax=%" PRIu64 "\n"
 220                 "%sDevicePolicy=%s\n"
 221                 "%sDelegate=%s\n",
 222                 prefix, yes_no(c->cpu_accounting),
 223                 prefix, yes_no(c->io_accounting),
 224                 prefix, yes_no(c->blockio_accounting),
 225                 prefix, yes_no(c->memory_accounting),
 226                 prefix, yes_no(c->tasks_accounting),
 227                 prefix, yes_no(c->ip_accounting),
 228                 prefix, c->cpu_weight,
 229                 prefix, c->startup_cpu_weight,
 230                 prefix, c->cpu_shares,
 231                 prefix, c->startup_cpu_shares,
 232                 prefix, format_timespan(u, sizeof(u), c->cpu_quota_per_sec_usec, 1),
 233                 prefix, c->io_weight,
 234                 prefix, c->startup_io_weight,
 235                 prefix, c->blockio_weight,
 236                 prefix, c->startup_blockio_weight,
 237                 prefix, c->memory_low,
 238                 prefix, c->memory_high,
 239                 prefix, c->memory_max,
 240                 prefix, c->memory_swap_max,
 241                 prefix, c->memory_limit,
 242                 prefix, c->tasks_max,
 243                 prefix, cgroup_device_policy_to_string(c->device_policy),
 244                 prefix, yes_no(c->delegate));
 245
 246         if (c->delegate) {
 247                 _cleanup_free_ char *t = NULL;
 248
 249                 (void) cg_mask_to_string(c->delegate_controllers, &t);
 250
 251                 fprintf(f, "%sDelegateControllers=%s\n",
 252                         prefix,
 253                         strempty(t));
 254         }
 255
 256         LIST_FOREACH(device_allow, a, c->device_allow)
 257                 fprintf(f,
 258                         "%sDeviceAllow=%s %s%s%s\n",
 259                         prefix,
 260                         a->path,
 261                         a->r ? "r" : "", a->w ? "w" : "", a->m ? "m" : "");
 262
 263         LIST_FOREACH(device_weights, iw, c->io_device_weights)
 264                 fprintf(f,
 265                         "%sIODeviceWeight=%s %" PRIu64,
 266                         prefix,
 267                         iw->path,
 268                         iw->weight);
 269
 270         LIST_FOREACH(device_limits, il, c->io_device_limits) {
 271                 char buf[FORMAT_BYTES_MAX];
 272                 CGroupIOLimitType type;
 273
 274                 for (type = 0; type < _CGROUP_IO_LIMIT_TYPE_MAX; type++)
 275                         if (il->limits[type] != cgroup_io_limit_defaults[type])
 276                                 fprintf(f,
 277                                         "%s%s=%s %s\n",
 278                                         prefix,
 279                                         cgroup_io_limit_type_to_string(type),
 280                                         il->path,
 281                                         format_bytes(buf, sizeof(buf), il->limits[type]));
 282         }
 283
 284         LIST_FOREACH(device_weights, w, c->blockio_device_weights)
 285                 fprintf(f,
 286                         "%sBlockIODeviceWeight=%s %" PRIu64,
 287                         prefix,
 288                         w->path,
 289                         w->weight);
 290
 291         LIST_FOREACH(device_bandwidths, b, c->blockio_device_bandwidths) {
 292                 char buf[FORMAT_BYTES_MAX];
 293
 294                 if (b->rbps != CGROUP_LIMIT_MAX)
 295                         fprintf(f,
 296                                 "%sBlockIOReadBandwidth=%s %s\n",
 297                                 prefix,
 298                                 b->path,
 299                                 format_bytes(buf, sizeof(buf), b->rbps));
 300                 if (b->wbps != CGROUP_LIMIT_MAX)
 301                         fprintf(f,
 302                                 "%sBlockIOWriteBandwidth=%s %s\n",
 303                                 prefix,
 304                                 b->path,
 305                                 format_bytes(buf, sizeof(buf), b->wbps));
 306         }
 307
 308         LIST_FOREACH(items, iaai, c->ip_address_allow) {
 309                 _cleanup_free_ char *k = NULL;
 310
 311                 (void) in_addr_to_string(iaai->family, &iaai->address, &k);
 312                 fprintf(f, "%sIPAddressAllow=%s/%u\n", prefix, strnull(k), iaai->prefixlen);
 313         }
 314
 315         LIST_FOREACH(items, iaai, c->ip_address_deny) {
 316                 _cleanup_free_ char *k = NULL;
 317
 318                 (void) in_addr_to_string(iaai->family, &iaai->address, &k);
 319                 fprintf(f, "%sIPAddressDeny=%s/%u\n", prefix, strnull(k), iaai->prefixlen);
 320         }
 321 }
 322
 323 static int lookup_block_device(const char *p, dev_t *dev) {
 324         struct stat st;
 325         int r;
 326
 327         assert(p);
 328         assert(dev);
 329
 330         r = stat(p, &st);
 331         if (r < 0)
 332                 return log_warning_errno(errno, "Couldn't stat device %s: %m", p);
 333
 334         if (S_ISBLK(st.st_mode))
 335                 *dev = st.st_rdev;
 336         else if (major(st.st_dev) != 0) {
 337                 /* If this is not a device node then find the block
 338                  * device this file is stored on */
 339                 *dev = st.st_dev;
 340
 341                 /* If this is a partition, try to get the originating
 342                  * block device */
 343                 (void) block_get_whole_disk(*dev, dev);
 344         } else {
 345                 log_warning("%s is not a block device and file system block device cannot be determined or is not local.", p);
 346                 return -ENODEV;
 347         }
 348
 349         return 0;
 350 }
 351
 352 static int whitelist_device(const char *path, const char *node, const char *acc) {
 353         char buf[2+DECIMAL_STR_MAX(dev_t)*2+2+4];
 354         struct stat st;
 355         bool ignore_notfound;
 356         int r;
 357
 358         assert(path);
 359         assert(acc);
 360
 361         if (node[0] == '-') {
 362                 /* Non-existent paths starting with "-" must be silently ignored */
 363                 node++;
 364                 ignore_notfound = true;
 365         } else
 366                 ignore_notfound = false;
 367
 368         if (stat(node, &st) < 0) {
 369                 if (errno == ENOENT && ignore_notfound)
 370                         return 0;
 371
 372                 return log_warning_errno(errno, "Couldn't stat device %s: %m", node);
 373         }
 374
 375         if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
 376                 log_warning("%s is not a device.", node);
 377                 return -ENODEV;
 378         }
 379
 380         sprintf(buf,
 381                 "%c %u:%u %s",
 382                 S_ISCHR(st.st_mode) ? 'c' : 'b',
 383                 major(st.st_rdev), minor(st.st_rdev),
 384                 acc);
 385
 386         r = cg_set_attribute("devices", path, "devices.allow", buf);
 387         if (r < 0)
 388                 log_full_errno(IN_SET(r, -ENOENT, -EROFS, -EINVAL, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
 389                                "Failed to set devices.allow on %s: %m", path);
 390
 391         return r;
 392 }
 393
 394 static int whitelist_major(const char *path, const char *name, char type, const char *acc) {
 395         _cleanup_fclose_ FILE *f = NULL;
 396         char line[LINE_MAX];
 397         bool good = false;
 398         int r;
 399
 400         assert(path);
 401         assert(acc);
 402         assert(IN_SET(type, 'b', 'c'));
 403
 404         f = fopen("/proc/devices", "re");
 405         if (!f)
 406                 return log_warning_errno(errno, "Cannot open /proc/devices to resolve %s (%c): %m", name, type);
 407
 408         FOREACH_LINE(line, f, goto fail) {
 409                 char buf[2+DECIMAL_STR_MAX(unsigned)+3+4], *p, *w;
 410                 unsigned maj;
 411
 412                 truncate_nl(line);
 413
 414                 if (type == 'c' && streq(line, "Character devices:")) {
 415                         good = true;
 416                         continue;
 417                 }
 418
 419                 if (type == 'b' && streq(line, "Block devices:")) {
 420                         good = true;
 421                         continue;
 422                 }
 423
 424                 if (isempty(line)) {
 425                         good = false;
 426                         continue;
 427                 }
 428
 429                 if (!good)
 430                         continue;
 431
 432                 p = strstrip(line);
 433
 434                 w = strpbrk(p, WHITESPACE);
 435                 if (!w)
 436                         continue;
 437                 *w = 0;
 438
 439                 r = safe_atou(p, &maj);
 440                 if (r < 0)
 441                         continue;
 442                 if (maj <= 0)
 443                         continue;
 444
 445                 w++;
 446                 w += strspn(w, WHITESPACE);
 447
 448                 if (fnmatch(name, w, 0) != 0)
 449                         continue;
 450
 451                 sprintf(buf,
 452                         "%c %u:* %s",
 453                         type,
 454                         maj,
 455                         acc);
 456
 457                 r = cg_set_attribute("devices", path, "devices.allow", buf);
 458                 if (r < 0)
 459                         log_full_errno(IN_SET(r, -ENOENT, -EROFS, -EINVAL, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
 460                                        "Failed to set devices.allow on %s: %m", path);
 461         }
 462
 463         return 0;
 464
 465 fail:
 466         return log_warning_errno(errno, "Failed to read /proc/devices: %m");
 467 }
 468
 469 static bool cgroup_context_has_cpu_weight(CGroupContext *c) {
 470         return c->cpu_weight != CGROUP_WEIGHT_INVALID ||
 471                 c->startup_cpu_weight != CGROUP_WEIGHT_INVALID;
 472 }
 473
 474 static bool cgroup_context_has_cpu_shares(CGroupContext *c) {
 475         return c->cpu_shares != CGROUP_CPU_SHARES_INVALID ||
 476                 c->startup_cpu_shares != CGROUP_CPU_SHARES_INVALID;
 477 }
 478
 479 static uint64_t cgroup_context_cpu_weight(CGroupContext *c, ManagerState state) {
 480         if (IN_SET(state, MANAGER_STARTING, MANAGER_INITIALIZING) &&
 481             c->startup_cpu_weight != CGROUP_WEIGHT_INVALID)
 482                 return c->startup_cpu_weight;
 483         else if (c->cpu_weight != CGROUP_WEIGHT_INVALID)
 484                 return c->cpu_weight;
 485         else
 486                 return CGROUP_WEIGHT_DEFAULT;
 487 }
 488
 489 static uint64_t cgroup_context_cpu_shares(CGroupContext *c, ManagerState state) {
 490         if (IN_SET(state, MANAGER_STARTING, MANAGER_INITIALIZING) &&
 491             c->startup_cpu_shares != CGROUP_CPU_SHARES_INVALID)
 492                 return c->startup_cpu_shares;
 493         else if (c->cpu_shares != CGROUP_CPU_SHARES_INVALID)
 494                 return c->cpu_shares;
 495         else
 496                 return CGROUP_CPU_SHARES_DEFAULT;
 497 }
 498
 499 static void cgroup_apply_unified_cpu_config(Unit *u, uint64_t weight, uint64_t quota) {
 500         char buf[MAX(DECIMAL_STR_MAX(uint64_t) + 1, (DECIMAL_STR_MAX(usec_t) + 1) * 2)];
 501         int r;
 502
 503         xsprintf(buf, "%" PRIu64 "\n", weight);
 504         r = cg_set_attribute("cpu", u->cgroup_path, "cpu.weight", buf);
 505         if (r < 0)
 506                 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
 507                               "Failed to set cpu.weight: %m");
 508
 509         if (quota != USEC_INFINITY)
 510                 xsprintf(buf, USEC_FMT " " USEC_FMT "\n",
 511                          quota * CGROUP_CPU_QUOTA_PERIOD_USEC / USEC_PER_SEC, CGROUP_CPU_QUOTA_PERIOD_USEC);
 512         else
 513                 xsprintf(buf, "max " USEC_FMT "\n", CGROUP_CPU_QUOTA_PERIOD_USEC);
 514
 515         r = cg_set_attribute("cpu", u->cgroup_path, "cpu.max", buf);
 516
 517         if (r < 0)
 518                 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
 519                               "Failed to set cpu.max: %m");
 520 }
 521
 522 static void cgroup_apply_legacy_cpu_config(Unit *u, uint64_t shares, uint64_t quota) {
 523         char buf[MAX(DECIMAL_STR_MAX(uint64_t), DECIMAL_STR_MAX(usec_t)) + 1];
 524         int r;
 525
 526         xsprintf(buf, "%" PRIu64 "\n", shares);
 527         r = cg_set_attribute("cpu", u->cgroup_path, "cpu.shares", buf);
 528         if (r < 0)
 529                 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
 530                               "Failed to set cpu.shares: %m");
 531
 532         xsprintf(buf, USEC_FMT "\n", CGROUP_CPU_QUOTA_PERIOD_USEC);
 533         r = cg_set_attribute("cpu", u->cgroup_path, "cpu.cfs_period_us", buf);
 534         if (r < 0)
 535                 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
 536                               "Failed to set cpu.cfs_period_us: %m");
 537
 538         if (quota != USEC_INFINITY) {
 539                 xsprintf(buf, USEC_FMT "\n", quota * CGROUP_CPU_QUOTA_PERIOD_USEC / USEC_PER_SEC);
 540                 r = cg_set_attribute("cpu", u->cgroup_path, "cpu.cfs_quota_us", buf);
 541         } else
 542                 r = cg_set_attribute("cpu", u->cgroup_path, "cpu.cfs_quota_us", "-1");
 543         if (r < 0)
 544                 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
 545                               "Failed to set cpu.cfs_quota_us: %m");
 546 }
 547
 548 static uint64_t cgroup_cpu_shares_to_weight(uint64_t shares) {
 549         return CLAMP(shares * CGROUP_WEIGHT_DEFAULT / CGROUP_CPU_SHARES_DEFAULT,
 550                      CGROUP_WEIGHT_MIN, CGROUP_WEIGHT_MAX);
 551 }
 552
 553 static uint64_t cgroup_cpu_weight_to_shares(uint64_t weight) {
 554         return CLAMP(weight * CGROUP_CPU_SHARES_DEFAULT / CGROUP_WEIGHT_DEFAULT,
 555                      CGROUP_CPU_SHARES_MIN, CGROUP_CPU_SHARES_MAX);
 556 }
 557
 558 static bool cgroup_context_has_io_config(CGroupContext *c) {
 559         return c->io_accounting ||
 560                 c->io_weight != CGROUP_WEIGHT_INVALID ||
 561                 c->startup_io_weight != CGROUP_WEIGHT_INVALID ||
 562                 c->io_device_weights ||
 563                 c->io_device_limits;
 564 }
 565
 566 static bool cgroup_context_has_blockio_config(CGroupContext *c) {
 567         return c->blockio_accounting ||
 568                 c->blockio_weight != CGROUP_BLKIO_WEIGHT_INVALID ||
 569                 c->startup_blockio_weight != CGROUP_BLKIO_WEIGHT_INVALID ||
 570                 c->blockio_device_weights ||
 571                 c->blockio_device_bandwidths;
 572 }
 573
 574 static uint64_t cgroup_context_io_weight(CGroupContext *c, ManagerState state) {
 575         if (IN_SET(state, MANAGER_STARTING, MANAGER_INITIALIZING) &&
 576             c->startup_io_weight != CGROUP_WEIGHT_INVALID)
 577                 return c->startup_io_weight;
 578         else if (c->io_weight != CGROUP_WEIGHT_INVALID)
 579                 return c->io_weight;
 580         else
 581                 return CGROUP_WEIGHT_DEFAULT;
 582 }
 583
 584 static uint64_t cgroup_context_blkio_weight(CGroupContext *c, ManagerState state) {
 585         if (IN_SET(state, MANAGER_STARTING, MANAGER_INITIALIZING) &&
 586             c->startup_blockio_weight != CGROUP_BLKIO_WEIGHT_INVALID)
 587                 return c->startup_blockio_weight;
 588         else if (c->blockio_weight != CGROUP_BLKIO_WEIGHT_INVALID)
 589                 return c->blockio_weight;
 590         else
 591                 return CGROUP_BLKIO_WEIGHT_DEFAULT;
 592 }
 593
 594 static uint64_t cgroup_weight_blkio_to_io(uint64_t blkio_weight) {
 595         return CLAMP(blkio_weight * CGROUP_WEIGHT_DEFAULT / CGROUP_BLKIO_WEIGHT_DEFAULT,
 596                      CGROUP_WEIGHT_MIN, CGROUP_WEIGHT_MAX);
 597 }
 598
 599 static uint64_t cgroup_weight_io_to_blkio(uint64_t io_weight) {
 600         return CLAMP(io_weight * CGROUP_BLKIO_WEIGHT_DEFAULT / CGROUP_WEIGHT_DEFAULT,
 601                      CGROUP_BLKIO_WEIGHT_MIN, CGROUP_BLKIO_WEIGHT_MAX);
 602 }
 603
 604 static void cgroup_apply_io_device_weight(Unit *u, const char *dev_path, uint64_t io_weight) {
 605         char buf[DECIMAL_STR_MAX(dev_t)*2+2+DECIMAL_STR_MAX(uint64_t)+1];
 606         dev_t dev;
 607         int r;
 608
 609         r = lookup_block_device(dev_path, &dev);
 610         if (r < 0)
 611                 return;
 612
 613         xsprintf(buf, "%u:%u %" PRIu64 "\n", major(dev), minor(dev), io_weight);
 614         r = cg_set_attribute("io", u->cgroup_path, "io.weight", buf);
 615         if (r < 0)
 616                 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
 617                               "Failed to set io.weight: %m");
 618 }
 619
 620 static void cgroup_apply_blkio_device_weight(Unit *u, const char *dev_path, uint64_t blkio_weight) {
 621         char buf[DECIMAL_STR_MAX(dev_t)*2+2+DECIMAL_STR_MAX(uint64_t)+1];
 622         dev_t dev;
 623         int r;
 624
 625         r = lookup_block_device(dev_path, &dev);
 626         if (r < 0)
 627                 return;
 628
 629         xsprintf(buf, "%u:%u %" PRIu64 "\n", major(dev), minor(dev), blkio_weight);
 630         r = cg_set_attribute("blkio", u->cgroup_path, "blkio.weight_device", buf);
 631         if (r < 0)
 632                 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
 633                               "Failed to set blkio.weight_device: %m");
 634 }
 635
 636 static unsigned cgroup_apply_io_device_limit(Unit *u, const char *dev_path, uint64_t *limits) {
 637         char limit_bufs[_CGROUP_IO_LIMIT_TYPE_MAX][DECIMAL_STR_MAX(uint64_t)];
 638         char buf[DECIMAL_STR_MAX(dev_t)*2+2+(6+DECIMAL_STR_MAX(uint64_t)+1)*4];
 639         CGroupIOLimitType type;
 640         dev_t dev;
 641         unsigned n = 0;
 642         int r;
 643
 644         r = lookup_block_device(dev_path, &dev);
 645         if (r < 0)
 646                 return 0;
 647
 648         for (type = 0; type < _CGROUP_IO_LIMIT_TYPE_MAX; type++) {
 649                 if (limits[type] != cgroup_io_limit_defaults[type]) {
 650                         xsprintf(limit_bufs[type], "%" PRIu64, limits[type]);
 651                         n++;
 652                 } else {
 653                         xsprintf(limit_bufs[type], "%s", limits[type] == CGROUP_LIMIT_MAX ? "max" : "0");
 654                 }
 655         }
 656
 657         xsprintf(buf, "%u:%u rbps=%s wbps=%s riops=%s wiops=%s\n", major(dev), minor(dev),
 658                  limit_bufs[CGROUP_IO_RBPS_MAX], limit_bufs[CGROUP_IO_WBPS_MAX],
 659                  limit_bufs[CGROUP_IO_RIOPS_MAX], limit_bufs[CGROUP_IO_WIOPS_MAX]);
 660         r = cg_set_attribute("io", u->cgroup_path, "io.max", buf);
 661         if (r < 0)
 662                 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
 663                               "Failed to set io.max: %m");
 664         return n;
 665 }
 666
 667 static unsigned cgroup_apply_blkio_device_limit(Unit *u, const char *dev_path, uint64_t rbps, uint64_t wbps) {
 668         char buf[DECIMAL_STR_MAX(dev_t)*2+2+DECIMAL_STR_MAX(uint64_t)+1];
 669         dev_t dev;
 670         unsigned n = 0;
 671         int r;
 672
 673         r = lookup_block_device(dev_path, &dev);
 674         if (r < 0)
 675                 return 0;
 676
 677         if (rbps != CGROUP_LIMIT_MAX)
 678                 n++;
 679         sprintf(buf, "%u:%u %" PRIu64 "\n", major(dev), minor(dev), rbps);
 680         r = cg_set_attribute("blkio", u->cgroup_path, "blkio.throttle.read_bps_device", buf);
 681         if (r < 0)
 682                 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
 683                               "Failed to set blkio.throttle.read_bps_device: %m");
 684
 685         if (wbps != CGROUP_LIMIT_MAX)
 686                 n++;
 687         sprintf(buf, "%u:%u %" PRIu64 "\n", major(dev), minor(dev), wbps);
 688         r = cg_set_attribute("blkio", u->cgroup_path, "blkio.throttle.write_bps_device", buf);
 689         if (r < 0)
 690                 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
 691                               "Failed to set blkio.throttle.write_bps_device: %m");
 692
 693         return n;
 694 }
 695
 696 static bool cgroup_context_has_unified_memory_config(CGroupContext *c) {
 697         return c->memory_low > 0 || c->memory_high != CGROUP_LIMIT_MAX || c->memory_max != CGROUP_LIMIT_MAX || c->memory_swap_max != CGROUP_LIMIT_MAX;
 698 }
 699
 700 static void cgroup_apply_unified_memory_limit(Unit *u, const char *file, uint64_t v) {
 701         char buf[DECIMAL_STR_MAX(uint64_t) + 1] = "max";
 702         int r;
 703
 704         if (v != CGROUP_LIMIT_MAX)
 705                 xsprintf(buf, "%" PRIu64 "\n", v);
 706
 707         r = cg_set_attribute("memory", u->cgroup_path, file, buf);
 708         if (r < 0)
 709                 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
 710                               "Failed to set %s: %m", file);
 711 }
 712
 713 static void cgroup_apply_firewall(Unit *u) {
 714         assert(u);
 715
 716         /* Best-effort: let's apply IP firewalling and/or accounting if that's enabled */
 717
 718         if (bpf_firewall_compile(u) < 0)
 719                 return;
 720
 721         (void) bpf_firewall_install(u);
 722 }
 723
 724 static void cgroup_context_apply(
 725                 Unit *u,
 726                 CGroupMask apply_mask,
 727                 bool apply_bpf,
 728                 ManagerState state) {
 729
 730         const char *path;
 731         CGroupContext *c;
 732         bool is_root;
 733         int r;
 734
 735         assert(u);
 736
 737         /* Nothing to do? Exit early! */
 738         if (apply_mask == 0 && !apply_bpf)
 739                 return;
 740
 741         /* Some cgroup attributes are not supported on the root cgroup, hence silently ignore */
 742         is_root = unit_has_root_cgroup(u);
 743
 744         assert_se(c = unit_get_cgroup_context(u));
 745         assert_se(path = u->cgroup_path);
 746
 747         if (is_root) /* Make sure we don't try to display messages with an empty path. */
 748                 path = "/";
 749
 750         /* We generally ignore errors caused by read-only mounted
 751          * cgroup trees (assuming we are running in a container then),
 752          * and missing cgroups, i.e. EROFS and ENOENT. */
 753
 754         if ((apply_mask & CGROUP_MASK_CPU) && !is_root) {
 755                 bool has_weight, has_shares;
 756
 757                 has_weight = cgroup_context_has_cpu_weight(c);
 758                 has_shares = cgroup_context_has_cpu_shares(c);
 759
 760                 if (cg_all_unified() > 0) {
 761                         uint64_t weight;
 762
 763                         if (has_weight)
 764                                 weight = cgroup_context_cpu_weight(c, state);
 765                         else if (has_shares) {
 766                                 uint64_t shares = cgroup_context_cpu_shares(c, state);
 767
 768                                 weight = cgroup_cpu_shares_to_weight(shares);
 769
 770                                 log_cgroup_compat(u, "Applying [Startup]CpuShares %" PRIu64 " as [Startup]CpuWeight %" PRIu64 " on %s",
 771                                                   shares, weight, path);
 772                         } else
 773                                 weight = CGROUP_WEIGHT_DEFAULT;
 774
 775                         cgroup_apply_unified_cpu_config(u, weight, c->cpu_quota_per_sec_usec);
 776                 } else {
 777                         uint64_t shares;
 778
 779                         if (has_weight) {
 780                                 uint64_t weight = cgroup_context_cpu_weight(c, state);
 781
 782                                 shares = cgroup_cpu_weight_to_shares(weight);
 783
 784                                 log_cgroup_compat(u, "Applying [Startup]CpuWeight %" PRIu64 " as [Startup]CpuShares %" PRIu64 " on %s",
 785                                                   weight, shares, path);
 786                         } else if (has_shares)
 787                                 shares = cgroup_context_cpu_shares(c, state);
 788                         else
 789                                 shares = CGROUP_CPU_SHARES_DEFAULT;
 790
 791                         cgroup_apply_legacy_cpu_config(u, shares, c->cpu_quota_per_sec_usec);
 792                 }
 793         }
 794
 795         if (apply_mask & CGROUP_MASK_IO) {
 796                 bool has_io = cgroup_context_has_io_config(c);
 797                 bool has_blockio = cgroup_context_has_blockio_config(c);
 798
 799                 if (!is_root) {
 800                         char buf[8+DECIMAL_STR_MAX(uint64_t)+1];
 801                         uint64_t weight;
 802
 803                         if (has_io)
 804                                 weight = cgroup_context_io_weight(c, state);
 805                         else if (has_blockio) {
 806                                 uint64_t blkio_weight = cgroup_context_blkio_weight(c, state);
 807
 808                                 weight = cgroup_weight_blkio_to_io(blkio_weight);
 809
 810                                 log_cgroup_compat(u, "Applying [Startup]BlockIOWeight %" PRIu64 " as [Startup]IOWeight %" PRIu64,
 811                                                   blkio_weight, weight);
 812                         } else
 813                                 weight = CGROUP_WEIGHT_DEFAULT;
 814
 815                         xsprintf(buf, "default %" PRIu64 "\n", weight);
 816                         r = cg_set_attribute("io", path, "io.weight", buf);
 817                         if (r < 0)
 818                                 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
 819                                               "Failed to set io.weight: %m");
 820
 821                         if (has_io) {
 822                                 CGroupIODeviceWeight *w;
 823
 824                                 /* FIXME: no way to reset this list */
 825                                 LIST_FOREACH(device_weights, w, c->io_device_weights)
 826                                         cgroup_apply_io_device_weight(u, w->path, w->weight);
 827                         } else if (has_blockio) {
 828                                 CGroupBlockIODeviceWeight *w;
 829
 830                                 /* FIXME: no way to reset this list */
 831                                 LIST_FOREACH(device_weights, w, c->blockio_device_weights) {
 832                                         weight = cgroup_weight_blkio_to_io(w->weight);
 833
 834                                         log_cgroup_compat(u, "Applying BlockIODeviceWeight %" PRIu64 " as IODeviceWeight %" PRIu64 " for %s",
 835                                                           w->weight, weight, w->path);
 836
 837                                         cgroup_apply_io_device_weight(u, w->path, weight);
 838                                 }
 839                         }
 840                 }
 841
 842                 /* Apply limits and free ones without config. */
 843                 if (has_io) {
 844                         CGroupIODeviceLimit *l, *next;
 845
 846                         LIST_FOREACH_SAFE(device_limits, l, next, c->io_device_limits) {
 847                                 if (!cgroup_apply_io_device_limit(u, l->path, l->limits))
 848                                         cgroup_context_free_io_device_limit(c, l);
 849                         }
 850                 } else if (has_blockio) {
 851                         CGroupBlockIODeviceBandwidth *b, *next;
 852
 853                         LIST_FOREACH_SAFE(device_bandwidths, b, next, c->blockio_device_bandwidths) {
 854                                 uint64_t limits[_CGROUP_IO_LIMIT_TYPE_MAX];
 855                                 CGroupIOLimitType type;
 856
 857                                 for (type = 0; type < _CGROUP_IO_LIMIT_TYPE_MAX; type++)
 858                                         limits[type] = cgroup_io_limit_defaults[type];
 859
 860                                 limits[CGROUP_IO_RBPS_MAX] = b->rbps;
 861                                 limits[CGROUP_IO_WBPS_MAX] = b->wbps;
 862
 863                                 log_cgroup_compat(u, "Applying BlockIO{Read|Write}Bandwidth %" PRIu64 " %" PRIu64 " as IO{Read|Write}BandwidthMax for %s",
 864                                                   b->rbps, b->wbps, b->path);
 865
 866                                 if (!cgroup_apply_io_device_limit(u, b->path, limits))
 867                                         cgroup_context_free_blockio_device_bandwidth(c, b);
 868                         }
 869                 }
 870         }
 871
 872         if (apply_mask & CGROUP_MASK_BLKIO) {
 873                 bool has_io = cgroup_context_has_io_config(c);
 874                 bool has_blockio = cgroup_context_has_blockio_config(c);
 875
 876                 if (!is_root) {
 877                         char buf[DECIMAL_STR_MAX(uint64_t)+1];
 878                         uint64_t weight;
 879
 880                         if (has_io) {
 881                                 uint64_t io_weight = cgroup_context_io_weight(c, state);
 882
 883                                 weight = cgroup_weight_io_to_blkio(cgroup_context_io_weight(c, state));
 884
 885                                 log_cgroup_compat(u, "Applying [Startup]IOWeight %" PRIu64 " as [Startup]BlockIOWeight %" PRIu64,
 886                                                   io_weight, weight);
 887                         } else if (has_blockio)
 888                                 weight = cgroup_context_blkio_weight(c, state);
 889                         else
 890                                 weight = CGROUP_BLKIO_WEIGHT_DEFAULT;
 891
 892                         xsprintf(buf, "%" PRIu64 "\n", weight);
 893                         r = cg_set_attribute("blkio", path, "blkio.weight", buf);
 894                         if (r < 0)
 895                                 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
 896                                               "Failed to set blkio.weight: %m");
 897
 898                         if (has_io) {
 899                                 CGroupIODeviceWeight *w;
 900
 901                                 /* FIXME: no way to reset this list */
 902                                 LIST_FOREACH(device_weights, w, c->io_device_weights) {
 903                                         weight = cgroup_weight_io_to_blkio(w->weight);
 904
 905                                         log_cgroup_compat(u, "Applying IODeviceWeight %" PRIu64 " as BlockIODeviceWeight %" PRIu64 " for %s",
 906                                                           w->weight, weight, w->path);
 907
 908                                         cgroup_apply_blkio_device_weight(u, w->path, weight);
 909                                 }
 910                         } else if (has_blockio) {
 911                                 CGroupBlockIODeviceWeight *w;
 912
 913                                 /* FIXME: no way to reset this list */
 914                                 LIST_FOREACH(device_weights, w, c->blockio_device_weights)
 915                                         cgroup_apply_blkio_device_weight(u, w->path, w->weight);
 916                         }
 917                 }
 918
 919                 /* Apply limits and free ones without config. */
 920                 if (has_io) {
 921                         CGroupIODeviceLimit *l, *next;
 922
 923                         LIST_FOREACH_SAFE(device_limits, l, next, c->io_device_limits) {
 924                                 log_cgroup_compat(u, "Applying IO{Read|Write}Bandwidth %" PRIu64 " %" PRIu64 " as BlockIO{Read|Write}BandwidthMax for %s",
 925                                                   l->limits[CGROUP_IO_RBPS_MAX], l->limits[CGROUP_IO_WBPS_MAX], l->path);
 926
 927                                 if (!cgroup_apply_blkio_device_limit(u, l->path, l->limits[CGROUP_IO_RBPS_MAX], l->limits[CGROUP_IO_WBPS_MAX]))
 928                                         cgroup_context_free_io_device_limit(c, l);
 929                         }
 930                 } else if (has_blockio) {
 931                         CGroupBlockIODeviceBandwidth *b, *next;
 932
 933                         LIST_FOREACH_SAFE(device_bandwidths, b, next, c->blockio_device_bandwidths)
 934                                 if (!cgroup_apply_blkio_device_limit(u, b->path, b->rbps, b->wbps))
 935                                         cgroup_context_free_blockio_device_bandwidth(c, b);
 936                 }
 937         }
 938
 939         if ((apply_mask & CGROUP_MASK_MEMORY) && !is_root) {
 940                 if (cg_all_unified() > 0) {
 941                         uint64_t max, swap_max = CGROUP_LIMIT_MAX;
 942
 943                         if (cgroup_context_has_unified_memory_config(c)) {
 944                                 max = c->memory_max;
 945                                 swap_max = c->memory_swap_max;
 946                         } else {
 947                                 max = c->memory_limit;
 948
 949                                 if (max != CGROUP_LIMIT_MAX)
 950                                         log_cgroup_compat(u, "Applying MemoryLimit %" PRIu64 " as MemoryMax", max);
 951                         }
 952
 953                         cgroup_apply_unified_memory_limit(u, "memory.low", c->memory_low);
 954                         cgroup_apply_unified_memory_limit(u, "memory.high", c->memory_high);
 955                         cgroup_apply_unified_memory_limit(u, "memory.max", max);
 956                         cgroup_apply_unified_memory_limit(u, "memory.swap.max", swap_max);
 957                 } else {
 958                         char buf[DECIMAL_STR_MAX(uint64_t) + 1];
 959                         uint64_t val;
 960
 961                         if (cgroup_context_has_unified_memory_config(c)) {
 962                                 val = c->memory_max;
 963                                 log_cgroup_compat(u, "Applying MemoryMax %" PRIi64 " as MemoryLimit", val);
 964                         } else
 965                                 val = c->memory_limit;
 966
 967                         if (val == CGROUP_LIMIT_MAX)
 968                                 strncpy(buf, "-1\n", sizeof(buf));
 969                         else
 970                                 xsprintf(buf, "%" PRIu64 "\n", val);
 971
 972                         r = cg_set_attribute("memory", path, "memory.limit_in_bytes", buf);
 973                         if (r < 0)
 974                                 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
 975                                               "Failed to set memory.limit_in_bytes: %m");
 976                 }
 977         }
 978
 979         if ((apply_mask & CGROUP_MASK_DEVICES) && !is_root) {
 980                 CGroupDeviceAllow *a;
 981
 982                 /* Changing the devices list of a populated cgroup
 983                  * might result in EINVAL, hence ignore EINVAL
 984                  * here. */
 985
 986                 if (c->device_allow || c->device_policy != CGROUP_AUTO)
 987                         r = cg_set_attribute("devices", path, "devices.deny", "a");
 988                 else
 989                         r = cg_set_attribute("devices", path, "devices.allow", "a");
 990                 if (r < 0)
 991                         log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EINVAL, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
 992                                       "Failed to reset devices.list: %m");
 993
 994                 if (c->device_policy == CGROUP_CLOSED ||
 995                     (c->device_policy == CGROUP_AUTO && c->device_allow)) {
 996                         static const char auto_devices[] =
 997                                 "/dev/null\0" "rwm\0"
 998                                 "/dev/zero\0" "rwm\0"
 999                                 "/dev/full\0" "rwm\0"
1000                                 "/dev/random\0" "rwm\0"
1001                                 "/dev/urandom\0" "rwm\0"
1002                                 "/dev/tty\0" "rwm\0"
1003                                 "/dev/ptmx\0" "rwm\0"
1004                                 /* Allow /run/systemd/inaccessible/{chr,blk} devices for mapping InaccessiblePaths */
1005                                 "-/run/systemd/inaccessible/chr\0" "rwm\0"
1006                                 "-/run/systemd/inaccessible/blk\0" "rwm\0";
1007
1008                         const char *x, *y;
1009
1010                         NULSTR_FOREACH_PAIR(x, y, auto_devices)
1011                                 whitelist_device(path, x, y);
1012
1013                         /* PTS (/dev/pts) devices may not be duplicated, but accessed */
1014                         whitelist_major(path, "pts", 'c', "rw");
1015                 }
1016
1017                 LIST_FOREACH(device_allow, a, c->device_allow) {
1018                         char acc[4], *val;
1019                         unsigned k = 0;
1020
1021                         if (a->r)
1022                                 acc[k++] = 'r';
1023                         if (a->w)
1024                                 acc[k++] = 'w';
1025                         if (a->m)
1026                                 acc[k++] = 'm';
1027
1028                         if (k == 0)
1029                                 continue;
1030
1031                         acc[k++] = 0;
1032
1033                         if (path_startswith(a->path, "/dev/"))
1034                                 whitelist_device(path, a->path, acc);
1035                         else if ((val = startswith(a->path, "block-")))
1036                                 whitelist_major(path, val, 'b', acc);
1037                         else if ((val = startswith(a->path, "char-")))
1038                                 whitelist_major(path, val, 'c', acc);
1039                         else
1040                                 log_unit_debug(u, "Ignoring device %s while writing cgroup attribute.", a->path);
1041                 }
1042         }
1043
1044         if (apply_mask & CGROUP_MASK_PIDS) {
1045
1046                 if (is_root) {
1047                         /* So, the "pids" controller does not expose anything on the root cgroup, in order not to
1048                          * replicate knobs exposed elsewhere needlessly. We abstract this away here however, and when
1049                          * the knobs of the root cgroup are modified propagate this to the relevant sysctls. There's a
1050                          * non-obvious asymmetry however: unlike the cgroup properties we don't really want to take
1051                          * exclusive ownership of the sysctls, but we still want to honour things if the user sets
1052                          * limits. Hence we employ sort of a one-way strategy: when the user sets a bounded limit
1053                          * through us it counts. When the user afterwards unsets it again (i.e. sets it to unbounded)
1054                          * it also counts. But if the user never set a limit through us (i.e. we are the default of
1055                          * "unbounded") we leave things unmodified. For this we manage a global boolean that we turn on
1056                          * the first time we set a limit. Note that this boolean is flushed out on manager reload,
1057                          * which is desirable so that there's an offical way to release control of the sysctl from
1058                          * systemd: set the limit to unbounded and reload. */
1059
1060                         if (c->tasks_max != CGROUP_LIMIT_MAX) {
1061                                 u->manager->sysctl_pid_max_changed = true;
1062                                 r = procfs_tasks_set_limit(c->tasks_max);
1063                         } else if (u->manager->sysctl_pid_max_changed)
1064                                 r = procfs_tasks_set_limit(TASKS_MAX);
1065                         else
1066                                 r = 0;
1067
1068                         if (r < 0)
1069                                 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
1070                                               "Failed to write to tasks limit sysctls: %m");
1071
1072                 } else {
1073                         if (c->tasks_max != CGROUP_LIMIT_MAX) {
1074                                 char buf[DECIMAL_STR_MAX(uint64_t) + 2];
1075
1076                                 sprintf(buf, "%" PRIu64 "\n", c->tasks_max);
1077                                 r = cg_set_attribute("pids", path, "pids.max", buf);
1078                         } else
1079                                 r = cg_set_attribute("pids", path, "pids.max", "max");
1080                         if (r < 0)
1081                                 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
1082                                               "Failed to set pids.max: %m");
1083                 }
1084         }
1085
1086         if (apply_bpf)
1087                 cgroup_apply_firewall(u);
1088 }
1089
1090 CGroupMask cgroup_context_get_mask(CGroupContext *c) {
1091         CGroupMask mask = 0;
1092
1093         /* Figure out which controllers we need */
1094
1095         if (c->cpu_accounting ||
1096             cgroup_context_has_cpu_weight(c) ||
1097             cgroup_context_has_cpu_shares(c) ||
1098             c->cpu_quota_per_sec_usec != USEC_INFINITY)
1099                 mask |= CGROUP_MASK_CPUACCT | CGROUP_MASK_CPU;
1100
1101         if (cgroup_context_has_io_config(c) || cgroup_context_has_blockio_config(c))
1102                 mask |= CGROUP_MASK_IO | CGROUP_MASK_BLKIO;
1103
1104         if (c->memory_accounting ||
1105             c->memory_limit != CGROUP_LIMIT_MAX ||
1106             cgroup_context_has_unified_memory_config(c))
1107                 mask |= CGROUP_MASK_MEMORY;
1108
1109         if (c->device_allow ||
1110             c->device_policy != CGROUP_AUTO)
1111                 mask |= CGROUP_MASK_DEVICES;
1112
1113         if (c->tasks_accounting ||
1114             c->tasks_max != CGROUP_LIMIT_MAX)
1115                 mask |= CGROUP_MASK_PIDS;
1116
1117         return mask;
1118 }
1119
1120 CGroupMask unit_get_own_mask(Unit *u) {
1121         CGroupContext *c;
1122
1123         /* Returns the mask of controllers the unit needs for itself */
1124
1125         c = unit_get_cgroup_context(u);
1126         if (!c)
1127                 return 0;
1128
1129         return cgroup_context_get_mask(c) | unit_get_delegate_mask(u);
1130 }
1131
1132 CGroupMask unit_get_delegate_mask(Unit *u) {
1133         CGroupContext *c;
1134
1135         /* If delegation is turned on, then turn on selected controllers, unless we are on the legacy hierarchy and the
1136          * process we fork into is known to drop privileges, and hence shouldn't get access to the controllers.
1137          *
1138          * Note that on the unified hierarchy it is safe to delegate controllers to unprivileged services. */
1139
1140         if (!unit_cgroup_delegate(u))
1141                 return 0;
1142
1143         if (cg_all_unified() <= 0) {
1144                 ExecContext *e;
1145
1146                 e = unit_get_exec_context(u);
1147                 if (e && !exec_context_maintains_privileges(e))
1148                         return 0;
1149         }
1150
1151         assert_se(c = unit_get_cgroup_context(u));
1152         return c->delegate_controllers;
1153 }
1154
1155 CGroupMask unit_get_members_mask(Unit *u) {
1156         assert(u);
1157
1158         /* Returns the mask of controllers all of the unit's children require, merged */
1159
1160         if (u->cgroup_members_mask_valid)
1161                 return u->cgroup_members_mask;
1162
1163         u->cgroup_members_mask = 0;
1164
1165         if (u->type == UNIT_SLICE) {
1166                 void *v;
1167                 Unit *member;
1168                 Iterator i;
1169
1170                 HASHMAP_FOREACH_KEY(v, member, u->dependencies[UNIT_BEFORE], i) {
1171
1172                         if (member == u)
1173                                 continue;
1174
1175                         if (UNIT_DEREF(member->slice) != u)
1176                                 continue;
1177
1178                         u->cgroup_members_mask |= unit_get_subtree_mask(member); /* note that this calls ourselves again, for the children */
1179                 }
1180         }
1181
1182         u->cgroup_members_mask_valid = true;
1183         return u->cgroup_members_mask;
1184 }
1185
1186 CGroupMask unit_get_siblings_mask(Unit *u) {
1187         assert(u);
1188
1189         /* Returns the mask of controllers all of the unit's siblings
1190          * require, i.e. the members mask of the unit's parent slice
1191          * if there is one. */
1192
1193         if (UNIT_ISSET(u->slice))
1194                 return unit_get_members_mask(UNIT_DEREF(u->slice));
1195
1196         return unit_get_subtree_mask(u); /* we are the top-level slice */
1197 }
1198
1199 CGroupMask unit_get_subtree_mask(Unit *u) {
1200
1201         /* Returns the mask of this subtree, meaning of the group
1202          * itself and its children. */
1203
1204         return unit_get_own_mask(u) | unit_get_members_mask(u);
1205 }
1206
1207 CGroupMask unit_get_target_mask(Unit *u) {
1208         CGroupMask mask;
1209
1210         /* This returns the cgroup mask of all controllers to enable
1211          * for a specific cgroup, i.e. everything it needs itself,
1212          * plus all that its children need, plus all that its siblings
1213          * need. This is primarily useful on the legacy cgroup
1214          * hierarchy, where we need to duplicate each cgroup in each
1215          * hierarchy that shall be enabled for it. */
1216
1217         mask = unit_get_own_mask(u) | unit_get_members_mask(u) | unit_get_siblings_mask(u);
1218         mask &= u->manager->cgroup_supported;
1219
1220         return mask;
1221 }
1222
1223 CGroupMask unit_get_enable_mask(Unit *u) {
1224         CGroupMask mask;
1225
1226         /* This returns the cgroup mask of all controllers to enable
1227          * for the children of a specific cgroup. This is primarily
1228          * useful for the unified cgroup hierarchy, where each cgroup
1229          * controls which controllers are enabled for its children. */
1230
1231         mask = unit_get_members_mask(u);
1232         mask &= u->manager->cgroup_supported;
1233
1234         return mask;
1235 }
1236
1237 bool unit_get_needs_bpf(Unit *u) {
1238         CGroupContext *c;
1239         Unit *p;
1240         assert(u);
1241
1242         c = unit_get_cgroup_context(u);
1243         if (!c)
1244                 return false;
1245
1246         if (c->ip_accounting ||
1247             c->ip_address_allow ||
1248             c->ip_address_deny)
1249                 return true;
1250
1251         /* If any parent slice has an IP access list defined, it applies too */
1252         for (p = UNIT_DEREF(u->slice); p; p = UNIT_DEREF(p->slice)) {
1253                 c = unit_get_cgroup_context(p);
1254                 if (!c)
1255                         return false;
1256
1257                 if (c->ip_address_allow ||
1258                     c->ip_address_deny)
1259                         return true;
1260         }
1261
1262         return false;
1263 }
1264
1265 /* Recurse from a unit up through its containing slices, propagating
1266  * mask bits upward. A unit is also member of itself. */
1267 void unit_update_cgroup_members_masks(Unit *u) {
1268         CGroupMask m;
1269         bool more;
1270
1271         assert(u);
1272
1273         /* Calculate subtree mask */
1274         m = unit_get_subtree_mask(u);
1275
1276         /* See if anything changed from the previous invocation. If
1277          * not, we're done. */
1278         if (u->cgroup_subtree_mask_valid && m == u->cgroup_subtree_mask)
1279                 return;
1280
1281         more =
1282                 u->cgroup_subtree_mask_valid &&
1283                 ((m & ~u->cgroup_subtree_mask) != 0) &&
1284                 ((~m & u->cgroup_subtree_mask) == 0);
1285
1286         u->cgroup_subtree_mask = m;
1287         u->cgroup_subtree_mask_valid = true;
1288
1289         if (UNIT_ISSET(u->slice)) {
1290                 Unit *s = UNIT_DEREF(u->slice);
1291
1292                 if (more)
1293                         /* There's more set now than before. We
1294                          * propagate the new mask to the parent's mask
1295                          * (not caring if it actually was valid or
1296                          * not). */
1297
1298                         s->cgroup_members_mask |= m;
1299
1300                 else
1301                         /* There's less set now than before (or we
1302                          * don't know), we need to recalculate
1303                          * everything, so let's invalidate the
1304                          * parent's members mask */
1305
1306                         s->cgroup_members_mask_valid = false;
1307
1308                 /* And now make sure that this change also hits our
1309                  * grandparents */
1310                 unit_update_cgroup_members_masks(s);
1311         }
1312 }
1313
1314 const char *unit_get_realized_cgroup_path(Unit *u, CGroupMask mask) {
1315
1316         /* Returns the realized cgroup path of the specified unit where all specified controllers are available. */
1317
1318         while (u) {
1319
1320                 if (u->cgroup_path &&
1321                     u->cgroup_realized &&
1322                     (u->cgroup_realized_mask & mask) == mask)
1323                         return u->cgroup_path;
1324
1325                 u = UNIT_DEREF(u->slice);
1326         }
1327
1328         return NULL;
1329 }
1330
1331 static const char *migrate_callback(CGroupMask mask, void *userdata) {
1332         return unit_get_realized_cgroup_path(userdata, mask);
1333 }
1334
1335 char *unit_default_cgroup_path(Unit *u) {
1336         _cleanup_free_ char *escaped = NULL, *slice = NULL;
1337         int r;
1338
1339         assert(u);
1340
1341         if (unit_has_name(u, SPECIAL_ROOT_SLICE))
1342                 return strdup(u->manager->cgroup_root);
1343
1344         if (UNIT_ISSET(u->slice) && !unit_has_name(UNIT_DEREF(u->slice), SPECIAL_ROOT_SLICE)) {
1345                 r = cg_slice_to_path(UNIT_DEREF(u->slice)->id, &slice);
1346                 if (r < 0)
1347                         return NULL;
1348         }
1349
1350         escaped = cg_escape(u->id);
1351         if (!escaped)
1352                 return NULL;
1353
1354         if (slice)
1355                 return strjoin(u->manager->cgroup_root, "/", slice, "/",
1356                                escaped);
1357         else
1358                 return strjoin(u->manager->cgroup_root, "/", escaped);
1359 }
1360
1361 int unit_set_cgroup_path(Unit *u, const char *path) {
1362         _cleanup_free_ char *p = NULL;
1363         int r;
1364
1365         assert(u);
1366
1367         if (path) {
1368                 p = strdup(path);
1369                 if (!p)
1370                         return -ENOMEM;
1371         } else
1372                 p = NULL;
1373
1374         if (streq_ptr(u->cgroup_path, p))
1375                 return 0;
1376
1377         if (p) {
1378                 r = hashmap_put(u->manager->cgroup_unit, p, u);
1379                 if (r < 0)
1380                         return r;
1381         }
1382
1383         unit_release_cgroup(u);
1384
1385         u->cgroup_path = p;
1386         p = NULL;
1387
1388         return 1;
1389 }
1390
1391 int unit_watch_cgroup(Unit *u) {
1392         _cleanup_free_ char *events = NULL;
1393         int r;
1394
1395         assert(u);
1396
1397         if (!u->cgroup_path)
1398                 return 0;
1399
1400         if (u->cgroup_inotify_wd >= 0)
1401                 return 0;
1402
1403         /* Only applies to the unified hierarchy */
1404         r = cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER);
1405         if (r < 0)
1406                 return log_error_errno(r, "Failed to determine whether the name=systemd hierarchy is unified: %m");
1407         if (r == 0)
1408                 return 0;
1409
1410         /* Don't watch the root slice, it's pointless. */
1411         if (unit_has_name(u, SPECIAL_ROOT_SLICE))
1412                 return 0;
1413
1414         r = hashmap_ensure_allocated(&u->manager->cgroup_inotify_wd_unit, &trivial_hash_ops);
1415         if (r < 0)
1416                 return log_oom();
1417
1418         r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, "cgroup.events", &events);
1419         if (r < 0)
1420                 return log_oom();
1421
1422         u->cgroup_inotify_wd = inotify_add_watch(u->manager->cgroup_inotify_fd, events, IN_MODIFY);
1423         if (u->cgroup_inotify_wd < 0) {
1424
1425                 if (errno == ENOENT) /* If the directory is already
1426                                       * gone we don't need to track
1427                                       * it, so this is not an error */
1428                         return 0;
1429
1430                 return log_unit_error_errno(u, errno, "Failed to add inotify watch descriptor for control group %s: %m", u->cgroup_path);
1431         }
1432
1433         r = hashmap_put(u->manager->cgroup_inotify_wd_unit, INT_TO_PTR(u->cgroup_inotify_wd), u);
1434         if (r < 0)
1435                 return log_unit_error_errno(u, r, "Failed to add inotify watch descriptor to hash map: %m");
1436
1437         return 0;
1438 }
1439
1440 int unit_pick_cgroup_path(Unit *u) {
1441         _cleanup_free_ char *path = NULL;
1442         int r;
1443
1444         assert(u);
1445
1446         if (u->cgroup_path)
1447                 return 0;
1448
1449         if (!UNIT_HAS_CGROUP_CONTEXT(u))
1450                 return -EINVAL;
1451
1452         path = unit_default_cgroup_path(u);
1453         if (!path)
1454                 return log_oom();
1455
1456         r = unit_set_cgroup_path(u, path);
1457         if (r == -EEXIST)
1458                 return log_unit_error_errno(u, r, "Control group %s exists already.", path);
1459         if (r < 0)
1460                 return log_unit_error_errno(u, r, "Failed to set unit's control group path to %s: %m", path);
1461
1462         return 0;
1463 }
1464
1465 static int unit_create_cgroup(
1466                 Unit *u,
1467                 CGroupMask target_mask,
1468                 CGroupMask enable_mask,
1469                 bool needs_bpf) {
1470
1471         CGroupContext *c;
1472         int r;
1473
1474         assert(u);
1475
1476         c = unit_get_cgroup_context(u);
1477         if (!c)
1478                 return 0;
1479
1480         /* Figure out our cgroup path */
1481         r = unit_pick_cgroup_path(u);
1482         if (r < 0)
1483                 return r;
1484
1485         /* First, create our own group */
1486         r = cg_create_everywhere(u->manager->cgroup_supported, target_mask, u->cgroup_path);
1487         if (r < 0)
1488                 return log_unit_error_errno(u, r, "Failed to create cgroup %s: %m", u->cgroup_path);
1489
1490         /* Start watching it */
1491         (void) unit_watch_cgroup(u);
1492
1493         /* Enable all controllers we need */
1494         r = cg_enable_everywhere(u->manager->cgroup_supported, enable_mask, u->cgroup_path);
1495         if (r < 0)
1496                 log_unit_warning_errno(u, r, "Failed to enable controllers on cgroup %s, ignoring: %m", u->cgroup_path);
1497
1498         /* Keep track that this is now realized */
1499         u->cgroup_realized = true;
1500         u->cgroup_realized_mask = target_mask;
1501         u->cgroup_enabled_mask = enable_mask;
1502         u->cgroup_bpf_state = needs_bpf ? UNIT_CGROUP_BPF_ON : UNIT_CGROUP_BPF_OFF;
1503
1504         if (u->type != UNIT_SLICE && !unit_cgroup_delegate(u)) {
1505
1506                 /* Then, possibly move things over, but not if
1507                  * subgroups may contain processes, which is the case
1508                  * for slice and delegation units. */
1509                 r = cg_migrate_everywhere(u->manager->cgroup_supported, u->cgroup_path, u->cgroup_path, migrate_callback, u);
1510                 if (r < 0)
1511                         log_unit_warning_errno(u, r, "Failed to migrate cgroup from to %s, ignoring: %m", u->cgroup_path);
1512         }
1513
1514         return 0;
1515 }
1516
1517 static int unit_attach_pid_to_cgroup_via_bus(Unit *u, pid_t pid, const char *suffix_path) {
1518         _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL;
1519         char *pp;
1520         int r;
1521
1522         assert(u);
1523
1524         if (MANAGER_IS_SYSTEM(u->manager))
1525                 return -EINVAL;
1526
1527         if (!u->manager->system_bus)
1528                 return -EIO;
1529
1530         if (!u->cgroup_path)
1531                 return -EINVAL;
1532
1533         /* Determine this unit's cgroup path relative to our cgroup root */
1534         pp = path_startswith(u->cgroup_path, u->manager->cgroup_root);
1535         if (!pp)
1536                 return -EINVAL;
1537
1538         pp = strjoina("/", pp, suffix_path);
1539         path_kill_slashes(pp);
1540
1541         r = sd_bus_call_method(u->manager->system_bus,
1542                                "org.freedesktop.systemd1",
1543                                "/org/freedesktop/systemd1",
1544                                "org.freedesktop.systemd1.Manager",
1545                                "AttachProcessesToUnit",
1546                                &error, NULL,
1547                                "ssau",
1548                                NULL /* empty unit name means client's unit, i.e. us */, pp, 1, (uint32_t) pid);
1549         if (r < 0)
1550                 return log_unit_debug_errno(u, r, "Failed to attach unit process " PID_FMT " via the bus: %s", pid, bus_error_message(&error, r));
1551
1552         return 0;
1553 }
1554
1555 int unit_attach_pids_to_cgroup(Unit *u, Set *pids, const char *suffix_path) {
1556         CGroupMask delegated_mask;
1557         const char *p;
1558         Iterator i;
1559         void *pidp;
1560         int r, q;
1561
1562         assert(u);
1563
1564         if (!UNIT_HAS_CGROUP_CONTEXT(u))
1565                 return -EINVAL;
1566
1567         if (set_isempty(pids))
1568                 return 0;
1569
1570         r = unit_realize_cgroup(u);
1571         if (r < 0)
1572                 return r;
1573
1574         if (isempty(suffix_path))
1575                 p = u->cgroup_path;
1576         else
1577                 p = strjoina(u->cgroup_path, "/", suffix_path);
1578
1579         delegated_mask = unit_get_delegate_mask(u);
1580
1581         r = 0;
1582         SET_FOREACH(pidp, pids, i) {
1583                 pid_t pid = PTR_TO_PID(pidp);
1584                 CGroupController c;
1585
1586                 /* First, attach the PID to the main cgroup hierarchy */
1587                 q = cg_attach(SYSTEMD_CGROUP_CONTROLLER, p, pid);
1588                 if (q < 0) {
1589                         log_unit_debug_errno(u, q, "Couldn't move process " PID_FMT " to requested cgroup '%s': %m", pid, p);
1590
1591                         if (MANAGER_IS_USER(u->manager) && IN_SET(q, -EPERM, -EACCES)) {
1592                                 int z;
1593
1594                                 /* If we are in a user instance, and we can't move the process ourselves due to
1595                                  * permission problems, let's ask the system instance about it instead. Since it's more
1596                                  * privileged it might be able to move the process across the leaves of a subtree who's
1597                                  * top node is not owned by us. */
1598
1599                                 z = unit_attach_pid_to_cgroup_via_bus(u, pid, suffix_path);
1600                                 if (z < 0)
1601                                         log_unit_debug_errno(u, z, "Couldn't move process " PID_FMT " to requested cgroup '%s' via the system bus either: %m", pid, p);
1602                                 else
1603                                         continue; /* When the bus thing worked via the bus we are fully done for this PID. */
1604                         }
1605
1606                         if (r >= 0)
1607                                 r = q; /* Remember first error */
1608
1609                         continue;
1610                 }
1611
1612                 q = cg_all_unified();
1613                 if (q < 0)
1614                         return q;
1615                 if (q > 0)
1616                         continue;
1617
1618                 /* In the legacy hierarchy, attach the process to the request cgroup if possible, and if not to the
1619                  * innermost realized one */
1620
1621                 for (c = 0; c < _CGROUP_CONTROLLER_MAX; c++) {
1622                         CGroupMask bit = CGROUP_CONTROLLER_TO_MASK(c);
1623                         const char *realized;
1624
1625                         if (!(u->manager->cgroup_supported & bit))
1626                                 continue;
1627
1628                         /* If this controller is delegated and realized, honour the caller's request for the cgroup suffix. */
1629                         if (delegated_mask & u->cgroup_realized_mask & bit) {
1630                                 q = cg_attach(cgroup_controller_to_string(c), p, pid);
1631                                 if (q >= 0)
1632                                         continue; /* Success! */
1633
1634                                 log_unit_debug_errno(u, q, "Failed to attach PID " PID_FMT " to requested cgroup %s in controller %s, falling back to unit's cgroup: %m",
1635                                                      pid, p, cgroup_controller_to_string(c));
1636                         }
1637
1638                         /* So this controller is either not delegate or realized, or something else weird happened. In
1639                          * that case let's attach the PID at least to the closest cgroup up the tree that is
1640                          * realized. */
1641                         realized = unit_get_realized_cgroup_path(u, bit);
1642                         if (!realized)
1643                                 continue; /* Not even realized in the root slice? Then let's not bother */
1644
1645                         q = cg_attach(cgroup_controller_to_string(c), realized, pid);
1646                         if (q < 0)
1647                                 log_unit_debug_errno(u, q, "Failed to attach PID " PID_FMT " to realized cgroup %s in controller %s, ignoring: %m",
1648                                                      pid, realized, cgroup_controller_to_string(c));
1649                 }
1650         }
1651
1652         return r;
1653 }
1654
1655 static void cgroup_xattr_apply(Unit *u) {
1656         char ids[SD_ID128_STRING_MAX];
1657         int r;
1658
1659         assert(u);
1660
1661         if (!MANAGER_IS_SYSTEM(u->manager))
1662                 return;
1663
1664         if (sd_id128_is_null(u->invocation_id))
1665                 return;
1666
1667         r = cg_set_xattr(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path,
1668                          "trusted.invocation_id",
1669                          sd_id128_to_string(u->invocation_id, ids), 32,
1670                          0);
1671         if (r < 0)
1672                 log_unit_debug_errno(u, r, "Failed to set invocation ID on control group %s, ignoring: %m", u->cgroup_path);
1673 }
1674
1675 static bool unit_has_mask_realized(
1676                 Unit *u,
1677                 CGroupMask target_mask,
1678                 CGroupMask enable_mask,
1679                 bool needs_bpf) {
1680
1681         assert(u);
1682
1683         return u->cgroup_realized &&
1684                 u->cgroup_realized_mask == target_mask &&
1685                 u->cgroup_enabled_mask == enable_mask &&
1686                 ((needs_bpf && u->cgroup_bpf_state == UNIT_CGROUP_BPF_ON) ||
1687                  (!needs_bpf && u->cgroup_bpf_state == UNIT_CGROUP_BPF_OFF));
1688 }
1689
1690 static void unit_add_to_cgroup_realize_queue(Unit *u) {
1691         assert(u);
1692
1693         if (u->in_cgroup_realize_queue)
1694                 return;
1695
1696         LIST_PREPEND(cgroup_realize_queue, u->manager->cgroup_realize_queue, u);
1697         u->in_cgroup_realize_queue = true;
1698 }
1699
1700 static void unit_remove_from_cgroup_realize_queue(Unit *u) {
1701         assert(u);
1702
1703         if (!u->in_cgroup_realize_queue)
1704                 return;
1705
1706         LIST_REMOVE(cgroup_realize_queue, u->manager->cgroup_realize_queue, u);
1707         u->in_cgroup_realize_queue = false;
1708 }
1709
1710
1711 /* Check if necessary controllers and attributes for a unit are in place.
1712  *
1713  * If so, do nothing.
1714  * If not, create paths, move processes over, and set attributes.
1715  *
1716  * Returns 0 on success and < 0 on failure. */
1717 static int unit_realize_cgroup_now(Unit *u, ManagerState state) {
1718         CGroupMask target_mask, enable_mask;
1719         bool needs_bpf, apply_bpf;
1720         int r;
1721
1722         assert(u);
1723
1724         unit_remove_from_cgroup_realize_queue(u);
1725
1726         target_mask = unit_get_target_mask(u);
1727         enable_mask = unit_get_enable_mask(u);
1728         needs_bpf = unit_get_needs_bpf(u);
1729
1730         if (unit_has_mask_realized(u, target_mask, enable_mask, needs_bpf))
1731                 return 0;
1732
1733         /* Make sure we apply the BPF filters either when one is configured, or if none is configured but previously
1734          * the state was anything but off. This way, if a unit with a BPF filter applied is reconfigured to lose it
1735          * this will trickle down properly to cgroupfs. */
1736         apply_bpf = needs_bpf || u->cgroup_bpf_state != UNIT_CGROUP_BPF_OFF;
1737
1738         /* First, realize parents */
1739         if (UNIT_ISSET(u->slice)) {
1740                 r = unit_realize_cgroup_now(UNIT_DEREF(u->slice), state);
1741                 if (r < 0)
1742                         return r;
1743         }
1744
1745         /* And then do the real work */
1746         r = unit_create_cgroup(u, target_mask, enable_mask, needs_bpf);
1747         if (r < 0)
1748                 return r;
1749
1750         /* Finally, apply the necessary attributes. */
1751         cgroup_context_apply(u, target_mask, apply_bpf, state);
1752         cgroup_xattr_apply(u);
1753
1754         return 0;
1755 }
1756
1757 unsigned manager_dispatch_cgroup_realize_queue(Manager *m) {
1758         ManagerState state;
1759         unsigned n = 0;
1760         Unit *i;
1761         int r;
1762
1763         assert(m);
1764
1765         state = manager_state(m);
1766
1767         while ((i = m->cgroup_realize_queue)) {
1768                 assert(i->in_cgroup_realize_queue);
1769
1770                 if (UNIT_IS_INACTIVE_OR_FAILED(unit_active_state(i))) {
1771                         /* Maybe things changed, and the unit is not actually active anymore? */
1772                         unit_remove_from_cgroup_realize_queue(i);
1773                         continue;
1774                 }
1775
1776                 r = unit_realize_cgroup_now(i, state);
1777                 if (r < 0)
1778                         log_warning_errno(r, "Failed to realize cgroups for queued unit %s, ignoring: %m", i->id);
1779
1780                 n++;
1781         }
1782
1783         return n;
1784 }
1785
1786 static void unit_add_siblings_to_cgroup_realize_queue(Unit *u) {
1787         Unit *slice;
1788
1789         /* This adds the siblings of the specified unit and the
1790          * siblings of all parent units to the cgroup queue. (But
1791          * neither the specified unit itself nor the parents.) */
1792
1793         while ((slice = UNIT_DEREF(u->slice))) {
1794                 Iterator i;
1795                 Unit *m;
1796                 void *v;
1797
1798                 HASHMAP_FOREACH_KEY(v, m, u->dependencies[UNIT_BEFORE], i) {
1799                         if (m == u)
1800                                 continue;
1801
1802                         /* Skip units that have a dependency on the slice
1803                          * but aren't actually in it. */
1804                         if (UNIT_DEREF(m->slice) != slice)
1805                                 continue;
1806
1807                         /* No point in doing cgroup application for units
1808                          * without active processes. */
1809                         if (UNIT_IS_INACTIVE_OR_FAILED(unit_active_state(m)))
1810                                 continue;
1811
1812                         /* If the unit doesn't need any new controllers
1813                          * and has current ones realized, it doesn't need
1814                          * any changes. */
1815                         if (unit_has_mask_realized(m,
1816                                                    unit_get_target_mask(m),
1817                                                    unit_get_enable_mask(m),
1818                                                    unit_get_needs_bpf(m)))
1819                                 continue;
1820
1821                         unit_add_to_cgroup_realize_queue(m);
1822                 }
1823
1824                 u = slice;
1825         }
1826 }
1827
1828 int unit_realize_cgroup(Unit *u) {
1829         assert(u);
1830
1831         if (!UNIT_HAS_CGROUP_CONTEXT(u))
1832                 return 0;
1833
1834         /* So, here's the deal: when realizing the cgroups for this
1835          * unit, we need to first create all parents, but there's more
1836          * actually: for the weight-based controllers we also need to
1837          * make sure that all our siblings (i.e. units that are in the
1838          * same slice as we are) have cgroups, too. Otherwise, things
1839          * would become very uneven as each of their processes would
1840          * get as much resources as all our group together. This call
1841          * will synchronously create the parent cgroups, but will
1842          * defer work on the siblings to the next event loop
1843          * iteration. */
1844
1845         /* Add all sibling slices to the cgroup queue. */
1846         unit_add_siblings_to_cgroup_realize_queue(u);
1847
1848         /* And realize this one now (and apply the values) */
1849         return unit_realize_cgroup_now(u, manager_state(u->manager));
1850 }
1851
1852 void unit_release_cgroup(Unit *u) {
1853         assert(u);
1854
1855         /* Forgets all cgroup details for this cgroup */
1856
1857         if (u->cgroup_path) {
1858                 (void) hashmap_remove(u->manager->cgroup_unit, u->cgroup_path);
1859                 u->cgroup_path = mfree(u->cgroup_path);
1860         }
1861
1862         if (u->cgroup_inotify_wd >= 0) {
1863                 if (inotify_rm_watch(u->manager->cgroup_inotify_fd, u->cgroup_inotify_wd) < 0)
1864                         log_unit_debug_errno(u, errno, "Failed to remove cgroup inotify watch %i for %s, ignoring", u->cgroup_inotify_wd, u->id);
1865
1866                 (void) hashmap_remove(u->manager->cgroup_inotify_wd_unit, INT_TO_PTR(u->cgroup_inotify_wd));
1867                 u->cgroup_inotify_wd = -1;
1868         }
1869 }
1870
1871 void unit_prune_cgroup(Unit *u) {
1872         int r;
1873         bool is_root_slice;
1874
1875         assert(u);
1876
1877         /* Removes the cgroup, if empty and possible, and stops watching it. */
1878
1879         if (!u->cgroup_path)
1880                 return;
1881
1882         (void) unit_get_cpu_usage(u, NULL); /* Cache the last CPU usage value before we destroy the cgroup */
1883
1884         is_root_slice = unit_has_name(u, SPECIAL_ROOT_SLICE);
1885
1886         r = cg_trim_everywhere(u->manager->cgroup_supported, u->cgroup_path, !is_root_slice);
1887         if (r < 0) {
1888                 log_unit_debug_errno(u, r, "Failed to destroy cgroup %s, ignoring: %m", u->cgroup_path);
1889                 return;
1890         }
1891
1892         if (is_root_slice)
1893                 return;
1894
1895         unit_release_cgroup(u);
1896
1897         u->cgroup_realized = false;
1898         u->cgroup_realized_mask = 0;
1899         u->cgroup_enabled_mask = 0;
1900 }
1901
1902 int unit_search_main_pid(Unit *u, pid_t *ret) {
1903         _cleanup_fclose_ FILE *f = NULL;
1904         pid_t pid = 0, npid, mypid;
1905         int r;
1906
1907         assert(u);
1908         assert(ret);
1909
1910         if (!u->cgroup_path)
1911                 return -ENXIO;
1912
1913         r = cg_enumerate_processes(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, &f);
1914         if (r < 0)
1915                 return r;
1916
1917         mypid = getpid_cached();
1918         while (cg_read_pid(f, &npid) > 0)  {
1919                 pid_t ppid;
1920
1921                 if (npid == pid)
1922                         continue;
1923
1924                 /* Ignore processes that aren't our kids */
1925                 if (get_process_ppid(npid, &ppid) >= 0 && ppid != mypid)
1926                         continue;
1927
1928                 if (pid != 0)
1929                         /* Dang, there's more than one daemonized PID
1930                         in this group, so we don't know what process
1931                         is the main process. */
1932
1933                         return -ENODATA;
1934
1935                 pid = npid;
1936         }
1937
1938         *ret = pid;
1939         return 0;
1940 }
1941
1942 static int unit_watch_pids_in_path(Unit *u, const char *path) {
1943         _cleanup_closedir_ DIR *d = NULL;
1944         _cleanup_fclose_ FILE *f = NULL;
1945         int ret = 0, r;
1946
1947         assert(u);
1948         assert(path);
1949
1950         r = cg_enumerate_processes(SYSTEMD_CGROUP_CONTROLLER, path, &f);
1951         if (r < 0)
1952                 ret = r;
1953         else {
1954                 pid_t pid;
1955
1956                 while ((r = cg_read_pid(f, &pid)) > 0) {
1957                         r = unit_watch_pid(u, pid);
1958                         if (r < 0 && ret >= 0)
1959                                 ret = r;
1960                 }
1961
1962                 if (r < 0 && ret >= 0)
1963                         ret = r;
1964         }
1965
1966         r = cg_enumerate_subgroups(SYSTEMD_CGROUP_CONTROLLER, path, &d);
1967         if (r < 0) {
1968                 if (ret >= 0)
1969                         ret = r;
1970         } else {
1971                 char *fn;
1972
1973                 while ((r = cg_read_subgroup(d, &fn)) > 0) {
1974                         _cleanup_free_ char *p = NULL;
1975
1976                         p = strjoin(path, "/", fn);
1977                         free(fn);
1978
1979                         if (!p)
1980                                 return -ENOMEM;
1981
1982                         r = unit_watch_pids_in_path(u, p);
1983                         if (r < 0 && ret >= 0)
1984                                 ret = r;
1985                 }
1986
1987                 if (r < 0 && ret >= 0)
1988                         ret = r;
1989         }
1990
1991         return ret;
1992 }
1993
1994 int unit_synthesize_cgroup_empty_event(Unit *u) {
1995         int r;
1996
1997         assert(u);
1998
1999         /* Enqueue a synthetic cgroup empty event if this unit doesn't watch any PIDs anymore. This is compatibility
2000          * support for non-unified systems where notifications aren't reliable, and hence need to take whatever we can
2001          * get as notification source as soon as we stopped having any useful PIDs to watch for. */
2002
2003         if (!u->cgroup_path)
2004                 return -ENOENT;
2005
2006         r = cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER);
2007         if (r < 0)
2008                 return r;
2009         if (r > 0) /* On unified we have reliable notifications, and don't need this */
2010                 return 0;
2011
2012         if (!set_isempty(u->pids))
2013                 return 0;
2014
2015         unit_add_to_cgroup_empty_queue(u);
2016         return 0;
2017 }
2018
2019 int unit_watch_all_pids(Unit *u) {
2020         int r;
2021
2022         assert(u);
2023
2024         /* Adds all PIDs from our cgroup to the set of PIDs we
2025          * watch. This is a fallback logic for cases where we do not
2026          * get reliable cgroup empty notifications: we try to use
2027          * SIGCHLD as replacement. */
2028
2029         if (!u->cgroup_path)
2030                 return -ENOENT;
2031
2032         r = cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER);
2033         if (r < 0)
2034                 return r;
2035         if (r > 0) /* On unified we can use proper notifications */
2036                 return 0;
2037
2038         return unit_watch_pids_in_path(u, u->cgroup_path);
2039 }
2040
2041 static int on_cgroup_empty_event(sd_event_source *s, void *userdata) {
2042         Manager *m = userdata;
2043         Unit *u;
2044         int r;
2045
2046         assert(s);
2047         assert(m);
2048
2049         u = m->cgroup_empty_queue;
2050         if (!u)
2051                 return 0;
2052
2053         assert(u->in_cgroup_empty_queue);
2054         u->in_cgroup_empty_queue = false;
2055         LIST_REMOVE(cgroup_empty_queue, m->cgroup_empty_queue, u);
2056
2057         if (m->cgroup_empty_queue) {
2058                 /* More stuff queued, let's make sure we remain enabled */
2059                 r = sd_event_source_set_enabled(s, SD_EVENT_ONESHOT);
2060                 if (r < 0)
2061                         log_debug_errno(r, "Failed to reenable cgroup empty event source: %m");
2062         }
2063
2064         unit_add_to_gc_queue(u);
2065
2066         if (UNIT_VTABLE(u)->notify_cgroup_empty)
2067                 UNIT_VTABLE(u)->notify_cgroup_empty(u);
2068
2069         return 0;
2070 }
2071
2072 void unit_add_to_cgroup_empty_queue(Unit *u) {
2073         int r;
2074
2075         assert(u);
2076
2077         /* Note that there are four different ways how cgroup empty events reach us:
2078          *
2079          * 1. On the unified hierarchy we get an inotify event on the cgroup
2080          *
2081          * 2. On the legacy hierarchy, when running in system mode, we get a datagram on the cgroup agent socket
2082          *
2083          * 3. On the legacy hierarchy, when running in user mode, we get a D-Bus signal on the system bus
2084          *
2085          * 4. On the legacy hierarchy, in service units we start watching all processes of the cgroup for SIGCHLD as
2086          *    soon as we get one SIGCHLD, to deal with unreliable cgroup notifications.
2087          *
2088          * Regardless which way we got the notification, we'll verify it here, and then add it to a separate
2089          * queue. This queue will be dispatched at a lower priority than the SIGCHLD handler, so that we always use
2090          * SIGCHLD if we can get it first, and only use the cgroup empty notifications if there's no SIGCHLD pending
2091          * (which might happen if the cgroup doesn't contain processes that are our own child, which is typically the
2092          * case for scope units). */
2093
2094         if (u->in_cgroup_empty_queue)
2095                 return;
2096
2097         /* Let's verify that the cgroup is really empty */
2098         if (!u->cgroup_path)
2099                 return;
2100         r = cg_is_empty_recursive(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path);
2101         if (r < 0) {
2102                 log_unit_debug_errno(u, r, "Failed to determine whether cgroup %s is empty: %m", u->cgroup_path);
2103                 return;
2104         }
2105         if (r == 0)
2106                 return;
2107
2108         LIST_PREPEND(cgroup_empty_queue, u->manager->cgroup_empty_queue, u);
2109         u->in_cgroup_empty_queue = true;
2110
2111         /* Trigger the defer event */
2112         r = sd_event_source_set_enabled(u->manager->cgroup_empty_event_source, SD_EVENT_ONESHOT);
2113         if (r < 0)
2114                 log_debug_errno(r, "Failed to enable cgroup empty event source: %m");
2115 }
2116
2117 static int on_cgroup_inotify_event(sd_event_source *s, int fd, uint32_t revents, void *userdata) {
2118         Manager *m = userdata;
2119
2120         assert(s);
2121         assert(fd >= 0);
2122         assert(m);
2123
2124         for (;;) {
2125                 union inotify_event_buffer buffer;
2126                 struct inotify_event *e;
2127                 ssize_t l;
2128
2129                 l = read(fd, &buffer, sizeof(buffer));
2130                 if (l < 0) {
2131                         if (IN_SET(errno, EINTR, EAGAIN))
2132                                 return 0;
2133
2134                         return log_error_errno(errno, "Failed to read control group inotify events: %m");
2135                 }
2136
2137                 FOREACH_INOTIFY_EVENT(e, buffer, l) {
2138                         Unit *u;
2139
2140                         if (e->wd < 0)
2141                                 /* Queue overflow has no watch descriptor */
2142                                 continue;
2143
2144                         if (e->mask & IN_IGNORED)
2145                                 /* The watch was just removed */
2146                                 continue;
2147
2148                         u = hashmap_get(m->cgroup_inotify_wd_unit, INT_TO_PTR(e->wd));
2149                         if (!u) /* Not that inotify might deliver
2150                                  * events for a watch even after it
2151                                  * was removed, because it was queued
2152                                  * before the removal. Let's ignore
2153                                  * this here safely. */
2154                                 continue;
2155
2156                         unit_add_to_cgroup_empty_queue(u);
2157                 }
2158         }
2159 }
2160 #endif // 0
2161
2162 int manager_setup_cgroup(Manager *m) {
2163         _cleanup_free_ char *path = NULL;
2164         const char *scope_path;
2165         CGroupController c;
2166         int r, all_unified;
2167 #if 0 /// UNNEEDED by elogind
2168         char *e;
2169 #endif // 0
2170
2171         assert(m);
2172
2173         /* 1. Determine hierarchy */
2174         m->cgroup_root = mfree(m->cgroup_root);
2175 #if 0 /// elogind is not init and must therefore search for PID 1 instead of self.
2176         r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, 0, &m->cgroup_root);
2177 #else
2178         r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, 1, &m->cgroup_root);
2179 #endif // 0
2180         if (r < 0)
2181                 return log_error_errno(r, "Cannot determine cgroup we are running in: %m");
2182
2183 #if 0 /// elogind does not support systemd scopes and slices
2184         /* Chop off the init scope, if we are already located in it */
2185         e = endswith(m->cgroup_root, "/" SPECIAL_INIT_SCOPE);
2186
2187         /* LEGACY: Also chop off the system slice if we are in
2188          * it. This is to support live upgrades from older systemd
2189          * versions where PID 1 was moved there. Also see
2190          * cg_get_root_path(). */
2191         if (!e && MANAGER_IS_SYSTEM(m)) {
2192                 e = endswith(m->cgroup_root, "/" SPECIAL_SYSTEM_SLICE);
2193                 if (!e)
2194                         e = endswith(m->cgroup_root, "/system"); /* even more legacy */
2195         }
2196         if (e)
2197                 *e = 0;
2198 #endif // 0
2199
2200         log_debug_elogind("Cgroup Controller \"%s\" -> root \"%s\"",
2201                           SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root);
2202         /* And make sure to store away the root value without trailing slash, even for the root dir, so that we can
2203          * easily prepend it everywhere. */
2204         delete_trailing_chars(m->cgroup_root, "/");
2205
2206         /* 2. Show data */
2207         r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, NULL, &path);
2208         if (r < 0)
2209                 return log_error_errno(r, "Cannot find cgroup mount point: %m");
2210
2211         r = cg_unified_flush();
2212         if (r < 0)
2213                 return log_error_errno(r, "Couldn't determine if we are running in the unified hierarchy: %m");
2214
2215         all_unified = cg_all_unified();
2216         if (all_unified < 0)
2217                 return log_error_errno(all_unified, "Couldn't determine whether we are in all unified mode: %m");
2218         if (all_unified > 0)
2219                 log_debug("Unified cgroup hierarchy is located at %s.", path);
2220         else {
2221                 r = cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER);
2222                 if (r < 0)
2223                         return log_error_errno(r, "Failed to determine whether systemd's own controller is in unified mode: %m");
2224                 if (r > 0)
2225                         log_debug("Unified cgroup hierarchy is located at %s. Controllers are on legacy hierarchies.", path);
2226                 else
2227                         log_debug("Using cgroup controller " SYSTEMD_CGROUP_CONTROLLER_LEGACY ". File system hierarchy is at %s.", path);
2228         }
2229
2230 #if 0 /// elogind is not init, and does not install the agent here.
2231         /* 3. Allocate cgroup empty defer event source */
2232         m->cgroup_empty_event_source = sd_event_source_unref(m->cgroup_empty_event_source);
2233         r = sd_event_add_defer(m->event, &m->cgroup_empty_event_source, on_cgroup_empty_event, m);
2234         if (r < 0)
2235                 return log_error_errno(r, "Failed to create cgroup empty event source: %m");
2236
2237         r = sd_event_source_set_priority(m->cgroup_empty_event_source, SD_EVENT_PRIORITY_NORMAL-5);
2238         if (r < 0)
2239                 return log_error_errno(r, "Failed to set priority of cgroup empty event source: %m");
2240
2241         r = sd_event_source_set_enabled(m->cgroup_empty_event_source, SD_EVENT_OFF);
2242         if (r < 0)
2243                 return log_error_errno(r, "Failed to disable cgroup empty event source: %m");
2244
2245         (void) sd_event_source_set_description(m->cgroup_empty_event_source, "cgroup-empty");
2246
2247         /* 4. Install notifier inotify object, or agent */
2248         if (cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER) > 0) {
2249
2250                 /* In the unified hierarchy we can get cgroup empty notifications via inotify. */
2251
2252                 m->cgroup_inotify_event_source = sd_event_source_unref(m->cgroup_inotify_event_source);
2253                 safe_close(m->cgroup_inotify_fd);
2254
2255                 m->cgroup_inotify_fd = inotify_init1(IN_NONBLOCK|IN_CLOEXEC);
2256                 if (m->cgroup_inotify_fd < 0)
2257                         return log_error_errno(errno, "Failed to create control group inotify object: %m");
2258
2259                 r = sd_event_add_io(m->event, &m->cgroup_inotify_event_source, m->cgroup_inotify_fd, EPOLLIN, on_cgroup_inotify_event, m);
2260                 if (r < 0)
2261                         return log_error_errno(r, "Failed to watch control group inotify object: %m");
2262
2263                 /* Process cgroup empty notifications early, but after service notifications and SIGCHLD. Also
2264                  * see handling of cgroup agent notifications, for the classic cgroup hierarchy support. */
2265                 r = sd_event_source_set_priority(m->cgroup_inotify_event_source, SD_EVENT_PRIORITY_NORMAL-4);
2266                 if (r < 0)
2267                         return log_error_errno(r, "Failed to set priority of inotify event source: %m");
2268
2269                 (void) sd_event_source_set_description(m->cgroup_inotify_event_source, "cgroup-inotify");
2270
2271         } else if (MANAGER_IS_SYSTEM(m) && m->test_run_flags == 0) {
2272
2273                 /* On the legacy hierarchy we only get notifications via cgroup agents. (Which isn't really reliable,
2274                  * since it does not generate events when control groups with children run empty. */
2275
2276                 r = cg_install_release_agent(SYSTEMD_CGROUP_CONTROLLER, SYSTEMD_CGROUP_AGENT_PATH);
2277                 if (r < 0)
2278                         log_warning_errno(r, "Failed to install release agent, ignoring: %m");
2279                 else if (r > 0)
2280                         log_debug("Installed release agent.");
2281                 else if (r == 0)
2282                         log_debug("Release agent already installed.");
2283         }
2284
2285         /* 5. Make sure we are in the special "init.scope" unit in the root slice. */
2286         scope_path = strjoina(m->cgroup_root, "/" SPECIAL_INIT_SCOPE);
2287         r = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, scope_path, 0);
2288         if (r >= 0) {
2289                 /* Also, move all other userspace processes remaining in the root cgroup into that scope. */
2290                 r = cg_migrate(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, SYSTEMD_CGROUP_CONTROLLER, scope_path, 0);
2291                 if (r < 0)
2292                         log_warning_errno(r, "Couldn't move remaining userspace processes, ignoring: %m");
2293 #else
2294         /* Note:
2295                 * This method is in core, and normally called by systemd
2296                 * being init. As elogind is never init, we can not install
2297                 * our agent here. We do so when mounting our cgroup file
2298                 * system, so only if elogind is its own tiny controller.
2299                 * Further, elogind is not meant to run in systemd init scope. */
2300         if (MANAGER_IS_SYSTEM(m))
2301                 // we are our own cgroup controller
2302                 scope_path = strjoina("");
2303         else if (streq(m->cgroup_root, "/elogind"))
2304                 // root already is our cgroup
2305                 scope_path = strjoina(m->cgroup_root);
2306         else
2307                 // we have to create our own group
2308                 scope_path = strjoina(m->cgroup_root, "/elogind");
2309         r = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, scope_path, 0);
2310 #endif // 0
2311         log_debug_elogind("Created control group \"%s\"", scope_path);
2312
2313 #if 0 /// elogind is not a "sub-controller" like systemd, so migration is not needed.
2314 #endif // 0
2315                 /* 6. And pin it, so that it cannot be unmounted */
2316                 safe_close(m->pin_cgroupfs_fd);
2317                 m->pin_cgroupfs_fd = open(path, O_RDONLY|O_CLOEXEC|O_DIRECTORY|O_NOCTTY|O_NONBLOCK);
2318                 if (m->pin_cgroupfs_fd < 0)
2319                         return log_error_errno(errno, "Failed to open pin file: %m");
2320
2321         } else if (r < 0 && !m->test_run_flags)
2322                 return log_error_errno(r, "Failed to create %s control group: %m", scope_path);
2323
2324         /* 7. Always enable hierarchical support if it exists... */
2325         if (!all_unified && m->test_run_flags == 0)
2326                 (void) cg_set_attribute("memory", "/", "memory.use_hierarchy", "1");
2327
2328         /* 8. Figure out which controllers are supported, and log about it */
2329         r = cg_mask_supported(&m->cgroup_supported);
2330         if (r < 0)
2331                 return log_error_errno(r, "Failed to determine supported controllers: %m");
2332         for (c = 0; c < _CGROUP_CONTROLLER_MAX; c++)
2333                 log_debug("Controller '%s' supported: %s", cgroup_controller_to_string(c), yes_no(m->cgroup_supported & CGROUP_CONTROLLER_TO_MASK(c)));
2334
2335         return 0;
2336 }
2337
2338 void manager_shutdown_cgroup(Manager *m, bool delete) {
2339         assert(m);
2340
2341         /* We can't really delete the group, since we are in it. But
2342          * let's trim it. */
2343         if (delete && m->cgroup_root)
2344                 (void) cg_trim(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, false);
2345
2346 #if 0 /// elogind is not init
2347         m->cgroup_empty_event_source = sd_event_source_unref(m->cgroup_empty_event_source);
2348
2349         m->cgroup_inotify_wd_unit = hashmap_free(m->cgroup_inotify_wd_unit);
2350
2351         m->cgroup_inotify_event_source = sd_event_source_unref(m->cgroup_inotify_event_source);
2352         m->cgroup_inotify_fd = safe_close(m->cgroup_inotify_fd);
2353 #endif // 0
2354
2355         m->pin_cgroupfs_fd = safe_close(m->pin_cgroupfs_fd);
2356
2357         m->cgroup_root = mfree(m->cgroup_root);
2358 }
2359
2360 #if 0 /// UNNEEDED by elogind
2361 Unit* manager_get_unit_by_cgroup(Manager *m, const char *cgroup) {
2362         char *p;
2363         Unit *u;
2364
2365         assert(m);
2366         assert(cgroup);
2367
2368         u = hashmap_get(m->cgroup_unit, cgroup);
2369         if (u)
2370                 return u;
2371
2372         p = strdupa(cgroup);
2373         for (;;) {
2374                 char *e;
2375
2376                 e = strrchr(p, '/');
2377                 if (!e || e == p)
2378                         return hashmap_get(m->cgroup_unit, SPECIAL_ROOT_SLICE);
2379
2380                 *e = 0;
2381
2382                 u = hashmap_get(m->cgroup_unit, p);
2383                 if (u)
2384                         return u;
2385         }
2386 }
2387
2388 Unit *manager_get_unit_by_pid_cgroup(Manager *m, pid_t pid) {
2389         _cleanup_free_ char *cgroup = NULL;
2390
2391         assert(m);
2392
2393         if (!pid_is_valid(pid))
2394                 return NULL;
2395
2396         if (cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, pid, &cgroup) < 0)
2397                 return NULL;
2398
2399         return manager_get_unit_by_cgroup(m, cgroup);
2400 }
2401
2402 Unit *manager_get_unit_by_pid(Manager *m, pid_t pid) {
2403         Unit *u, **array;
2404
2405         assert(m);
2406
2407         /* Note that a process might be owned by multiple units, we return only one here, which is good enough for most
2408          * cases, though not strictly correct. We prefer the one reported by cgroup membership, as that's the most
2409          * relevant one as children of the process will be assigned to that one, too, before all else. */
2410
2411         if (!pid_is_valid(pid))
2412                 return NULL;
2413
2414         if (pid == getpid_cached())
2415                 return hashmap_get(m->units, SPECIAL_INIT_SCOPE);
2416
2417         u = manager_get_unit_by_pid_cgroup(m, pid);
2418         if (u)
2419                 return u;
2420
2421         u = hashmap_get(m->watch_pids, PID_TO_PTR(pid));
2422         if (u)
2423                 return u;
2424
2425         array = hashmap_get(m->watch_pids, PID_TO_PTR(-pid));
2426         if (array)
2427                 return array[0];
2428
2429         return NULL;
2430 }
2431 #endif // 0
2432
2433 #if 0 /// elogind must substitute this with its own variant
2434 int manager_notify_cgroup_empty(Manager *m, const char *cgroup) {
2435         Unit *u;
2436
2437         assert(m);
2438         assert(cgroup);
2439
2440         /* Called on the legacy hierarchy whenever we get an explicit cgroup notification from the cgroup agent process
2441          * or from the --system instance */
2442
2443         log_debug("Got cgroup empty notification for: %s", cgroup);
2444
2445         u = manager_get_unit_by_cgroup(m, cgroup);
2446         if (!u)
2447                 return 0;
2448
2449         unit_add_to_cgroup_empty_queue(u);
2450         return 1;
2451 }
2452 #else
2453 int manager_notify_cgroup_empty(Manager *m, const char *cgroup) {
2454         Session *s;
2455
2456         assert(m);
2457         assert(cgroup);
2458
2459         log_debug("Got cgroup empty notification for: %s", cgroup);
2460
2461         s = hashmap_get(m->sessions, cgroup);
2462
2463         if (s) {
2464                 session_finalize(s);
2465                 session_free(s);
2466         } else
2467                 log_warning("Session not found: %s", cgroup);
2468
2469         return 0;
2470 }
2471 #endif // 0
2472 #if 0 /// UNNEEDED by elogind
2473 int unit_get_memory_current(Unit *u, uint64_t *ret) {
2474         _cleanup_free_ char *v = NULL;
2475         int r;
2476
2477         assert(u);
2478         assert(ret);
2479
2480         if (!UNIT_CGROUP_BOOL(u, memory_accounting))
2481                 return -ENODATA;
2482
2483         if (!u->cgroup_path)
2484                 return -ENODATA;
2485
2486         /* The root cgroup doesn't expose this information, let's get it from /proc instead */
2487         if (unit_has_root_cgroup(u))
2488                 return procfs_memory_get_current(ret);
2489
2490         if ((u->cgroup_realized_mask & CGROUP_MASK_MEMORY) == 0)
2491                 return -ENODATA;
2492
2493         r = cg_all_unified();
2494         if (r < 0)
2495                 return r;
2496         if (r > 0)
2497                 r = cg_get_attribute("memory", u->cgroup_path, "memory.current", &v);
2498         else
2499                 r = cg_get_attribute("memory", u->cgroup_path, "memory.usage_in_bytes", &v);
2500         if (r == -ENOENT)
2501                 return -ENODATA;
2502         if (r < 0)
2503                 return r;
2504
2505         return safe_atou64(v, ret);
2506 }
2507
2508 int unit_get_tasks_current(Unit *u, uint64_t *ret) {
2509         _cleanup_free_ char *v = NULL;
2510         int r;
2511
2512         assert(u);
2513         assert(ret);
2514
2515         if (!UNIT_CGROUP_BOOL(u, tasks_accounting))
2516                 return -ENODATA;
2517
2518         if (!u->cgroup_path)
2519                 return -ENODATA;
2520
2521         /* The root cgroup doesn't expose this information, let's get it from /proc instead */
2522         if (unit_has_root_cgroup(u))
2523                 return procfs_tasks_get_current(ret);
2524
2525         if ((u->cgroup_realized_mask & CGROUP_MASK_PIDS) == 0)
2526                 return -ENODATA;
2527
2528         r = cg_get_attribute("pids", u->cgroup_path, "pids.current", &v);
2529         if (r == -ENOENT)
2530                 return -ENODATA;
2531         if (r < 0)
2532                 return r;
2533
2534         return safe_atou64(v, ret);
2535 }
2536
2537 static int unit_get_cpu_usage_raw(Unit *u, nsec_t *ret) {
2538         _cleanup_free_ char *v = NULL;
2539         uint64_t ns;
2540         int r;
2541
2542         assert(u);
2543         assert(ret);
2544
2545         if (!u->cgroup_path)
2546                 return -ENODATA;
2547
2548         /* The root cgroup doesn't expose this information, let's get it from /proc instead */
2549         if (unit_has_root_cgroup(u))
2550                 return procfs_cpu_get_usage(ret);
2551
2552         r = cg_all_unified();
2553         if (r < 0)
2554                 return r;
2555         if (r > 0) {
2556                 _cleanup_free_ char *val = NULL;
2557                 uint64_t us;
2558
2559                 if ((u->cgroup_realized_mask & CGROUP_MASK_CPU) == 0)
2560                         return -ENODATA;
2561
2562                 r = cg_get_keyed_attribute("cpu", u->cgroup_path, "cpu.stat", STRV_MAKE("usage_usec"), &val);
2563                 if (r < 0)
2564                         return r;
2565                 if (IN_SET(r, -ENOENT, -ENXIO))
2566                         return -ENODATA;
2567
2568                 r = safe_atou64(val, &us);
2569                 if (r < 0)
2570                         return r;
2571
2572                 ns = us * NSEC_PER_USEC;
2573         } else {
2574                 if ((u->cgroup_realized_mask & CGROUP_MASK_CPUACCT) == 0)
2575                         return -ENODATA;
2576
2577                 r = cg_get_attribute("cpuacct", u->cgroup_path, "cpuacct.usage", &v);
2578                 if (r == -ENOENT)
2579                         return -ENODATA;
2580                 if (r < 0)
2581                         return r;
2582
2583                 r = safe_atou64(v, &ns);
2584                 if (r < 0)
2585                         return r;
2586         }
2587
2588         *ret = ns;
2589         return 0;
2590 }
2591
2592 int unit_get_cpu_usage(Unit *u, nsec_t *ret) {
2593         nsec_t ns;
2594         int r;
2595
2596         assert(u);
2597
2598         /* Retrieve the current CPU usage counter. This will subtract the CPU counter taken when the unit was
2599          * started. If the cgroup has been removed already, returns the last cached value. To cache the value, simply
2600          * call this function with a NULL return value. */
2601
2602         if (!UNIT_CGROUP_BOOL(u, cpu_accounting))
2603                 return -ENODATA;
2604
2605         r = unit_get_cpu_usage_raw(u, &ns);
2606         if (r == -ENODATA && u->cpu_usage_last != NSEC_INFINITY) {
2607                 /* If we can't get the CPU usage anymore (because the cgroup was already removed, for example), use our
2608                  * cached value. */
2609
2610                 if (ret)
2611                         *ret = u->cpu_usage_last;
2612                 return 0;
2613         }
2614         if (r < 0)
2615                 return r;
2616
2617         if (ns > u->cpu_usage_base)
2618                 ns -= u->cpu_usage_base;
2619         else
2620                 ns = 0;
2621
2622         u->cpu_usage_last = ns;
2623         if (ret)
2624                 *ret = ns;
2625
2626         return 0;
2627 }
2628
2629 int unit_get_ip_accounting(
2630                 Unit *u,
2631                 CGroupIPAccountingMetric metric,
2632                 uint64_t *ret) {
2633
2634         uint64_t value;
2635         int fd, r;
2636
2637         assert(u);
2638         assert(metric >= 0);
2639         assert(metric < _CGROUP_IP_ACCOUNTING_METRIC_MAX);
2640         assert(ret);
2641
2642         if (!UNIT_CGROUP_BOOL(u, ip_accounting))
2643                 return -ENODATA;
2644
2645         fd = IN_SET(metric, CGROUP_IP_INGRESS_BYTES, CGROUP_IP_INGRESS_PACKETS) ?
2646                 u->ip_accounting_ingress_map_fd :
2647                 u->ip_accounting_egress_map_fd;
2648         if (fd < 0)
2649                 return -ENODATA;
2650
2651         if (IN_SET(metric, CGROUP_IP_INGRESS_BYTES, CGROUP_IP_EGRESS_BYTES))
2652                 r = bpf_firewall_read_accounting(fd, &value, NULL);
2653         else
2654                 r = bpf_firewall_read_accounting(fd, NULL, &value);
2655         if (r < 0)
2656                 return r;
2657
2658         /* Add in additional metrics from a previous runtime. Note that when reexecing/reloading the daemon we compile
2659          * all BPF programs and maps anew, but serialize the old counters. When deserializing we store them in the
2660          * ip_accounting_extra[] field, and add them in here transparently. */
2661
2662         *ret = value + u->ip_accounting_extra[metric];
2663
2664         return r;
2665 }
2666
2667 int unit_reset_cpu_accounting(Unit *u) {
2668         nsec_t ns;
2669         int r;
2670
2671         assert(u);
2672
2673         u->cpu_usage_last = NSEC_INFINITY;
2674
2675         r = unit_get_cpu_usage_raw(u, &ns);
2676         if (r < 0) {
2677                 u->cpu_usage_base = 0;
2678                 return r;
2679         }
2680
2681         u->cpu_usage_base = ns;
2682         return 0;
2683 }
2684
2685 int unit_reset_ip_accounting(Unit *u) {
2686         int r = 0, q = 0;
2687
2688         assert(u);
2689
2690         if (u->ip_accounting_ingress_map_fd >= 0)
2691                 r = bpf_firewall_reset_accounting(u->ip_accounting_ingress_map_fd);
2692
2693         if (u->ip_accounting_egress_map_fd >= 0)
2694                 q = bpf_firewall_reset_accounting(u->ip_accounting_egress_map_fd);
2695
2696         zero(u->ip_accounting_extra);
2697
2698         return r < 0 ? r : q;
2699 }
2700
2701 void unit_invalidate_cgroup(Unit *u, CGroupMask m) {
2702         assert(u);
2703
2704         if (!UNIT_HAS_CGROUP_CONTEXT(u))
2705                 return;
2706
2707         if (m == 0)
2708                 return;
2709
2710         /* always invalidate compat pairs together */
2711         if (m & (CGROUP_MASK_IO | CGROUP_MASK_BLKIO))
2712                 m |= CGROUP_MASK_IO | CGROUP_MASK_BLKIO;
2713
2714         if (m & (CGROUP_MASK_CPU | CGROUP_MASK_CPUACCT))
2715                 m |= CGROUP_MASK_CPU | CGROUP_MASK_CPUACCT;
2716
2717         if ((u->cgroup_realized_mask & m) == 0) /* NOP? */
2718                 return;
2719
2720         u->cgroup_realized_mask &= ~m;
2721         unit_add_to_cgroup_realize_queue(u);
2722 }
2723
2724 void unit_invalidate_cgroup_bpf(Unit *u) {
2725         assert(u);
2726
2727         if (!UNIT_HAS_CGROUP_CONTEXT(u))
2728                 return;
2729
2730         if (u->cgroup_bpf_state == UNIT_CGROUP_BPF_INVALIDATED) /* NOP? */
2731                 return;
2732
2733         u->cgroup_bpf_state = UNIT_CGROUP_BPF_INVALIDATED;
2734         unit_add_to_cgroup_realize_queue(u);
2735
2736         /* If we are a slice unit, we also need to put compile a new BPF program for all our children, as the IP access
2737          * list of our children includes our own. */
2738         if (u->type == UNIT_SLICE) {
2739                 Unit *member;
2740                 Iterator i;
2741                 void *v;
2742
2743                 HASHMAP_FOREACH_KEY(v, member, u->dependencies[UNIT_BEFORE], i) {
2744                         if (member == u)
2745                                 continue;
2746
2747                         if (UNIT_DEREF(member->slice) != u)
2748                                 continue;
2749
2750                         unit_invalidate_cgroup_bpf(member);
2751                 }
2752         }
2753 }
2754
2755 bool unit_cgroup_delegate(Unit *u) {
2756         CGroupContext *c;
2757
2758         assert(u);
2759
2760         if (!UNIT_VTABLE(u)->can_delegate)
2761                 return false;
2762
2763         c = unit_get_cgroup_context(u);
2764         if (!c)
2765                 return false;
2766
2767         return c->delegate;
2768 }
2769
2770 void manager_invalidate_startup_units(Manager *m) {
2771         Iterator i;
2772         Unit *u;
2773
2774         assert(m);
2775
2776         SET_FOREACH(u, m->startup_units, i)
2777                 unit_invalidate_cgroup(u, CGROUP_MASK_CPU|CGROUP_MASK_IO|CGROUP_MASK_BLKIO);
2778 }
2779
2780 static const char* const cgroup_device_policy_table[_CGROUP_DEVICE_POLICY_MAX] = {
2781         [CGROUP_AUTO] = "auto",
2782         [CGROUP_CLOSED] = "closed",
2783         [CGROUP_STRICT] = "strict",
2784 };
2785
2786 DEFINE_STRING_TABLE_LOOKUP(cgroup_device_policy, CGroupDevicePolicy);
2787 #endif // 0