src/core/cgroup.c

   1 /* SPDX-License-Identifier: LGPL-2.1+ */
   2 /***
   3   This file is part of systemd.
   4
   5   Copyright 2013 Lennart Poettering
   6
   7   systemd is free software; you can redistribute it and/or modify it
   8   under the terms of the GNU Lesser General Public License as published by
   9   the Free Software Foundation; either version 2.1 of the License, or
  10   (at your option) any later version.
  11
  12   systemd is distributed in the hope that it will be useful, but
  13   WITHOUT ANY WARRANTY; without even the implied warranty of
  14   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  15   Lesser General Public License for more details.
  16
  17   You should have received a copy of the GNU Lesser General Public License
  18   along with systemd; If not, see <http://www.gnu.org/licenses/>.
  19 ***/
  20
  21 #include <fcntl.h>
  22 #include <fnmatch.h>
  23
  24 #include "alloc-util.h"
  25 //#include "blockdev-util.h"
  26 //#include "bpf-firewall.h"
  27 #include "cgroup-util.h"
  28 #include "cgroup.h"
  29 #include "fd-util.h"
  30 #include "fileio.h"
  31 #include "fs-util.h"
  32 #include "parse-util.h"
  33 #include "path-util.h"
  34 #include "process-util.h"
  35 //#include "special.h"
  36 #include "stdio-util.h"
  37 #include "string-table.h"
  38 #include "string-util.h"
  39
  40 #define CGROUP_CPU_QUOTA_PERIOD_USEC ((usec_t) 100 * USEC_PER_MSEC)
  41
  42 bool unit_has_root_cgroup(Unit *u) {
  43         assert(u);
  44
  45         /* Returns whether this unit manages the root cgroup. Note that this is different from being named "-.slice",
  46          * as inside of containers the root slice won't be identical to the root cgroup. */
  47
  48         if (!u->cgroup_path)
  49                 return false;
  50
  51         return isempty(u->cgroup_path) || path_equal(u->cgroup_path, "/");
  52 }
  53
  54 #if 0 /// UNNEEDED by elogind
  55 static void cgroup_compat_warn(void) {
  56         static bool cgroup_compat_warned = false;
  57
  58         if (cgroup_compat_warned)
  59                 return;
  60
  61         log_warning("cgroup compatibility translation between legacy and unified hierarchy settings activated. See cgroup-compat debug messages for details.");
  62         cgroup_compat_warned = true;
  63 }
  64
  65 #define log_cgroup_compat(unit, fmt, ...) do {                                  \
  66                 cgroup_compat_warn();                                           \
  67                 log_unit_debug(unit, "cgroup-compat: " fmt, ##__VA_ARGS__);     \
  68         } while (false)
  69
  70 void cgroup_context_init(CGroupContext *c) {
  71         assert(c);
  72
  73         /* Initialize everything to the kernel defaults, assuming the
  74          * structure is preinitialized to 0 */
  75
  76         c->cpu_weight = CGROUP_WEIGHT_INVALID;
  77         c->startup_cpu_weight = CGROUP_WEIGHT_INVALID;
  78         c->cpu_quota_per_sec_usec = USEC_INFINITY;
  79
  80         c->cpu_shares = CGROUP_CPU_SHARES_INVALID;
  81         c->startup_cpu_shares = CGROUP_CPU_SHARES_INVALID;
  82
  83         c->memory_high = CGROUP_LIMIT_MAX;
  84         c->memory_max = CGROUP_LIMIT_MAX;
  85         c->memory_swap_max = CGROUP_LIMIT_MAX;
  86
  87         c->memory_limit = CGROUP_LIMIT_MAX;
  88
  89         c->io_weight = CGROUP_WEIGHT_INVALID;
  90         c->startup_io_weight = CGROUP_WEIGHT_INVALID;
  91
  92         c->blockio_weight = CGROUP_BLKIO_WEIGHT_INVALID;
  93         c->startup_blockio_weight = CGROUP_BLKIO_WEIGHT_INVALID;
  94
  95         c->tasks_max = (uint64_t) -1;
  96 }
  97
  98 void cgroup_context_free_device_allow(CGroupContext *c, CGroupDeviceAllow *a) {
  99         assert(c);
 100         assert(a);
 101
 102         LIST_REMOVE(device_allow, c->device_allow, a);
 103         free(a->path);
 104         free(a);
 105 }
 106
 107 void cgroup_context_free_io_device_weight(CGroupContext *c, CGroupIODeviceWeight *w) {
 108         assert(c);
 109         assert(w);
 110
 111         LIST_REMOVE(device_weights, c->io_device_weights, w);
 112         free(w->path);
 113         free(w);
 114 }
 115
 116 void cgroup_context_free_io_device_limit(CGroupContext *c, CGroupIODeviceLimit *l) {
 117         assert(c);
 118         assert(l);
 119
 120         LIST_REMOVE(device_limits, c->io_device_limits, l);
 121         free(l->path);
 122         free(l);
 123 }
 124
 125 void cgroup_context_free_blockio_device_weight(CGroupContext *c, CGroupBlockIODeviceWeight *w) {
 126         assert(c);
 127         assert(w);
 128
 129         LIST_REMOVE(device_weights, c->blockio_device_weights, w);
 130         free(w->path);
 131         free(w);
 132 }
 133
 134 void cgroup_context_free_blockio_device_bandwidth(CGroupContext *c, CGroupBlockIODeviceBandwidth *b) {
 135         assert(c);
 136         assert(b);
 137
 138         LIST_REMOVE(device_bandwidths, c->blockio_device_bandwidths, b);
 139         free(b->path);
 140         free(b);
 141 }
 142
 143 void cgroup_context_done(CGroupContext *c) {
 144         assert(c);
 145
 146         while (c->io_device_weights)
 147                 cgroup_context_free_io_device_weight(c, c->io_device_weights);
 148
 149         while (c->io_device_limits)
 150                 cgroup_context_free_io_device_limit(c, c->io_device_limits);
 151
 152         while (c->blockio_device_weights)
 153                 cgroup_context_free_blockio_device_weight(c, c->blockio_device_weights);
 154
 155         while (c->blockio_device_bandwidths)
 156                 cgroup_context_free_blockio_device_bandwidth(c, c->blockio_device_bandwidths);
 157
 158         while (c->device_allow)
 159                 cgroup_context_free_device_allow(c, c->device_allow);
 160
 161         c->ip_address_allow = ip_address_access_free_all(c->ip_address_allow);
 162         c->ip_address_deny = ip_address_access_free_all(c->ip_address_deny);
 163 }
 164
 165 void cgroup_context_dump(CGroupContext *c, FILE* f, const char *prefix) {
 166         CGroupIODeviceLimit *il;
 167         CGroupIODeviceWeight *iw;
 168         CGroupBlockIODeviceBandwidth *b;
 169         CGroupBlockIODeviceWeight *w;
 170         CGroupDeviceAllow *a;
 171         IPAddressAccessItem *iaai;
 172         char u[FORMAT_TIMESPAN_MAX];
 173
 174         assert(c);
 175         assert(f);
 176
 177         prefix = strempty(prefix);
 178
 179         fprintf(f,
 180                 "%sCPUAccounting=%s\n"
 181                 "%sIOAccounting=%s\n"
 182                 "%sBlockIOAccounting=%s\n"
 183                 "%sMemoryAccounting=%s\n"
 184                 "%sTasksAccounting=%s\n"
 185                 "%sIPAccounting=%s\n"
 186                 "%sCPUWeight=%" PRIu64 "\n"
 187                 "%sStartupCPUWeight=%" PRIu64 "\n"
 188                 "%sCPUShares=%" PRIu64 "\n"
 189                 "%sStartupCPUShares=%" PRIu64 "\n"
 190                 "%sCPUQuotaPerSecSec=%s\n"
 191                 "%sIOWeight=%" PRIu64 "\n"
 192                 "%sStartupIOWeight=%" PRIu64 "\n"
 193                 "%sBlockIOWeight=%" PRIu64 "\n"
 194                 "%sStartupBlockIOWeight=%" PRIu64 "\n"
 195                 "%sMemoryLow=%" PRIu64 "\n"
 196                 "%sMemoryHigh=%" PRIu64 "\n"
 197                 "%sMemoryMax=%" PRIu64 "\n"
 198                 "%sMemorySwapMax=%" PRIu64 "\n"
 199                 "%sMemoryLimit=%" PRIu64 "\n"
 200                 "%sTasksMax=%" PRIu64 "\n"
 201                 "%sDevicePolicy=%s\n"
 202                 "%sDelegate=%s\n",
 203                 prefix, yes_no(c->cpu_accounting),
 204                 prefix, yes_no(c->io_accounting),
 205                 prefix, yes_no(c->blockio_accounting),
 206                 prefix, yes_no(c->memory_accounting),
 207                 prefix, yes_no(c->tasks_accounting),
 208                 prefix, yes_no(c->ip_accounting),
 209                 prefix, c->cpu_weight,
 210                 prefix, c->startup_cpu_weight,
 211                 prefix, c->cpu_shares,
 212                 prefix, c->startup_cpu_shares,
 213                 prefix, format_timespan(u, sizeof(u), c->cpu_quota_per_sec_usec, 1),
 214                 prefix, c->io_weight,
 215                 prefix, c->startup_io_weight,
 216                 prefix, c->blockio_weight,
 217                 prefix, c->startup_blockio_weight,
 218                 prefix, c->memory_low,
 219                 prefix, c->memory_high,
 220                 prefix, c->memory_max,
 221                 prefix, c->memory_swap_max,
 222                 prefix, c->memory_limit,
 223                 prefix, c->tasks_max,
 224                 prefix, cgroup_device_policy_to_string(c->device_policy),
 225                 prefix, yes_no(c->delegate));
 226
 227         if (c->delegate) {
 228                 _cleanup_free_ char *t = NULL;
 229
 230                 (void) cg_mask_to_string(c->delegate_controllers, &t);
 231
 232                 fprintf(f, "%sDelegateControllers=%s\n",
 233                         prefix,
 234                         strempty(t));
 235         }
 236
 237         LIST_FOREACH(device_allow, a, c->device_allow)
 238                 fprintf(f,
 239                         "%sDeviceAllow=%s %s%s%s\n",
 240                         prefix,
 241                         a->path,
 242                         a->r ? "r" : "", a->w ? "w" : "", a->m ? "m" : "");
 243
 244         LIST_FOREACH(device_weights, iw, c->io_device_weights)
 245                 fprintf(f,
 246                         "%sIODeviceWeight=%s %" PRIu64,
 247                         prefix,
 248                         iw->path,
 249                         iw->weight);
 250
 251         LIST_FOREACH(device_limits, il, c->io_device_limits) {
 252                 char buf[FORMAT_BYTES_MAX];
 253                 CGroupIOLimitType type;
 254
 255                 for (type = 0; type < _CGROUP_IO_LIMIT_TYPE_MAX; type++)
 256                         if (il->limits[type] != cgroup_io_limit_defaults[type])
 257                                 fprintf(f,
 258                                         "%s%s=%s %s\n",
 259                                         prefix,
 260                                         cgroup_io_limit_type_to_string(type),
 261                                         il->path,
 262                                         format_bytes(buf, sizeof(buf), il->limits[type]));
 263         }
 264
 265         LIST_FOREACH(device_weights, w, c->blockio_device_weights)
 266                 fprintf(f,
 267                         "%sBlockIODeviceWeight=%s %" PRIu64,
 268                         prefix,
 269                         w->path,
 270                         w->weight);
 271
 272         LIST_FOREACH(device_bandwidths, b, c->blockio_device_bandwidths) {
 273                 char buf[FORMAT_BYTES_MAX];
 274
 275                 if (b->rbps != CGROUP_LIMIT_MAX)
 276                         fprintf(f,
 277                                 "%sBlockIOReadBandwidth=%s %s\n",
 278                                 prefix,
 279                                 b->path,
 280                                 format_bytes(buf, sizeof(buf), b->rbps));
 281                 if (b->wbps != CGROUP_LIMIT_MAX)
 282                         fprintf(f,
 283                                 "%sBlockIOWriteBandwidth=%s %s\n",
 284                                 prefix,
 285                                 b->path,
 286                                 format_bytes(buf, sizeof(buf), b->wbps));
 287         }
 288
 289         LIST_FOREACH(items, iaai, c->ip_address_allow) {
 290                 _cleanup_free_ char *k = NULL;
 291
 292                 (void) in_addr_to_string(iaai->family, &iaai->address, &k);
 293                 fprintf(f, "%sIPAddressAllow=%s/%u\n", prefix, strnull(k), iaai->prefixlen);
 294         }
 295
 296         LIST_FOREACH(items, iaai, c->ip_address_deny) {
 297                 _cleanup_free_ char *k = NULL;
 298
 299                 (void) in_addr_to_string(iaai->family, &iaai->address, &k);
 300                 fprintf(f, "%sIPAddressDeny=%s/%u\n", prefix, strnull(k), iaai->prefixlen);
 301         }
 302 }
 303
 304 static int lookup_block_device(const char *p, dev_t *dev) {
 305         struct stat st;
 306         int r;
 307
 308         assert(p);
 309         assert(dev);
 310
 311         r = stat(p, &st);
 312         if (r < 0)
 313                 return log_warning_errno(errno, "Couldn't stat device %s: %m", p);
 314
 315         if (S_ISBLK(st.st_mode))
 316                 *dev = st.st_rdev;
 317         else if (major(st.st_dev) != 0) {
 318                 /* If this is not a device node then find the block
 319                  * device this file is stored on */
 320                 *dev = st.st_dev;
 321
 322                 /* If this is a partition, try to get the originating
 323                  * block device */
 324                 (void) block_get_whole_disk(*dev, dev);
 325         } else {
 326                 log_warning("%s is not a block device and file system block device cannot be determined or is not local.", p);
 327                 return -ENODEV;
 328         }
 329
 330         return 0;
 331 }
 332
 333 static int whitelist_device(const char *path, const char *node, const char *acc) {
 334         char buf[2+DECIMAL_STR_MAX(dev_t)*2+2+4];
 335         struct stat st;
 336         bool ignore_notfound;
 337         int r;
 338
 339         assert(path);
 340         assert(acc);
 341
 342         if (node[0] == '-') {
 343                 /* Non-existent paths starting with "-" must be silently ignored */
 344                 node++;
 345                 ignore_notfound = true;
 346         } else
 347                 ignore_notfound = false;
 348
 349         if (stat(node, &st) < 0) {
 350                 if (errno == ENOENT && ignore_notfound)
 351                         return 0;
 352
 353                 return log_warning_errno(errno, "Couldn't stat device %s: %m", node);
 354         }
 355
 356         if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
 357                 log_warning("%s is not a device.", node);
 358                 return -ENODEV;
 359         }
 360
 361         sprintf(buf,
 362                 "%c %u:%u %s",
 363                 S_ISCHR(st.st_mode) ? 'c' : 'b',
 364                 major(st.st_rdev), minor(st.st_rdev),
 365                 acc);
 366
 367         r = cg_set_attribute("devices", path, "devices.allow", buf);
 368         if (r < 0)
 369                 log_full_errno(IN_SET(r, -ENOENT, -EROFS, -EINVAL, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
 370                                "Failed to set devices.allow on %s: %m", path);
 371
 372         return r;
 373 }
 374
 375 static int whitelist_major(const char *path, const char *name, char type, const char *acc) {
 376         _cleanup_fclose_ FILE *f = NULL;
 377         char line[LINE_MAX];
 378         bool good = false;
 379         int r;
 380
 381         assert(path);
 382         assert(acc);
 383         assert(IN_SET(type, 'b', 'c'));
 384
 385         f = fopen("/proc/devices", "re");
 386         if (!f)
 387                 return log_warning_errno(errno, "Cannot open /proc/devices to resolve %s (%c): %m", name, type);
 388
 389         FOREACH_LINE(line, f, goto fail) {
 390                 char buf[2+DECIMAL_STR_MAX(unsigned)+3+4], *p, *w;
 391                 unsigned maj;
 392
 393                 truncate_nl(line);
 394
 395                 if (type == 'c' && streq(line, "Character devices:")) {
 396                         good = true;
 397                         continue;
 398                 }
 399
 400                 if (type == 'b' && streq(line, "Block devices:")) {
 401                         good = true;
 402                         continue;
 403                 }
 404
 405                 if (isempty(line)) {
 406                         good = false;
 407                         continue;
 408                 }
 409
 410                 if (!good)
 411                         continue;
 412
 413                 p = strstrip(line);
 414
 415                 w = strpbrk(p, WHITESPACE);
 416                 if (!w)
 417                         continue;
 418                 *w = 0;
 419
 420                 r = safe_atou(p, &maj);
 421                 if (r < 0)
 422                         continue;
 423                 if (maj <= 0)
 424                         continue;
 425
 426                 w++;
 427                 w += strspn(w, WHITESPACE);
 428
 429                 if (fnmatch(name, w, 0) != 0)
 430                         continue;
 431
 432                 sprintf(buf,
 433                         "%c %u:* %s",
 434                         type,
 435                         maj,
 436                         acc);
 437
 438                 r = cg_set_attribute("devices", path, "devices.allow", buf);
 439                 if (r < 0)
 440                         log_full_errno(IN_SET(r, -ENOENT, -EROFS, -EINVAL, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
 441                                        "Failed to set devices.allow on %s: %m", path);
 442         }
 443
 444         return 0;
 445
 446 fail:
 447         return log_warning_errno(errno, "Failed to read /proc/devices: %m");
 448 }
 449
 450 static bool cgroup_context_has_cpu_weight(CGroupContext *c) {
 451         return c->cpu_weight != CGROUP_WEIGHT_INVALID ||
 452                 c->startup_cpu_weight != CGROUP_WEIGHT_INVALID;
 453 }
 454
 455 static bool cgroup_context_has_cpu_shares(CGroupContext *c) {
 456         return c->cpu_shares != CGROUP_CPU_SHARES_INVALID ||
 457                 c->startup_cpu_shares != CGROUP_CPU_SHARES_INVALID;
 458 }
 459
 460 static uint64_t cgroup_context_cpu_weight(CGroupContext *c, ManagerState state) {
 461         if (IN_SET(state, MANAGER_STARTING, MANAGER_INITIALIZING) &&
 462             c->startup_cpu_weight != CGROUP_WEIGHT_INVALID)
 463                 return c->startup_cpu_weight;
 464         else if (c->cpu_weight != CGROUP_WEIGHT_INVALID)
 465                 return c->cpu_weight;
 466         else
 467                 return CGROUP_WEIGHT_DEFAULT;
 468 }
 469
 470 static uint64_t cgroup_context_cpu_shares(CGroupContext *c, ManagerState state) {
 471         if (IN_SET(state, MANAGER_STARTING, MANAGER_INITIALIZING) &&
 472             c->startup_cpu_shares != CGROUP_CPU_SHARES_INVALID)
 473                 return c->startup_cpu_shares;
 474         else if (c->cpu_shares != CGROUP_CPU_SHARES_INVALID)
 475                 return c->cpu_shares;
 476         else
 477                 return CGROUP_CPU_SHARES_DEFAULT;
 478 }
 479
 480 static void cgroup_apply_unified_cpu_config(Unit *u, uint64_t weight, uint64_t quota) {
 481         char buf[MAX(DECIMAL_STR_MAX(uint64_t) + 1, (DECIMAL_STR_MAX(usec_t) + 1) * 2)];
 482         int r;
 483
 484         xsprintf(buf, "%" PRIu64 "\n", weight);
 485         r = cg_set_attribute("cpu", u->cgroup_path, "cpu.weight", buf);
 486         if (r < 0)
 487                 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
 488                               "Failed to set cpu.weight: %m");
 489
 490         if (quota != USEC_INFINITY)
 491                 xsprintf(buf, USEC_FMT " " USEC_FMT "\n",
 492                          quota * CGROUP_CPU_QUOTA_PERIOD_USEC / USEC_PER_SEC, CGROUP_CPU_QUOTA_PERIOD_USEC);
 493         else
 494                 xsprintf(buf, "max " USEC_FMT "\n", CGROUP_CPU_QUOTA_PERIOD_USEC);
 495
 496         r = cg_set_attribute("cpu", u->cgroup_path, "cpu.max", buf);
 497
 498         if (r < 0)
 499                 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
 500                               "Failed to set cpu.max: %m");
 501 }
 502
 503 static void cgroup_apply_legacy_cpu_config(Unit *u, uint64_t shares, uint64_t quota) {
 504         char buf[MAX(DECIMAL_STR_MAX(uint64_t), DECIMAL_STR_MAX(usec_t)) + 1];
 505         int r;
 506
 507         xsprintf(buf, "%" PRIu64 "\n", shares);
 508         r = cg_set_attribute("cpu", u->cgroup_path, "cpu.shares", buf);
 509         if (r < 0)
 510                 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
 511                               "Failed to set cpu.shares: %m");
 512
 513         xsprintf(buf, USEC_FMT "\n", CGROUP_CPU_QUOTA_PERIOD_USEC);
 514         r = cg_set_attribute("cpu", u->cgroup_path, "cpu.cfs_period_us", buf);
 515         if (r < 0)
 516                 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
 517                               "Failed to set cpu.cfs_period_us: %m");
 518
 519         if (quota != USEC_INFINITY) {
 520                 xsprintf(buf, USEC_FMT "\n", quota * CGROUP_CPU_QUOTA_PERIOD_USEC / USEC_PER_SEC);
 521                 r = cg_set_attribute("cpu", u->cgroup_path, "cpu.cfs_quota_us", buf);
 522         } else
 523                 r = cg_set_attribute("cpu", u->cgroup_path, "cpu.cfs_quota_us", "-1");
 524         if (r < 0)
 525                 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
 526                               "Failed to set cpu.cfs_quota_us: %m");
 527 }
 528
 529 static uint64_t cgroup_cpu_shares_to_weight(uint64_t shares) {
 530         return CLAMP(shares * CGROUP_WEIGHT_DEFAULT / CGROUP_CPU_SHARES_DEFAULT,
 531                      CGROUP_WEIGHT_MIN, CGROUP_WEIGHT_MAX);
 532 }
 533
 534 static uint64_t cgroup_cpu_weight_to_shares(uint64_t weight) {
 535         return CLAMP(weight * CGROUP_CPU_SHARES_DEFAULT / CGROUP_WEIGHT_DEFAULT,
 536                      CGROUP_CPU_SHARES_MIN, CGROUP_CPU_SHARES_MAX);
 537 }
 538
 539 static bool cgroup_context_has_io_config(CGroupContext *c) {
 540         return c->io_accounting ||
 541                 c->io_weight != CGROUP_WEIGHT_INVALID ||
 542                 c->startup_io_weight != CGROUP_WEIGHT_INVALID ||
 543                 c->io_device_weights ||
 544                 c->io_device_limits;
 545 }
 546
 547 static bool cgroup_context_has_blockio_config(CGroupContext *c) {
 548         return c->blockio_accounting ||
 549                 c->blockio_weight != CGROUP_BLKIO_WEIGHT_INVALID ||
 550                 c->startup_blockio_weight != CGROUP_BLKIO_WEIGHT_INVALID ||
 551                 c->blockio_device_weights ||
 552                 c->blockio_device_bandwidths;
 553 }
 554
 555 static uint64_t cgroup_context_io_weight(CGroupContext *c, ManagerState state) {
 556         if (IN_SET(state, MANAGER_STARTING, MANAGER_INITIALIZING) &&
 557             c->startup_io_weight != CGROUP_WEIGHT_INVALID)
 558                 return c->startup_io_weight;
 559         else if (c->io_weight != CGROUP_WEIGHT_INVALID)
 560                 return c->io_weight;
 561         else
 562                 return CGROUP_WEIGHT_DEFAULT;
 563 }
 564
 565 static uint64_t cgroup_context_blkio_weight(CGroupContext *c, ManagerState state) {
 566         if (IN_SET(state, MANAGER_STARTING, MANAGER_INITIALIZING) &&
 567             c->startup_blockio_weight != CGROUP_BLKIO_WEIGHT_INVALID)
 568                 return c->startup_blockio_weight;
 569         else if (c->blockio_weight != CGROUP_BLKIO_WEIGHT_INVALID)
 570                 return c->blockio_weight;
 571         else
 572                 return CGROUP_BLKIO_WEIGHT_DEFAULT;
 573 }
 574
 575 static uint64_t cgroup_weight_blkio_to_io(uint64_t blkio_weight) {
 576         return CLAMP(blkio_weight * CGROUP_WEIGHT_DEFAULT / CGROUP_BLKIO_WEIGHT_DEFAULT,
 577                      CGROUP_WEIGHT_MIN, CGROUP_WEIGHT_MAX);
 578 }
 579
 580 static uint64_t cgroup_weight_io_to_blkio(uint64_t io_weight) {
 581         return CLAMP(io_weight * CGROUP_BLKIO_WEIGHT_DEFAULT / CGROUP_WEIGHT_DEFAULT,
 582                      CGROUP_BLKIO_WEIGHT_MIN, CGROUP_BLKIO_WEIGHT_MAX);
 583 }
 584
 585 static void cgroup_apply_io_device_weight(Unit *u, const char *dev_path, uint64_t io_weight) {
 586         char buf[DECIMAL_STR_MAX(dev_t)*2+2+DECIMAL_STR_MAX(uint64_t)+1];
 587         dev_t dev;
 588         int r;
 589
 590         r = lookup_block_device(dev_path, &dev);
 591         if (r < 0)
 592                 return;
 593
 594         xsprintf(buf, "%u:%u %" PRIu64 "\n", major(dev), minor(dev), io_weight);
 595         r = cg_set_attribute("io", u->cgroup_path, "io.weight", buf);
 596         if (r < 0)
 597                 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
 598                               "Failed to set io.weight: %m");
 599 }
 600
 601 static void cgroup_apply_blkio_device_weight(Unit *u, const char *dev_path, uint64_t blkio_weight) {
 602         char buf[DECIMAL_STR_MAX(dev_t)*2+2+DECIMAL_STR_MAX(uint64_t)+1];
 603         dev_t dev;
 604         int r;
 605
 606         r = lookup_block_device(dev_path, &dev);
 607         if (r < 0)
 608                 return;
 609
 610         xsprintf(buf, "%u:%u %" PRIu64 "\n", major(dev), minor(dev), blkio_weight);
 611         r = cg_set_attribute("blkio", u->cgroup_path, "blkio.weight_device", buf);
 612         if (r < 0)
 613                 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
 614                               "Failed to set blkio.weight_device: %m");
 615 }
 616
 617 static unsigned cgroup_apply_io_device_limit(Unit *u, const char *dev_path, uint64_t *limits) {
 618         char limit_bufs[_CGROUP_IO_LIMIT_TYPE_MAX][DECIMAL_STR_MAX(uint64_t)];
 619         char buf[DECIMAL_STR_MAX(dev_t)*2+2+(6+DECIMAL_STR_MAX(uint64_t)+1)*4];
 620         CGroupIOLimitType type;
 621         dev_t dev;
 622         unsigned n = 0;
 623         int r;
 624
 625         r = lookup_block_device(dev_path, &dev);
 626         if (r < 0)
 627                 return 0;
 628
 629         for (type = 0; type < _CGROUP_IO_LIMIT_TYPE_MAX; type++) {
 630                 if (limits[type] != cgroup_io_limit_defaults[type]) {
 631                         xsprintf(limit_bufs[type], "%" PRIu64, limits[type]);
 632                         n++;
 633                 } else {
 634                         xsprintf(limit_bufs[type], "%s", limits[type] == CGROUP_LIMIT_MAX ? "max" : "0");
 635                 }
 636         }
 637
 638         xsprintf(buf, "%u:%u rbps=%s wbps=%s riops=%s wiops=%s\n", major(dev), minor(dev),
 639                  limit_bufs[CGROUP_IO_RBPS_MAX], limit_bufs[CGROUP_IO_WBPS_MAX],
 640                  limit_bufs[CGROUP_IO_RIOPS_MAX], limit_bufs[CGROUP_IO_WIOPS_MAX]);
 641         r = cg_set_attribute("io", u->cgroup_path, "io.max", buf);
 642         if (r < 0)
 643                 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
 644                               "Failed to set io.max: %m");
 645         return n;
 646 }
 647
 648 static unsigned cgroup_apply_blkio_device_limit(Unit *u, const char *dev_path, uint64_t rbps, uint64_t wbps) {
 649         char buf[DECIMAL_STR_MAX(dev_t)*2+2+DECIMAL_STR_MAX(uint64_t)+1];
 650         dev_t dev;
 651         unsigned n = 0;
 652         int r;
 653
 654         r = lookup_block_device(dev_path, &dev);
 655         if (r < 0)
 656                 return 0;
 657
 658         if (rbps != CGROUP_LIMIT_MAX)
 659                 n++;
 660         sprintf(buf, "%u:%u %" PRIu64 "\n", major(dev), minor(dev), rbps);
 661         r = cg_set_attribute("blkio", u->cgroup_path, "blkio.throttle.read_bps_device", buf);
 662         if (r < 0)
 663                 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
 664                               "Failed to set blkio.throttle.read_bps_device: %m");
 665
 666         if (wbps != CGROUP_LIMIT_MAX)
 667                 n++;
 668         sprintf(buf, "%u:%u %" PRIu64 "\n", major(dev), minor(dev), wbps);
 669         r = cg_set_attribute("blkio", u->cgroup_path, "blkio.throttle.write_bps_device", buf);
 670         if (r < 0)
 671                 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
 672                               "Failed to set blkio.throttle.write_bps_device: %m");
 673
 674         return n;
 675 }
 676
 677 static bool cgroup_context_has_unified_memory_config(CGroupContext *c) {
 678         return c->memory_low > 0 || c->memory_high != CGROUP_LIMIT_MAX || c->memory_max != CGROUP_LIMIT_MAX || c->memory_swap_max != CGROUP_LIMIT_MAX;
 679 }
 680
 681 static void cgroup_apply_unified_memory_limit(Unit *u, const char *file, uint64_t v) {
 682         char buf[DECIMAL_STR_MAX(uint64_t) + 1] = "max";
 683         int r;
 684
 685         if (v != CGROUP_LIMIT_MAX)
 686                 xsprintf(buf, "%" PRIu64 "\n", v);
 687
 688         r = cg_set_attribute("memory", u->cgroup_path, file, buf);
 689         if (r < 0)
 690                 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
 691                               "Failed to set %s: %m", file);
 692 }
 693
 694 static void cgroup_apply_firewall(Unit *u) {
 695         int r;
 696
 697         assert(u);
 698
 699         if (u->type == UNIT_SLICE) /* Skip this for slice units, they are inner cgroup nodes, and since bpf/cgroup is
 700                                     * not recursive we don't ever touch the bpf on them */
 701                 return;
 702
 703         r = bpf_firewall_compile(u);
 704         if (r < 0)
 705                 return;
 706
 707         (void) bpf_firewall_install(u);
 708         return;
 709 }
 710
 711 static void cgroup_context_apply(
 712                 Unit *u,
 713                 CGroupMask apply_mask,
 714                 bool apply_bpf,
 715                 ManagerState state) {
 716
 717         const char *path;
 718         CGroupContext *c;
 719         bool is_root;
 720         int r;
 721
 722         assert(u);
 723
 724         /* Nothing to do? Exit early! */
 725         if (apply_mask == 0 && !apply_bpf)
 726                 return;
 727
 728         /* Some cgroup attributes are not supported on the root cgroup, hence silently ignore */
 729         is_root = unit_has_root_cgroup(u);
 730
 731         assert_se(c = unit_get_cgroup_context(u));
 732         assert_se(path = u->cgroup_path);
 733
 734         if (is_root) /* Make sure we don't try to display messages with an empty path. */
 735                 path = "/";
 736
 737         /* We generally ignore errors caused by read-only mounted
 738          * cgroup trees (assuming we are running in a container then),
 739          * and missing cgroups, i.e. EROFS and ENOENT. */
 740
 741         if ((apply_mask & CGROUP_MASK_CPU) && !is_root) {
 742                 bool has_weight, has_shares;
 743
 744                 has_weight = cgroup_context_has_cpu_weight(c);
 745                 has_shares = cgroup_context_has_cpu_shares(c);
 746
 747                 if (cg_all_unified() > 0) {
 748                         uint64_t weight;
 749
 750                         if (has_weight)
 751                                 weight = cgroup_context_cpu_weight(c, state);
 752                         else if (has_shares) {
 753                                 uint64_t shares = cgroup_context_cpu_shares(c, state);
 754
 755                                 weight = cgroup_cpu_shares_to_weight(shares);
 756
 757                                 log_cgroup_compat(u, "Applying [Startup]CpuShares %" PRIu64 " as [Startup]CpuWeight %" PRIu64 " on %s",
 758                                                   shares, weight, path);
 759                         } else
 760                                 weight = CGROUP_WEIGHT_DEFAULT;
 761
 762                         cgroup_apply_unified_cpu_config(u, weight, c->cpu_quota_per_sec_usec);
 763                 } else {
 764                         uint64_t shares;
 765
 766                         if (has_weight) {
 767                                 uint64_t weight = cgroup_context_cpu_weight(c, state);
 768
 769                                 shares = cgroup_cpu_weight_to_shares(weight);
 770
 771                                 log_cgroup_compat(u, "Applying [Startup]CpuWeight %" PRIu64 " as [Startup]CpuShares %" PRIu64 " on %s",
 772                                                   weight, shares, path);
 773                         } else if (has_shares)
 774                                 shares = cgroup_context_cpu_shares(c, state);
 775                         else
 776                                 shares = CGROUP_CPU_SHARES_DEFAULT;
 777
 778                         cgroup_apply_legacy_cpu_config(u, shares, c->cpu_quota_per_sec_usec);
 779                 }
 780         }
 781
 782         if (apply_mask & CGROUP_MASK_IO) {
 783                 bool has_io = cgroup_context_has_io_config(c);
 784                 bool has_blockio = cgroup_context_has_blockio_config(c);
 785
 786                 if (!is_root) {
 787                         char buf[8+DECIMAL_STR_MAX(uint64_t)+1];
 788                         uint64_t weight;
 789
 790                         if (has_io)
 791                                 weight = cgroup_context_io_weight(c, state);
 792                         else if (has_blockio) {
 793                                 uint64_t blkio_weight = cgroup_context_blkio_weight(c, state);
 794
 795                                 weight = cgroup_weight_blkio_to_io(blkio_weight);
 796
 797                                 log_cgroup_compat(u, "Applying [Startup]BlockIOWeight %" PRIu64 " as [Startup]IOWeight %" PRIu64,
 798                                                   blkio_weight, weight);
 799                         } else
 800                                 weight = CGROUP_WEIGHT_DEFAULT;
 801
 802                         xsprintf(buf, "default %" PRIu64 "\n", weight);
 803                         r = cg_set_attribute("io", path, "io.weight", buf);
 804                         if (r < 0)
 805                                 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
 806                                               "Failed to set io.weight: %m");
 807
 808                         if (has_io) {
 809                                 CGroupIODeviceWeight *w;
 810
 811                                 /* FIXME: no way to reset this list */
 812                                 LIST_FOREACH(device_weights, w, c->io_device_weights)
 813                                         cgroup_apply_io_device_weight(u, w->path, w->weight);
 814                         } else if (has_blockio) {
 815                                 CGroupBlockIODeviceWeight *w;
 816
 817                                 /* FIXME: no way to reset this list */
 818                                 LIST_FOREACH(device_weights, w, c->blockio_device_weights) {
 819                                         weight = cgroup_weight_blkio_to_io(w->weight);
 820
 821                                         log_cgroup_compat(u, "Applying BlockIODeviceWeight %" PRIu64 " as IODeviceWeight %" PRIu64 " for %s",
 822                                                           w->weight, weight, w->path);
 823
 824                                         cgroup_apply_io_device_weight(u, w->path, weight);
 825                                 }
 826                         }
 827                 }
 828
 829                 /* Apply limits and free ones without config. */
 830                 if (has_io) {
 831                         CGroupIODeviceLimit *l, *next;
 832
 833                         LIST_FOREACH_SAFE(device_limits, l, next, c->io_device_limits) {
 834                                 if (!cgroup_apply_io_device_limit(u, l->path, l->limits))
 835                                         cgroup_context_free_io_device_limit(c, l);
 836                         }
 837                 } else if (has_blockio) {
 838                         CGroupBlockIODeviceBandwidth *b, *next;
 839
 840                         LIST_FOREACH_SAFE(device_bandwidths, b, next, c->blockio_device_bandwidths) {
 841                                 uint64_t limits[_CGROUP_IO_LIMIT_TYPE_MAX];
 842                                 CGroupIOLimitType type;
 843
 844                                 for (type = 0; type < _CGROUP_IO_LIMIT_TYPE_MAX; type++)
 845                                         limits[type] = cgroup_io_limit_defaults[type];
 846
 847                                 limits[CGROUP_IO_RBPS_MAX] = b->rbps;
 848                                 limits[CGROUP_IO_WBPS_MAX] = b->wbps;
 849
 850                                 log_cgroup_compat(u, "Applying BlockIO{Read|Write}Bandwidth %" PRIu64 " %" PRIu64 " as IO{Read|Write}BandwidthMax for %s",
 851                                                   b->rbps, b->wbps, b->path);
 852
 853                                 if (!cgroup_apply_io_device_limit(u, b->path, limits))
 854                                         cgroup_context_free_blockio_device_bandwidth(c, b);
 855                         }
 856                 }
 857         }
 858
 859         if (apply_mask & CGROUP_MASK_BLKIO) {
 860                 bool has_io = cgroup_context_has_io_config(c);
 861                 bool has_blockio = cgroup_context_has_blockio_config(c);
 862
 863                 if (!is_root) {
 864                         char buf[DECIMAL_STR_MAX(uint64_t)+1];
 865                         uint64_t weight;
 866
 867                         if (has_io) {
 868                                 uint64_t io_weight = cgroup_context_io_weight(c, state);
 869
 870                                 weight = cgroup_weight_io_to_blkio(cgroup_context_io_weight(c, state));
 871
 872                                 log_cgroup_compat(u, "Applying [Startup]IOWeight %" PRIu64 " as [Startup]BlockIOWeight %" PRIu64,
 873                                                   io_weight, weight);
 874                         } else if (has_blockio)
 875                                 weight = cgroup_context_blkio_weight(c, state);
 876                         else
 877                                 weight = CGROUP_BLKIO_WEIGHT_DEFAULT;
 878
 879                         xsprintf(buf, "%" PRIu64 "\n", weight);
 880                         r = cg_set_attribute("blkio", path, "blkio.weight", buf);
 881                         if (r < 0)
 882                                 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
 883                                               "Failed to set blkio.weight: %m");
 884
 885                         if (has_io) {
 886                                 CGroupIODeviceWeight *w;
 887
 888                                 /* FIXME: no way to reset this list */
 889                                 LIST_FOREACH(device_weights, w, c->io_device_weights) {
 890                                         weight = cgroup_weight_io_to_blkio(w->weight);
 891
 892                                         log_cgroup_compat(u, "Applying IODeviceWeight %" PRIu64 " as BlockIODeviceWeight %" PRIu64 " for %s",
 893                                                           w->weight, weight, w->path);
 894
 895                                         cgroup_apply_blkio_device_weight(u, w->path, weight);
 896                                 }
 897                         } else if (has_blockio) {
 898                                 CGroupBlockIODeviceWeight *w;
 899
 900                                 /* FIXME: no way to reset this list */
 901                                 LIST_FOREACH(device_weights, w, c->blockio_device_weights)
 902                                         cgroup_apply_blkio_device_weight(u, w->path, w->weight);
 903                         }
 904                 }
 905
 906                 /* Apply limits and free ones without config. */
 907                 if (has_io) {
 908                         CGroupIODeviceLimit *l, *next;
 909
 910                         LIST_FOREACH_SAFE(device_limits, l, next, c->io_device_limits) {
 911                                 log_cgroup_compat(u, "Applying IO{Read|Write}Bandwidth %" PRIu64 " %" PRIu64 " as BlockIO{Read|Write}BandwidthMax for %s",
 912                                                   l->limits[CGROUP_IO_RBPS_MAX], l->limits[CGROUP_IO_WBPS_MAX], l->path);
 913
 914                                 if (!cgroup_apply_blkio_device_limit(u, l->path, l->limits[CGROUP_IO_RBPS_MAX], l->limits[CGROUP_IO_WBPS_MAX]))
 915                                         cgroup_context_free_io_device_limit(c, l);
 916                         }
 917                 } else if (has_blockio) {
 918                         CGroupBlockIODeviceBandwidth *b, *next;
 919
 920                         LIST_FOREACH_SAFE(device_bandwidths, b, next, c->blockio_device_bandwidths)
 921                                 if (!cgroup_apply_blkio_device_limit(u, b->path, b->rbps, b->wbps))
 922                                         cgroup_context_free_blockio_device_bandwidth(c, b);
 923                 }
 924         }
 925
 926         if ((apply_mask & CGROUP_MASK_MEMORY) && !is_root) {
 927                 if (cg_all_unified() > 0) {
 928                         uint64_t max, swap_max = CGROUP_LIMIT_MAX;
 929
 930                         if (cgroup_context_has_unified_memory_config(c)) {
 931                                 max = c->memory_max;
 932                                 swap_max = c->memory_swap_max;
 933                         } else {
 934                                 max = c->memory_limit;
 935
 936                                 if (max != CGROUP_LIMIT_MAX)
 937                                         log_cgroup_compat(u, "Applying MemoryLimit %" PRIu64 " as MemoryMax", max);
 938                         }
 939
 940                         cgroup_apply_unified_memory_limit(u, "memory.low", c->memory_low);
 941                         cgroup_apply_unified_memory_limit(u, "memory.high", c->memory_high);
 942                         cgroup_apply_unified_memory_limit(u, "memory.max", max);
 943                         cgroup_apply_unified_memory_limit(u, "memory.swap.max", swap_max);
 944                 } else {
 945                         char buf[DECIMAL_STR_MAX(uint64_t) + 1];
 946                         uint64_t val;
 947
 948                         if (cgroup_context_has_unified_memory_config(c)) {
 949                                 val = c->memory_max;
 950                                 log_cgroup_compat(u, "Applying MemoryMax %" PRIi64 " as MemoryLimit", val);
 951                         } else
 952                                 val = c->memory_limit;
 953
 954                         if (val == CGROUP_LIMIT_MAX)
 955                                 strncpy(buf, "-1\n", sizeof(buf));
 956                         else
 957                                 xsprintf(buf, "%" PRIu64 "\n", val);
 958
 959                         r = cg_set_attribute("memory", path, "memory.limit_in_bytes", buf);
 960                         if (r < 0)
 961                                 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
 962                                               "Failed to set memory.limit_in_bytes: %m");
 963                 }
 964         }
 965
 966         if ((apply_mask & CGROUP_MASK_DEVICES) && !is_root) {
 967                 CGroupDeviceAllow *a;
 968
 969                 /* Changing the devices list of a populated cgroup
 970                  * might result in EINVAL, hence ignore EINVAL
 971                  * here. */
 972
 973                 if (c->device_allow || c->device_policy != CGROUP_AUTO)
 974                         r = cg_set_attribute("devices", path, "devices.deny", "a");
 975                 else
 976                         r = cg_set_attribute("devices", path, "devices.allow", "a");
 977                 if (r < 0)
 978                         log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EINVAL, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
 979                                       "Failed to reset devices.list: %m");
 980
 981                 if (c->device_policy == CGROUP_CLOSED ||
 982                     (c->device_policy == CGROUP_AUTO && c->device_allow)) {
 983                         static const char auto_devices[] =
 984                                 "/dev/null\0" "rwm\0"
 985                                 "/dev/zero\0" "rwm\0"
 986                                 "/dev/full\0" "rwm\0"
 987                                 "/dev/random\0" "rwm\0"
 988                                 "/dev/urandom\0" "rwm\0"
 989                                 "/dev/tty\0" "rwm\0"
 990                                 "/dev/ptmx\0" "rwm\0"
 991                                 /* Allow /run/systemd/inaccessible/{chr,blk} devices for mapping InaccessiblePaths */
 992                                 "-/run/systemd/inaccessible/chr\0" "rwm\0"
 993                                 "-/run/systemd/inaccessible/blk\0" "rwm\0";
 994
 995                         const char *x, *y;
 996
 997                         NULSTR_FOREACH_PAIR(x, y, auto_devices)
 998                                 whitelist_device(path, x, y);
 999
1000                         /* PTS (/dev/pts) devices may not be duplicated, but accessed */
1001                         whitelist_major(path, "pts", 'c', "rw");
1002                 }
1003
1004                 LIST_FOREACH(device_allow, a, c->device_allow) {
1005                         char acc[4], *val;
1006                         unsigned k = 0;
1007
1008                         if (a->r)
1009                                 acc[k++] = 'r';
1010                         if (a->w)
1011                                 acc[k++] = 'w';
1012                         if (a->m)
1013                                 acc[k++] = 'm';
1014
1015                         if (k == 0)
1016                                 continue;
1017
1018                         acc[k++] = 0;
1019
1020                         if (path_startswith(a->path, "/dev/"))
1021                                 whitelist_device(path, a->path, acc);
1022                         else if ((val = startswith(a->path, "block-")))
1023                                 whitelist_major(path, val, 'b', acc);
1024                         else if ((val = startswith(a->path, "char-")))
1025                                 whitelist_major(path, val, 'c', acc);
1026                         else
1027                                 log_unit_debug(u, "Ignoring device %s while writing cgroup attribute.", a->path);
1028                 }
1029         }
1030
1031         if ((apply_mask & CGROUP_MASK_PIDS) && !is_root) {
1032
1033                 if (c->tasks_max != CGROUP_LIMIT_MAX) {
1034                         char buf[DECIMAL_STR_MAX(uint64_t) + 2];
1035
1036                         sprintf(buf, "%" PRIu64 "\n", c->tasks_max);
1037                         r = cg_set_attribute("pids", path, "pids.max", buf);
1038                 } else
1039                         r = cg_set_attribute("pids", path, "pids.max", "max");
1040
1041                 if (r < 0)
1042                         log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
1043                                       "Failed to set pids.max: %m");
1044         }
1045
1046         if (apply_bpf)
1047                 cgroup_apply_firewall(u);
1048 }
1049
1050 CGroupMask cgroup_context_get_mask(CGroupContext *c) {
1051         CGroupMask mask = 0;
1052
1053         /* Figure out which controllers we need */
1054
1055         if (c->cpu_accounting ||
1056             cgroup_context_has_cpu_weight(c) ||
1057             cgroup_context_has_cpu_shares(c) ||
1058             c->cpu_quota_per_sec_usec != USEC_INFINITY)
1059                 mask |= CGROUP_MASK_CPUACCT | CGROUP_MASK_CPU;
1060
1061         if (cgroup_context_has_io_config(c) || cgroup_context_has_blockio_config(c))
1062                 mask |= CGROUP_MASK_IO | CGROUP_MASK_BLKIO;
1063
1064         if (c->memory_accounting ||
1065             c->memory_limit != CGROUP_LIMIT_MAX ||
1066             cgroup_context_has_unified_memory_config(c))
1067                 mask |= CGROUP_MASK_MEMORY;
1068
1069         if (c->device_allow ||
1070             c->device_policy != CGROUP_AUTO)
1071                 mask |= CGROUP_MASK_DEVICES;
1072
1073         if (c->tasks_accounting ||
1074             c->tasks_max != CGROUP_LIMIT_MAX)
1075                 mask |= CGROUP_MASK_PIDS;
1076
1077         return mask;
1078 }
1079
1080 CGroupMask unit_get_own_mask(Unit *u) {
1081         CGroupContext *c;
1082
1083         /* Returns the mask of controllers the unit needs for itself */
1084
1085         c = unit_get_cgroup_context(u);
1086         if (!c)
1087                 return 0;
1088
1089         return cgroup_context_get_mask(c) | unit_get_delegate_mask(u);
1090 }
1091
1092 CGroupMask unit_get_delegate_mask(Unit *u) {
1093         CGroupContext *c;
1094
1095         /* If delegation is turned on, then turn on selected controllers, unless we are on the legacy hierarchy and the
1096          * process we fork into is known to drop privileges, and hence shouldn't get access to the controllers.
1097          *
1098          * Note that on the unified hierarchy it is safe to delegate controllers to unprivileged services. */
1099
1100         if (u->type == UNIT_SLICE)
1101                 return 0;
1102
1103         c = unit_get_cgroup_context(u);
1104         if (!c)
1105                 return 0;
1106
1107         if (!c->delegate)
1108                 return 0;
1109
1110         if (cg_all_unified() <= 0) {
1111                 ExecContext *e;
1112
1113                 e = unit_get_exec_context(u);
1114                 if (e && !exec_context_maintains_privileges(e))
1115                         return 0;
1116         }
1117
1118         return c->delegate_controllers;
1119 }
1120
1121 CGroupMask unit_get_members_mask(Unit *u) {
1122         assert(u);
1123
1124         /* Returns the mask of controllers all of the unit's children require, merged */
1125
1126         if (u->cgroup_members_mask_valid)
1127                 return u->cgroup_members_mask;
1128
1129         u->cgroup_members_mask = 0;
1130
1131         if (u->type == UNIT_SLICE) {
1132                 void *v;
1133                 Unit *member;
1134                 Iterator i;
1135
1136                 HASHMAP_FOREACH_KEY(v, member, u->dependencies[UNIT_BEFORE], i) {
1137
1138                         if (member == u)
1139                                 continue;
1140
1141                         if (UNIT_DEREF(member->slice) != u)
1142                                 continue;
1143
1144                         u->cgroup_members_mask |= unit_get_subtree_mask(member); /* note that this calls ourselves again, for the children */
1145                 }
1146         }
1147
1148         u->cgroup_members_mask_valid = true;
1149         return u->cgroup_members_mask;
1150 }
1151
1152 CGroupMask unit_get_siblings_mask(Unit *u) {
1153         assert(u);
1154
1155         /* Returns the mask of controllers all of the unit's siblings
1156          * require, i.e. the members mask of the unit's parent slice
1157          * if there is one. */
1158
1159         if (UNIT_ISSET(u->slice))
1160                 return unit_get_members_mask(UNIT_DEREF(u->slice));
1161
1162         return unit_get_subtree_mask(u); /* we are the top-level slice */
1163 }
1164
1165 CGroupMask unit_get_subtree_mask(Unit *u) {
1166
1167         /* Returns the mask of this subtree, meaning of the group
1168          * itself and its children. */
1169
1170         return unit_get_own_mask(u) | unit_get_members_mask(u);
1171 }
1172
1173 CGroupMask unit_get_target_mask(Unit *u) {
1174         CGroupMask mask;
1175
1176         /* This returns the cgroup mask of all controllers to enable
1177          * for a specific cgroup, i.e. everything it needs itself,
1178          * plus all that its children need, plus all that its siblings
1179          * need. This is primarily useful on the legacy cgroup
1180          * hierarchy, where we need to duplicate each cgroup in each
1181          * hierarchy that shall be enabled for it. */
1182
1183         mask = unit_get_own_mask(u) | unit_get_members_mask(u) | unit_get_siblings_mask(u);
1184         mask &= u->manager->cgroup_supported;
1185
1186         return mask;
1187 }
1188
1189 CGroupMask unit_get_enable_mask(Unit *u) {
1190         CGroupMask mask;
1191
1192         /* This returns the cgroup mask of all controllers to enable
1193          * for the children of a specific cgroup. This is primarily
1194          * useful for the unified cgroup hierarchy, where each cgroup
1195          * controls which controllers are enabled for its children. */
1196
1197         mask = unit_get_members_mask(u);
1198         mask &= u->manager->cgroup_supported;
1199
1200         return mask;
1201 }
1202
1203 bool unit_get_needs_bpf(Unit *u) {
1204         CGroupContext *c;
1205         Unit *p;
1206         assert(u);
1207
1208         /* We never attach BPF to slice units, as they are inner cgroup nodes and cgroup/BPF is not recursive at the
1209          * moment. */
1210         if (u->type == UNIT_SLICE)
1211                 return false;
1212
1213         c = unit_get_cgroup_context(u);
1214         if (!c)
1215                 return false;
1216
1217         if (c->ip_accounting ||
1218             c->ip_address_allow ||
1219             c->ip_address_deny)
1220                 return true;
1221
1222         /* If any parent slice has an IP access list defined, it applies too */
1223         for (p = UNIT_DEREF(u->slice); p; p = UNIT_DEREF(p->slice)) {
1224                 c = unit_get_cgroup_context(p);
1225                 if (!c)
1226                         return false;
1227
1228                 if (c->ip_address_allow ||
1229                     c->ip_address_deny)
1230                         return true;
1231         }
1232
1233         return false;
1234 }
1235
1236 /* Recurse from a unit up through its containing slices, propagating
1237  * mask bits upward. A unit is also member of itself. */
1238 void unit_update_cgroup_members_masks(Unit *u) {
1239         CGroupMask m;
1240         bool more;
1241
1242         assert(u);
1243
1244         /* Calculate subtree mask */
1245         m = unit_get_subtree_mask(u);
1246
1247         /* See if anything changed from the previous invocation. If
1248          * not, we're done. */
1249         if (u->cgroup_subtree_mask_valid && m == u->cgroup_subtree_mask)
1250                 return;
1251
1252         more =
1253                 u->cgroup_subtree_mask_valid &&
1254                 ((m & ~u->cgroup_subtree_mask) != 0) &&
1255                 ((~m & u->cgroup_subtree_mask) == 0);
1256
1257         u->cgroup_subtree_mask = m;
1258         u->cgroup_subtree_mask_valid = true;
1259
1260         if (UNIT_ISSET(u->slice)) {
1261                 Unit *s = UNIT_DEREF(u->slice);
1262
1263                 if (more)
1264                         /* There's more set now than before. We
1265                          * propagate the new mask to the parent's mask
1266                          * (not caring if it actually was valid or
1267                          * not). */
1268
1269                         s->cgroup_members_mask |= m;
1270
1271                 else
1272                         /* There's less set now than before (or we
1273                          * don't know), we need to recalculate
1274                          * everything, so let's invalidate the
1275                          * parent's members mask */
1276
1277                         s->cgroup_members_mask_valid = false;
1278
1279                 /* And now make sure that this change also hits our
1280                  * grandparents */
1281                 unit_update_cgroup_members_masks(s);
1282         }
1283 }
1284
1285 static const char *migrate_callback(CGroupMask mask, void *userdata) {
1286         Unit *u = userdata;
1287
1288         assert(mask != 0);
1289         assert(u);
1290
1291         while (u) {
1292                 if (u->cgroup_path &&
1293                     u->cgroup_realized &&
1294                     (u->cgroup_realized_mask & mask) == mask)
1295                         return u->cgroup_path;
1296
1297                 u = UNIT_DEREF(u->slice);
1298         }
1299
1300         return NULL;
1301 }
1302
1303 char *unit_default_cgroup_path(Unit *u) {
1304         _cleanup_free_ char *escaped = NULL, *slice = NULL;
1305         int r;
1306
1307         assert(u);
1308
1309         if (unit_has_name(u, SPECIAL_ROOT_SLICE))
1310                 return strdup(u->manager->cgroup_root);
1311
1312         if (UNIT_ISSET(u->slice) && !unit_has_name(UNIT_DEREF(u->slice), SPECIAL_ROOT_SLICE)) {
1313                 r = cg_slice_to_path(UNIT_DEREF(u->slice)->id, &slice);
1314                 if (r < 0)
1315                         return NULL;
1316         }
1317
1318         escaped = cg_escape(u->id);
1319         if (!escaped)
1320                 return NULL;
1321
1322         if (slice)
1323                 return strjoin(u->manager->cgroup_root, "/", slice, "/",
1324                                escaped);
1325         else
1326                 return strjoin(u->manager->cgroup_root, "/", escaped);
1327 }
1328
1329 int unit_set_cgroup_path(Unit *u, const char *path) {
1330         _cleanup_free_ char *p = NULL;
1331         int r;
1332
1333         assert(u);
1334
1335         if (path) {
1336                 p = strdup(path);
1337                 if (!p)
1338                         return -ENOMEM;
1339         } else
1340                 p = NULL;
1341
1342         if (streq_ptr(u->cgroup_path, p))
1343                 return 0;
1344
1345         if (p) {
1346                 r = hashmap_put(u->manager->cgroup_unit, p, u);
1347                 if (r < 0)
1348                         return r;
1349         }
1350
1351         unit_release_cgroup(u);
1352
1353         u->cgroup_path = p;
1354         p = NULL;
1355
1356         return 1;
1357 }
1358
1359 int unit_watch_cgroup(Unit *u) {
1360         _cleanup_free_ char *events = NULL;
1361         int r;
1362
1363         assert(u);
1364
1365         if (!u->cgroup_path)
1366                 return 0;
1367
1368         if (u->cgroup_inotify_wd >= 0)
1369                 return 0;
1370
1371         /* Only applies to the unified hierarchy */
1372         r = cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER);
1373         if (r < 0)
1374                 return log_error_errno(r, "Failed to determine whether the name=systemd hierarchy is unified: %m");
1375         if (r == 0)
1376                 return 0;
1377
1378         /* Don't watch the root slice, it's pointless. */
1379         if (unit_has_name(u, SPECIAL_ROOT_SLICE))
1380                 return 0;
1381
1382         r = hashmap_ensure_allocated(&u->manager->cgroup_inotify_wd_unit, &trivial_hash_ops);
1383         if (r < 0)
1384                 return log_oom();
1385
1386         r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, "cgroup.events", &events);
1387         if (r < 0)
1388                 return log_oom();
1389
1390         u->cgroup_inotify_wd = inotify_add_watch(u->manager->cgroup_inotify_fd, events, IN_MODIFY);
1391         if (u->cgroup_inotify_wd < 0) {
1392
1393                 if (errno == ENOENT) /* If the directory is already
1394                                       * gone we don't need to track
1395                                       * it, so this is not an error */
1396                         return 0;
1397
1398                 return log_unit_error_errno(u, errno, "Failed to add inotify watch descriptor for control group %s: %m", u->cgroup_path);
1399         }
1400
1401         r = hashmap_put(u->manager->cgroup_inotify_wd_unit, INT_TO_PTR(u->cgroup_inotify_wd), u);
1402         if (r < 0)
1403                 return log_unit_error_errno(u, r, "Failed to add inotify watch descriptor to hash map: %m");
1404
1405         return 0;
1406 }
1407
1408 int unit_pick_cgroup_path(Unit *u) {
1409         _cleanup_free_ char *path = NULL;
1410         int r;
1411
1412         assert(u);
1413
1414         if (u->cgroup_path)
1415                 return 0;
1416
1417         if (!UNIT_HAS_CGROUP_CONTEXT(u))
1418                 return -EINVAL;
1419
1420         path = unit_default_cgroup_path(u);
1421         if (!path)
1422                 return log_oom();
1423
1424         r = unit_set_cgroup_path(u, path);
1425         if (r == -EEXIST)
1426                 return log_unit_error_errno(u, r, "Control group %s exists already.", path);
1427         if (r < 0)
1428                 return log_unit_error_errno(u, r, "Failed to set unit's control group path to %s: %m", path);
1429
1430         return 0;
1431 }
1432
1433 static int unit_create_cgroup(
1434                 Unit *u,
1435                 CGroupMask target_mask,
1436                 CGroupMask enable_mask,
1437                 bool needs_bpf) {
1438
1439         CGroupContext *c;
1440         int r;
1441
1442         assert(u);
1443
1444         c = unit_get_cgroup_context(u);
1445         if (!c)
1446                 return 0;
1447
1448         /* Figure out our cgroup path */
1449         r = unit_pick_cgroup_path(u);
1450         if (r < 0)
1451                 return r;
1452
1453         /* First, create our own group */
1454         r = cg_create_everywhere(u->manager->cgroup_supported, target_mask, u->cgroup_path);
1455         if (r < 0)
1456                 return log_unit_error_errno(u, r, "Failed to create cgroup %s: %m", u->cgroup_path);
1457
1458         /* Start watching it */
1459         (void) unit_watch_cgroup(u);
1460
1461         /* Enable all controllers we need */
1462         r = cg_enable_everywhere(u->manager->cgroup_supported, enable_mask, u->cgroup_path);
1463         if (r < 0)
1464                 log_unit_warning_errno(u, r, "Failed to enable controllers on cgroup %s, ignoring: %m", u->cgroup_path);
1465
1466         /* Keep track that this is now realized */
1467         u->cgroup_realized = true;
1468         u->cgroup_realized_mask = target_mask;
1469         u->cgroup_enabled_mask = enable_mask;
1470         u->cgroup_bpf_state = needs_bpf ? UNIT_CGROUP_BPF_ON : UNIT_CGROUP_BPF_OFF;
1471
1472         if (u->type != UNIT_SLICE && !c->delegate) {
1473
1474                 /* Then, possibly move things over, but not if
1475                  * subgroups may contain processes, which is the case
1476                  * for slice and delegation units. */
1477                 r = cg_migrate_everywhere(u->manager->cgroup_supported, u->cgroup_path, u->cgroup_path, migrate_callback, u);
1478                 if (r < 0)
1479                         log_unit_warning_errno(u, r, "Failed to migrate cgroup from to %s, ignoring: %m", u->cgroup_path);
1480         }
1481
1482         return 0;
1483 }
1484
1485 int unit_attach_pids_to_cgroup(Unit *u) {
1486         int r;
1487         assert(u);
1488
1489         r = unit_realize_cgroup(u);
1490         if (r < 0)
1491                 return r;
1492
1493         r = cg_attach_many_everywhere(u->manager->cgroup_supported, u->cgroup_path, u->pids, migrate_callback, u);
1494         if (r < 0)
1495                 return r;
1496
1497         return 0;
1498 }
1499
1500 static void cgroup_xattr_apply(Unit *u) {
1501         char ids[SD_ID128_STRING_MAX];
1502         int r;
1503
1504         assert(u);
1505
1506         if (!MANAGER_IS_SYSTEM(u->manager))
1507                 return;
1508
1509         if (sd_id128_is_null(u->invocation_id))
1510                 return;
1511
1512         r = cg_set_xattr(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path,
1513                          "trusted.invocation_id",
1514                          sd_id128_to_string(u->invocation_id, ids), 32,
1515                          0);
1516         if (r < 0)
1517                 log_unit_debug_errno(u, r, "Failed to set invocation ID on control group %s, ignoring: %m", u->cgroup_path);
1518 }
1519
1520 static bool unit_has_mask_realized(
1521                 Unit *u,
1522                 CGroupMask target_mask,
1523                 CGroupMask enable_mask,
1524                 bool needs_bpf) {
1525
1526         assert(u);
1527
1528         return u->cgroup_realized &&
1529                 u->cgroup_realized_mask == target_mask &&
1530                 u->cgroup_enabled_mask == enable_mask &&
1531                 ((needs_bpf && u->cgroup_bpf_state == UNIT_CGROUP_BPF_ON) ||
1532                  (!needs_bpf && u->cgroup_bpf_state == UNIT_CGROUP_BPF_OFF));
1533 }
1534
1535 static void unit_add_to_cgroup_realize_queue(Unit *u) {
1536         assert(u);
1537
1538         if (u->in_cgroup_realize_queue)
1539                 return;
1540
1541         LIST_PREPEND(cgroup_realize_queue, u->manager->cgroup_realize_queue, u);
1542         u->in_cgroup_realize_queue = true;
1543 }
1544
1545 static void unit_remove_from_cgroup_realize_queue(Unit *u) {
1546         assert(u);
1547
1548         if (!u->in_cgroup_realize_queue)
1549                 return;
1550
1551         LIST_REMOVE(cgroup_realize_queue, u->manager->cgroup_realize_queue, u);
1552         u->in_cgroup_realize_queue = false;
1553 }
1554
1555
1556 /* Check if necessary controllers and attributes for a unit are in place.
1557  *
1558  * If so, do nothing.
1559  * If not, create paths, move processes over, and set attributes.
1560  *
1561  * Returns 0 on success and < 0 on failure. */
1562 static int unit_realize_cgroup_now(Unit *u, ManagerState state) {
1563         CGroupMask target_mask, enable_mask;
1564         bool needs_bpf, apply_bpf;
1565         int r;
1566
1567         assert(u);
1568
1569         unit_remove_from_cgroup_realize_queue(u);
1570
1571         target_mask = unit_get_target_mask(u);
1572         enable_mask = unit_get_enable_mask(u);
1573         needs_bpf = unit_get_needs_bpf(u);
1574
1575         if (unit_has_mask_realized(u, target_mask, enable_mask, needs_bpf))
1576                 return 0;
1577
1578         /* Make sure we apply the BPF filters either when one is configured, or if none is configured but previously
1579          * the state was anything but off. This way, if a unit with a BPF filter applied is reconfigured to lose it
1580          * this will trickle down properly to cgroupfs. */
1581         apply_bpf = needs_bpf || u->cgroup_bpf_state != UNIT_CGROUP_BPF_OFF;
1582
1583         /* First, realize parents */
1584         if (UNIT_ISSET(u->slice)) {
1585                 r = unit_realize_cgroup_now(UNIT_DEREF(u->slice), state);
1586                 if (r < 0)
1587                         return r;
1588         }
1589
1590         /* And then do the real work */
1591         r = unit_create_cgroup(u, target_mask, enable_mask, needs_bpf);
1592         if (r < 0)
1593                 return r;
1594
1595         /* Finally, apply the necessary attributes. */
1596         cgroup_context_apply(u, target_mask, apply_bpf, state);
1597         cgroup_xattr_apply(u);
1598
1599         return 0;
1600 }
1601
1602 unsigned manager_dispatch_cgroup_realize_queue(Manager *m) {
1603         ManagerState state;
1604         unsigned n = 0;
1605         Unit *i;
1606         int r;
1607
1608         assert(m);
1609
1610         state = manager_state(m);
1611
1612         while ((i = m->cgroup_realize_queue)) {
1613                 assert(i->in_cgroup_realize_queue);
1614
1615                 if (UNIT_IS_INACTIVE_OR_FAILED(unit_active_state(i))) {
1616                         /* Maybe things changed, and the unit is not actually active anymore? */
1617                         unit_remove_from_cgroup_realize_queue(i);
1618                         continue;
1619                 }
1620
1621                 r = unit_realize_cgroup_now(i, state);
1622                 if (r < 0)
1623                         log_warning_errno(r, "Failed to realize cgroups for queued unit %s, ignoring: %m", i->id);
1624
1625                 n++;
1626         }
1627
1628         return n;
1629 }
1630
1631 static void unit_add_siblings_to_cgroup_realize_queue(Unit *u) {
1632         Unit *slice;
1633
1634         /* This adds the siblings of the specified unit and the
1635          * siblings of all parent units to the cgroup queue. (But
1636          * neither the specified unit itself nor the parents.) */
1637
1638         while ((slice = UNIT_DEREF(u->slice))) {
1639                 Iterator i;
1640                 Unit *m;
1641                 void *v;
1642
1643                 HASHMAP_FOREACH_KEY(v, m, u->dependencies[UNIT_BEFORE], i) {
1644                         if (m == u)
1645                                 continue;
1646
1647                         /* Skip units that have a dependency on the slice
1648                          * but aren't actually in it. */
1649                         if (UNIT_DEREF(m->slice) != slice)
1650                                 continue;
1651
1652                         /* No point in doing cgroup application for units
1653                          * without active processes. */
1654                         if (UNIT_IS_INACTIVE_OR_FAILED(unit_active_state(m)))
1655                                 continue;
1656
1657                         /* If the unit doesn't need any new controllers
1658                          * and has current ones realized, it doesn't need
1659                          * any changes. */
1660                         if (unit_has_mask_realized(m,
1661                                                    unit_get_target_mask(m),
1662                                                    unit_get_enable_mask(m),
1663                                                    unit_get_needs_bpf(m)))
1664                                 continue;
1665
1666                         unit_add_to_cgroup_realize_queue(m);
1667                 }
1668
1669                 u = slice;
1670         }
1671 }
1672
1673 int unit_realize_cgroup(Unit *u) {
1674         assert(u);
1675
1676         if (!UNIT_HAS_CGROUP_CONTEXT(u))
1677                 return 0;
1678
1679         /* So, here's the deal: when realizing the cgroups for this
1680          * unit, we need to first create all parents, but there's more
1681          * actually: for the weight-based controllers we also need to
1682          * make sure that all our siblings (i.e. units that are in the
1683          * same slice as we are) have cgroups, too. Otherwise, things
1684          * would become very uneven as each of their processes would
1685          * get as much resources as all our group together. This call
1686          * will synchronously create the parent cgroups, but will
1687          * defer work on the siblings to the next event loop
1688          * iteration. */
1689
1690         /* Add all sibling slices to the cgroup queue. */
1691         unit_add_siblings_to_cgroup_realize_queue(u);
1692
1693         /* And realize this one now (and apply the values) */
1694         return unit_realize_cgroup_now(u, manager_state(u->manager));
1695 }
1696
1697 void unit_release_cgroup(Unit *u) {
1698         assert(u);
1699
1700         /* Forgets all cgroup details for this cgroup */
1701
1702         if (u->cgroup_path) {
1703                 (void) hashmap_remove(u->manager->cgroup_unit, u->cgroup_path);
1704                 u->cgroup_path = mfree(u->cgroup_path);
1705         }
1706
1707         if (u->cgroup_inotify_wd >= 0) {
1708                 if (inotify_rm_watch(u->manager->cgroup_inotify_fd, u->cgroup_inotify_wd) < 0)
1709                         log_unit_debug_errno(u, errno, "Failed to remove cgroup inotify watch %i for %s, ignoring", u->cgroup_inotify_wd, u->id);
1710
1711                 (void) hashmap_remove(u->manager->cgroup_inotify_wd_unit, INT_TO_PTR(u->cgroup_inotify_wd));
1712                 u->cgroup_inotify_wd = -1;
1713         }
1714 }
1715
1716 void unit_prune_cgroup(Unit *u) {
1717         int r;
1718         bool is_root_slice;
1719
1720         assert(u);
1721
1722         /* Removes the cgroup, if empty and possible, and stops watching it. */
1723
1724         if (!u->cgroup_path)
1725                 return;
1726
1727         (void) unit_get_cpu_usage(u, NULL); /* Cache the last CPU usage value before we destroy the cgroup */
1728
1729         is_root_slice = unit_has_name(u, SPECIAL_ROOT_SLICE);
1730
1731         r = cg_trim_everywhere(u->manager->cgroup_supported, u->cgroup_path, !is_root_slice);
1732         if (r < 0) {
1733                 log_unit_debug_errno(u, r, "Failed to destroy cgroup %s, ignoring: %m", u->cgroup_path);
1734                 return;
1735         }
1736
1737         if (is_root_slice)
1738                 return;
1739
1740         unit_release_cgroup(u);
1741
1742         u->cgroup_realized = false;
1743         u->cgroup_realized_mask = 0;
1744         u->cgroup_enabled_mask = 0;
1745 }
1746
1747 int unit_search_main_pid(Unit *u, pid_t *ret) {
1748         _cleanup_fclose_ FILE *f = NULL;
1749         pid_t pid = 0, npid, mypid;
1750         int r;
1751
1752         assert(u);
1753         assert(ret);
1754
1755         if (!u->cgroup_path)
1756                 return -ENXIO;
1757
1758         r = cg_enumerate_processes(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, &f);
1759         if (r < 0)
1760                 return r;
1761
1762         mypid = getpid_cached();
1763         while (cg_read_pid(f, &npid) > 0)  {
1764                 pid_t ppid;
1765
1766                 if (npid == pid)
1767                         continue;
1768
1769                 /* Ignore processes that aren't our kids */
1770                 if (get_process_ppid(npid, &ppid) >= 0 && ppid != mypid)
1771                         continue;
1772
1773                 if (pid != 0)
1774                         /* Dang, there's more than one daemonized PID
1775                         in this group, so we don't know what process
1776                         is the main process. */
1777
1778                         return -ENODATA;
1779
1780                 pid = npid;
1781         }
1782
1783         *ret = pid;
1784         return 0;
1785 }
1786
1787 static int unit_watch_pids_in_path(Unit *u, const char *path) {
1788         _cleanup_closedir_ DIR *d = NULL;
1789         _cleanup_fclose_ FILE *f = NULL;
1790         int ret = 0, r;
1791
1792         assert(u);
1793         assert(path);
1794
1795         r = cg_enumerate_processes(SYSTEMD_CGROUP_CONTROLLER, path, &f);
1796         if (r < 0)
1797                 ret = r;
1798         else {
1799                 pid_t pid;
1800
1801                 while ((r = cg_read_pid(f, &pid)) > 0) {
1802                         r = unit_watch_pid(u, pid);
1803                         if (r < 0 && ret >= 0)
1804                                 ret = r;
1805                 }
1806
1807                 if (r < 0 && ret >= 0)
1808                         ret = r;
1809         }
1810
1811         r = cg_enumerate_subgroups(SYSTEMD_CGROUP_CONTROLLER, path, &d);
1812         if (r < 0) {
1813                 if (ret >= 0)
1814                         ret = r;
1815         } else {
1816                 char *fn;
1817
1818                 while ((r = cg_read_subgroup(d, &fn)) > 0) {
1819                         _cleanup_free_ char *p = NULL;
1820
1821                         p = strjoin(path, "/", fn);
1822                         free(fn);
1823
1824                         if (!p)
1825                                 return -ENOMEM;
1826
1827                         r = unit_watch_pids_in_path(u, p);
1828                         if (r < 0 && ret >= 0)
1829                                 ret = r;
1830                 }
1831
1832                 if (r < 0 && ret >= 0)
1833                         ret = r;
1834         }
1835
1836         return ret;
1837 }
1838
1839 int unit_watch_all_pids(Unit *u) {
1840         int r;
1841
1842         assert(u);
1843
1844         /* Adds all PIDs from our cgroup to the set of PIDs we
1845          * watch. This is a fallback logic for cases where we do not
1846          * get reliable cgroup empty notifications: we try to use
1847          * SIGCHLD as replacement. */
1848
1849         if (!u->cgroup_path)
1850                 return -ENOENT;
1851
1852         r = cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER);
1853         if (r < 0)
1854                 return r;
1855         if (r > 0) /* On unified we can use proper notifications */
1856                 return 0;
1857
1858         return unit_watch_pids_in_path(u, u->cgroup_path);
1859 }
1860
1861 static int on_cgroup_empty_event(sd_event_source *s, void *userdata) {
1862         Manager *m = userdata;
1863         Unit *u;
1864         int r;
1865
1866         assert(s);
1867         assert(m);
1868
1869         u = m->cgroup_empty_queue;
1870         if (!u)
1871                 return 0;
1872
1873         assert(u->in_cgroup_empty_queue);
1874         u->in_cgroup_empty_queue = false;
1875         LIST_REMOVE(cgroup_empty_queue, m->cgroup_empty_queue, u);
1876
1877         if (m->cgroup_empty_queue) {
1878                 /* More stuff queued, let's make sure we remain enabled */
1879                 r = sd_event_source_set_enabled(s, SD_EVENT_ONESHOT);
1880                 if (r < 0)
1881                         log_debug_errno(r, "Failed to reenable cgroup empty event source: %m");
1882         }
1883
1884         unit_add_to_gc_queue(u);
1885
1886         if (UNIT_VTABLE(u)->notify_cgroup_empty)
1887                 UNIT_VTABLE(u)->notify_cgroup_empty(u);
1888
1889         return 0;
1890 }
1891
1892 void unit_add_to_cgroup_empty_queue(Unit *u) {
1893         int r;
1894
1895         assert(u);
1896
1897         /* Note that there are four different ways how cgroup empty events reach us:
1898          *
1899          * 1. On the unified hierarchy we get an inotify event on the cgroup
1900          *
1901          * 2. On the legacy hierarchy, when running in system mode, we get a datagram on the cgroup agent socket
1902          *
1903          * 3. On the legacy hierarchy, when running in user mode, we get a D-Bus signal on the system bus
1904          *
1905          * 4. On the legacy hierarchy, in service units we start watching all processes of the cgroup for SIGCHLD as
1906          *    soon as we get one SIGCHLD, to deal with unreliable cgroup notifications.
1907          *
1908          * Regardless which way we got the notification, we'll verify it here, and then add it to a separate
1909          * queue. This queue will be dispatched at a lower priority than the SIGCHLD handler, so that we always use
1910          * SIGCHLD if we can get it first, and only use the cgroup empty notifications if there's no SIGCHLD pending
1911          * (which might happen if the cgroup doesn't contain processes that are our own child, which is typically the
1912          * case for scope units). */
1913
1914         if (u->in_cgroup_empty_queue)
1915                 return;
1916
1917         /* Let's verify that the cgroup is really empty */
1918         if (!u->cgroup_path)
1919                 return;
1920         r = cg_is_empty_recursive(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path);
1921         if (r < 0) {
1922                 log_unit_debug_errno(u, r, "Failed to determine whether cgroup %s is empty: %m", u->cgroup_path);
1923                 return;
1924         }
1925         if (r == 0)
1926                 return;
1927
1928         LIST_PREPEND(cgroup_empty_queue, u->manager->cgroup_empty_queue, u);
1929         u->in_cgroup_empty_queue = true;
1930
1931         /* Trigger the defer event */
1932         r = sd_event_source_set_enabled(u->manager->cgroup_empty_event_source, SD_EVENT_ONESHOT);
1933         if (r < 0)
1934                 log_debug_errno(r, "Failed to enable cgroup empty event source: %m");
1935 }
1936
1937 static int on_cgroup_inotify_event(sd_event_source *s, int fd, uint32_t revents, void *userdata) {
1938         Manager *m = userdata;
1939
1940         assert(s);
1941         assert(fd >= 0);
1942         assert(m);
1943
1944         for (;;) {
1945                 union inotify_event_buffer buffer;
1946                 struct inotify_event *e;
1947                 ssize_t l;
1948
1949                 l = read(fd, &buffer, sizeof(buffer));
1950                 if (l < 0) {
1951                         if (IN_SET(errno, EINTR, EAGAIN))
1952                                 return 0;
1953
1954                         return log_error_errno(errno, "Failed to read control group inotify events: %m");
1955                 }
1956
1957                 FOREACH_INOTIFY_EVENT(e, buffer, l) {
1958                         Unit *u;
1959
1960                         if (e->wd < 0)
1961                                 /* Queue overflow has no watch descriptor */
1962                                 continue;
1963
1964                         if (e->mask & IN_IGNORED)
1965                                 /* The watch was just removed */
1966                                 continue;
1967
1968                         u = hashmap_get(m->cgroup_inotify_wd_unit, INT_TO_PTR(e->wd));
1969                         if (!u) /* Not that inotify might deliver
1970                                  * events for a watch even after it
1971                                  * was removed, because it was queued
1972                                  * before the removal. Let's ignore
1973                                  * this here safely. */
1974                                 continue;
1975
1976                         unit_add_to_cgroup_empty_queue(u);
1977                 }
1978         }
1979 }
1980 #endif // 0
1981
1982 int manager_setup_cgroup(Manager *m) {
1983         _cleanup_free_ char *path = NULL;
1984         const char *scope_path;
1985         CGroupController c;
1986         int r, all_unified;
1987 #if 0 /// UNNEEDED by elogind
1988         char *e;
1989 #endif // 0
1990
1991         assert(m);
1992
1993         /* 1. Determine hierarchy */
1994         m->cgroup_root = mfree(m->cgroup_root);
1995 #if 0 /// elogind is not init and must therefore search for PID 1 instead of self.
1996         r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, 0, &m->cgroup_root);
1997 #else
1998         r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, 1, &m->cgroup_root);
1999 #endif // 0
2000         if (r < 0)
2001                 return log_error_errno(r, "Cannot determine cgroup we are running in: %m");
2002
2003 #if 0 /// elogind does not support systemd scopes and slices
2004         /* Chop off the init scope, if we are already located in it */
2005         e = endswith(m->cgroup_root, "/" SPECIAL_INIT_SCOPE);
2006
2007         /* LEGACY: Also chop off the system slice if we are in
2008          * it. This is to support live upgrades from older systemd
2009          * versions where PID 1 was moved there. Also see
2010          * cg_get_root_path(). */
2011         if (!e && MANAGER_IS_SYSTEM(m)) {
2012                 e = endswith(m->cgroup_root, "/" SPECIAL_SYSTEM_SLICE);
2013                 if (!e)
2014                         e = endswith(m->cgroup_root, "/system"); /* even more legacy */
2015         }
2016         if (e)
2017                 *e = 0;
2018 #endif // 0
2019
2020         log_debug_elogind("Cgroup Controller \"%s\" -> root \"%s\"",
2021                           SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root);
2022         /* And make sure to store away the root value without trailing slash, even for the root dir, so that we can
2023          * easily prepend it everywhere. */
2024         delete_trailing_chars(m->cgroup_root, "/");
2025
2026         /* 2. Show data */
2027         r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, NULL, &path);
2028         if (r < 0)
2029                 return log_error_errno(r, "Cannot find cgroup mount point: %m");
2030
2031         r = cg_unified_flush();
2032         if (r < 0)
2033                 return log_error_errno(r, "Couldn't determine if we are running in the unified hierarchy: %m");
2034
2035         all_unified = cg_all_unified();
2036         if (all_unified < 0)
2037                 return log_error_errno(all_unified, "Couldn't determine whether we are in all unified mode: %m");
2038         if (all_unified > 0)
2039                 log_debug("Unified cgroup hierarchy is located at %s.", path);
2040         else {
2041                 r = cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER);
2042                 if (r < 0)
2043                         return log_error_errno(r, "Failed to determine whether systemd's own controller is in unified mode: %m");
2044                 if (r > 0)
2045                         log_debug("Unified cgroup hierarchy is located at %s. Controllers are on legacy hierarchies.", path);
2046                 else
2047                         log_debug("Using cgroup controller " SYSTEMD_CGROUP_CONTROLLER_LEGACY ". File system hierarchy is at %s.", path);
2048         }
2049
2050 #if 0 /// elogind is not init, and does not install the agent here.
2051         /* 3. Allocate cgroup empty defer event source */
2052         m->cgroup_empty_event_source = sd_event_source_unref(m->cgroup_empty_event_source);
2053         r = sd_event_add_defer(m->event, &m->cgroup_empty_event_source, on_cgroup_empty_event, m);
2054         if (r < 0)
2055                 return log_error_errno(r, "Failed to create cgroup empty event source: %m");
2056
2057         r = sd_event_source_set_priority(m->cgroup_empty_event_source, SD_EVENT_PRIORITY_NORMAL-5);
2058         if (r < 0)
2059                 return log_error_errno(r, "Failed to set priority of cgroup empty event source: %m");
2060
2061         r = sd_event_source_set_enabled(m->cgroup_empty_event_source, SD_EVENT_OFF);
2062         if (r < 0)
2063                 return log_error_errno(r, "Failed to disable cgroup empty event source: %m");
2064
2065         (void) sd_event_source_set_description(m->cgroup_empty_event_source, "cgroup-empty");
2066
2067         /* 4. Install notifier inotify object, or agent */
2068         if (cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER) > 0) {
2069
2070                 /* In the unified hierarchy we can get cgroup empty notifications via inotify. */
2071
2072                 m->cgroup_inotify_event_source = sd_event_source_unref(m->cgroup_inotify_event_source);
2073                 safe_close(m->cgroup_inotify_fd);
2074
2075                 m->cgroup_inotify_fd = inotify_init1(IN_NONBLOCK|IN_CLOEXEC);
2076                 if (m->cgroup_inotify_fd < 0)
2077                         return log_error_errno(errno, "Failed to create control group inotify object: %m");
2078
2079                 r = sd_event_add_io(m->event, &m->cgroup_inotify_event_source, m->cgroup_inotify_fd, EPOLLIN, on_cgroup_inotify_event, m);
2080                 if (r < 0)
2081                         return log_error_errno(r, "Failed to watch control group inotify object: %m");
2082
2083                 /* Process cgroup empty notifications early, but after service notifications and SIGCHLD. Also
2084                  * see handling of cgroup agent notifications, for the classic cgroup hierarchy support. */
2085                 r = sd_event_source_set_priority(m->cgroup_inotify_event_source, SD_EVENT_PRIORITY_NORMAL-4);
2086                 if (r < 0)
2087                         return log_error_errno(r, "Failed to set priority of inotify event source: %m");
2088
2089                 (void) sd_event_source_set_description(m->cgroup_inotify_event_source, "cgroup-inotify");
2090
2091         } else if (MANAGER_IS_SYSTEM(m) && m->test_run_flags == 0) {
2092
2093                 /* On the legacy hierarchy we only get notifications via cgroup agents. (Which isn't really reliable,
2094                  * since it does not generate events when control groups with children run empty. */
2095
2096                 r = cg_install_release_agent(SYSTEMD_CGROUP_CONTROLLER, SYSTEMD_CGROUP_AGENT_PATH);
2097                 if (r < 0)
2098                         log_warning_errno(r, "Failed to install release agent, ignoring: %m");
2099                 else if (r > 0)
2100                         log_debug("Installed release agent.");
2101                 else if (r == 0)
2102                         log_debug("Release agent already installed.");
2103         }
2104
2105         /* 5. Make sure we are in the special "init.scope" unit in the root slice. */
2106         scope_path = strjoina(m->cgroup_root, "/" SPECIAL_INIT_SCOPE);
2107         r = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, scope_path, 0);
2108 #else
2109         /* Note:
2110                 * This method is in core, and normally called by systemd
2111                 * being init. As elogind is never init, we can not install
2112                 * our agent here. We do so when mounting our cgroup file
2113                 * system, so only if elogind is its own tiny controller.
2114                 * Further, elogind is not meant to run in systemd init scope. */
2115         if (MANAGER_IS_SYSTEM(m))
2116                 // we are our own cgroup controller
2117                 scope_path = strjoina("");
2118         else if (streq(m->cgroup_root, "/elogind"))
2119                 // root already is our cgroup
2120                 scope_path = strjoina(m->cgroup_root);
2121         else
2122                 // we have to create our own group
2123                 scope_path = strjoina(m->cgroup_root, "/elogind");
2124         r = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, scope_path, 0);
2125 #endif // 0
2126         if (r < 0)
2127                 return log_error_errno(r, "Failed to create %s control group: %m", scope_path);
2128         log_debug_elogind("Created control group \"%s\"", scope_path);
2129
2130 #if 0 /// elogind is not a "sub-controller" like systemd, so migration is not needed.
2131         /* Also, move all other userspace processes remaining in the root cgroup into that scope. */
2132         r = cg_migrate(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, SYSTEMD_CGROUP_CONTROLLER, scope_path, 0);
2133         if (r < 0)
2134                 log_warning_errno(r, "Couldn't move remaining userspace processes, ignoring: %m");
2135 #endif // 0
2136
2137         /* 6. And pin it, so that it cannot be unmounted */
2138         safe_close(m->pin_cgroupfs_fd);
2139         m->pin_cgroupfs_fd = open(path, O_RDONLY|O_CLOEXEC|O_DIRECTORY|O_NOCTTY|O_NONBLOCK);
2140         if (m->pin_cgroupfs_fd < 0)
2141                 return log_error_errno(errno, "Failed to open pin file: %m");
2142
2143         /* 7. Always enable hierarchical support if it exists... */
2144         if (!all_unified && m->test_run_flags == 0)
2145                 (void) cg_set_attribute("memory", "/", "memory.use_hierarchy", "1");
2146
2147         /* 8. Figure out which controllers are supported, and log about it */
2148         r = cg_mask_supported(&m->cgroup_supported);
2149         if (r < 0)
2150                 return log_error_errno(r, "Failed to determine supported controllers: %m");
2151         for (c = 0; c < _CGROUP_CONTROLLER_MAX; c++)
2152                 log_debug("Controller '%s' supported: %s", cgroup_controller_to_string(c), yes_no(m->cgroup_supported & CGROUP_CONTROLLER_TO_MASK(c)));
2153
2154         return 0;
2155 }
2156
2157 void manager_shutdown_cgroup(Manager *m, bool delete) {
2158         assert(m);
2159
2160         /* We can't really delete the group, since we are in it. But
2161          * let's trim it. */
2162         if (delete && m->cgroup_root)
2163                 (void) cg_trim(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, false);
2164
2165 #if 0 /// elogind is not init
2166         m->cgroup_empty_event_source = sd_event_source_unref(m->cgroup_empty_event_source);
2167
2168         m->cgroup_inotify_wd_unit = hashmap_free(m->cgroup_inotify_wd_unit);
2169
2170         m->cgroup_inotify_event_source = sd_event_source_unref(m->cgroup_inotify_event_source);
2171         m->cgroup_inotify_fd = safe_close(m->cgroup_inotify_fd);
2172 #endif // 0
2173
2174         m->pin_cgroupfs_fd = safe_close(m->pin_cgroupfs_fd);
2175
2176         m->cgroup_root = mfree(m->cgroup_root);
2177 }
2178
2179 #if 0 /// UNNEEDED by elogind
2180 Unit* manager_get_unit_by_cgroup(Manager *m, const char *cgroup) {
2181         char *p;
2182         Unit *u;
2183
2184         assert(m);
2185         assert(cgroup);
2186
2187         u = hashmap_get(m->cgroup_unit, cgroup);
2188         if (u)
2189                 return u;
2190
2191         p = strdupa(cgroup);
2192         for (;;) {
2193                 char *e;
2194
2195                 e = strrchr(p, '/');
2196                 if (!e || e == p)
2197                         return hashmap_get(m->cgroup_unit, SPECIAL_ROOT_SLICE);
2198
2199                 *e = 0;
2200
2201                 u = hashmap_get(m->cgroup_unit, p);
2202                 if (u)
2203                         return u;
2204         }
2205 }
2206
2207 Unit *manager_get_unit_by_pid_cgroup(Manager *m, pid_t pid) {
2208         _cleanup_free_ char *cgroup = NULL;
2209         int r;
2210
2211         assert(m);
2212
2213         if (pid <= 0)
2214                 return NULL;
2215
2216         r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, pid, &cgroup);
2217         if (r < 0)
2218                 return NULL;
2219
2220         return manager_get_unit_by_cgroup(m, cgroup);
2221 }
2222
2223 Unit *manager_get_unit_by_pid(Manager *m, pid_t pid) {
2224         Unit *u;
2225
2226         assert(m);
2227
2228         if (pid <= 0)
2229                 return NULL;
2230
2231         if (pid == 1)
2232                 return hashmap_get(m->units, SPECIAL_INIT_SCOPE);
2233
2234         u = hashmap_get(m->watch_pids1, PID_TO_PTR(pid));
2235         if (u)
2236                 return u;
2237
2238         u = hashmap_get(m->watch_pids2, PID_TO_PTR(pid));
2239         if (u)
2240                 return u;
2241
2242         return manager_get_unit_by_pid_cgroup(m, pid);
2243 }
2244 #endif // 0
2245
2246 #if 0 /// elogind must substitute this with its own variant
2247 int manager_notify_cgroup_empty(Manager *m, const char *cgroup) {
2248         Unit *u;
2249
2250         assert(m);
2251         assert(cgroup);
2252
2253         /* Called on the legacy hierarchy whenever we get an explicit cgroup notification from the cgroup agent process
2254          * or from the --system instance */
2255
2256         log_debug("Got cgroup empty notification for: %s", cgroup);
2257
2258         u = manager_get_unit_by_cgroup(m, cgroup);
2259         if (!u)
2260                 return 0;
2261
2262         unit_add_to_cgroup_empty_queue(u);
2263         return 1;
2264 }
2265 #else
2266 int manager_notify_cgroup_empty(Manager *m, const char *cgroup) {
2267         Session *s;
2268
2269         assert(m);
2270         assert(cgroup);
2271
2272         log_debug("Got cgroup empty notification for: %s", cgroup);
2273
2274         s = hashmap_get(m->sessions, cgroup);
2275
2276         if (s) {
2277                 session_finalize(s);
2278                 session_free(s);
2279         } else
2280                 log_warning("Session not found: %s", cgroup);
2281
2282         return 0;
2283 }
2284 #endif // 0
2285 #if 0 /// UNNEEDED by elogind
2286 int unit_get_memory_current(Unit *u, uint64_t *ret) {
2287         _cleanup_free_ char *v = NULL;
2288         int r;
2289
2290         assert(u);
2291         assert(ret);
2292
2293         if (!UNIT_CGROUP_BOOL(u, memory_accounting))
2294                 return -ENODATA;
2295
2296         if (!u->cgroup_path)
2297                 return -ENODATA;
2298
2299         if ((u->cgroup_realized_mask & CGROUP_MASK_MEMORY) == 0)
2300                 return -ENODATA;
2301
2302         r = cg_all_unified();
2303         if (r < 0)
2304                 return r;
2305         if (r > 0)
2306                 r = cg_get_attribute("memory", u->cgroup_path, "memory.current", &v);
2307         else
2308                 r = cg_get_attribute("memory", u->cgroup_path, "memory.usage_in_bytes", &v);
2309         if (r == -ENOENT)
2310                 return -ENODATA;
2311         if (r < 0)
2312                 return r;
2313
2314         return safe_atou64(v, ret);
2315 }
2316
2317 int unit_get_tasks_current(Unit *u, uint64_t *ret) {
2318         _cleanup_free_ char *v = NULL;
2319         int r;
2320
2321         assert(u);
2322         assert(ret);
2323
2324         if (!UNIT_CGROUP_BOOL(u, tasks_accounting))
2325                 return -ENODATA;
2326
2327         if (!u->cgroup_path)
2328                 return -ENODATA;
2329
2330         if ((u->cgroup_realized_mask & CGROUP_MASK_PIDS) == 0)
2331                 return -ENODATA;
2332
2333         r = cg_get_attribute("pids", u->cgroup_path, "pids.current", &v);
2334         if (r == -ENOENT)
2335                 return -ENODATA;
2336         if (r < 0)
2337                 return r;
2338
2339         return safe_atou64(v, ret);
2340 }
2341
2342 static int unit_get_cpu_usage_raw(Unit *u, nsec_t *ret) {
2343         _cleanup_free_ char *v = NULL;
2344         uint64_t ns;
2345         int r;
2346
2347         assert(u);
2348         assert(ret);
2349
2350         if (!u->cgroup_path)
2351                 return -ENODATA;
2352
2353         r = cg_all_unified();
2354         if (r < 0)
2355                 return r;
2356         if (r > 0) {
2357                 const char *keys[] = { "usage_usec", NULL };
2358                 _cleanup_free_ char *val = NULL;
2359                 uint64_t us;
2360
2361                 if ((u->cgroup_realized_mask & CGROUP_MASK_CPU) == 0)
2362                         return -ENODATA;
2363
2364                 r = cg_get_keyed_attribute("cpu", u->cgroup_path, "cpu.stat", keys, &val);
2365                 if (r < 0)
2366                         return r;
2367
2368                 r = safe_atou64(val, &us);
2369                 if (r < 0)
2370                         return r;
2371
2372                 ns = us * NSEC_PER_USEC;
2373         } else {
2374                 if ((u->cgroup_realized_mask & CGROUP_MASK_CPUACCT) == 0)
2375                         return -ENODATA;
2376
2377                 r = cg_get_attribute("cpuacct", u->cgroup_path, "cpuacct.usage", &v);
2378                 if (r == -ENOENT)
2379                         return -ENODATA;
2380                 if (r < 0)
2381                         return r;
2382
2383                 r = safe_atou64(v, &ns);
2384                 if (r < 0)
2385                         return r;
2386         }
2387
2388         *ret = ns;
2389         return 0;
2390 }
2391
2392 int unit_get_cpu_usage(Unit *u, nsec_t *ret) {
2393         nsec_t ns;
2394         int r;
2395
2396         assert(u);
2397
2398         /* Retrieve the current CPU usage counter. This will subtract the CPU counter taken when the unit was
2399          * started. If the cgroup has been removed already, returns the last cached value. To cache the value, simply
2400          * call this function with a NULL return value. */
2401
2402         if (!UNIT_CGROUP_BOOL(u, cpu_accounting))
2403                 return -ENODATA;
2404
2405         r = unit_get_cpu_usage_raw(u, &ns);
2406         if (r == -ENODATA && u->cpu_usage_last != NSEC_INFINITY) {
2407                 /* If we can't get the CPU usage anymore (because the cgroup was already removed, for example), use our
2408                  * cached value. */
2409
2410                 if (ret)
2411                         *ret = u->cpu_usage_last;
2412                 return 0;
2413         }
2414         if (r < 0)
2415                 return r;
2416
2417         if (ns > u->cpu_usage_base)
2418                 ns -= u->cpu_usage_base;
2419         else
2420                 ns = 0;
2421
2422         u->cpu_usage_last = ns;
2423         if (ret)
2424                 *ret = ns;
2425
2426         return 0;
2427 }
2428
2429 int unit_get_ip_accounting(
2430                 Unit *u,
2431                 CGroupIPAccountingMetric metric,
2432                 uint64_t *ret) {
2433
2434         uint64_t value;
2435         int fd, r;
2436
2437         assert(u);
2438         assert(metric >= 0);
2439         assert(metric < _CGROUP_IP_ACCOUNTING_METRIC_MAX);
2440         assert(ret);
2441
2442         /* IP accounting is currently not recursive, and hence we refuse to return any data for slice nodes. Slices are
2443          * inner cgroup nodes and hence have no processes directly attached, hence their counters would be zero
2444          * anyway. And if we block this now we can later open this up, if the kernel learns recursive BPF cgroup
2445          * filters. */
2446         if (u->type == UNIT_SLICE)
2447                 return -ENODATA;
2448
2449         if (!UNIT_CGROUP_BOOL(u, ip_accounting))
2450                 return -ENODATA;
2451
2452         fd = IN_SET(metric, CGROUP_IP_INGRESS_BYTES, CGROUP_IP_INGRESS_PACKETS) ?
2453                 u->ip_accounting_ingress_map_fd :
2454                 u->ip_accounting_egress_map_fd;
2455         if (fd < 0)
2456                 return -ENODATA;
2457
2458         if (IN_SET(metric, CGROUP_IP_INGRESS_BYTES, CGROUP_IP_EGRESS_BYTES))
2459                 r = bpf_firewall_read_accounting(fd, &value, NULL);
2460         else
2461                 r = bpf_firewall_read_accounting(fd, NULL, &value);
2462         if (r < 0)
2463                 return r;
2464
2465         /* Add in additional metrics from a previous runtime. Note that when reexecing/reloading the daemon we compile
2466          * all BPF programs and maps anew, but serialize the old counters. When deserializing we store them in the
2467          * ip_accounting_extra[] field, and add them in here transparently. */
2468
2469         *ret = value + u->ip_accounting_extra[metric];
2470
2471         return r;
2472 }
2473
2474 int unit_reset_cpu_accounting(Unit *u) {
2475         nsec_t ns;
2476         int r;
2477
2478         assert(u);
2479
2480         u->cpu_usage_last = NSEC_INFINITY;
2481
2482         r = unit_get_cpu_usage_raw(u, &ns);
2483         if (r < 0) {
2484                 u->cpu_usage_base = 0;
2485                 return r;
2486         }
2487
2488         u->cpu_usage_base = ns;
2489         return 0;
2490 }
2491
2492 int unit_reset_ip_accounting(Unit *u) {
2493         int r = 0, q = 0;
2494
2495         assert(u);
2496
2497         if (u->ip_accounting_ingress_map_fd >= 0)
2498                 r = bpf_firewall_reset_accounting(u->ip_accounting_ingress_map_fd);
2499
2500         if (u->ip_accounting_egress_map_fd >= 0)
2501                 q = bpf_firewall_reset_accounting(u->ip_accounting_egress_map_fd);
2502
2503         zero(u->ip_accounting_extra);
2504
2505         return r < 0 ? r : q;
2506 }
2507
2508 void unit_invalidate_cgroup(Unit *u, CGroupMask m) {
2509         assert(u);
2510
2511         if (!UNIT_HAS_CGROUP_CONTEXT(u))
2512                 return;
2513
2514         if (m == 0)
2515                 return;
2516
2517         /* always invalidate compat pairs together */
2518         if (m & (CGROUP_MASK_IO | CGROUP_MASK_BLKIO))
2519                 m |= CGROUP_MASK_IO | CGROUP_MASK_BLKIO;
2520
2521         if (m & (CGROUP_MASK_CPU | CGROUP_MASK_CPUACCT))
2522                 m |= CGROUP_MASK_CPU | CGROUP_MASK_CPUACCT;
2523
2524         if ((u->cgroup_realized_mask & m) == 0) /* NOP? */
2525                 return;
2526
2527         u->cgroup_realized_mask &= ~m;
2528         unit_add_to_cgroup_realize_queue(u);
2529 }
2530
2531 void unit_invalidate_cgroup_bpf(Unit *u) {
2532         assert(u);
2533
2534         if (!UNIT_HAS_CGROUP_CONTEXT(u))
2535                 return;
2536
2537         if (u->cgroup_bpf_state == UNIT_CGROUP_BPF_INVALIDATED) /* NOP? */
2538                 return;
2539
2540         u->cgroup_bpf_state = UNIT_CGROUP_BPF_INVALIDATED;
2541         unit_add_to_cgroup_realize_queue(u);
2542
2543         /* If we are a slice unit, we also need to put compile a new BPF program for all our children, as the IP access
2544          * list of our children includes our own. */
2545         if (u->type == UNIT_SLICE) {
2546                 Unit *member;
2547                 Iterator i;
2548                 void *v;
2549
2550                 HASHMAP_FOREACH_KEY(v, member, u->dependencies[UNIT_BEFORE], i) {
2551                         if (member == u)
2552                                 continue;
2553
2554                         if (UNIT_DEREF(member->slice) != u)
2555                                 continue;
2556
2557                         unit_invalidate_cgroup_bpf(member);
2558                 }
2559         }
2560 }
2561
2562 void manager_invalidate_startup_units(Manager *m) {
2563         Iterator i;
2564         Unit *u;
2565
2566         assert(m);
2567
2568         SET_FOREACH(u, m->startup_units, i)
2569                 unit_invalidate_cgroup(u, CGROUP_MASK_CPU|CGROUP_MASK_IO|CGROUP_MASK_BLKIO);
2570 }
2571
2572 static const char* const cgroup_device_policy_table[_CGROUP_DEVICE_POLICY_MAX] = {
2573         [CGROUP_AUTO] = "auto",
2574         [CGROUP_CLOSED] = "closed",
2575         [CGROUP_STRICT] = "strict",
2576 };
2577
2578 DEFINE_STRING_TABLE_LOOKUP(cgroup_device_policy, CGroupDevicePolicy);
2579 #endif // 0