src/core/cgroup.c

   1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
   2
   3 /***
   4   This file is part of systemd.
   5
   6   Copyright 2013 Lennart Poettering
   7
   8   systemd is free software; you can redistribute it and/or modify it
   9   under the terms of the GNU Lesser General Public License as published by
  10   the Free Software Foundation; either version 2.1 of the License, or
  11   (at your option) any later version.
  12
  13   systemd is distributed in the hope that it will be useful, but
  14   WITHOUT ANY WARRANTY; without even the implied warranty of
  15   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  16   Lesser General Public License for more details.
  17
  18   You should have received a copy of the GNU Lesser General Public License
  19   along with systemd; If not, see <http://www.gnu.org/licenses/>.
  20 ***/
  21
  22 #include <fcntl.h>
  23 #include <fnmatch.h>
  24
  25 #include "alloc-util.h"
  26 #include "cgroup-util.h"
  27 #include "cgroup.h"
  28 #include "fd-util.h"
  29 #include "fileio.h"
  30 #include "fs-util.h"
  31 #include "parse-util.h"
  32 #include "path-util.h"
  33 #include "process-util.h"
  34 //#include "special.h"
  35 #include "string-table.h"
  36 #include "string-util.h"
  37
  38 #define CGROUP_CPU_QUOTA_PERIOD_USEC ((usec_t) 100 * USEC_PER_MSEC)
  39
  40 #if 0 /// UNNEEDED by elogind
  41 void cgroup_context_init(CGroupContext *c) {
  42         assert(c);
  43
  44         /* Initialize everything to the kernel defaults, assuming the
  45          * structure is preinitialized to 0 */
  46
  47         c->cpu_shares = CGROUP_CPU_SHARES_INVALID;
  48         c->startup_cpu_shares = CGROUP_CPU_SHARES_INVALID;
  49         c->cpu_quota_per_sec_usec = USEC_INFINITY;
  50
  51         c->memory_limit = (uint64_t) -1;
  52
  53         c->blockio_weight = CGROUP_BLKIO_WEIGHT_INVALID;
  54         c->startup_blockio_weight = CGROUP_BLKIO_WEIGHT_INVALID;
  55
  56         c->tasks_max = (uint64_t) -1;
  57
  58         c->netclass_type = CGROUP_NETCLASS_TYPE_NONE;
  59 }
  60
  61 void cgroup_context_free_device_allow(CGroupContext *c, CGroupDeviceAllow *a) {
  62         assert(c);
  63         assert(a);
  64
  65         LIST_REMOVE(device_allow, c->device_allow, a);
  66         free(a->path);
  67         free(a);
  68 }
  69
  70 void cgroup_context_free_blockio_device_weight(CGroupContext *c, CGroupBlockIODeviceWeight *w) {
  71         assert(c);
  72         assert(w);
  73
  74         LIST_REMOVE(device_weights, c->blockio_device_weights, w);
  75         free(w->path);
  76         free(w);
  77 }
  78
  79 void cgroup_context_free_blockio_device_bandwidth(CGroupContext *c, CGroupBlockIODeviceBandwidth *b) {
  80         assert(c);
  81         assert(b);
  82
  83         LIST_REMOVE(device_bandwidths, c->blockio_device_bandwidths, b);
  84         free(b->path);
  85         free(b);
  86 }
  87
  88 void cgroup_context_done(CGroupContext *c) {
  89         assert(c);
  90
  91         while (c->blockio_device_weights)
  92                 cgroup_context_free_blockio_device_weight(c, c->blockio_device_weights);
  93
  94         while (c->blockio_device_bandwidths)
  95                 cgroup_context_free_blockio_device_bandwidth(c, c->blockio_device_bandwidths);
  96
  97         while (c->device_allow)
  98                 cgroup_context_free_device_allow(c, c->device_allow);
  99 }
 100
 101 void cgroup_context_dump(CGroupContext *c, FILE* f, const char *prefix) {
 102         CGroupBlockIODeviceBandwidth *b;
 103         CGroupBlockIODeviceWeight *w;
 104         CGroupDeviceAllow *a;
 105         char u[FORMAT_TIMESPAN_MAX];
 106
 107         assert(c);
 108         assert(f);
 109
 110         prefix = strempty(prefix);
 111
 112         fprintf(f,
 113                 "%sCPUAccounting=%s\n"
 114                 "%sBlockIOAccounting=%s\n"
 115                 "%sMemoryAccounting=%s\n"
 116                 "%sTasksAccounting=%s\n"
 117                 "%sCPUShares=%" PRIu64 "\n"
 118                 "%sStartupCPUShares=%" PRIu64 "\n"
 119                 "%sCPUQuotaPerSecSec=%s\n"
 120                 "%sBlockIOWeight=%" PRIu64 "\n"
 121                 "%sStartupBlockIOWeight=%" PRIu64 "\n"
 122                 "%sMemoryLimit=%" PRIu64 "\n"
 123                 "%sTasksMax=%" PRIu64 "\n"
 124                 "%sDevicePolicy=%s\n"
 125                 "%sDelegate=%s\n",
 126                 prefix, yes_no(c->cpu_accounting),
 127                 prefix, yes_no(c->blockio_accounting),
 128                 prefix, yes_no(c->memory_accounting),
 129                 prefix, yes_no(c->tasks_accounting),
 130                 prefix, c->cpu_shares,
 131                 prefix, c->startup_cpu_shares,
 132                 prefix, format_timespan(u, sizeof(u), c->cpu_quota_per_sec_usec, 1),
 133                 prefix, c->blockio_weight,
 134                 prefix, c->startup_blockio_weight,
 135                 prefix, c->memory_limit,
 136                 prefix, c->tasks_max,
 137                 prefix, cgroup_device_policy_to_string(c->device_policy),
 138                 prefix, yes_no(c->delegate));
 139
 140         LIST_FOREACH(device_allow, a, c->device_allow)
 141                 fprintf(f,
 142                         "%sDeviceAllow=%s %s%s%s\n",
 143                         prefix,
 144                         a->path,
 145                         a->r ? "r" : "", a->w ? "w" : "", a->m ? "m" : "");
 146
 147         LIST_FOREACH(device_weights, w, c->blockio_device_weights)
 148                 fprintf(f,
 149                         "%sBlockIODeviceWeight=%s %" PRIu64,
 150                         prefix,
 151                         w->path,
 152                         w->weight);
 153
 154         LIST_FOREACH(device_bandwidths, b, c->blockio_device_bandwidths) {
 155                 char buf[FORMAT_BYTES_MAX];
 156
 157                 fprintf(f,
 158                         "%s%s=%s %s\n",
 159                         prefix,
 160                         b->read ? "BlockIOReadBandwidth" : "BlockIOWriteBandwidth",
 161                         b->path,
 162                         format_bytes(buf, sizeof(buf), b->bandwidth));
 163         }
 164 }
 165
 166 static int lookup_blkio_device(const char *p, dev_t *dev) {
 167         struct stat st;
 168         int r;
 169
 170         assert(p);
 171         assert(dev);
 172
 173         r = stat(p, &st);
 174         if (r < 0)
 175                 return log_warning_errno(errno, "Couldn't stat device %s: %m", p);
 176
 177         if (S_ISBLK(st.st_mode))
 178                 *dev = st.st_rdev;
 179         else if (major(st.st_dev) != 0) {
 180                 /* If this is not a device node then find the block
 181                  * device this file is stored on */
 182                 *dev = st.st_dev;
 183
 184                 /* If this is a partition, try to get the originating
 185                  * block device */
 186                 block_get_whole_disk(*dev, dev);
 187         } else {
 188                 log_warning("%s is not a block device and file system block device cannot be determined or is not local.", p);
 189                 return -ENODEV;
 190         }
 191
 192         return 0;
 193 }
 194
 195 static int whitelist_device(const char *path, const char *node, const char *acc) {
 196         char buf[2+DECIMAL_STR_MAX(dev_t)*2+2+4];
 197         struct stat st;
 198         int r;
 199
 200         assert(path);
 201         assert(acc);
 202
 203         if (stat(node, &st) < 0) {
 204                 log_warning("Couldn't stat device %s", node);
 205                 return -errno;
 206         }
 207
 208         if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
 209                 log_warning("%s is not a device.", node);
 210                 return -ENODEV;
 211         }
 212
 213         sprintf(buf,
 214                 "%c %u:%u %s",
 215                 S_ISCHR(st.st_mode) ? 'c' : 'b',
 216                 major(st.st_rdev), minor(st.st_rdev),
 217                 acc);
 218
 219         r = cg_set_attribute("devices", path, "devices.allow", buf);
 220         if (r < 0)
 221                 log_full_errno(IN_SET(r, -ENOENT, -EROFS, -EINVAL, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
 222                                "Failed to set devices.allow on %s: %m", path);
 223
 224         return r;
 225 }
 226
 227 static int whitelist_major(const char *path, const char *name, char type, const char *acc) {
 228         _cleanup_fclose_ FILE *f = NULL;
 229         char line[LINE_MAX];
 230         bool good = false;
 231         int r;
 232
 233         assert(path);
 234         assert(acc);
 235         assert(type == 'b' || type == 'c');
 236
 237         f = fopen("/proc/devices", "re");
 238         if (!f)
 239                 return log_warning_errno(errno, "Cannot open /proc/devices to resolve %s (%c): %m", name, type);
 240
 241         FOREACH_LINE(line, f, goto fail) {
 242                 char buf[2+DECIMAL_STR_MAX(unsigned)+3+4], *p, *w;
 243                 unsigned maj;
 244
 245                 truncate_nl(line);
 246
 247                 if (type == 'c' && streq(line, "Character devices:")) {
 248                         good = true;
 249                         continue;
 250                 }
 251
 252                 if (type == 'b' && streq(line, "Block devices:")) {
 253                         good = true;
 254                         continue;
 255                 }
 256
 257                 if (isempty(line)) {
 258                         good = false;
 259                         continue;
 260                 }
 261
 262                 if (!good)
 263                         continue;
 264
 265                 p = strstrip(line);
 266
 267                 w = strpbrk(p, WHITESPACE);
 268                 if (!w)
 269                         continue;
 270                 *w = 0;
 271
 272                 r = safe_atou(p, &maj);
 273                 if (r < 0)
 274                         continue;
 275                 if (maj <= 0)
 276                         continue;
 277
 278                 w++;
 279                 w += strspn(w, WHITESPACE);
 280
 281                 if (fnmatch(name, w, 0) != 0)
 282                         continue;
 283
 284                 sprintf(buf,
 285                         "%c %u:* %s",
 286                         type,
 287                         maj,
 288                         acc);
 289
 290                 r = cg_set_attribute("devices", path, "devices.allow", buf);
 291                 if (r < 0)
 292                         log_full_errno(IN_SET(r, -ENOENT, -EROFS, -EINVAL, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
 293                                        "Failed to set devices.allow on %s: %m", path);
 294         }
 295
 296         return 0;
 297
 298 fail:
 299         log_warning_errno(errno, "Failed to read /proc/devices: %m");
 300         return -errno;
 301 }
 302
 303 void cgroup_context_apply(CGroupContext *c, CGroupMask mask, const char *path, uint32_t netclass, ManagerState state) {
 304         bool is_root;
 305         int r;
 306
 307         assert(c);
 308         assert(path);
 309
 310         if (mask == 0)
 311                 return;
 312
 313         /* Some cgroup attributes are not supported on the root cgroup,
 314          * hence silently ignore */
 315         is_root = isempty(path) || path_equal(path, "/");
 316         if (is_root)
 317                 /* Make sure we don't try to display messages with an empty path. */
 318                 path = "/";
 319
 320         /* We generally ignore errors caused by read-only mounted
 321          * cgroup trees (assuming we are running in a container then),
 322          * and missing cgroups, i.e. EROFS and ENOENT. */
 323
 324         if ((mask & CGROUP_MASK_CPU) && !is_root) {
 325                 char buf[MAX(DECIMAL_STR_MAX(uint64_t), DECIMAL_STR_MAX(usec_t)) + 1];
 326
 327                 sprintf(buf, "%" PRIu64 "\n",
 328                         IN_SET(state, MANAGER_STARTING, MANAGER_INITIALIZING) && c->startup_cpu_shares != CGROUP_CPU_SHARES_INVALID ? c->startup_cpu_shares :
 329                         c->cpu_shares != CGROUP_CPU_SHARES_INVALID ? c->cpu_shares : CGROUP_CPU_SHARES_DEFAULT);
 330                 r = cg_set_attribute("cpu", path, "cpu.shares", buf);
 331                 if (r < 0)
 332                         log_full_errno(IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
 333                                        "Failed to set cpu.shares on %s: %m", path);
 334
 335                 sprintf(buf, USEC_FMT "\n", CGROUP_CPU_QUOTA_PERIOD_USEC);
 336                 r = cg_set_attribute("cpu", path, "cpu.cfs_period_us", buf);
 337                 if (r < 0)
 338                         log_full_errno(IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
 339                                        "Failed to set cpu.cfs_period_us on %s: %m", path);
 340
 341                 if (c->cpu_quota_per_sec_usec != USEC_INFINITY) {
 342                         sprintf(buf, USEC_FMT "\n", c->cpu_quota_per_sec_usec * CGROUP_CPU_QUOTA_PERIOD_USEC / USEC_PER_SEC);
 343                         r = cg_set_attribute("cpu", path, "cpu.cfs_quota_us", buf);
 344                 } else
 345                         r = cg_set_attribute("cpu", path, "cpu.cfs_quota_us", "-1");
 346                 if (r < 0)
 347                         log_full_errno(IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
 348                                        "Failed to set cpu.cfs_quota_us on %s: %m", path);
 349         }
 350
 351         if (mask & CGROUP_MASK_BLKIO) {
 352                 char buf[MAX(DECIMAL_STR_MAX(uint64_t)+1,
 353                               DECIMAL_STR_MAX(dev_t)*2+2+DECIMAL_STR_MAX(uint64_t)+1)];
 354                 CGroupBlockIODeviceWeight *w;
 355                 CGroupBlockIODeviceBandwidth *b;
 356
 357                 if (!is_root) {
 358                         sprintf(buf, "%" PRIu64 "\n",
 359                                 IN_SET(state, MANAGER_STARTING, MANAGER_INITIALIZING) && c->startup_blockio_weight != CGROUP_BLKIO_WEIGHT_INVALID ? c->startup_blockio_weight :
 360                                 c->blockio_weight != CGROUP_BLKIO_WEIGHT_INVALID ? c->blockio_weight : CGROUP_BLKIO_WEIGHT_DEFAULT);
 361                         r = cg_set_attribute("blkio", path, "blkio.weight", buf);
 362                         if (r < 0)
 363                                 log_full_errno(IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
 364                                                "Failed to set blkio.weight on %s: %m", path);
 365
 366                         /* FIXME: no way to reset this list */
 367                         LIST_FOREACH(device_weights, w, c->blockio_device_weights) {
 368                                 dev_t dev;
 369
 370                                 r = lookup_blkio_device(w->path, &dev);
 371                                 if (r < 0)
 372                                         continue;
 373
 374                                 sprintf(buf, "%u:%u %" PRIu64 "\n", major(dev), minor(dev), w->weight);
 375                                 r = cg_set_attribute("blkio", path, "blkio.weight_device", buf);
 376                                 if (r < 0)
 377                                         log_full_errno(IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
 378                                                        "Failed to set blkio.weight_device on %s: %m", path);
 379                         }
 380                 }
 381
 382                 /* FIXME: no way to reset this list */
 383                 LIST_FOREACH(device_bandwidths, b, c->blockio_device_bandwidths) {
 384                         const char *a;
 385                         dev_t dev;
 386
 387                         r = lookup_blkio_device(b->path, &dev);
 388                         if (r < 0)
 389                                 continue;
 390
 391                         a = b->read ? "blkio.throttle.read_bps_device" : "blkio.throttle.write_bps_device";
 392
 393                         sprintf(buf, "%u:%u %" PRIu64 "\n", major(dev), minor(dev), b->bandwidth);
 394                         r = cg_set_attribute("blkio", path, a, buf);
 395                         if (r < 0)
 396                                 log_full_errno(IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
 397                                                "Failed to set %s on %s: %m", a, path);
 398                 }
 399         }
 400
 401         if ((mask & CGROUP_MASK_MEMORY) && !is_root) {
 402                 if (c->memory_limit != (uint64_t) -1) {
 403                         char buf[DECIMAL_STR_MAX(uint64_t) + 1];
 404
 405                         sprintf(buf, "%" PRIu64 "\n", c->memory_limit);
 406
 407                         if (cg_unified() <= 0)
 408                                 r = cg_set_attribute("memory", path, "memory.limit_in_bytes", buf);
 409                         else
 410                                 r = cg_set_attribute("memory", path, "memory.max", buf);
 411
 412                 } else {
 413                         if (cg_unified() <= 0)
 414                                 r = cg_set_attribute("memory", path, "memory.limit_in_bytes", "-1");
 415                         else
 416                                 r = cg_set_attribute("memory", path, "memory.max", "max");
 417                 }
 418
 419                 if (r < 0)
 420                         log_full_errno(IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
 421                                        "Failed to set memory.limit_in_bytes/memory.max on %s: %m", path);
 422         }
 423
 424         if ((mask & CGROUP_MASK_DEVICES) && !is_root) {
 425                 CGroupDeviceAllow *a;
 426
 427                 /* Changing the devices list of a populated cgroup
 428                  * might result in EINVAL, hence ignore EINVAL
 429                  * here. */
 430
 431                 if (c->device_allow || c->device_policy != CGROUP_AUTO)
 432                         r = cg_set_attribute("devices", path, "devices.deny", "a");
 433                 else
 434                         r = cg_set_attribute("devices", path, "devices.allow", "a");
 435                 if (r < 0)
 436                         log_full_errno(IN_SET(r, -ENOENT, -EROFS, -EINVAL, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
 437                                        "Failed to reset devices.list on %s: %m", path);
 438
 439                 if (c->device_policy == CGROUP_CLOSED ||
 440                     (c->device_policy == CGROUP_AUTO && c->device_allow)) {
 441                         static const char auto_devices[] =
 442                                 "/dev/null\0" "rwm\0"
 443                                 "/dev/zero\0" "rwm\0"
 444                                 "/dev/full\0" "rwm\0"
 445                                 "/dev/random\0" "rwm\0"
 446                                 "/dev/urandom\0" "rwm\0"
 447                                 "/dev/tty\0" "rwm\0"
 448                                 "/dev/pts/ptmx\0" "rw\0"; /* /dev/pts/ptmx may not be duplicated, but accessed */
 449
 450                         const char *x, *y;
 451
 452                         NULSTR_FOREACH_PAIR(x, y, auto_devices)
 453                                 whitelist_device(path, x, y);
 454
 455                         whitelist_major(path, "pts", 'c', "rw");
 456                         whitelist_major(path, "kdbus", 'c', "rw");
 457                         whitelist_major(path, "kdbus/*", 'c', "rw");
 458                 }
 459
 460                 LIST_FOREACH(device_allow, a, c->device_allow) {
 461                         char acc[4];
 462                         unsigned k = 0;
 463
 464                         if (a->r)
 465                                 acc[k++] = 'r';
 466                         if (a->w)
 467                                 acc[k++] = 'w';
 468                         if (a->m)
 469                                 acc[k++] = 'm';
 470
 471                         if (k == 0)
 472                                 continue;
 473
 474                         acc[k++] = 0;
 475
 476                         if (startswith(a->path, "/dev/"))
 477                                 whitelist_device(path, a->path, acc);
 478                         else if (startswith(a->path, "block-"))
 479                                 whitelist_major(path, a->path + 6, 'b', acc);
 480                         else if (startswith(a->path, "char-"))
 481                                 whitelist_major(path, a->path + 5, 'c', acc);
 482                         else
 483                                 log_debug("Ignoring device %s while writing cgroup attribute.", a->path);
 484                 }
 485         }
 486
 487         if ((mask & CGROUP_MASK_PIDS) && !is_root) {
 488
 489                 if (c->tasks_max != (uint64_t) -1) {
 490                         char buf[DECIMAL_STR_MAX(uint64_t) + 2];
 491
 492                         sprintf(buf, "%" PRIu64 "\n", c->tasks_max);
 493                         r = cg_set_attribute("pids", path, "pids.max", buf);
 494                 } else
 495                         r = cg_set_attribute("pids", path, "pids.max", "max");
 496
 497                 if (r < 0)
 498                         log_full_errno(IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
 499                                        "Failed to set pids.max on %s: %m", path);
 500         }
 501
 502         if (mask & CGROUP_MASK_NET_CLS) {
 503                 char buf[DECIMAL_STR_MAX(uint32_t)];
 504
 505                 sprintf(buf, "%" PRIu32, netclass);
 506
 507                 r = cg_set_attribute("net_cls", path, "net_cls.classid", buf);
 508                 if (r < 0)
 509                         log_full_errno(IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
 510                                        "Failed to set net_cls.classid on %s: %m", path);
 511         }
 512 }
 513
 514 CGroupMask cgroup_context_get_mask(CGroupContext *c) {
 515         CGroupMask mask = 0;
 516
 517         /* Figure out which controllers we need */
 518
 519         if (c->cpu_accounting ||
 520             c->cpu_shares != CGROUP_CPU_SHARES_INVALID ||
 521             c->startup_cpu_shares != CGROUP_CPU_SHARES_INVALID ||
 522             c->cpu_quota_per_sec_usec != USEC_INFINITY)
 523                 mask |= CGROUP_MASK_CPUACCT | CGROUP_MASK_CPU;
 524
 525         if (c->blockio_accounting ||
 526             c->blockio_weight != CGROUP_BLKIO_WEIGHT_INVALID ||
 527             c->startup_blockio_weight != CGROUP_BLKIO_WEIGHT_INVALID ||
 528             c->blockio_device_weights ||
 529             c->blockio_device_bandwidths)
 530                 mask |= CGROUP_MASK_BLKIO;
 531
 532         if (c->memory_accounting ||
 533             c->memory_limit != (uint64_t) -1)
 534                 mask |= CGROUP_MASK_MEMORY;
 535
 536         if (c->device_allow ||
 537             c->device_policy != CGROUP_AUTO)
 538                 mask |= CGROUP_MASK_DEVICES;
 539
 540         if (c->tasks_accounting ||
 541             c->tasks_max != (uint64_t) -1)
 542                 mask |= CGROUP_MASK_PIDS;
 543
 544         if (c->netclass_type != CGROUP_NETCLASS_TYPE_NONE)
 545                 mask |= CGROUP_MASK_NET_CLS;
 546
 547         return mask;
 548 }
 549
 550 CGroupMask unit_get_own_mask(Unit *u) {
 551         CGroupContext *c;
 552
 553         /* Returns the mask of controllers the unit needs for itself */
 554
 555         c = unit_get_cgroup_context(u);
 556         if (!c)
 557                 return 0;
 558
 559         /* If delegation is turned on, then turn on all cgroups,
 560          * unless we are on the legacy hierarchy and the process we
 561          * fork into it is known to drop privileges, and hence
 562          * shouldn't get access to the controllers.
 563          *
 564          * Note that on the unified hierarchy it is safe to delegate
 565          * controllers to unprivileged services. */
 566
 567         if (c->delegate) {
 568                 ExecContext *e;
 569
 570                 e = unit_get_exec_context(u);
 571                 if (!e ||
 572                     exec_context_maintains_privileges(e) ||
 573                     cg_unified() > 0)
 574                         return _CGROUP_MASK_ALL;
 575         }
 576
 577         return cgroup_context_get_mask(c);
 578 }
 579
 580 CGroupMask unit_get_members_mask(Unit *u) {
 581         assert(u);
 582
 583         /* Returns the mask of controllers all of the unit's children
 584          * require, merged */
 585
 586         if (u->cgroup_members_mask_valid)
 587                 return u->cgroup_members_mask;
 588
 589         u->cgroup_members_mask = 0;
 590
 591         if (u->type == UNIT_SLICE) {
 592                 Unit *member;
 593                 Iterator i;
 594
 595                 SET_FOREACH(member, u->dependencies[UNIT_BEFORE], i) {
 596
 597                         if (member == u)
 598                                 continue;
 599
 600                         if (UNIT_DEREF(member->slice) != u)
 601                                 continue;
 602
 603                         u->cgroup_members_mask |=
 604                                 unit_get_own_mask(member) |
 605                                 unit_get_members_mask(member);
 606                 }
 607         }
 608
 609         u->cgroup_members_mask_valid = true;
 610         return u->cgroup_members_mask;
 611 }
 612
 613 CGroupMask unit_get_siblings_mask(Unit *u) {
 614         assert(u);
 615
 616         /* Returns the mask of controllers all of the unit's siblings
 617          * require, i.e. the members mask of the unit's parent slice
 618          * if there is one. */
 619
 620         if (UNIT_ISSET(u->slice))
 621                 return unit_get_members_mask(UNIT_DEREF(u->slice));
 622
 623         return unit_get_own_mask(u) | unit_get_members_mask(u);
 624 }
 625
 626 CGroupMask unit_get_subtree_mask(Unit *u) {
 627
 628         /* Returns the mask of this subtree, meaning of the group
 629          * itself and its children. */
 630
 631         return unit_get_own_mask(u) | unit_get_members_mask(u);
 632 }
 633
 634 CGroupMask unit_get_target_mask(Unit *u) {
 635         CGroupMask mask;
 636
 637         /* This returns the cgroup mask of all controllers to enable
 638          * for a specific cgroup, i.e. everything it needs itself,
 639          * plus all that its children need, plus all that its siblings
 640          * need. This is primarily useful on the legacy cgroup
 641          * hierarchy, where we need to duplicate each cgroup in each
 642          * hierarchy that shall be enabled for it. */
 643
 644         mask = unit_get_own_mask(u) | unit_get_members_mask(u) | unit_get_siblings_mask(u);
 645         mask &= u->manager->cgroup_supported;
 646
 647         return mask;
 648 }
 649
 650 CGroupMask unit_get_enable_mask(Unit *u) {
 651         CGroupMask mask;
 652
 653         /* This returns the cgroup mask of all controllers to enable
 654          * for the children of a specific cgroup. This is primarily
 655          * useful for the unified cgroup hierarchy, where each cgroup
 656          * controls which controllers are enabled for its children. */
 657
 658         mask = unit_get_members_mask(u);
 659         mask &= u->manager->cgroup_supported;
 660
 661         return mask;
 662 }
 663
 664 /* Recurse from a unit up through its containing slices, propagating
 665  * mask bits upward. A unit is also member of itself. */
 666 void unit_update_cgroup_members_masks(Unit *u) {
 667         CGroupMask m;
 668         bool more;
 669
 670         assert(u);
 671
 672         /* Calculate subtree mask */
 673         m = unit_get_subtree_mask(u);
 674
 675         /* See if anything changed from the previous invocation. If
 676          * not, we're done. */
 677         if (u->cgroup_subtree_mask_valid && m == u->cgroup_subtree_mask)
 678                 return;
 679
 680         more =
 681                 u->cgroup_subtree_mask_valid &&
 682                 ((m & ~u->cgroup_subtree_mask) != 0) &&
 683                 ((~m & u->cgroup_subtree_mask) == 0);
 684
 685         u->cgroup_subtree_mask = m;
 686         u->cgroup_subtree_mask_valid = true;
 687
 688         if (UNIT_ISSET(u->slice)) {
 689                 Unit *s = UNIT_DEREF(u->slice);
 690
 691                 if (more)
 692                         /* There's more set now than before. We
 693                          * propagate the new mask to the parent's mask
 694                          * (not caring if it actually was valid or
 695                          * not). */
 696
 697                         s->cgroup_members_mask |= m;
 698
 699                 else
 700                         /* There's less set now than before (or we
 701                          * don't know), we need to recalculate
 702                          * everything, so let's invalidate the
 703                          * parent's members mask */
 704
 705                         s->cgroup_members_mask_valid = false;
 706
 707                 /* And now make sure that this change also hits our
 708                  * grandparents */
 709                 unit_update_cgroup_members_masks(s);
 710         }
 711 }
 712
 713 static const char *migrate_callback(CGroupMask mask, void *userdata) {
 714         Unit *u = userdata;
 715
 716         assert(mask != 0);
 717         assert(u);
 718
 719         while (u) {
 720                 if (u->cgroup_path &&
 721                     u->cgroup_realized &&
 722                     (u->cgroup_realized_mask & mask) == mask)
 723                         return u->cgroup_path;
 724
 725                 u = UNIT_DEREF(u->slice);
 726         }
 727
 728         return NULL;
 729 }
 730
 731 char *unit_default_cgroup_path(Unit *u) {
 732         _cleanup_free_ char *escaped = NULL, *slice = NULL;
 733         int r;
 734
 735         assert(u);
 736
 737         if (unit_has_name(u, SPECIAL_ROOT_SLICE))
 738                 return strdup(u->manager->cgroup_root);
 739
 740         if (UNIT_ISSET(u->slice) && !unit_has_name(UNIT_DEREF(u->slice), SPECIAL_ROOT_SLICE)) {
 741                 r = cg_slice_to_path(UNIT_DEREF(u->slice)->id, &slice);
 742                 if (r < 0)
 743                         return NULL;
 744         }
 745
 746         escaped = cg_escape(u->id);
 747         if (!escaped)
 748                 return NULL;
 749
 750         if (slice)
 751                 return strjoin(u->manager->cgroup_root, "/", slice, "/", escaped, NULL);
 752         else
 753                 return strjoin(u->manager->cgroup_root, "/", escaped, NULL);
 754 }
 755
 756 int unit_set_cgroup_path(Unit *u, const char *path) {
 757         _cleanup_free_ char *p = NULL;
 758         int r;
 759
 760         assert(u);
 761
 762         if (path) {
 763                 p = strdup(path);
 764                 if (!p)
 765                         return -ENOMEM;
 766         } else
 767                 p = NULL;
 768
 769         if (streq_ptr(u->cgroup_path, p))
 770                 return 0;
 771
 772         if (p) {
 773                 r = hashmap_put(u->manager->cgroup_unit, p, u);
 774                 if (r < 0)
 775                         return r;
 776         }
 777
 778         unit_release_cgroup(u);
 779
 780         u->cgroup_path = p;
 781         p = NULL;
 782
 783         return 1;
 784 }
 785
 786 int unit_watch_cgroup(Unit *u) {
 787         _cleanup_free_ char *populated = NULL;
 788         int r;
 789
 790         assert(u);
 791
 792         if (!u->cgroup_path)
 793                 return 0;
 794
 795         if (u->cgroup_inotify_wd >= 0)
 796                 return 0;
 797
 798         /* Only applies to the unified hierarchy */
 799         r = cg_unified();
 800         if (r < 0)
 801                 return log_unit_error_errno(u, r, "Failed detect wether the unified hierarchy is used: %m");
 802         if (r == 0)
 803                 return 0;
 804
 805         /* Don't watch the root slice, it's pointless. */
 806         if (unit_has_name(u, SPECIAL_ROOT_SLICE))
 807                 return 0;
 808
 809         r = hashmap_ensure_allocated(&u->manager->cgroup_inotify_wd_unit, &trivial_hash_ops);
 810         if (r < 0)
 811                 return log_oom();
 812
 813         r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, "cgroup.populated", &populated);
 814         if (r < 0)
 815                 return log_oom();
 816
 817         u->cgroup_inotify_wd = inotify_add_watch(u->manager->cgroup_inotify_fd, populated, IN_MODIFY);
 818         if (u->cgroup_inotify_wd < 0) {
 819
 820                 if (errno == ENOENT) /* If the directory is already
 821                                       * gone we don't need to track
 822                                       * it, so this is not an error */
 823                         return 0;
 824
 825                 return log_unit_error_errno(u, errno, "Failed to add inotify watch descriptor for control group %s: %m", u->cgroup_path);
 826         }
 827
 828         r = hashmap_put(u->manager->cgroup_inotify_wd_unit, INT_TO_PTR(u->cgroup_inotify_wd), u);
 829         if (r < 0)
 830                 return log_unit_error_errno(u, r, "Failed to add inotify watch descriptor to hash map: %m");
 831
 832         return 0;
 833 }
 834
 835 static int unit_create_cgroup(
 836                 Unit *u,
 837                 CGroupMask target_mask,
 838                 CGroupMask enable_mask) {
 839
 840         CGroupContext *c;
 841         int r;
 842
 843         assert(u);
 844
 845         c = unit_get_cgroup_context(u);
 846         if (!c)
 847                 return 0;
 848
 849         if (!u->cgroup_path) {
 850                 _cleanup_free_ char *path = NULL;
 851
 852                 path = unit_default_cgroup_path(u);
 853                 if (!path)
 854                         return log_oom();
 855
 856                 r = unit_set_cgroup_path(u, path);
 857                 if (r == -EEXIST)
 858                         return log_unit_error_errno(u, r, "Control group %s exists already.", path);
 859                 if (r < 0)
 860                         return log_unit_error_errno(u, r, "Failed to set unit's control group path to %s: %m", path);
 861         }
 862
 863         /* First, create our own group */
 864         r = cg_create_everywhere(u->manager->cgroup_supported, target_mask, u->cgroup_path);
 865         if (r < 0)
 866                 return log_unit_error_errno(u, r, "Failed to create cgroup %s: %m", u->cgroup_path);
 867
 868         /* Start watching it */
 869         (void) unit_watch_cgroup(u);
 870
 871         /* Enable all controllers we need */
 872         r = cg_enable_everywhere(u->manager->cgroup_supported, enable_mask, u->cgroup_path);
 873         if (r < 0)
 874                 log_unit_warning_errno(u, r, "Failed to enable controllers on cgroup %s, ignoring: %m", u->cgroup_path);
 875
 876         /* Keep track that this is now realized */
 877         u->cgroup_realized = true;
 878         u->cgroup_realized_mask = target_mask;
 879
 880         if (u->type != UNIT_SLICE && !c->delegate) {
 881
 882                 /* Then, possibly move things over, but not if
 883                  * subgroups may contain processes, which is the case
 884                  * for slice and delegation units. */
 885                 r = cg_migrate_everywhere(u->manager->cgroup_supported, u->cgroup_path, u->cgroup_path, migrate_callback, u);
 886                 if (r < 0)
 887                         log_unit_warning_errno(u, r, "Failed to migrate cgroup from to %s, ignoring: %m", u->cgroup_path);
 888         }
 889
 890         return 0;
 891 }
 892
 893 int unit_attach_pids_to_cgroup(Unit *u) {
 894         int r;
 895         assert(u);
 896
 897         r = unit_realize_cgroup(u);
 898         if (r < 0)
 899                 return r;
 900
 901         r = cg_attach_many_everywhere(u->manager->cgroup_supported, u->cgroup_path, u->pids, migrate_callback, u);
 902         if (r < 0)
 903                 return r;
 904
 905         return 0;
 906 }
 907
 908 static bool unit_has_mask_realized(Unit *u, CGroupMask target_mask) {
 909         assert(u);
 910
 911         return u->cgroup_realized && u->cgroup_realized_mask == target_mask;
 912 }
 913
 914 static int unit_find_free_netclass_cgroup(Unit *u, uint32_t *ret) {
 915
 916         uint32_t start, i;
 917         Manager *m;
 918
 919         assert(u);
 920
 921         m = u->manager;
 922
 923         i = start = m->cgroup_netclass_registry_last;
 924
 925         do {
 926                 i++;
 927
 928                 if (!hashmap_get(m->cgroup_netclass_registry, UINT_TO_PTR(i))) {
 929                         m->cgroup_netclass_registry_last = i;
 930                         *ret = i;
 931                         return 0;
 932                 }
 933
 934                 if (i == UINT32_MAX)
 935                         i = CGROUP_NETCLASS_FIXED_MAX;
 936
 937         } while (i != start);
 938
 939         return -ENOBUFS;
 940 }
 941
 942 int unit_add_to_netclass_cgroup(Unit *u) {
 943
 944         CGroupContext *cc;
 945         Unit *first;
 946         void *key;
 947         int r;
 948
 949         assert(u);
 950
 951         cc = unit_get_cgroup_context(u);
 952         if (!cc)
 953                 return 0;
 954
 955         switch (cc->netclass_type) {
 956         case CGROUP_NETCLASS_TYPE_NONE:
 957                 return 0;
 958
 959         case CGROUP_NETCLASS_TYPE_FIXED:
 960                 u->cgroup_netclass_id = cc->netclass_id;
 961                 break;
 962
 963         case CGROUP_NETCLASS_TYPE_AUTO:
 964                 /* Allocate a new ID in case it was requested and not done yet */
 965                 if (u->cgroup_netclass_id == 0) {
 966                         r = unit_find_free_netclass_cgroup(u, &u->cgroup_netclass_id);
 967                         if (r < 0)
 968                                 return r;
 969
 970                         log_debug("Dynamically assigned netclass cgroup id %" PRIu32 " to %s", u->cgroup_netclass_id, u->id);
 971                 }
 972
 973                 break;
 974         }
 975
 976         r = hashmap_ensure_allocated(&u->manager->cgroup_netclass_registry, &trivial_hash_ops);
 977         if (r < 0)
 978                 return r;
 979
 980         key = UINT32_TO_PTR(u->cgroup_netclass_id);
 981         first = hashmap_get(u->manager->cgroup_netclass_registry, key);
 982
 983         if (first) {
 984                 LIST_PREPEND(cgroup_netclass, first, u);
 985                 return hashmap_replace(u->manager->cgroup_netclass_registry, key, u);
 986         }
 987
 988         return hashmap_put(u->manager->cgroup_netclass_registry, key, u);
 989 }
 990
 991 int unit_remove_from_netclass_cgroup(Unit *u) {
 992
 993         Unit *head;
 994         void *key;
 995
 996         assert(u);
 997
 998         key = UINT32_TO_PTR(u->cgroup_netclass_id);
 999
1000         LIST_FIND_HEAD(cgroup_netclass, u, head);
1001         LIST_REMOVE(cgroup_netclass, head, u);
1002
1003         if (head)
1004                 return hashmap_replace(u->manager->cgroup_netclass_registry, key, head);
1005
1006         hashmap_remove(u->manager->cgroup_netclass_registry, key);
1007
1008         return 0;
1009 }
1010
1011 /* Check if necessary controllers and attributes for a unit are in place.
1012  *
1013  * If so, do nothing.
1014  * If not, create paths, move processes over, and set attributes.
1015  *
1016  * Returns 0 on success and < 0 on failure. */
1017 static int unit_realize_cgroup_now(Unit *u, ManagerState state) {
1018         CGroupMask target_mask, enable_mask;
1019         int r;
1020
1021         assert(u);
1022
1023         if (u->in_cgroup_queue) {
1024                 LIST_REMOVE(cgroup_queue, u->manager->cgroup_queue, u);
1025                 u->in_cgroup_queue = false;
1026         }
1027
1028         target_mask = unit_get_target_mask(u);
1029         if (unit_has_mask_realized(u, target_mask))
1030                 return 0;
1031
1032         /* First, realize parents */
1033         if (UNIT_ISSET(u->slice)) {
1034                 r = unit_realize_cgroup_now(UNIT_DEREF(u->slice), state);
1035                 if (r < 0)
1036                         return r;
1037         }
1038
1039         /* And then do the real work */
1040         enable_mask = unit_get_enable_mask(u);
1041         r = unit_create_cgroup(u, target_mask, enable_mask);
1042         if (r < 0)
1043                 return r;
1044
1045         /* Finally, apply the necessary attributes. */
1046         cgroup_context_apply(unit_get_cgroup_context(u), target_mask, u->cgroup_path, u->cgroup_netclass_id, state);
1047
1048         return 0;
1049 }
1050
1051 static void unit_add_to_cgroup_queue(Unit *u) {
1052
1053         if (u->in_cgroup_queue)
1054                 return;
1055
1056         LIST_PREPEND(cgroup_queue, u->manager->cgroup_queue, u);
1057         u->in_cgroup_queue = true;
1058 }
1059
1060 unsigned manager_dispatch_cgroup_queue(Manager *m) {
1061         ManagerState state;
1062         unsigned n = 0;
1063         Unit *i;
1064         int r;
1065
1066         state = manager_state(m);
1067
1068         while ((i = m->cgroup_queue)) {
1069                 assert(i->in_cgroup_queue);
1070
1071                 r = unit_realize_cgroup_now(i, state);
1072                 if (r < 0)
1073                         log_warning_errno(r, "Failed to realize cgroups for queued unit %s, ignoring: %m", i->id);
1074
1075                 n++;
1076         }
1077
1078         return n;
1079 }
1080
1081 static void unit_queue_siblings(Unit *u) {
1082         Unit *slice;
1083
1084         /* This adds the siblings of the specified unit and the
1085          * siblings of all parent units to the cgroup queue. (But
1086          * neither the specified unit itself nor the parents.) */
1087
1088         while ((slice = UNIT_DEREF(u->slice))) {
1089                 Iterator i;
1090                 Unit *m;
1091
1092                 SET_FOREACH(m, slice->dependencies[UNIT_BEFORE], i) {
1093                         if (m == u)
1094                                 continue;
1095
1096                         /* Skip units that have a dependency on the slice
1097                          * but aren't actually in it. */
1098                         if (UNIT_DEREF(m->slice) != slice)
1099                                 continue;
1100
1101                         /* No point in doing cgroup application for units
1102                          * without active processes. */
1103                         if (UNIT_IS_INACTIVE_OR_FAILED(unit_active_state(m)))
1104                                 continue;
1105
1106                         /* If the unit doesn't need any new controllers
1107                          * and has current ones realized, it doesn't need
1108                          * any changes. */
1109                         if (unit_has_mask_realized(m, unit_get_target_mask(m)))
1110                                 continue;
1111
1112                         unit_add_to_cgroup_queue(m);
1113                 }
1114
1115                 u = slice;
1116         }
1117 }
1118
1119 int unit_realize_cgroup(Unit *u) {
1120         assert(u);
1121
1122         if (!UNIT_HAS_CGROUP_CONTEXT(u))
1123                 return 0;
1124
1125         /* So, here's the deal: when realizing the cgroups for this
1126          * unit, we need to first create all parents, but there's more
1127          * actually: for the weight-based controllers we also need to
1128          * make sure that all our siblings (i.e. units that are in the
1129          * same slice as we are) have cgroups, too. Otherwise, things
1130          * would become very uneven as each of their processes would
1131          * get as much resources as all our group together. This call
1132          * will synchronously create the parent cgroups, but will
1133          * defer work on the siblings to the next event loop
1134          * iteration. */
1135
1136         /* Add all sibling slices to the cgroup queue. */
1137         unit_queue_siblings(u);
1138
1139         /* And realize this one now (and apply the values) */
1140         return unit_realize_cgroup_now(u, manager_state(u->manager));
1141 }
1142
1143 void unit_release_cgroup(Unit *u) {
1144         assert(u);
1145
1146         /* Forgets all cgroup details for this cgroup */
1147
1148         if (u->cgroup_path) {
1149                 (void) hashmap_remove(u->manager->cgroup_unit, u->cgroup_path);
1150                 u->cgroup_path = mfree(u->cgroup_path);
1151         }
1152
1153         if (u->cgroup_inotify_wd >= 0) {
1154                 if (inotify_rm_watch(u->manager->cgroup_inotify_fd, u->cgroup_inotify_wd) < 0)
1155                         log_unit_debug_errno(u, errno, "Failed to remove cgroup inotify watch %i for %s, ignoring", u->cgroup_inotify_wd, u->id);
1156
1157                 (void) hashmap_remove(u->manager->cgroup_inotify_wd_unit, INT_TO_PTR(u->cgroup_inotify_wd));
1158                 u->cgroup_inotify_wd = -1;
1159         }
1160 }
1161
1162 void unit_prune_cgroup(Unit *u) {
1163         int r;
1164         bool is_root_slice;
1165
1166         assert(u);
1167
1168         /* Removes the cgroup, if empty and possible, and stops watching it. */
1169
1170         if (!u->cgroup_path)
1171                 return;
1172
1173         is_root_slice = unit_has_name(u, SPECIAL_ROOT_SLICE);
1174
1175         r = cg_trim_everywhere(u->manager->cgroup_supported, u->cgroup_path, !is_root_slice);
1176         if (r < 0) {
1177                 log_debug_errno(r, "Failed to destroy cgroup %s, ignoring: %m", u->cgroup_path);
1178                 return;
1179         }
1180
1181         if (is_root_slice)
1182                 return;
1183
1184         unit_release_cgroup(u);
1185
1186         u->cgroup_realized = false;
1187         u->cgroup_realized_mask = 0;
1188 }
1189
1190 int unit_search_main_pid(Unit *u, pid_t *ret) {
1191         _cleanup_fclose_ FILE *f = NULL;
1192         pid_t pid = 0, npid, mypid;
1193         int r;
1194
1195         assert(u);
1196         assert(ret);
1197
1198         if (!u->cgroup_path)
1199                 return -ENXIO;
1200
1201         r = cg_enumerate_processes(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, &f);
1202         if (r < 0)
1203                 return r;
1204
1205         mypid = getpid();
1206         while (cg_read_pid(f, &npid) > 0)  {
1207                 pid_t ppid;
1208
1209                 if (npid == pid)
1210                         continue;
1211
1212                 /* Ignore processes that aren't our kids */
1213                 if (get_process_ppid(npid, &ppid) >= 0 && ppid != mypid)
1214                         continue;
1215
1216                 if (pid != 0)
1217                         /* Dang, there's more than one daemonized PID
1218                         in this group, so we don't know what process
1219                         is the main process. */
1220
1221                         return -ENODATA;
1222
1223                 pid = npid;
1224         }
1225
1226         *ret = pid;
1227         return 0;
1228 }
1229
1230 static int unit_watch_pids_in_path(Unit *u, const char *path) {
1231         _cleanup_closedir_ DIR *d = NULL;
1232         _cleanup_fclose_ FILE *f = NULL;
1233         int ret = 0, r;
1234
1235         assert(u);
1236         assert(path);
1237
1238         r = cg_enumerate_processes(SYSTEMD_CGROUP_CONTROLLER, path, &f);
1239         if (r < 0)
1240                 ret = r;
1241         else {
1242                 pid_t pid;
1243
1244                 while ((r = cg_read_pid(f, &pid)) > 0) {
1245                         r = unit_watch_pid(u, pid);
1246                         if (r < 0 && ret >= 0)
1247                                 ret = r;
1248                 }
1249
1250                 if (r < 0 && ret >= 0)
1251                         ret = r;
1252         }
1253
1254         r = cg_enumerate_subgroups(SYSTEMD_CGROUP_CONTROLLER, path, &d);
1255         if (r < 0) {
1256                 if (ret >= 0)
1257                         ret = r;
1258         } else {
1259                 char *fn;
1260
1261                 while ((r = cg_read_subgroup(d, &fn)) > 0) {
1262                         _cleanup_free_ char *p = NULL;
1263
1264                         p = strjoin(path, "/", fn, NULL);
1265                         free(fn);
1266
1267                         if (!p)
1268                                 return -ENOMEM;
1269
1270                         r = unit_watch_pids_in_path(u, p);
1271                         if (r < 0 && ret >= 0)
1272                                 ret = r;
1273                 }
1274
1275                 if (r < 0 && ret >= 0)
1276                         ret = r;
1277         }
1278
1279         return ret;
1280 }
1281
1282 int unit_watch_all_pids(Unit *u) {
1283         assert(u);
1284
1285         /* Adds all PIDs from our cgroup to the set of PIDs we
1286          * watch. This is a fallback logic for cases where we do not
1287          * get reliable cgroup empty notifications: we try to use
1288          * SIGCHLD as replacement. */
1289
1290         if (!u->cgroup_path)
1291                 return -ENOENT;
1292
1293         if (cg_unified() > 0) /* On unified we can use proper notifications */
1294                 return 0;
1295
1296         return unit_watch_pids_in_path(u, u->cgroup_path);
1297 }
1298
1299 int unit_notify_cgroup_empty(Unit *u) {
1300         int r;
1301
1302         assert(u);
1303
1304         if (!u->cgroup_path)
1305                 return 0;
1306
1307         r = cg_is_empty_recursive(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path);
1308         if (r <= 0)
1309                 return r;
1310
1311         unit_add_to_gc_queue(u);
1312
1313         if (UNIT_VTABLE(u)->notify_cgroup_empty)
1314                 UNIT_VTABLE(u)->notify_cgroup_empty(u);
1315
1316         return 0;
1317 }
1318
1319 static int on_cgroup_inotify_event(sd_event_source *s, int fd, uint32_t revents, void *userdata) {
1320         Manager *m = userdata;
1321
1322         assert(s);
1323         assert(fd >= 0);
1324         assert(m);
1325
1326         for (;;) {
1327                 union inotify_event_buffer buffer;
1328                 struct inotify_event *e;
1329                 ssize_t l;
1330
1331                 l = read(fd, &buffer, sizeof(buffer));
1332                 if (l < 0) {
1333                         if (errno == EINTR || errno == EAGAIN)
1334                                 return 0;
1335
1336                         return log_error_errno(errno, "Failed to read control group inotify events: %m");
1337                 }
1338
1339                 FOREACH_INOTIFY_EVENT(e, buffer, l) {
1340                         Unit *u;
1341
1342                         if (e->wd < 0)
1343                                 /* Queue overflow has no watch descriptor */
1344                                 continue;
1345
1346                         if (e->mask & IN_IGNORED)
1347                                 /* The watch was just removed */
1348                                 continue;
1349
1350                         u = hashmap_get(m->cgroup_inotify_wd_unit, INT_TO_PTR(e->wd));
1351                         if (!u) /* Not that inotify might deliver
1352                                  * events for a watch even after it
1353                                  * was removed, because it was queued
1354                                  * before the removal. Let's ignore
1355                                  * this here safely. */
1356                                 continue;
1357
1358                         (void) unit_notify_cgroup_empty(u);
1359                 }
1360         }
1361 }
1362 #endif // 0
1363
1364 int manager_setup_cgroup(Manager *m) {
1365         _cleanup_free_ char *path = NULL;
1366         CGroupController c;
1367         int r, unified;
1368         char *e;
1369
1370         assert(m);
1371
1372         /* 1. Determine hierarchy */
1373         m->cgroup_root = mfree(m->cgroup_root);
1374         r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, 0, &m->cgroup_root);
1375         if (r < 0)
1376                 return log_error_errno(r, "Cannot determine cgroup we are running in: %m");
1377
1378 #if 0 /// elogind does not support systemd scopes and slices
1379         /* Chop off the init scope, if we are already located in it */
1380         e = endswith(m->cgroup_root, "/" SPECIAL_INIT_SCOPE);
1381
1382         /* LEGACY: Also chop off the system slice if we are in
1383          * it. This is to support live upgrades from older systemd
1384          * versions where PID 1 was moved there. Also see
1385          * cg_get_root_path(). */
1386         if (!e && m->running_as == MANAGER_SYSTEM) {
1387                 e = endswith(m->cgroup_root, "/" SPECIAL_SYSTEM_SLICE);
1388                 if (!e)
1389                         e = endswith(m->cgroup_root, "/system"); /* even more legacy */
1390         }
1391         if (e)
1392                 *e = 0;
1393 #endif // 0
1394
1395         /* And make sure to store away the root value without trailing
1396          * slash, even for the root dir, so that we can easily prepend
1397          * it everywhere. */
1398         while ((e = endswith(m->cgroup_root, "/")))
1399                 *e = 0;
1400         log_debug_elogind("Cgroup Controller \"%s\" -> root \"%s\"",
1401                           SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root);
1402
1403         /* 2. Show data */
1404         r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, NULL, &path);
1405         if (r < 0)
1406                 return log_error_errno(r, "Cannot find cgroup mount point: %m");
1407
1408         unified = cg_unified();
1409         if (unified < 0)
1410                 return log_error_errno(r, "Couldn't determine if we are running in the unified hierarchy: %m");
1411         if (unified > 0)
1412                 log_debug("Unified cgroup hierarchy is located at %s.", path);
1413         else
1414                 log_debug("Using cgroup controller " SYSTEMD_CGROUP_CONTROLLER ". File system hierarchy is at %s.", path);
1415
1416         if (!m->test_run) {
1417                 const char *scope_path;
1418
1419                 /* 3. Install agent */
1420                 if (unified) {
1421
1422                         /* In the unified hierarchy we can can get
1423                          * cgroup empty notifications via inotify. */
1424
1425 #if 0 /// elogind does not support the unified hierarchy, yet.
1426                         m->cgroup_inotify_event_source = sd_event_source_unref(m->cgroup_inotify_event_source);
1427                         safe_close(m->cgroup_inotify_fd);
1428
1429                         m->cgroup_inotify_fd = inotify_init1(IN_NONBLOCK|IN_CLOEXEC);
1430                         if (m->cgroup_inotify_fd < 0)
1431                                 return log_error_errno(errno, "Failed to create control group inotify object: %m");
1432
1433                         r = sd_event_add_io(m->event, &m->cgroup_inotify_event_source, m->cgroup_inotify_fd, EPOLLIN, on_cgroup_inotify_event, m);
1434                         if (r < 0)
1435                                 return log_error_errno(r, "Failed to watch control group inotify object: %m");
1436
1437                         r = sd_event_source_set_priority(m->cgroup_inotify_event_source, SD_EVENT_PRIORITY_IDLE - 5);
1438                         if (r < 0)
1439                                 return log_error_errno(r, "Failed to set priority of inotify event source: %m");
1440
1441                         (void) sd_event_source_set_description(m->cgroup_inotify_event_source, "cgroup-inotify");
1442
1443 #else
1444                         return log_error_errno(EOPNOTSUPP, "Unified cgroup hierarchy not supported: %m");
1445 #endif // 0
1446                 } else if (m->running_as == MANAGER_SYSTEM) {
1447
1448                         /* On the legacy hierarchy we only get
1449                          * notifications via cgroup agents. (Which
1450                          * isn't really reliable, since it does not
1451                          * generate events when control groups with
1452                          * children run empty. */
1453
1454                         r = cg_install_release_agent(SYSTEMD_CGROUP_CONTROLLER, ELOGIND_CGROUP_AGENT_PATH);
1455                         if (r < 0)
1456                                 log_warning_errno(r, "Failed to install release agent, ignoring: %m");
1457                         else if (r > 0)
1458                                 log_debug("Installed release agent.");
1459                         else if (r == 0)
1460                                 log_debug("Release agent already installed.");
1461                 }
1462
1463 #if 0 /// elogind is not meant to run in systemd init scope
1464                 /* 4. Make sure we are in the special "init.scope" unit in the root slice. */
1465                 scope_path = strjoina(m->cgroup_root, "/" SPECIAL_INIT_SCOPE);
1466                 r = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, scope_path, 0);
1467 #else
1468                 if (streq(SYSTEMD_CGROUP_CONTROLLER, "name=elogind"))
1469                         // we are our own cgroup controller
1470                         scope_path = strjoina("");
1471                 else if (streq(m->cgroup_root, "/elogind"))
1472                         // root already is our cgroup
1473                         scope_path = strjoina(m->cgroup_root);
1474                 else
1475                         // we have to create our own group
1476                         scope_path = strjoina(m->cgroup_root, "/elogind");
1477                 r = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, scope_path, 0);
1478 #endif // 0
1479                 if (r < 0)
1480                         return log_error_errno(r, "Failed to create %s control group: %m", scope_path);
1481                 log_debug_elogind("Created control group \"%s\"", scope_path);
1482
1483                 /* also, move all other userspace processes remaining
1484                  * in the root cgroup into that scope. */
1485                 if (!streq(m->cgroup_root, scope_path)) {
1486                         r = cg_migrate(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, SYSTEMD_CGROUP_CONTROLLER, scope_path, false);
1487                         if (r < 0)
1488                                 log_warning_errno(r, "Couldn't move remaining userspace processes, ignoring: %m");
1489                 }
1490
1491                 /* 5. And pin it, so that it cannot be unmounted */
1492                 safe_close(m->pin_cgroupfs_fd);
1493                 m->pin_cgroupfs_fd = open(path, O_RDONLY|O_CLOEXEC|O_DIRECTORY|O_NOCTTY|O_NONBLOCK);
1494                 if (m->pin_cgroupfs_fd < 0)
1495                         return log_error_errno(errno, "Failed to open pin file: %m");
1496
1497                 /* 6.  Always enable hierarchical support if it exists... */
1498                 if (!unified)
1499                         (void) cg_set_attribute("memory", "/", "memory.use_hierarchy", "1");
1500         }
1501
1502         /* 7. Figure out which controllers are supported */
1503         r = cg_mask_supported(&m->cgroup_supported);
1504         if (r < 0)
1505                 return log_error_errno(r, "Failed to determine supported controllers: %m");
1506
1507         for (c = 0; c < _CGROUP_CONTROLLER_MAX; c++)
1508                 log_debug("Controller '%s' supported: %s", cgroup_controller_to_string(c), yes_no(m->cgroup_supported & c));
1509
1510         return 0;
1511 }
1512
1513 void manager_shutdown_cgroup(Manager *m, bool delete) {
1514         assert(m);
1515
1516         /* We can't really delete the group, since we are in it. But
1517          * let's trim it. */
1518         if (delete && m->cgroup_root)
1519                 (void) cg_trim(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, false);
1520
1521 #if 0 /// elogind does not support the unified hierarchy, yet.
1522         m->cgroup_inotify_wd_unit = hashmap_free(m->cgroup_inotify_wd_unit);
1523
1524         m->cgroup_inotify_event_source = sd_event_source_unref(m->cgroup_inotify_event_source);
1525         m->cgroup_inotify_fd = safe_close(m->cgroup_inotify_fd);
1526 #endif // 0
1527
1528         m->pin_cgroupfs_fd = safe_close(m->pin_cgroupfs_fd);
1529
1530         m->cgroup_root = mfree(m->cgroup_root);
1531 }
1532
1533 #if 0 /// UNNEEDED by elogind
1534 Unit* manager_get_unit_by_cgroup(Manager *m, const char *cgroup) {
1535         char *p;
1536         Unit *u;
1537
1538         assert(m);
1539         assert(cgroup);
1540
1541         u = hashmap_get(m->cgroup_unit, cgroup);
1542         if (u)
1543                 return u;
1544
1545         p = strdupa(cgroup);
1546         for (;;) {
1547                 char *e;
1548
1549                 e = strrchr(p, '/');
1550                 if (!e || e == p)
1551                         return hashmap_get(m->cgroup_unit, SPECIAL_ROOT_SLICE);
1552
1553                 *e = 0;
1554
1555                 u = hashmap_get(m->cgroup_unit, p);
1556                 if (u)
1557                         return u;
1558         }
1559 }
1560
1561 Unit *manager_get_unit_by_pid_cgroup(Manager *m, pid_t pid) {
1562         _cleanup_free_ char *cgroup = NULL;
1563         int r;
1564
1565         assert(m);
1566
1567         if (pid <= 0)
1568                 return NULL;
1569
1570         r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, pid, &cgroup);
1571         if (r < 0)
1572                 return NULL;
1573
1574         return manager_get_unit_by_cgroup(m, cgroup);
1575 }
1576
1577 Unit *manager_get_unit_by_pid(Manager *m, pid_t pid) {
1578         Unit *u;
1579
1580         assert(m);
1581
1582         if (pid <= 0)
1583                 return NULL;
1584
1585         if (pid == 1)
1586                 return hashmap_get(m->units, SPECIAL_INIT_SCOPE);
1587
1588         u = hashmap_get(m->watch_pids1, PID_TO_PTR(pid));
1589         if (u)
1590                 return u;
1591
1592         u = hashmap_get(m->watch_pids2, PID_TO_PTR(pid));
1593         if (u)
1594                 return u;
1595
1596         return manager_get_unit_by_pid_cgroup(m, pid);
1597 }
1598
1599 int manager_notify_cgroup_empty(Manager *m, const char *cgroup) {
1600         Unit *u;
1601
1602         assert(m);
1603         assert(cgroup);
1604
1605         u = manager_get_unit_by_cgroup(m, cgroup);
1606         if (!u)
1607                 return 0;
1608
1609         return unit_notify_cgroup_empty(u);
1610 }
1611
1612 int unit_get_memory_current(Unit *u, uint64_t *ret) {
1613         _cleanup_free_ char *v = NULL;
1614         int r;
1615
1616         assert(u);
1617         assert(ret);
1618
1619         if (!u->cgroup_path)
1620                 return -ENODATA;
1621
1622         if ((u->cgroup_realized_mask & CGROUP_MASK_MEMORY) == 0)
1623                 return -ENODATA;
1624
1625         if (cg_unified() <= 0)
1626                 r = cg_get_attribute("memory", u->cgroup_path, "memory.usage_in_bytes", &v);
1627         else
1628                 r = cg_get_attribute("memory", u->cgroup_path, "memory.current", &v);
1629         if (r == -ENOENT)
1630                 return -ENODATA;
1631         if (r < 0)
1632                 return r;
1633
1634         return safe_atou64(v, ret);
1635 }
1636
1637 int unit_get_tasks_current(Unit *u, uint64_t *ret) {
1638         _cleanup_free_ char *v = NULL;
1639         int r;
1640
1641         assert(u);
1642         assert(ret);
1643
1644         if (!u->cgroup_path)
1645                 return -ENODATA;
1646
1647         if ((u->cgroup_realized_mask & CGROUP_MASK_PIDS) == 0)
1648                 return -ENODATA;
1649
1650         r = cg_get_attribute("pids", u->cgroup_path, "pids.current", &v);
1651         if (r == -ENOENT)
1652                 return -ENODATA;
1653         if (r < 0)
1654                 return r;
1655
1656         return safe_atou64(v, ret);
1657 }
1658
1659 static int unit_get_cpu_usage_raw(Unit *u, nsec_t *ret) {
1660         _cleanup_free_ char *v = NULL;
1661         uint64_t ns;
1662         int r;
1663
1664         assert(u);
1665         assert(ret);
1666
1667         if (!u->cgroup_path)
1668                 return -ENODATA;
1669
1670         if ((u->cgroup_realized_mask & CGROUP_MASK_CPUACCT) == 0)
1671                 return -ENODATA;
1672
1673         r = cg_get_attribute("cpuacct", u->cgroup_path, "cpuacct.usage", &v);
1674         if (r == -ENOENT)
1675                 return -ENODATA;
1676         if (r < 0)
1677                 return r;
1678
1679         r = safe_atou64(v, &ns);
1680         if (r < 0)
1681                 return r;
1682
1683         *ret = ns;
1684         return 0;
1685 }
1686
1687 int unit_get_cpu_usage(Unit *u, nsec_t *ret) {
1688         nsec_t ns;
1689         int r;
1690
1691         r = unit_get_cpu_usage_raw(u, &ns);
1692         if (r < 0)
1693                 return r;
1694
1695         if (ns > u->cpuacct_usage_base)
1696                 ns -= u->cpuacct_usage_base;
1697         else
1698                 ns = 0;
1699
1700         *ret = ns;
1701         return 0;
1702 }
1703
1704 int unit_reset_cpu_usage(Unit *u) {
1705         nsec_t ns;
1706         int r;
1707
1708         assert(u);
1709
1710         r = unit_get_cpu_usage_raw(u, &ns);
1711         if (r < 0) {
1712                 u->cpuacct_usage_base = 0;
1713                 return r;
1714         }
1715
1716         u->cpuacct_usage_base = ns;
1717         return 0;
1718 }
1719
1720 bool unit_cgroup_delegate(Unit *u) {
1721         CGroupContext *c;
1722
1723         assert(u);
1724
1725         c = unit_get_cgroup_context(u);
1726         if (!c)
1727                 return false;
1728
1729         return c->delegate;
1730 }
1731
1732 void unit_invalidate_cgroup(Unit *u, CGroupMask m) {
1733         assert(u);
1734
1735         if (!UNIT_HAS_CGROUP_CONTEXT(u))
1736                 return;
1737
1738         if (m == 0)
1739                 return;
1740
1741         if ((u->cgroup_realized_mask & m) == 0)
1742                 return;
1743
1744         u->cgroup_realized_mask &= ~m;
1745         unit_add_to_cgroup_queue(u);
1746 }
1747
1748 void manager_invalidate_startup_units(Manager *m) {
1749         Iterator i;
1750         Unit *u;
1751
1752         assert(m);
1753
1754         SET_FOREACH(u, m->startup_units, i)
1755                 unit_invalidate_cgroup(u, CGROUP_MASK_CPU|CGROUP_MASK_BLKIO);
1756 }
1757
1758 static const char* const cgroup_device_policy_table[_CGROUP_DEVICE_POLICY_MAX] = {
1759         [CGROUP_AUTO] = "auto",
1760         [CGROUP_CLOSED] = "closed",
1761         [CGROUP_STRICT] = "strict",
1762 };
1763
1764 DEFINE_STRING_TABLE_LOOKUP(cgroup_device_policy, CGroupDevicePolicy);
1765 #endif // 0