src/core/cgroup.c

   1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
   2
   3 /***
   4   This file is part of systemd.
   5
   6   Copyright 2013 Lennart Poettering
   7
   8   systemd is free software; you can redistribute it and/or modify it
   9   under the terms of the GNU Lesser General Public License as published by
  10   the Free Software Foundation; either version 2.1 of the License, or
  11   (at your option) any later version.
  12
  13   systemd is distributed in the hope that it will be useful, but
  14   WITHOUT ANY WARRANTY; without even the implied warranty of
  15   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  16   Lesser General Public License for more details.
  17
  18   You should have received a copy of the GNU Lesser General Public License
  19   along with systemd; If not, see <http://www.gnu.org/licenses/>.
  20 ***/
  21
  22 #include <fcntl.h>
  23 #include <fnmatch.h>
  24
  25 #include "alloc-util.h"
  26 #include "cgroup-util.h"
  27 #include "cgroup.h"
  28 #include "fd-util.h"
  29 #include "fileio.h"
  30 #include "fs-util.h"
  31 #include "parse-util.h"
  32 #include "path-util.h"
  33 #include "process-util.h"
  34 //#include "special.h"
  35 #include "string-table.h"
  36 #include "string-util.h"
  37
  38 #define CGROUP_CPU_QUOTA_PERIOD_USEC ((usec_t) 100 * USEC_PER_MSEC)
  39
  40 #if 0 /// UNNEEDED by elogind
  41 void cgroup_context_init(CGroupContext *c) {
  42         assert(c);
  43
  44         /* Initialize everything to the kernel defaults, assuming the
  45          * structure is preinitialized to 0 */
  46
  47         c->cpu_shares = CGROUP_CPU_SHARES_INVALID;
  48         c->startup_cpu_shares = CGROUP_CPU_SHARES_INVALID;
  49         c->cpu_quota_per_sec_usec = USEC_INFINITY;
  50
  51         c->memory_limit = (uint64_t) -1;
  52
  53         c->blockio_weight = CGROUP_BLKIO_WEIGHT_INVALID;
  54         c->startup_blockio_weight = CGROUP_BLKIO_WEIGHT_INVALID;
  55
  56         c->tasks_max = (uint64_t) -1;
  57 }
  58
  59 void cgroup_context_free_device_allow(CGroupContext *c, CGroupDeviceAllow *a) {
  60         assert(c);
  61         assert(a);
  62
  63         LIST_REMOVE(device_allow, c->device_allow, a);
  64         free(a->path);
  65         free(a);
  66 }
  67
  68 void cgroup_context_free_blockio_device_weight(CGroupContext *c, CGroupBlockIODeviceWeight *w) {
  69         assert(c);
  70         assert(w);
  71
  72         LIST_REMOVE(device_weights, c->blockio_device_weights, w);
  73         free(w->path);
  74         free(w);
  75 }
  76
  77 void cgroup_context_free_blockio_device_bandwidth(CGroupContext *c, CGroupBlockIODeviceBandwidth *b) {
  78         assert(c);
  79         assert(b);
  80
  81         LIST_REMOVE(device_bandwidths, c->blockio_device_bandwidths, b);
  82         free(b->path);
  83         free(b);
  84 }
  85
  86 void cgroup_context_done(CGroupContext *c) {
  87         assert(c);
  88
  89         while (c->blockio_device_weights)
  90                 cgroup_context_free_blockio_device_weight(c, c->blockio_device_weights);
  91
  92         while (c->blockio_device_bandwidths)
  93                 cgroup_context_free_blockio_device_bandwidth(c, c->blockio_device_bandwidths);
  94
  95         while (c->device_allow)
  96                 cgroup_context_free_device_allow(c, c->device_allow);
  97 }
  98
  99 void cgroup_context_dump(CGroupContext *c, FILE* f, const char *prefix) {
 100         CGroupBlockIODeviceBandwidth *b;
 101         CGroupBlockIODeviceWeight *w;
 102         CGroupDeviceAllow *a;
 103         char u[FORMAT_TIMESPAN_MAX];
 104
 105         assert(c);
 106         assert(f);
 107
 108         prefix = strempty(prefix);
 109
 110         fprintf(f,
 111                 "%sCPUAccounting=%s\n"
 112                 "%sBlockIOAccounting=%s\n"
 113                 "%sMemoryAccounting=%s\n"
 114                 "%sTasksAccounting=%s\n"
 115                 "%sCPUShares=%" PRIu64 "\n"
 116                 "%sStartupCPUShares=%" PRIu64 "\n"
 117                 "%sCPUQuotaPerSecSec=%s\n"
 118                 "%sBlockIOWeight=%" PRIu64 "\n"
 119                 "%sStartupBlockIOWeight=%" PRIu64 "\n"
 120                 "%sMemoryLimit=%" PRIu64 "\n"
 121                 "%sTasksMax=%" PRIu64 "\n"
 122                 "%sDevicePolicy=%s\n"
 123                 "%sDelegate=%s\n",
 124                 prefix, yes_no(c->cpu_accounting),
 125                 prefix, yes_no(c->blockio_accounting),
 126                 prefix, yes_no(c->memory_accounting),
 127                 prefix, yes_no(c->tasks_accounting),
 128                 prefix, c->cpu_shares,
 129                 prefix, c->startup_cpu_shares,
 130                 prefix, format_timespan(u, sizeof(u), c->cpu_quota_per_sec_usec, 1),
 131                 prefix, c->blockio_weight,
 132                 prefix, c->startup_blockio_weight,
 133                 prefix, c->memory_limit,
 134                 prefix, c->tasks_max,
 135                 prefix, cgroup_device_policy_to_string(c->device_policy),
 136                 prefix, yes_no(c->delegate));
 137
 138         LIST_FOREACH(device_allow, a, c->device_allow)
 139                 fprintf(f,
 140                         "%sDeviceAllow=%s %s%s%s\n",
 141                         prefix,
 142                         a->path,
 143                         a->r ? "r" : "", a->w ? "w" : "", a->m ? "m" : "");
 144
 145         LIST_FOREACH(device_weights, w, c->blockio_device_weights)
 146                 fprintf(f,
 147                         "%sBlockIODeviceWeight=%s %" PRIu64,
 148                         prefix,
 149                         w->path,
 150                         w->weight);
 151
 152         LIST_FOREACH(device_bandwidths, b, c->blockio_device_bandwidths) {
 153                 char buf[FORMAT_BYTES_MAX];
 154
 155                 fprintf(f,
 156                         "%s%s=%s %s\n",
 157                         prefix,
 158                         b->read ? "BlockIOReadBandwidth" : "BlockIOWriteBandwidth",
 159                         b->path,
 160                         format_bytes(buf, sizeof(buf), b->bandwidth));
 161         }
 162 }
 163
 164 static int lookup_blkio_device(const char *p, dev_t *dev) {
 165         struct stat st;
 166         int r;
 167
 168         assert(p);
 169         assert(dev);
 170
 171         r = stat(p, &st);
 172         if (r < 0)
 173                 return log_warning_errno(errno, "Couldn't stat device %s: %m", p);
 174
 175         if (S_ISBLK(st.st_mode))
 176                 *dev = st.st_rdev;
 177         else if (major(st.st_dev) != 0) {
 178                 /* If this is not a device node then find the block
 179                  * device this file is stored on */
 180                 *dev = st.st_dev;
 181
 182                 /* If this is a partition, try to get the originating
 183                  * block device */
 184                 block_get_whole_disk(*dev, dev);
 185         } else {
 186                 log_warning("%s is not a block device and file system block device cannot be determined or is not local.", p);
 187                 return -ENODEV;
 188         }
 189
 190         return 0;
 191 }
 192
 193 static int whitelist_device(const char *path, const char *node, const char *acc) {
 194         char buf[2+DECIMAL_STR_MAX(dev_t)*2+2+4];
 195         struct stat st;
 196         int r;
 197
 198         assert(path);
 199         assert(acc);
 200
 201         if (stat(node, &st) < 0) {
 202                 log_warning("Couldn't stat device %s", node);
 203                 return -errno;
 204         }
 205
 206         if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
 207                 log_warning("%s is not a device.", node);
 208                 return -ENODEV;
 209         }
 210
 211         sprintf(buf,
 212                 "%c %u:%u %s",
 213                 S_ISCHR(st.st_mode) ? 'c' : 'b',
 214                 major(st.st_rdev), minor(st.st_rdev),
 215                 acc);
 216
 217         r = cg_set_attribute("devices", path, "devices.allow", buf);
 218         if (r < 0)
 219                 log_full_errno(IN_SET(r, -ENOENT, -EROFS, -EINVAL, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
 220                                "Failed to set devices.allow on %s: %m", path);
 221
 222         return r;
 223 }
 224
 225 static int whitelist_major(const char *path, const char *name, char type, const char *acc) {
 226         _cleanup_fclose_ FILE *f = NULL;
 227         char line[LINE_MAX];
 228         bool good = false;
 229         int r;
 230
 231         assert(path);
 232         assert(acc);
 233         assert(type == 'b' || type == 'c');
 234
 235         f = fopen("/proc/devices", "re");
 236         if (!f)
 237                 return log_warning_errno(errno, "Cannot open /proc/devices to resolve %s (%c): %m", name, type);
 238
 239         FOREACH_LINE(line, f, goto fail) {
 240                 char buf[2+DECIMAL_STR_MAX(unsigned)+3+4], *p, *w;
 241                 unsigned maj;
 242
 243                 truncate_nl(line);
 244
 245                 if (type == 'c' && streq(line, "Character devices:")) {
 246                         good = true;
 247                         continue;
 248                 }
 249
 250                 if (type == 'b' && streq(line, "Block devices:")) {
 251                         good = true;
 252                         continue;
 253                 }
 254
 255                 if (isempty(line)) {
 256                         good = false;
 257                         continue;
 258                 }
 259
 260                 if (!good)
 261                         continue;
 262
 263                 p = strstrip(line);
 264
 265                 w = strpbrk(p, WHITESPACE);
 266                 if (!w)
 267                         continue;
 268                 *w = 0;
 269
 270                 r = safe_atou(p, &maj);
 271                 if (r < 0)
 272                         continue;
 273                 if (maj <= 0)
 274                         continue;
 275
 276                 w++;
 277                 w += strspn(w, WHITESPACE);
 278
 279                 if (fnmatch(name, w, 0) != 0)
 280                         continue;
 281
 282                 sprintf(buf,
 283                         "%c %u:* %s",
 284                         type,
 285                         maj,
 286                         acc);
 287
 288                 r = cg_set_attribute("devices", path, "devices.allow", buf);
 289                 if (r < 0)
 290                         log_full_errno(IN_SET(r, -ENOENT, -EROFS, -EINVAL, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
 291                                        "Failed to set devices.allow on %s: %m", path);
 292         }
 293
 294         return 0;
 295
 296 fail:
 297         log_warning_errno(errno, "Failed to read /proc/devices: %m");
 298         return -errno;
 299 }
 300
 301 void cgroup_context_apply(CGroupContext *c, CGroupMask mask, const char *path, ManagerState state) {
 302         bool is_root;
 303         int r;
 304
 305         assert(c);
 306         assert(path);
 307
 308         if (mask == 0)
 309                 return;
 310
 311         /* Some cgroup attributes are not supported on the root cgroup,
 312          * hence silently ignore */
 313         is_root = isempty(path) || path_equal(path, "/");
 314         if (is_root)
 315                 /* Make sure we don't try to display messages with an empty path. */
 316                 path = "/";
 317
 318         /* We generally ignore errors caused by read-only mounted
 319          * cgroup trees (assuming we are running in a container then),
 320          * and missing cgroups, i.e. EROFS and ENOENT. */
 321
 322         if ((mask & CGROUP_MASK_CPU) && !is_root) {
 323                 char buf[MAX(DECIMAL_STR_MAX(uint64_t), DECIMAL_STR_MAX(usec_t)) + 1];
 324
 325                 sprintf(buf, "%" PRIu64 "\n",
 326                         IN_SET(state, MANAGER_STARTING, MANAGER_INITIALIZING) && c->startup_cpu_shares != CGROUP_CPU_SHARES_INVALID ? c->startup_cpu_shares :
 327                         c->cpu_shares != CGROUP_CPU_SHARES_INVALID ? c->cpu_shares : CGROUP_CPU_SHARES_DEFAULT);
 328                 r = cg_set_attribute("cpu", path, "cpu.shares", buf);
 329                 if (r < 0)
 330                         log_full_errno(IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
 331                                        "Failed to set cpu.shares on %s: %m", path);
 332
 333                 sprintf(buf, USEC_FMT "\n", CGROUP_CPU_QUOTA_PERIOD_USEC);
 334                 r = cg_set_attribute("cpu", path, "cpu.cfs_period_us", buf);
 335                 if (r < 0)
 336                         log_full_errno(IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
 337                                        "Failed to set cpu.cfs_period_us on %s: %m", path);
 338
 339                 if (c->cpu_quota_per_sec_usec != USEC_INFINITY) {
 340                         sprintf(buf, USEC_FMT "\n", c->cpu_quota_per_sec_usec * CGROUP_CPU_QUOTA_PERIOD_USEC / USEC_PER_SEC);
 341                         r = cg_set_attribute("cpu", path, "cpu.cfs_quota_us", buf);
 342                 } else
 343                         r = cg_set_attribute("cpu", path, "cpu.cfs_quota_us", "-1");
 344                 if (r < 0)
 345                         log_full_errno(IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
 346                                        "Failed to set cpu.cfs_quota_us on %s: %m", path);
 347         }
 348
 349         if (mask & CGROUP_MASK_BLKIO) {
 350                 char buf[MAX(DECIMAL_STR_MAX(uint64_t)+1,
 351                               DECIMAL_STR_MAX(dev_t)*2+2+DECIMAL_STR_MAX(uint64_t)+1)];
 352                 CGroupBlockIODeviceWeight *w;
 353                 CGroupBlockIODeviceBandwidth *b;
 354
 355                 if (!is_root) {
 356                         sprintf(buf, "%" PRIu64 "\n",
 357                                 IN_SET(state, MANAGER_STARTING, MANAGER_INITIALIZING) && c->startup_blockio_weight != CGROUP_BLKIO_WEIGHT_INVALID ? c->startup_blockio_weight :
 358                                 c->blockio_weight != CGROUP_BLKIO_WEIGHT_INVALID ? c->blockio_weight : CGROUP_BLKIO_WEIGHT_DEFAULT);
 359                         r = cg_set_attribute("blkio", path, "blkio.weight", buf);
 360                         if (r < 0)
 361                                 log_full_errno(IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
 362                                                "Failed to set blkio.weight on %s: %m", path);
 363
 364                         /* FIXME: no way to reset this list */
 365                         LIST_FOREACH(device_weights, w, c->blockio_device_weights) {
 366                                 dev_t dev;
 367
 368                                 r = lookup_blkio_device(w->path, &dev);
 369                                 if (r < 0)
 370                                         continue;
 371
 372                                 sprintf(buf, "%u:%u %" PRIu64 "\n", major(dev), minor(dev), w->weight);
 373                                 r = cg_set_attribute("blkio", path, "blkio.weight_device", buf);
 374                                 if (r < 0)
 375                                         log_full_errno(IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
 376                                                        "Failed to set blkio.weight_device on %s: %m", path);
 377                         }
 378                 }
 379
 380                 /* FIXME: no way to reset this list */
 381                 LIST_FOREACH(device_bandwidths, b, c->blockio_device_bandwidths) {
 382                         const char *a;
 383                         dev_t dev;
 384
 385                         r = lookup_blkio_device(b->path, &dev);
 386                         if (r < 0)
 387                                 continue;
 388
 389                         a = b->read ? "blkio.throttle.read_bps_device" : "blkio.throttle.write_bps_device";
 390
 391                         sprintf(buf, "%u:%u %" PRIu64 "\n", major(dev), minor(dev), b->bandwidth);
 392                         r = cg_set_attribute("blkio", path, a, buf);
 393                         if (r < 0)
 394                                 log_full_errno(IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
 395                                                "Failed to set %s on %s: %m", a, path);
 396                 }
 397         }
 398
 399         if ((mask & CGROUP_MASK_MEMORY) && !is_root) {
 400                 if (c->memory_limit != (uint64_t) -1) {
 401                         char buf[DECIMAL_STR_MAX(uint64_t) + 1];
 402
 403                         sprintf(buf, "%" PRIu64 "\n", c->memory_limit);
 404
 405                         if (cg_unified() <= 0)
 406                                 r = cg_set_attribute("memory", path, "memory.limit_in_bytes", buf);
 407                         else
 408                                 r = cg_set_attribute("memory", path, "memory.max", buf);
 409
 410                 } else {
 411                         if (cg_unified() <= 0)
 412                                 r = cg_set_attribute("memory", path, "memory.limit_in_bytes", "-1");
 413                         else
 414                                 r = cg_set_attribute("memory", path, "memory.max", "max");
 415                 }
 416
 417                 if (r < 0)
 418                         log_full_errno(IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
 419                                        "Failed to set memory.limit_in_bytes/memory.max on %s: %m", path);
 420         }
 421
 422         if ((mask & CGROUP_MASK_DEVICES) && !is_root) {
 423                 CGroupDeviceAllow *a;
 424
 425                 /* Changing the devices list of a populated cgroup
 426                  * might result in EINVAL, hence ignore EINVAL
 427                  * here. */
 428
 429                 if (c->device_allow || c->device_policy != CGROUP_AUTO)
 430                         r = cg_set_attribute("devices", path, "devices.deny", "a");
 431                 else
 432                         r = cg_set_attribute("devices", path, "devices.allow", "a");
 433                 if (r < 0)
 434                         log_full_errno(IN_SET(r, -ENOENT, -EROFS, -EINVAL, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
 435                                        "Failed to reset devices.list on %s: %m", path);
 436
 437                 if (c->device_policy == CGROUP_CLOSED ||
 438                     (c->device_policy == CGROUP_AUTO && c->device_allow)) {
 439                         static const char auto_devices[] =
 440                                 "/dev/null\0" "rwm\0"
 441                                 "/dev/zero\0" "rwm\0"
 442                                 "/dev/full\0" "rwm\0"
 443                                 "/dev/random\0" "rwm\0"
 444                                 "/dev/urandom\0" "rwm\0"
 445                                 "/dev/tty\0" "rwm\0"
 446                                 "/dev/pts/ptmx\0" "rw\0"; /* /dev/pts/ptmx may not be duplicated, but accessed */
 447
 448                         const char *x, *y;
 449
 450                         NULSTR_FOREACH_PAIR(x, y, auto_devices)
 451                                 whitelist_device(path, x, y);
 452
 453                         whitelist_major(path, "pts", 'c', "rw");
 454                         whitelist_major(path, "kdbus", 'c', "rw");
 455                         whitelist_major(path, "kdbus/*", 'c', "rw");
 456                 }
 457
 458                 LIST_FOREACH(device_allow, a, c->device_allow) {
 459                         char acc[4];
 460                         unsigned k = 0;
 461
 462                         if (a->r)
 463                                 acc[k++] = 'r';
 464                         if (a->w)
 465                                 acc[k++] = 'w';
 466                         if (a->m)
 467                                 acc[k++] = 'm';
 468
 469                         if (k == 0)
 470                                 continue;
 471
 472                         acc[k++] = 0;
 473
 474                         if (startswith(a->path, "/dev/"))
 475                                 whitelist_device(path, a->path, acc);
 476                         else if (startswith(a->path, "block-"))
 477                                 whitelist_major(path, a->path + 6, 'b', acc);
 478                         else if (startswith(a->path, "char-"))
 479                                 whitelist_major(path, a->path + 5, 'c', acc);
 480                         else
 481                                 log_debug("Ignoring device %s while writing cgroup attribute.", a->path);
 482                 }
 483         }
 484
 485         if ((mask & CGROUP_MASK_PIDS) && !is_root) {
 486
 487                 if (c->tasks_max != (uint64_t) -1) {
 488                         char buf[DECIMAL_STR_MAX(uint64_t) + 2];
 489
 490                         sprintf(buf, "%" PRIu64 "\n", c->tasks_max);
 491                         r = cg_set_attribute("pids", path, "pids.max", buf);
 492                 } else
 493                         r = cg_set_attribute("pids", path, "pids.max", "max");
 494
 495                 if (r < 0)
 496                         log_full_errno(IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
 497                                        "Failed to set pids.max on %s: %m", path);
 498         }
 499 }
 500
 501 CGroupMask cgroup_context_get_mask(CGroupContext *c) {
 502         CGroupMask mask = 0;
 503
 504         /* Figure out which controllers we need */
 505
 506         if (c->cpu_accounting ||
 507             c->cpu_shares != CGROUP_CPU_SHARES_INVALID ||
 508             c->startup_cpu_shares != CGROUP_CPU_SHARES_INVALID ||
 509             c->cpu_quota_per_sec_usec != USEC_INFINITY)
 510                 mask |= CGROUP_MASK_CPUACCT | CGROUP_MASK_CPU;
 511
 512         if (c->blockio_accounting ||
 513             c->blockio_weight != CGROUP_BLKIO_WEIGHT_INVALID ||
 514             c->startup_blockio_weight != CGROUP_BLKIO_WEIGHT_INVALID ||
 515             c->blockio_device_weights ||
 516             c->blockio_device_bandwidths)
 517                 mask |= CGROUP_MASK_BLKIO;
 518
 519         if (c->memory_accounting ||
 520             c->memory_limit != (uint64_t) -1)
 521                 mask |= CGROUP_MASK_MEMORY;
 522
 523         if (c->device_allow ||
 524             c->device_policy != CGROUP_AUTO)
 525                 mask |= CGROUP_MASK_DEVICES;
 526
 527         if (c->tasks_accounting ||
 528             c->tasks_max != (uint64_t) -1)
 529                 mask |= CGROUP_MASK_PIDS;
 530
 531         return mask;
 532 }
 533
 534 CGroupMask unit_get_own_mask(Unit *u) {
 535         CGroupContext *c;
 536
 537         /* Returns the mask of controllers the unit needs for itself */
 538
 539         c = unit_get_cgroup_context(u);
 540         if (!c)
 541                 return 0;
 542
 543         /* If delegation is turned on, then turn on all cgroups,
 544          * unless we are on the legacy hierarchy and the process we
 545          * fork into it is known to drop privileges, and hence
 546          * shouldn't get access to the controllers.
 547          *
 548          * Note that on the unified hierarchy it is safe to delegate
 549          * controllers to unprivileged services. */
 550
 551         if (c->delegate) {
 552                 ExecContext *e;
 553
 554                 e = unit_get_exec_context(u);
 555                 if (!e ||
 556                     exec_context_maintains_privileges(e) ||
 557                     cg_unified() > 0)
 558                         return _CGROUP_MASK_ALL;
 559         }
 560
 561         return cgroup_context_get_mask(c);
 562 }
 563
 564 CGroupMask unit_get_members_mask(Unit *u) {
 565         assert(u);
 566
 567         /* Returns the mask of controllers all of the unit's children
 568          * require, merged */
 569
 570         if (u->cgroup_members_mask_valid)
 571                 return u->cgroup_members_mask;
 572
 573         u->cgroup_members_mask = 0;
 574
 575         if (u->type == UNIT_SLICE) {
 576                 Unit *member;
 577                 Iterator i;
 578
 579                 SET_FOREACH(member, u->dependencies[UNIT_BEFORE], i) {
 580
 581                         if (member == u)
 582                                 continue;
 583
 584                         if (UNIT_DEREF(member->slice) != u)
 585                                 continue;
 586
 587                         u->cgroup_members_mask |=
 588                                 unit_get_own_mask(member) |
 589                                 unit_get_members_mask(member);
 590                 }
 591         }
 592
 593         u->cgroup_members_mask_valid = true;
 594         return u->cgroup_members_mask;
 595 }
 596
 597 CGroupMask unit_get_siblings_mask(Unit *u) {
 598         assert(u);
 599
 600         /* Returns the mask of controllers all of the unit's siblings
 601          * require, i.e. the members mask of the unit's parent slice
 602          * if there is one. */
 603
 604         if (UNIT_ISSET(u->slice))
 605                 return unit_get_members_mask(UNIT_DEREF(u->slice));
 606
 607         return unit_get_own_mask(u) | unit_get_members_mask(u);
 608 }
 609
 610 CGroupMask unit_get_subtree_mask(Unit *u) {
 611
 612         /* Returns the mask of this subtree, meaning of the group
 613          * itself and its children. */
 614
 615         return unit_get_own_mask(u) | unit_get_members_mask(u);
 616 }
 617
 618 CGroupMask unit_get_target_mask(Unit *u) {
 619         CGroupMask mask;
 620
 621         /* This returns the cgroup mask of all controllers to enable
 622          * for a specific cgroup, i.e. everything it needs itself,
 623          * plus all that its children need, plus all that its siblings
 624          * need. This is primarily useful on the legacy cgroup
 625          * hierarchy, where we need to duplicate each cgroup in each
 626          * hierarchy that shall be enabled for it. */
 627
 628         mask = unit_get_own_mask(u) | unit_get_members_mask(u) | unit_get_siblings_mask(u);
 629         mask &= u->manager->cgroup_supported;
 630
 631         return mask;
 632 }
 633
 634 CGroupMask unit_get_enable_mask(Unit *u) {
 635         CGroupMask mask;
 636
 637         /* This returns the cgroup mask of all controllers to enable
 638          * for the children of a specific cgroup. This is primarily
 639          * useful for the unified cgroup hierarchy, where each cgroup
 640          * controls which controllers are enabled for its children. */
 641
 642         mask = unit_get_members_mask(u);
 643         mask &= u->manager->cgroup_supported;
 644
 645         return mask;
 646 }
 647
 648 /* Recurse from a unit up through its containing slices, propagating
 649  * mask bits upward. A unit is also member of itself. */
 650 void unit_update_cgroup_members_masks(Unit *u) {
 651         CGroupMask m;
 652         bool more;
 653
 654         assert(u);
 655
 656         /* Calculate subtree mask */
 657         m = unit_get_subtree_mask(u);
 658
 659         /* See if anything changed from the previous invocation. If
 660          * not, we're done. */
 661         if (u->cgroup_subtree_mask_valid && m == u->cgroup_subtree_mask)
 662                 return;
 663
 664         more =
 665                 u->cgroup_subtree_mask_valid &&
 666                 ((m & ~u->cgroup_subtree_mask) != 0) &&
 667                 ((~m & u->cgroup_subtree_mask) == 0);
 668
 669         u->cgroup_subtree_mask = m;
 670         u->cgroup_subtree_mask_valid = true;
 671
 672         if (UNIT_ISSET(u->slice)) {
 673                 Unit *s = UNIT_DEREF(u->slice);
 674
 675                 if (more)
 676                         /* There's more set now than before. We
 677                          * propagate the new mask to the parent's mask
 678                          * (not caring if it actually was valid or
 679                          * not). */
 680
 681                         s->cgroup_members_mask |= m;
 682
 683                 else
 684                         /* There's less set now than before (or we
 685                          * don't know), we need to recalculate
 686                          * everything, so let's invalidate the
 687                          * parent's members mask */
 688
 689                         s->cgroup_members_mask_valid = false;
 690
 691                 /* And now make sure that this change also hits our
 692                  * grandparents */
 693                 unit_update_cgroup_members_masks(s);
 694         }
 695 }
 696
 697 static const char *migrate_callback(CGroupMask mask, void *userdata) {
 698         Unit *u = userdata;
 699
 700         assert(mask != 0);
 701         assert(u);
 702
 703         while (u) {
 704                 if (u->cgroup_path &&
 705                     u->cgroup_realized &&
 706                     (u->cgroup_realized_mask & mask) == mask)
 707                         return u->cgroup_path;
 708
 709                 u = UNIT_DEREF(u->slice);
 710         }
 711
 712         return NULL;
 713 }
 714
 715 char *unit_default_cgroup_path(Unit *u) {
 716         _cleanup_free_ char *escaped = NULL, *slice = NULL;
 717         int r;
 718
 719         assert(u);
 720
 721         if (unit_has_name(u, SPECIAL_ROOT_SLICE))
 722                 return strdup(u->manager->cgroup_root);
 723
 724         if (UNIT_ISSET(u->slice) && !unit_has_name(UNIT_DEREF(u->slice), SPECIAL_ROOT_SLICE)) {
 725                 r = cg_slice_to_path(UNIT_DEREF(u->slice)->id, &slice);
 726                 if (r < 0)
 727                         return NULL;
 728         }
 729
 730         escaped = cg_escape(u->id);
 731         if (!escaped)
 732                 return NULL;
 733
 734         if (slice)
 735                 return strjoin(u->manager->cgroup_root, "/", slice, "/", escaped, NULL);
 736         else
 737                 return strjoin(u->manager->cgroup_root, "/", escaped, NULL);
 738 }
 739
 740 int unit_set_cgroup_path(Unit *u, const char *path) {
 741         _cleanup_free_ char *p = NULL;
 742         int r;
 743
 744         assert(u);
 745
 746         if (path) {
 747                 p = strdup(path);
 748                 if (!p)
 749                         return -ENOMEM;
 750         } else
 751                 p = NULL;
 752
 753         if (streq_ptr(u->cgroup_path, p))
 754                 return 0;
 755
 756         if (p) {
 757                 r = hashmap_put(u->manager->cgroup_unit, p, u);
 758                 if (r < 0)
 759                         return r;
 760         }
 761
 762         unit_release_cgroup(u);
 763
 764         u->cgroup_path = p;
 765         p = NULL;
 766
 767         return 1;
 768 }
 769
 770 int unit_watch_cgroup(Unit *u) {
 771         _cleanup_free_ char *populated = NULL;
 772         int r;
 773
 774         assert(u);
 775
 776         if (!u->cgroup_path)
 777                 return 0;
 778
 779         if (u->cgroup_inotify_wd >= 0)
 780                 return 0;
 781
 782         /* Only applies to the unified hierarchy */
 783         r = cg_unified();
 784         if (r < 0)
 785                 return log_unit_error_errno(u, r, "Failed detect wether the unified hierarchy is used: %m");
 786         if (r == 0)
 787                 return 0;
 788
 789         /* Don't watch the root slice, it's pointless. */
 790         if (unit_has_name(u, SPECIAL_ROOT_SLICE))
 791                 return 0;
 792
 793         r = hashmap_ensure_allocated(&u->manager->cgroup_inotify_wd_unit, &trivial_hash_ops);
 794         if (r < 0)
 795                 return log_oom();
 796
 797         r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, "cgroup.populated", &populated);
 798         if (r < 0)
 799                 return log_oom();
 800
 801         u->cgroup_inotify_wd = inotify_add_watch(u->manager->cgroup_inotify_fd, populated, IN_MODIFY);
 802         if (u->cgroup_inotify_wd < 0) {
 803
 804                 if (errno == ENOENT) /* If the directory is already
 805                                       * gone we don't need to track
 806                                       * it, so this is not an error */
 807                         return 0;
 808
 809                 return log_unit_error_errno(u, errno, "Failed to add inotify watch descriptor for control group %s: %m", u->cgroup_path);
 810         }
 811
 812         r = hashmap_put(u->manager->cgroup_inotify_wd_unit, INT_TO_PTR(u->cgroup_inotify_wd), u);
 813         if (r < 0)
 814                 return log_unit_error_errno(u, r, "Failed to add inotify watch descriptor to hash map: %m");
 815
 816         return 0;
 817 }
 818
 819 static int unit_create_cgroup(
 820                 Unit *u,
 821                 CGroupMask target_mask,
 822                 CGroupMask enable_mask) {
 823
 824         CGroupContext *c;
 825         int r;
 826
 827         assert(u);
 828
 829         c = unit_get_cgroup_context(u);
 830         if (!c)
 831                 return 0;
 832
 833         if (!u->cgroup_path) {
 834                 _cleanup_free_ char *path = NULL;
 835
 836                 path = unit_default_cgroup_path(u);
 837                 if (!path)
 838                         return log_oom();
 839
 840                 r = unit_set_cgroup_path(u, path);
 841                 if (r == -EEXIST)
 842                         return log_unit_error_errno(u, r, "Control group %s exists already.", path);
 843                 if (r < 0)
 844                         return log_unit_error_errno(u, r, "Failed to set unit's control group path to %s: %m", path);
 845         }
 846
 847         /* First, create our own group */
 848         r = cg_create_everywhere(u->manager->cgroup_supported, target_mask, u->cgroup_path);
 849         if (r < 0)
 850                 return log_unit_error_errno(u, r, "Failed to create cgroup %s: %m", u->cgroup_path);
 851
 852         /* Start watching it */
 853         (void) unit_watch_cgroup(u);
 854
 855         /* Enable all controllers we need */
 856         r = cg_enable_everywhere(u->manager->cgroup_supported, enable_mask, u->cgroup_path);
 857         if (r < 0)
 858                 log_unit_warning_errno(u, r, "Failed to enable controllers on cgroup %s, ignoring: %m", u->cgroup_path);
 859
 860         /* Keep track that this is now realized */
 861         u->cgroup_realized = true;
 862         u->cgroup_realized_mask = target_mask;
 863
 864         if (u->type != UNIT_SLICE && !c->delegate) {
 865
 866                 /* Then, possibly move things over, but not if
 867                  * subgroups may contain processes, which is the case
 868                  * for slice and delegation units. */
 869                 r = cg_migrate_everywhere(u->manager->cgroup_supported, u->cgroup_path, u->cgroup_path, migrate_callback, u);
 870                 if (r < 0)
 871                         log_unit_warning_errno(u, r, "Failed to migrate cgroup from to %s, ignoring: %m", u->cgroup_path);
 872         }
 873
 874         return 0;
 875 }
 876
 877 int unit_attach_pids_to_cgroup(Unit *u) {
 878         int r;
 879         assert(u);
 880
 881         r = unit_realize_cgroup(u);
 882         if (r < 0)
 883                 return r;
 884
 885         r = cg_attach_many_everywhere(u->manager->cgroup_supported, u->cgroup_path, u->pids, migrate_callback, u);
 886         if (r < 0)
 887                 return r;
 888
 889         return 0;
 890 }
 891
 892 static bool unit_has_mask_realized(Unit *u, CGroupMask target_mask) {
 893         assert(u);
 894
 895         return u->cgroup_realized && u->cgroup_realized_mask == target_mask;
 896 }
 897
 898 /* Check if necessary controllers and attributes for a unit are in place.
 899  *
 900  * If so, do nothing.
 901  * If not, create paths, move processes over, and set attributes.
 902  *
 903  * Returns 0 on success and < 0 on failure. */
 904 static int unit_realize_cgroup_now(Unit *u, ManagerState state) {
 905         CGroupMask target_mask, enable_mask;
 906         int r;
 907
 908         assert(u);
 909
 910         if (u->in_cgroup_queue) {
 911                 LIST_REMOVE(cgroup_queue, u->manager->cgroup_queue, u);
 912                 u->in_cgroup_queue = false;
 913         }
 914
 915         target_mask = unit_get_target_mask(u);
 916         if (unit_has_mask_realized(u, target_mask))
 917                 return 0;
 918
 919         /* First, realize parents */
 920         if (UNIT_ISSET(u->slice)) {
 921                 r = unit_realize_cgroup_now(UNIT_DEREF(u->slice), state);
 922                 if (r < 0)
 923                         return r;
 924         }
 925
 926         /* And then do the real work */
 927         enable_mask = unit_get_enable_mask(u);
 928         r = unit_create_cgroup(u, target_mask, enable_mask);
 929         if (r < 0)
 930                 return r;
 931
 932         /* Finally, apply the necessary attributes. */
 933         cgroup_context_apply(unit_get_cgroup_context(u), target_mask, u->cgroup_path, state);
 934
 935         return 0;
 936 }
 937
 938 static void unit_add_to_cgroup_queue(Unit *u) {
 939
 940         if (u->in_cgroup_queue)
 941                 return;
 942
 943         LIST_PREPEND(cgroup_queue, u->manager->cgroup_queue, u);
 944         u->in_cgroup_queue = true;
 945 }
 946
 947 unsigned manager_dispatch_cgroup_queue(Manager *m) {
 948         ManagerState state;
 949         unsigned n = 0;
 950         Unit *i;
 951         int r;
 952
 953         state = manager_state(m);
 954
 955         while ((i = m->cgroup_queue)) {
 956                 assert(i->in_cgroup_queue);
 957
 958                 r = unit_realize_cgroup_now(i, state);
 959                 if (r < 0)
 960                         log_warning_errno(r, "Failed to realize cgroups for queued unit %s, ignoring: %m", i->id);
 961
 962                 n++;
 963         }
 964
 965         return n;
 966 }
 967
 968 static void unit_queue_siblings(Unit *u) {
 969         Unit *slice;
 970
 971         /* This adds the siblings of the specified unit and the
 972          * siblings of all parent units to the cgroup queue. (But
 973          * neither the specified unit itself nor the parents.) */
 974
 975         while ((slice = UNIT_DEREF(u->slice))) {
 976                 Iterator i;
 977                 Unit *m;
 978
 979                 SET_FOREACH(m, slice->dependencies[UNIT_BEFORE], i) {
 980                         if (m == u)
 981                                 continue;
 982
 983                         /* Skip units that have a dependency on the slice
 984                          * but aren't actually in it. */
 985                         if (UNIT_DEREF(m->slice) != slice)
 986                                 continue;
 987
 988                         /* No point in doing cgroup application for units
 989                          * without active processes. */
 990                         if (UNIT_IS_INACTIVE_OR_FAILED(unit_active_state(m)))
 991                                 continue;
 992
 993                         /* If the unit doesn't need any new controllers
 994                          * and has current ones realized, it doesn't need
 995                          * any changes. */
 996                         if (unit_has_mask_realized(m, unit_get_target_mask(m)))
 997                                 continue;
 998
 999                         unit_add_to_cgroup_queue(m);
1000                 }
1001
1002                 u = slice;
1003         }
1004 }
1005
1006 int unit_realize_cgroup(Unit *u) {
1007         assert(u);
1008
1009         if (!UNIT_HAS_CGROUP_CONTEXT(u))
1010                 return 0;
1011
1012         /* So, here's the deal: when realizing the cgroups for this
1013          * unit, we need to first create all parents, but there's more
1014          * actually: for the weight-based controllers we also need to
1015          * make sure that all our siblings (i.e. units that are in the
1016          * same slice as we are) have cgroups, too. Otherwise, things
1017          * would become very uneven as each of their processes would
1018          * get as much resources as all our group together. This call
1019          * will synchronously create the parent cgroups, but will
1020          * defer work on the siblings to the next event loop
1021          * iteration. */
1022
1023         /* Add all sibling slices to the cgroup queue. */
1024         unit_queue_siblings(u);
1025
1026         /* And realize this one now (and apply the values) */
1027         return unit_realize_cgroup_now(u, manager_state(u->manager));
1028 }
1029
1030 void unit_release_cgroup(Unit *u) {
1031         assert(u);
1032
1033         /* Forgets all cgroup details for this cgroup */
1034
1035         if (u->cgroup_path) {
1036                 (void) hashmap_remove(u->manager->cgroup_unit, u->cgroup_path);
1037                 u->cgroup_path = mfree(u->cgroup_path);
1038         }
1039
1040         if (u->cgroup_inotify_wd >= 0) {
1041                 if (inotify_rm_watch(u->manager->cgroup_inotify_fd, u->cgroup_inotify_wd) < 0)
1042                         log_unit_debug_errno(u, errno, "Failed to remove cgroup inotify watch %i for %s, ignoring", u->cgroup_inotify_wd, u->id);
1043
1044                 (void) hashmap_remove(u->manager->cgroup_inotify_wd_unit, INT_TO_PTR(u->cgroup_inotify_wd));
1045                 u->cgroup_inotify_wd = -1;
1046         }
1047 }
1048
1049 void unit_prune_cgroup(Unit *u) {
1050         int r;
1051         bool is_root_slice;
1052
1053         assert(u);
1054
1055         /* Removes the cgroup, if empty and possible, and stops watching it. */
1056
1057         if (!u->cgroup_path)
1058                 return;
1059
1060         is_root_slice = unit_has_name(u, SPECIAL_ROOT_SLICE);
1061
1062         r = cg_trim_everywhere(u->manager->cgroup_supported, u->cgroup_path, !is_root_slice);
1063         if (r < 0) {
1064                 log_debug_errno(r, "Failed to destroy cgroup %s, ignoring: %m", u->cgroup_path);
1065                 return;
1066         }
1067
1068         if (is_root_slice)
1069                 return;
1070
1071         unit_release_cgroup(u);
1072
1073         u->cgroup_realized = false;
1074         u->cgroup_realized_mask = 0;
1075 }
1076
1077 int unit_search_main_pid(Unit *u, pid_t *ret) {
1078         _cleanup_fclose_ FILE *f = NULL;
1079         pid_t pid = 0, npid, mypid;
1080         int r;
1081
1082         assert(u);
1083         assert(ret);
1084
1085         if (!u->cgroup_path)
1086                 return -ENXIO;
1087
1088         r = cg_enumerate_processes(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, &f);
1089         if (r < 0)
1090                 return r;
1091
1092         mypid = getpid();
1093         while (cg_read_pid(f, &npid) > 0)  {
1094                 pid_t ppid;
1095
1096                 if (npid == pid)
1097                         continue;
1098
1099                 /* Ignore processes that aren't our kids */
1100                 if (get_process_ppid(npid, &ppid) >= 0 && ppid != mypid)
1101                         continue;
1102
1103                 if (pid != 0)
1104                         /* Dang, there's more than one daemonized PID
1105                         in this group, so we don't know what process
1106                         is the main process. */
1107
1108                         return -ENODATA;
1109
1110                 pid = npid;
1111         }
1112
1113         *ret = pid;
1114         return 0;
1115 }
1116
1117 static int unit_watch_pids_in_path(Unit *u, const char *path) {
1118         _cleanup_closedir_ DIR *d = NULL;
1119         _cleanup_fclose_ FILE *f = NULL;
1120         int ret = 0, r;
1121
1122         assert(u);
1123         assert(path);
1124
1125         r = cg_enumerate_processes(SYSTEMD_CGROUP_CONTROLLER, path, &f);
1126         if (r < 0)
1127                 ret = r;
1128         else {
1129                 pid_t pid;
1130
1131                 while ((r = cg_read_pid(f, &pid)) > 0) {
1132                         r = unit_watch_pid(u, pid);
1133                         if (r < 0 && ret >= 0)
1134                                 ret = r;
1135                 }
1136
1137                 if (r < 0 && ret >= 0)
1138                         ret = r;
1139         }
1140
1141         r = cg_enumerate_subgroups(SYSTEMD_CGROUP_CONTROLLER, path, &d);
1142         if (r < 0) {
1143                 if (ret >= 0)
1144                         ret = r;
1145         } else {
1146                 char *fn;
1147
1148                 while ((r = cg_read_subgroup(d, &fn)) > 0) {
1149                         _cleanup_free_ char *p = NULL;
1150
1151                         p = strjoin(path, "/", fn, NULL);
1152                         free(fn);
1153
1154                         if (!p)
1155                                 return -ENOMEM;
1156
1157                         r = unit_watch_pids_in_path(u, p);
1158                         if (r < 0 && ret >= 0)
1159                                 ret = r;
1160                 }
1161
1162                 if (r < 0 && ret >= 0)
1163                         ret = r;
1164         }
1165
1166         return ret;
1167 }
1168
1169 int unit_watch_all_pids(Unit *u) {
1170         assert(u);
1171
1172         /* Adds all PIDs from our cgroup to the set of PIDs we
1173          * watch. This is a fallback logic for cases where we do not
1174          * get reliable cgroup empty notifications: we try to use
1175          * SIGCHLD as replacement. */
1176
1177         if (!u->cgroup_path)
1178                 return -ENOENT;
1179
1180         if (cg_unified() > 0) /* On unified we can use proper notifications */
1181                 return 0;
1182
1183         return unit_watch_pids_in_path(u, u->cgroup_path);
1184 }
1185
1186 int unit_notify_cgroup_empty(Unit *u) {
1187         int r;
1188
1189         assert(u);
1190
1191         if (!u->cgroup_path)
1192                 return 0;
1193
1194         r = cg_is_empty_recursive(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path);
1195         if (r <= 0)
1196                 return r;
1197
1198         unit_add_to_gc_queue(u);
1199
1200         if (UNIT_VTABLE(u)->notify_cgroup_empty)
1201                 UNIT_VTABLE(u)->notify_cgroup_empty(u);
1202
1203         return 0;
1204 }
1205
1206 static int on_cgroup_inotify_event(sd_event_source *s, int fd, uint32_t revents, void *userdata) {
1207         Manager *m = userdata;
1208
1209         assert(s);
1210         assert(fd >= 0);
1211         assert(m);
1212
1213         for (;;) {
1214                 union inotify_event_buffer buffer;
1215                 struct inotify_event *e;
1216                 ssize_t l;
1217
1218                 l = read(fd, &buffer, sizeof(buffer));
1219                 if (l < 0) {
1220                         if (errno == EINTR || errno == EAGAIN)
1221                                 return 0;
1222
1223                         return log_error_errno(errno, "Failed to read control group inotify events: %m");
1224                 }
1225
1226                 FOREACH_INOTIFY_EVENT(e, buffer, l) {
1227                         Unit *u;
1228
1229                         if (e->wd < 0)
1230                                 /* Queue overflow has no watch descriptor */
1231                                 continue;
1232
1233                         if (e->mask & IN_IGNORED)
1234                                 /* The watch was just removed */
1235                                 continue;
1236
1237                         u = hashmap_get(m->cgroup_inotify_wd_unit, INT_TO_PTR(e->wd));
1238                         if (!u) /* Not that inotify might deliver
1239                                  * events for a watch even after it
1240                                  * was removed, because it was queued
1241                                  * before the removal. Let's ignore
1242                                  * this here safely. */
1243                                 continue;
1244
1245                         (void) unit_notify_cgroup_empty(u);
1246                 }
1247         }
1248 }
1249 #endif // 0
1250
1251 int manager_setup_cgroup(Manager *m) {
1252         _cleanup_free_ char *path = NULL;
1253         CGroupController c;
1254         int r, unified;
1255         char *e;
1256
1257         assert(m);
1258
1259         /* 1. Determine hierarchy */
1260         m->cgroup_root = mfree(m->cgroup_root);
1261         r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, 0, &m->cgroup_root);
1262         if (r < 0)
1263                 return log_error_errno(r, "Cannot determine cgroup we are running in: %m");
1264
1265 #if 0 /// elogind does not support systemd scopes and slices
1266         /* Chop off the init scope, if we are already located in it */
1267         e = endswith(m->cgroup_root, "/" SPECIAL_INIT_SCOPE);
1268
1269         /* LEGACY: Also chop off the system slice if we are in
1270          * it. This is to support live upgrades from older systemd
1271          * versions where PID 1 was moved there. Also see
1272          * cg_get_root_path(). */
1273         if (!e && m->running_as == MANAGER_SYSTEM) {
1274                 e = endswith(m->cgroup_root, "/" SPECIAL_SYSTEM_SLICE);
1275                 if (!e)
1276                         e = endswith(m->cgroup_root, "/system"); /* even more legacy */
1277         }
1278         if (e)
1279                 *e = 0;
1280 #endif // 0
1281
1282         /* And make sure to store away the root value without trailing
1283          * slash, even for the root dir, so that we can easily prepend
1284          * it everywhere. */
1285         while ((e = endswith(m->cgroup_root, "/")))
1286                 *e = 0;
1287         log_debug_elogind("Cgroup Controller \"%s\" -> root \"%s\"",
1288                           SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root);
1289
1290         /* 2. Show data */
1291         r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, NULL, &path);
1292         if (r < 0)
1293                 return log_error_errno(r, "Cannot find cgroup mount point: %m");
1294
1295         unified = cg_unified();
1296         if (unified < 0)
1297                 return log_error_errno(r, "Couldn't determine if we are running in the unified hierarchy: %m");
1298         if (unified > 0)
1299                 log_debug("Unified cgroup hierarchy is located at %s.", path);
1300         else
1301                 log_debug("Using cgroup controller " SYSTEMD_CGROUP_CONTROLLER ". File system hierarchy is at %s.", path);
1302
1303         if (!m->test_run) {
1304                 const char *scope_path;
1305
1306                 /* 3. Install agent */
1307                 if (unified) {
1308
1309                         /* In the unified hierarchy we can can get
1310                          * cgroup empty notifications via inotify. */
1311
1312 #if 0 /// elogind does not support the unified hierarchy, yet.
1313                         m->cgroup_inotify_event_source = sd_event_source_unref(m->cgroup_inotify_event_source);
1314                         safe_close(m->cgroup_inotify_fd);
1315
1316                         m->cgroup_inotify_fd = inotify_init1(IN_NONBLOCK|IN_CLOEXEC);
1317                         if (m->cgroup_inotify_fd < 0)
1318                                 return log_error_errno(errno, "Failed to create control group inotify object: %m");
1319
1320                         r = sd_event_add_io(m->event, &m->cgroup_inotify_event_source, m->cgroup_inotify_fd, EPOLLIN, on_cgroup_inotify_event, m);
1321                         if (r < 0)
1322                                 return log_error_errno(r, "Failed to watch control group inotify object: %m");
1323
1324                         r = sd_event_source_set_priority(m->cgroup_inotify_event_source, SD_EVENT_PRIORITY_IDLE - 5);
1325                         if (r < 0)
1326                                 return log_error_errno(r, "Failed to set priority of inotify event source: %m");
1327
1328                         (void) sd_event_source_set_description(m->cgroup_inotify_event_source, "cgroup-inotify");
1329
1330 #else
1331                         return log_error_errno(EOPNOTSUPP, "Unified cgroup hierarchy not supported: %m");
1332 #endif // 0
1333                 } else if (m->running_as == MANAGER_SYSTEM) {
1334
1335                         /* On the legacy hierarchy we only get
1336                          * notifications via cgroup agents. (Which
1337                          * isn't really reliable, since it does not
1338                          * generate events when control groups with
1339                          * children run empty. */
1340
1341                         r = cg_install_release_agent(SYSTEMD_CGROUP_CONTROLLER, ELOGIND_CGROUP_AGENT_PATH);
1342                         if (r < 0)
1343                                 log_warning_errno(r, "Failed to install release agent, ignoring: %m");
1344                         else if (r > 0)
1345                                 log_debug("Installed release agent.");
1346                         else if (r == 0)
1347                                 log_debug("Release agent already installed.");
1348                 }
1349
1350 #if 0 /// elogind is not meant to run in systemd init scope
1351                 /* 4. Make sure we are in the special "init.scope" unit in the root slice. */
1352                 scope_path = strjoina(m->cgroup_root, "/" SPECIAL_INIT_SCOPE);
1353                 r = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, scope_path, 0);
1354 #else
1355                 if (streq(SYSTEMD_CGROUP_CONTROLLER, "name=elogind"))
1356                         // we are our own cgroup controller
1357                         scope_path = strjoina("");
1358                 else if (streq(m->cgroup_root, "/elogind"))
1359                         // root already is our cgroup
1360                         scope_path = strjoina(m->cgroup_root);
1361                 else
1362                         // we have to create our own group
1363                         scope_path = strjoina(m->cgroup_root, "/elogind");
1364                 r = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, scope_path, 0);
1365 #endif // 0
1366                 if (r < 0)
1367                         return log_error_errno(r, "Failed to create %s control group: %m", scope_path);
1368                 log_debug_elogind("Created control group \"%s\"", scope_path);
1369
1370                 /* also, move all other userspace processes remaining
1371                  * in the root cgroup into that scope. */
1372                 if (!streq(m->cgroup_root, scope_path)) {
1373                         r = cg_migrate(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, SYSTEMD_CGROUP_CONTROLLER, scope_path, false);
1374                         if (r < 0)
1375                                 log_warning_errno(r, "Couldn't move remaining userspace processes, ignoring: %m");
1376                 }
1377
1378                 /* 5. And pin it, so that it cannot be unmounted */
1379                 safe_close(m->pin_cgroupfs_fd);
1380                 m->pin_cgroupfs_fd = open(path, O_RDONLY|O_CLOEXEC|O_DIRECTORY|O_NOCTTY|O_NONBLOCK);
1381                 if (m->pin_cgroupfs_fd < 0)
1382                         return log_error_errno(errno, "Failed to open pin file: %m");
1383
1384                 /* 6.  Always enable hierarchical support if it exists... */
1385                 if (!unified)
1386                         (void) cg_set_attribute("memory", "/", "memory.use_hierarchy", "1");
1387         }
1388
1389         /* 7. Figure out which controllers are supported */
1390         r = cg_mask_supported(&m->cgroup_supported);
1391         if (r < 0)
1392                 return log_error_errno(r, "Failed to determine supported controllers: %m");
1393
1394         for (c = 0; c < _CGROUP_CONTROLLER_MAX; c++)
1395                 log_debug("Controller '%s' supported: %s", cgroup_controller_to_string(c), yes_no(m->cgroup_supported & c));
1396
1397         return 0;
1398 }
1399
1400 void manager_shutdown_cgroup(Manager *m, bool delete) {
1401         assert(m);
1402
1403         /* We can't really delete the group, since we are in it. But
1404          * let's trim it. */
1405         if (delete && m->cgroup_root)
1406                 (void) cg_trim(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, false);
1407
1408 #if 0 /// elogind does not support the unified hierarchy, yet.
1409         m->cgroup_inotify_wd_unit = hashmap_free(m->cgroup_inotify_wd_unit);
1410
1411         m->cgroup_inotify_event_source = sd_event_source_unref(m->cgroup_inotify_event_source);
1412         m->cgroup_inotify_fd = safe_close(m->cgroup_inotify_fd);
1413 #endif // 0
1414
1415         m->pin_cgroupfs_fd = safe_close(m->pin_cgroupfs_fd);
1416
1417         m->cgroup_root = mfree(m->cgroup_root);
1418 }
1419
1420 #if 0 /// UNNEEDED by elogind
1421 Unit* manager_get_unit_by_cgroup(Manager *m, const char *cgroup) {
1422         char *p;
1423         Unit *u;
1424
1425         assert(m);
1426         assert(cgroup);
1427
1428         u = hashmap_get(m->cgroup_unit, cgroup);
1429         if (u)
1430                 return u;
1431
1432         p = strdupa(cgroup);
1433         for (;;) {
1434                 char *e;
1435
1436                 e = strrchr(p, '/');
1437                 if (!e || e == p)
1438                         return hashmap_get(m->cgroup_unit, SPECIAL_ROOT_SLICE);
1439
1440                 *e = 0;
1441
1442                 u = hashmap_get(m->cgroup_unit, p);
1443                 if (u)
1444                         return u;
1445         }
1446 }
1447
1448 Unit *manager_get_unit_by_pid_cgroup(Manager *m, pid_t pid) {
1449         _cleanup_free_ char *cgroup = NULL;
1450         int r;
1451
1452         assert(m);
1453
1454         if (pid <= 0)
1455                 return NULL;
1456
1457         r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, pid, &cgroup);
1458         if (r < 0)
1459                 return NULL;
1460
1461         return manager_get_unit_by_cgroup(m, cgroup);
1462 }
1463
1464 Unit *manager_get_unit_by_pid(Manager *m, pid_t pid) {
1465         Unit *u;
1466
1467         assert(m);
1468
1469         if (pid <= 0)
1470                 return NULL;
1471
1472         if (pid == 1)
1473                 return hashmap_get(m->units, SPECIAL_INIT_SCOPE);
1474
1475         u = hashmap_get(m->watch_pids1, PID_TO_PTR(pid));
1476         if (u)
1477                 return u;
1478
1479         u = hashmap_get(m->watch_pids2, PID_TO_PTR(pid));
1480         if (u)
1481                 return u;
1482
1483         return manager_get_unit_by_pid_cgroup(m, pid);
1484 }
1485
1486 int manager_notify_cgroup_empty(Manager *m, const char *cgroup) {
1487         Unit *u;
1488
1489         assert(m);
1490         assert(cgroup);
1491
1492         u = manager_get_unit_by_cgroup(m, cgroup);
1493         if (!u)
1494                 return 0;
1495
1496         return unit_notify_cgroup_empty(u);
1497 }
1498
1499 int unit_get_memory_current(Unit *u, uint64_t *ret) {
1500         _cleanup_free_ char *v = NULL;
1501         int r;
1502
1503         assert(u);
1504         assert(ret);
1505
1506         if (!u->cgroup_path)
1507                 return -ENODATA;
1508
1509         if ((u->cgroup_realized_mask & CGROUP_MASK_MEMORY) == 0)
1510                 return -ENODATA;
1511
1512         if (cg_unified() <= 0)
1513                 r = cg_get_attribute("memory", u->cgroup_path, "memory.usage_in_bytes", &v);
1514         else
1515                 r = cg_get_attribute("memory", u->cgroup_path, "memory.current", &v);
1516         if (r == -ENOENT)
1517                 return -ENODATA;
1518         if (r < 0)
1519                 return r;
1520
1521         return safe_atou64(v, ret);
1522 }
1523
1524 int unit_get_tasks_current(Unit *u, uint64_t *ret) {
1525         _cleanup_free_ char *v = NULL;
1526         int r;
1527
1528         assert(u);
1529         assert(ret);
1530
1531         if (!u->cgroup_path)
1532                 return -ENODATA;
1533
1534         if ((u->cgroup_realized_mask & CGROUP_MASK_PIDS) == 0)
1535                 return -ENODATA;
1536
1537         r = cg_get_attribute("pids", u->cgroup_path, "pids.current", &v);
1538         if (r == -ENOENT)
1539                 return -ENODATA;
1540         if (r < 0)
1541                 return r;
1542
1543         return safe_atou64(v, ret);
1544 }
1545
1546 static int unit_get_cpu_usage_raw(Unit *u, nsec_t *ret) {
1547         _cleanup_free_ char *v = NULL;
1548         uint64_t ns;
1549         int r;
1550
1551         assert(u);
1552         assert(ret);
1553
1554         if (!u->cgroup_path)
1555                 return -ENODATA;
1556
1557         if ((u->cgroup_realized_mask & CGROUP_MASK_CPUACCT) == 0)
1558                 return -ENODATA;
1559
1560         r = cg_get_attribute("cpuacct", u->cgroup_path, "cpuacct.usage", &v);
1561         if (r == -ENOENT)
1562                 return -ENODATA;
1563         if (r < 0)
1564                 return r;
1565
1566         r = safe_atou64(v, &ns);
1567         if (r < 0)
1568                 return r;
1569
1570         *ret = ns;
1571         return 0;
1572 }
1573
1574 int unit_get_cpu_usage(Unit *u, nsec_t *ret) {
1575         nsec_t ns;
1576         int r;
1577
1578         r = unit_get_cpu_usage_raw(u, &ns);
1579         if (r < 0)
1580                 return r;
1581
1582         if (ns > u->cpuacct_usage_base)
1583                 ns -= u->cpuacct_usage_base;
1584         else
1585                 ns = 0;
1586
1587         *ret = ns;
1588         return 0;
1589 }
1590
1591 int unit_reset_cpu_usage(Unit *u) {
1592         nsec_t ns;
1593         int r;
1594
1595         assert(u);
1596
1597         r = unit_get_cpu_usage_raw(u, &ns);
1598         if (r < 0) {
1599                 u->cpuacct_usage_base = 0;
1600                 return r;
1601         }
1602
1603         u->cpuacct_usage_base = ns;
1604         return 0;
1605 }
1606
1607 bool unit_cgroup_delegate(Unit *u) {
1608         CGroupContext *c;
1609
1610         assert(u);
1611
1612         c = unit_get_cgroup_context(u);
1613         if (!c)
1614                 return false;
1615
1616         return c->delegate;
1617 }
1618
1619 void unit_invalidate_cgroup(Unit *u, CGroupMask m) {
1620         assert(u);
1621
1622         if (!UNIT_HAS_CGROUP_CONTEXT(u))
1623                 return;
1624
1625         if (m == 0)
1626                 return;
1627
1628         if ((u->cgroup_realized_mask & m) == 0)
1629                 return;
1630
1631         u->cgroup_realized_mask &= ~m;
1632         unit_add_to_cgroup_queue(u);
1633 }
1634
1635 void manager_invalidate_startup_units(Manager *m) {
1636         Iterator i;
1637         Unit *u;
1638
1639         assert(m);
1640
1641         SET_FOREACH(u, m->startup_units, i)
1642                 unit_invalidate_cgroup(u, CGROUP_MASK_CPU|CGROUP_MASK_BLKIO);
1643 }
1644
1645 static const char* const cgroup_device_policy_table[_CGROUP_DEVICE_POLICY_MAX] = {
1646         [CGROUP_AUTO] = "auto",
1647         [CGROUP_CLOSED] = "closed",
1648         [CGROUP_STRICT] = "strict",
1649 };
1650
1651 DEFINE_STRING_TABLE_LOOKUP(cgroup_device_policy, CGroupDevicePolicy);
1652 #endif // 0