src/core/cgroup.c

   1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
   2
   3 /***
   4   This file is part of systemd.
   5
   6   Copyright 2013 Lennart Poettering
   7
   8   systemd is free software; you can redistribute it and/or modify it
   9   under the terms of the GNU Lesser General Public License as published by
  10   the Free Software Foundation; either version 2.1 of the License, or
  11   (at your option) any later version.
  12
  13   systemd is distributed in the hope that it will be useful, but
  14   WITHOUT ANY WARRANTY; without even the implied warranty of
  15   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  16   Lesser General Public License for more details.
  17
  18   You should have received a copy of the GNU Lesser General Public License
  19   along with systemd; If not, see <http://www.gnu.org/licenses/>.
  20 ***/
  21
  22 #include <fcntl.h>
  23 #include <fnmatch.h>
  24
  25 #include "cgroup-util.h"
  26 #include "path-util.h"
  27 #include "process-util.h"
  28 //#include "special.h"
  29
  30 #include "cgroup.h"
  31
  32 #define CGROUP_CPU_QUOTA_PERIOD_USEC ((usec_t) 100 * USEC_PER_MSEC)
  33
  34 // UNNEEDED by elogind
  35 #if 0
  36 void cgroup_context_init(CGroupContext *c) {
  37         assert(c);
  38
  39         /* Initialize everything to the kernel defaults, assuming the
  40          * structure is preinitialized to 0 */
  41
  42         c->cpu_shares = CGROUP_CPU_SHARES_INVALID;
  43         c->startup_cpu_shares = CGROUP_CPU_SHARES_INVALID;
  44         c->cpu_quota_per_sec_usec = USEC_INFINITY;
  45
  46         c->memory_limit = (uint64_t) -1;
  47
  48         c->blockio_weight = CGROUP_BLKIO_WEIGHT_INVALID;
  49         c->startup_blockio_weight = CGROUP_BLKIO_WEIGHT_INVALID;
  50
  51         c->tasks_max = (uint64_t) -1;
  52
  53         c->netclass_type = CGROUP_NETCLASS_TYPE_NONE;
  54 }
  55
  56 void cgroup_context_free_device_allow(CGroupContext *c, CGroupDeviceAllow *a) {
  57         assert(c);
  58         assert(a);
  59
  60         LIST_REMOVE(device_allow, c->device_allow, a);
  61         free(a->path);
  62         free(a);
  63 }
  64
  65 void cgroup_context_free_blockio_device_weight(CGroupContext *c, CGroupBlockIODeviceWeight *w) {
  66         assert(c);
  67         assert(w);
  68
  69         LIST_REMOVE(device_weights, c->blockio_device_weights, w);
  70         free(w->path);
  71         free(w);
  72 }
  73
  74 void cgroup_context_free_blockio_device_bandwidth(CGroupContext *c, CGroupBlockIODeviceBandwidth *b) {
  75         assert(c);
  76         assert(b);
  77
  78         LIST_REMOVE(device_bandwidths, c->blockio_device_bandwidths, b);
  79         free(b->path);
  80         free(b);
  81 }
  82
  83 void cgroup_context_done(CGroupContext *c) {
  84         assert(c);
  85
  86         while (c->blockio_device_weights)
  87                 cgroup_context_free_blockio_device_weight(c, c->blockio_device_weights);
  88
  89         while (c->blockio_device_bandwidths)
  90                 cgroup_context_free_blockio_device_bandwidth(c, c->blockio_device_bandwidths);
  91
  92         while (c->device_allow)
  93                 cgroup_context_free_device_allow(c, c->device_allow);
  94 }
  95
  96 void cgroup_context_dump(CGroupContext *c, FILE* f, const char *prefix) {
  97         CGroupBlockIODeviceBandwidth *b;
  98         CGroupBlockIODeviceWeight *w;
  99         CGroupDeviceAllow *a;
 100         char u[FORMAT_TIMESPAN_MAX];
 101
 102         assert(c);
 103         assert(f);
 104
 105         prefix = strempty(prefix);
 106
 107         fprintf(f,
 108                 "%sCPUAccounting=%s\n"
 109                 "%sBlockIOAccounting=%s\n"
 110                 "%sMemoryAccounting=%s\n"
 111                 "%sTasksAccounting=%s\n"
 112                 "%sCPUShares=%" PRIu64 "\n"
 113                 "%sStartupCPUShares=%" PRIu64 "\n"
 114                 "%sCPUQuotaPerSecSec=%s\n"
 115                 "%sBlockIOWeight=%" PRIu64 "\n"
 116                 "%sStartupBlockIOWeight=%" PRIu64 "\n"
 117                 "%sMemoryLimit=%" PRIu64 "\n"
 118                 "%sTasksMax=%" PRIu64 "\n"
 119                 "%sDevicePolicy=%s\n"
 120                 "%sDelegate=%s\n",
 121                 prefix, yes_no(c->cpu_accounting),
 122                 prefix, yes_no(c->blockio_accounting),
 123                 prefix, yes_no(c->memory_accounting),
 124                 prefix, yes_no(c->tasks_accounting),
 125                 prefix, c->cpu_shares,
 126                 prefix, c->startup_cpu_shares,
 127                 prefix, format_timespan(u, sizeof(u), c->cpu_quota_per_sec_usec, 1),
 128                 prefix, c->blockio_weight,
 129                 prefix, c->startup_blockio_weight,
 130                 prefix, c->memory_limit,
 131                 prefix, c->tasks_max,
 132                 prefix, cgroup_device_policy_to_string(c->device_policy),
 133                 prefix, yes_no(c->delegate));
 134
 135         LIST_FOREACH(device_allow, a, c->device_allow)
 136                 fprintf(f,
 137                         "%sDeviceAllow=%s %s%s%s\n",
 138                         prefix,
 139                         a->path,
 140                         a->r ? "r" : "", a->w ? "w" : "", a->m ? "m" : "");
 141
 142         LIST_FOREACH(device_weights, w, c->blockio_device_weights)
 143                 fprintf(f,
 144                         "%sBlockIODeviceWeight=%s %" PRIu64,
 145                         prefix,
 146                         w->path,
 147                         w->weight);
 148
 149         LIST_FOREACH(device_bandwidths, b, c->blockio_device_bandwidths) {
 150                 char buf[FORMAT_BYTES_MAX];
 151
 152                 fprintf(f,
 153                         "%s%s=%s %s\n",
 154                         prefix,
 155                         b->read ? "BlockIOReadBandwidth" : "BlockIOWriteBandwidth",
 156                         b->path,
 157                         format_bytes(buf, sizeof(buf), b->bandwidth));
 158         }
 159 }
 160
 161 static int lookup_blkio_device(const char *p, dev_t *dev) {
 162         struct stat st;
 163         int r;
 164
 165         assert(p);
 166         assert(dev);
 167
 168         r = stat(p, &st);
 169         if (r < 0)
 170                 return log_warning_errno(errno, "Couldn't stat device %s: %m", p);
 171
 172         if (S_ISBLK(st.st_mode))
 173                 *dev = st.st_rdev;
 174         else if (major(st.st_dev) != 0) {
 175                 /* If this is not a device node then find the block
 176                  * device this file is stored on */
 177                 *dev = st.st_dev;
 178
 179                 /* If this is a partition, try to get the originating
 180                  * block device */
 181                 block_get_whole_disk(*dev, dev);
 182         } else {
 183                 log_warning("%s is not a block device and file system block device cannot be determined or is not local.", p);
 184                 return -ENODEV;
 185         }
 186
 187         return 0;
 188 }
 189
 190 static int whitelist_device(const char *path, const char *node, const char *acc) {
 191         char buf[2+DECIMAL_STR_MAX(dev_t)*2+2+4];
 192         struct stat st;
 193         int r;
 194
 195         assert(path);
 196         assert(acc);
 197
 198         if (stat(node, &st) < 0) {
 199                 log_warning("Couldn't stat device %s", node);
 200                 return -errno;
 201         }
 202
 203         if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
 204                 log_warning("%s is not a device.", node);
 205                 return -ENODEV;
 206         }
 207
 208         sprintf(buf,
 209                 "%c %u:%u %s",
 210                 S_ISCHR(st.st_mode) ? 'c' : 'b',
 211                 major(st.st_rdev), minor(st.st_rdev),
 212                 acc);
 213
 214         r = cg_set_attribute("devices", path, "devices.allow", buf);
 215         if (r < 0)
 216                 log_full_errno(IN_SET(r, -ENOENT, -EROFS, -EINVAL) ? LOG_DEBUG : LOG_WARNING, r,
 217                                "Failed to set devices.allow on %s: %m", path);
 218
 219         return r;
 220 }
 221
 222 static int whitelist_major(const char *path, const char *name, char type, const char *acc) {
 223         _cleanup_fclose_ FILE *f = NULL;
 224         char line[LINE_MAX];
 225         bool good = false;
 226         int r;
 227
 228         assert(path);
 229         assert(acc);
 230         assert(type == 'b' || type == 'c');
 231
 232         f = fopen("/proc/devices", "re");
 233         if (!f)
 234                 return log_warning_errno(errno, "Cannot open /proc/devices to resolve %s (%c): %m", name, type);
 235
 236         FOREACH_LINE(line, f, goto fail) {
 237                 char buf[2+DECIMAL_STR_MAX(unsigned)+3+4], *p, *w;
 238                 unsigned maj;
 239
 240                 truncate_nl(line);
 241
 242                 if (type == 'c' && streq(line, "Character devices:")) {
 243                         good = true;
 244                         continue;
 245                 }
 246
 247                 if (type == 'b' && streq(line, "Block devices:")) {
 248                         good = true;
 249                         continue;
 250                 }
 251
 252                 if (isempty(line)) {
 253                         good = false;
 254                         continue;
 255                 }
 256
 257                 if (!good)
 258                         continue;
 259
 260                 p = strstrip(line);
 261
 262                 w = strpbrk(p, WHITESPACE);
 263                 if (!w)
 264                         continue;
 265                 *w = 0;
 266
 267                 r = safe_atou(p, &maj);
 268                 if (r < 0)
 269                         continue;
 270                 if (maj <= 0)
 271                         continue;
 272
 273                 w++;
 274                 w += strspn(w, WHITESPACE);
 275
 276                 if (fnmatch(name, w, 0) != 0)
 277                         continue;
 278
 279                 sprintf(buf,
 280                         "%c %u:* %s",
 281                         type,
 282                         maj,
 283                         acc);
 284
 285                 r = cg_set_attribute("devices", path, "devices.allow", buf);
 286                 if (r < 0)
 287                         log_full_errno(IN_SET(r, -ENOENT, -EROFS, -EINVAL) ? LOG_DEBUG : LOG_WARNING, r,
 288                                        "Failed to set devices.allow on %s: %m", path);
 289         }
 290
 291         return 0;
 292
 293 fail:
 294         log_warning_errno(errno, "Failed to read /proc/devices: %m");
 295         return -errno;
 296 }
 297
 298 void cgroup_context_apply(CGroupContext *c, CGroupMask mask, const char *path, uint32_t netclass, ManagerState state) {
 299         bool is_root;
 300         int r;
 301
 302         assert(c);
 303         assert(path);
 304
 305         if (mask == 0)
 306                 return;
 307
 308         /* Some cgroup attributes are not supported on the root cgroup,
 309          * hence silently ignore */
 310         is_root = isempty(path) || path_equal(path, "/");
 311         if (is_root)
 312                 /* Make sure we don't try to display messages with an empty path. */
 313                 path = "/";
 314
 315         /* We generally ignore errors caused by read-only mounted
 316          * cgroup trees (assuming we are running in a container then),
 317          * and missing cgroups, i.e. EROFS and ENOENT. */
 318
 319         if ((mask & CGROUP_MASK_CPU) && !is_root) {
 320                 char buf[MAX(DECIMAL_STR_MAX(uint64_t), DECIMAL_STR_MAX(usec_t)) + 1];
 321
 322                 sprintf(buf, "%" PRIu64 "\n",
 323                         IN_SET(state, MANAGER_STARTING, MANAGER_INITIALIZING) && c->startup_cpu_shares != CGROUP_CPU_SHARES_INVALID ? c->startup_cpu_shares :
 324                         c->cpu_shares != CGROUP_CPU_SHARES_INVALID ? c->cpu_shares : CGROUP_CPU_SHARES_DEFAULT);
 325                 r = cg_set_attribute("cpu", path, "cpu.shares", buf);
 326                 if (r < 0)
 327                         log_full_errno(IN_SET(r, -ENOENT, -EROFS) ? LOG_DEBUG : LOG_WARNING, r,
 328                                        "Failed to set cpu.shares on %s: %m", path);
 329
 330                 sprintf(buf, USEC_FMT "\n", CGROUP_CPU_QUOTA_PERIOD_USEC);
 331                 r = cg_set_attribute("cpu", path, "cpu.cfs_period_us", buf);
 332                 if (r < 0)
 333                         log_full_errno(IN_SET(r, -ENOENT, -EROFS) ? LOG_DEBUG : LOG_WARNING, r,
 334                                        "Failed to set cpu.cfs_period_us on %s: %m", path);
 335
 336                 if (c->cpu_quota_per_sec_usec != USEC_INFINITY) {
 337                         sprintf(buf, USEC_FMT "\n", c->cpu_quota_per_sec_usec * CGROUP_CPU_QUOTA_PERIOD_USEC / USEC_PER_SEC);
 338                         r = cg_set_attribute("cpu", path, "cpu.cfs_quota_us", buf);
 339                 } else
 340                         r = cg_set_attribute("cpu", path, "cpu.cfs_quota_us", "-1");
 341                 if (r < 0)
 342                         log_full_errno(IN_SET(r, -ENOENT, -EROFS) ? LOG_DEBUG : LOG_WARNING, r,
 343                                        "Failed to set cpu.cfs_quota_us on %s: %m", path);
 344         }
 345
 346         if (mask & CGROUP_MASK_BLKIO) {
 347                 char buf[MAX(DECIMAL_STR_MAX(uint64_t)+1,
 348                               DECIMAL_STR_MAX(dev_t)*2+2+DECIMAL_STR_MAX(uint64_t)+1)];
 349                 CGroupBlockIODeviceWeight *w;
 350                 CGroupBlockIODeviceBandwidth *b;
 351
 352                 if (!is_root) {
 353                         sprintf(buf, "%" PRIu64 "\n",
 354                                 IN_SET(state, MANAGER_STARTING, MANAGER_INITIALIZING) && c->startup_blockio_weight != CGROUP_BLKIO_WEIGHT_INVALID ? c->startup_blockio_weight :
 355                                 c->blockio_weight != CGROUP_BLKIO_WEIGHT_INVALID ? c->blockio_weight : CGROUP_BLKIO_WEIGHT_DEFAULT);
 356                         r = cg_set_attribute("blkio", path, "blkio.weight", buf);
 357                         if (r < 0)
 358                                 log_full_errno(IN_SET(r, -ENOENT, -EROFS) ? LOG_DEBUG : LOG_WARNING, r,
 359                                                "Failed to set blkio.weight on %s: %m", path);
 360
 361                         /* FIXME: no way to reset this list */
 362                         LIST_FOREACH(device_weights, w, c->blockio_device_weights) {
 363                                 dev_t dev;
 364
 365                                 r = lookup_blkio_device(w->path, &dev);
 366                                 if (r < 0)
 367                                         continue;
 368
 369                                 sprintf(buf, "%u:%u %" PRIu64 "\n", major(dev), minor(dev), w->weight);
 370                                 r = cg_set_attribute("blkio", path, "blkio.weight_device", buf);
 371                                 if (r < 0)
 372                                         log_full_errno(IN_SET(r, -ENOENT, -EROFS) ? LOG_DEBUG : LOG_WARNING, r,
 373                                                        "Failed to set blkio.weight_device on %s: %m", path);
 374                         }
 375                 }
 376
 377                 /* FIXME: no way to reset this list */
 378                 LIST_FOREACH(device_bandwidths, b, c->blockio_device_bandwidths) {
 379                         const char *a;
 380                         dev_t dev;
 381
 382                         r = lookup_blkio_device(b->path, &dev);
 383                         if (r < 0)
 384                                 continue;
 385
 386                         a = b->read ? "blkio.throttle.read_bps_device" : "blkio.throttle.write_bps_device";
 387
 388                         sprintf(buf, "%u:%u %" PRIu64 "\n", major(dev), minor(dev), b->bandwidth);
 389                         r = cg_set_attribute("blkio", path, a, buf);
 390                         if (r < 0)
 391                                 log_full_errno(IN_SET(r, -ENOENT, -EROFS) ? LOG_DEBUG : LOG_WARNING, r,
 392                                                "Failed to set %s on %s: %m", a, path);
 393                 }
 394         }
 395
 396         if ((mask & CGROUP_MASK_MEMORY) && !is_root) {
 397                 if (c->memory_limit != (uint64_t) -1) {
 398                         char buf[DECIMAL_STR_MAX(uint64_t) + 1];
 399
 400                         sprintf(buf, "%" PRIu64 "\n", c->memory_limit);
 401
 402                         if (cg_unified() <= 0)
 403                                 r = cg_set_attribute("memory", path, "memory.limit_in_bytes", buf);
 404                         else
 405                                 r = cg_set_attribute("memory", path, "memory.max", buf);
 406
 407                 } else {
 408                         if (cg_unified() <= 0)
 409                                 r = cg_set_attribute("memory", path, "memory.limit_in_bytes", "-1");
 410                         else
 411                                 r = cg_set_attribute("memory", path, "memory.max", "max");
 412                 }
 413
 414                 if (r < 0)
 415                         log_full_errno(IN_SET(r, -ENOENT, -EROFS) ? LOG_DEBUG : LOG_WARNING, r,
 416                                        "Failed to set memory.limit_in_bytes/memory.max on %s: %m", path);
 417         }
 418
 419         if ((mask & CGROUP_MASK_DEVICES) && !is_root) {
 420                 CGroupDeviceAllow *a;
 421
 422                 /* Changing the devices list of a populated cgroup
 423                  * might result in EINVAL, hence ignore EINVAL
 424                  * here. */
 425
 426                 if (c->device_allow || c->device_policy != CGROUP_AUTO)
 427                         r = cg_set_attribute("devices", path, "devices.deny", "a");
 428                 else
 429                         r = cg_set_attribute("devices", path, "devices.allow", "a");
 430                 if (r < 0)
 431                         log_full_errno(IN_SET(r, -ENOENT, -EROFS, -EINVAL) ? LOG_DEBUG : LOG_WARNING, r,
 432                                        "Failed to reset devices.list on %s: %m", path);
 433
 434                 if (c->device_policy == CGROUP_CLOSED ||
 435                     (c->device_policy == CGROUP_AUTO && c->device_allow)) {
 436                         static const char auto_devices[] =
 437                                 "/dev/null\0" "rwm\0"
 438                                 "/dev/zero\0" "rwm\0"
 439                                 "/dev/full\0" "rwm\0"
 440                                 "/dev/random\0" "rwm\0"
 441                                 "/dev/urandom\0" "rwm\0"
 442                                 "/dev/tty\0" "rwm\0"
 443                                 "/dev/pts/ptmx\0" "rw\0"; /* /dev/pts/ptmx may not be duplicated, but accessed */
 444
 445                         const char *x, *y;
 446
 447                         NULSTR_FOREACH_PAIR(x, y, auto_devices)
 448                                 whitelist_device(path, x, y);
 449
 450                         whitelist_major(path, "pts", 'c', "rw");
 451                         whitelist_major(path, "kdbus", 'c', "rw");
 452                         whitelist_major(path, "kdbus/*", 'c', "rw");
 453                 }
 454
 455                 LIST_FOREACH(device_allow, a, c->device_allow) {
 456                         char acc[4];
 457                         unsigned k = 0;
 458
 459                         if (a->r)
 460                                 acc[k++] = 'r';
 461                         if (a->w)
 462                                 acc[k++] = 'w';
 463                         if (a->m)
 464                                 acc[k++] = 'm';
 465
 466                         if (k == 0)
 467                                 continue;
 468
 469                         acc[k++] = 0;
 470
 471                         if (startswith(a->path, "/dev/"))
 472                                 whitelist_device(path, a->path, acc);
 473                         else if (startswith(a->path, "block-"))
 474                                 whitelist_major(path, a->path + 6, 'b', acc);
 475                         else if (startswith(a->path, "char-"))
 476                                 whitelist_major(path, a->path + 5, 'c', acc);
 477                         else
 478                                 log_debug("Ignoring device %s while writing cgroup attribute.", a->path);
 479                 }
 480         }
 481
 482         if ((mask & CGROUP_MASK_PIDS) && !is_root) {
 483
 484                 if (c->tasks_max != (uint64_t) -1) {
 485                         char buf[DECIMAL_STR_MAX(uint64_t) + 2];
 486
 487                         sprintf(buf, "%" PRIu64 "\n", c->tasks_max);
 488                         r = cg_set_attribute("pids", path, "pids.max", buf);
 489                 } else
 490                         r = cg_set_attribute("pids", path, "pids.max", "max");
 491
 492                 if (r < 0)
 493                         log_full_errno(IN_SET(r, -ENOENT, -EROFS) ? LOG_DEBUG : LOG_WARNING, r,
 494                                        "Failed to set pids.max on %s: %m", path);
 495         }
 496
 497         if (mask & CGROUP_MASK_NET_CLS) {
 498                 char buf[DECIMAL_STR_MAX(uint32_t)];
 499
 500                 sprintf(buf, "%" PRIu32, netclass);
 501
 502                 r = cg_set_attribute("net_cls", path, "net_cls.classid", buf);
 503                 if (r < 0)
 504                         log_full_errno(IN_SET(r, -ENOENT, -EROFS) ? LOG_DEBUG : LOG_WARNING, r,
 505                                        "Failed to set net_cls.classid on %s: %m", path);
 506         }
 507 }
 508
 509 CGroupMask cgroup_context_get_mask(CGroupContext *c) {
 510         CGroupMask mask = 0;
 511
 512         /* Figure out which controllers we need */
 513
 514         if (c->cpu_accounting ||
 515             c->cpu_shares != CGROUP_CPU_SHARES_INVALID ||
 516             c->startup_cpu_shares != CGROUP_CPU_SHARES_INVALID ||
 517             c->cpu_quota_per_sec_usec != USEC_INFINITY)
 518                 mask |= CGROUP_MASK_CPUACCT | CGROUP_MASK_CPU;
 519
 520         if (c->blockio_accounting ||
 521             c->blockio_weight != CGROUP_BLKIO_WEIGHT_INVALID ||
 522             c->startup_blockio_weight != CGROUP_BLKIO_WEIGHT_INVALID ||
 523             c->blockio_device_weights ||
 524             c->blockio_device_bandwidths)
 525                 mask |= CGROUP_MASK_BLKIO;
 526
 527         if (c->memory_accounting ||
 528             c->memory_limit != (uint64_t) -1)
 529                 mask |= CGROUP_MASK_MEMORY;
 530
 531         if (c->device_allow ||
 532             c->device_policy != CGROUP_AUTO)
 533                 mask |= CGROUP_MASK_DEVICES;
 534
 535         if (c->tasks_accounting ||
 536             c->tasks_max != (uint64_t) -1)
 537                 mask |= CGROUP_MASK_PIDS;
 538
 539         if (c->netclass_type != CGROUP_NETCLASS_TYPE_NONE)
 540                 mask |= CGROUP_MASK_NET_CLS;
 541
 542         return mask;
 543 }
 544
 545 CGroupMask unit_get_own_mask(Unit *u) {
 546         CGroupContext *c;
 547
 548         /* Returns the mask of controllers the unit needs for itself */
 549
 550         c = unit_get_cgroup_context(u);
 551         if (!c)
 552                 return 0;
 553
 554         /* If delegation is turned on, then turn on all cgroups,
 555          * unless we are on the legacy hierarchy and the process we
 556          * fork into it is known to drop privileges, and hence
 557          * shouldn't get access to the controllers.
 558          *
 559          * Note that on the unified hierarchy it is safe to delegate
 560          * controllers to unprivileged services. */
 561
 562         if (c->delegate) {
 563                 ExecContext *e;
 564
 565                 e = unit_get_exec_context(u);
 566                 if (!e ||
 567                     exec_context_maintains_privileges(e) ||
 568                     cg_unified() > 0)
 569                         return _CGROUP_MASK_ALL;
 570         }
 571
 572         return cgroup_context_get_mask(c);
 573 }
 574
 575 CGroupMask unit_get_members_mask(Unit *u) {
 576         assert(u);
 577
 578         /* Returns the mask of controllers all of the unit's children
 579          * require, merged */
 580
 581         if (u->cgroup_members_mask_valid)
 582                 return u->cgroup_members_mask;
 583
 584         u->cgroup_members_mask = 0;
 585
 586         if (u->type == UNIT_SLICE) {
 587                 Unit *member;
 588                 Iterator i;
 589
 590                 SET_FOREACH(member, u->dependencies[UNIT_BEFORE], i) {
 591
 592                         if (member == u)
 593                                 continue;
 594
 595                         if (UNIT_DEREF(member->slice) != u)
 596                                 continue;
 597
 598                         u->cgroup_members_mask |=
 599                                 unit_get_own_mask(member) |
 600                                 unit_get_members_mask(member);
 601                 }
 602         }
 603
 604         u->cgroup_members_mask_valid = true;
 605         return u->cgroup_members_mask;
 606 }
 607
 608 CGroupMask unit_get_siblings_mask(Unit *u) {
 609         assert(u);
 610
 611         /* Returns the mask of controllers all of the unit's siblings
 612          * require, i.e. the members mask of the unit's parent slice
 613          * if there is one. */
 614
 615         if (UNIT_ISSET(u->slice))
 616                 return unit_get_members_mask(UNIT_DEREF(u->slice));
 617
 618         return unit_get_own_mask(u) | unit_get_members_mask(u);
 619 }
 620
 621 CGroupMask unit_get_subtree_mask(Unit *u) {
 622
 623         /* Returns the mask of this subtree, meaning of the group
 624          * itself and its children. */
 625
 626         return unit_get_own_mask(u) | unit_get_members_mask(u);
 627 }
 628
 629 CGroupMask unit_get_target_mask(Unit *u) {
 630         CGroupMask mask;
 631
 632         /* This returns the cgroup mask of all controllers to enable
 633          * for a specific cgroup, i.e. everything it needs itself,
 634          * plus all that its children need, plus all that its siblings
 635          * need. This is primarily useful on the legacy cgroup
 636          * hierarchy, where we need to duplicate each cgroup in each
 637          * hierarchy that shall be enabled for it. */
 638
 639         mask = unit_get_own_mask(u) | unit_get_members_mask(u) | unit_get_siblings_mask(u);
 640         mask &= u->manager->cgroup_supported;
 641
 642         return mask;
 643 }
 644
 645 CGroupMask unit_get_enable_mask(Unit *u) {
 646         CGroupMask mask;
 647
 648         /* This returns the cgroup mask of all controllers to enable
 649          * for the children of a specific cgroup. This is primarily
 650          * useful for the unified cgroup hierarchy, where each cgroup
 651          * controls which controllers are enabled for its children. */
 652
 653         mask = unit_get_members_mask(u);
 654         mask &= u->manager->cgroup_supported;
 655
 656         return mask;
 657 }
 658
 659 /* Recurse from a unit up through its containing slices, propagating
 660  * mask bits upward. A unit is also member of itself. */
 661 void unit_update_cgroup_members_masks(Unit *u) {
 662         CGroupMask m;
 663         bool more;
 664
 665         assert(u);
 666
 667         /* Calculate subtree mask */
 668         m = unit_get_subtree_mask(u);
 669
 670         /* See if anything changed from the previous invocation. If
 671          * not, we're done. */
 672         if (u->cgroup_subtree_mask_valid && m == u->cgroup_subtree_mask)
 673                 return;
 674
 675         more =
 676                 u->cgroup_subtree_mask_valid &&
 677                 ((m & ~u->cgroup_subtree_mask) != 0) &&
 678                 ((~m & u->cgroup_subtree_mask) == 0);
 679
 680         u->cgroup_subtree_mask = m;
 681         u->cgroup_subtree_mask_valid = true;
 682
 683         if (UNIT_ISSET(u->slice)) {
 684                 Unit *s = UNIT_DEREF(u->slice);
 685
 686                 if (more)
 687                         /* There's more set now than before. We
 688                          * propagate the new mask to the parent's mask
 689                          * (not caring if it actually was valid or
 690                          * not). */
 691
 692                         s->cgroup_members_mask |= m;
 693
 694                 else
 695                         /* There's less set now than before (or we
 696                          * don't know), we need to recalculate
 697                          * everything, so let's invalidate the
 698                          * parent's members mask */
 699
 700                         s->cgroup_members_mask_valid = false;
 701
 702                 /* And now make sure that this change also hits our
 703                  * grandparents */
 704                 unit_update_cgroup_members_masks(s);
 705         }
 706 }
 707
 708 static const char *migrate_callback(CGroupMask mask, void *userdata) {
 709         Unit *u = userdata;
 710
 711         assert(mask != 0);
 712         assert(u);
 713
 714         while (u) {
 715                 if (u->cgroup_path &&
 716                     u->cgroup_realized &&
 717                     (u->cgroup_realized_mask & mask) == mask)
 718                         return u->cgroup_path;
 719
 720                 u = UNIT_DEREF(u->slice);
 721         }
 722
 723         return NULL;
 724 }
 725
 726 char *unit_default_cgroup_path(Unit *u) {
 727         _cleanup_free_ char *escaped = NULL, *slice = NULL;
 728         int r;
 729
 730         assert(u);
 731
 732         if (unit_has_name(u, SPECIAL_ROOT_SLICE))
 733                 return strdup(u->manager->cgroup_root);
 734
 735         if (UNIT_ISSET(u->slice) && !unit_has_name(UNIT_DEREF(u->slice), SPECIAL_ROOT_SLICE)) {
 736                 r = cg_slice_to_path(UNIT_DEREF(u->slice)->id, &slice);
 737                 if (r < 0)
 738                         return NULL;
 739         }
 740
 741         escaped = cg_escape(u->id);
 742         if (!escaped)
 743                 return NULL;
 744
 745         if (slice)
 746                 return strjoin(u->manager->cgroup_root, "/", slice, "/", escaped, NULL);
 747         else
 748                 return strjoin(u->manager->cgroup_root, "/", escaped, NULL);
 749 }
 750
 751 int unit_set_cgroup_path(Unit *u, const char *path) {
 752         _cleanup_free_ char *p = NULL;
 753         int r;
 754
 755         assert(u);
 756
 757         if (path) {
 758                 p = strdup(path);
 759                 if (!p)
 760                         return -ENOMEM;
 761         } else
 762                 p = NULL;
 763
 764         if (streq_ptr(u->cgroup_path, p))
 765                 return 0;
 766
 767         if (p) {
 768                 r = hashmap_put(u->manager->cgroup_unit, p, u);
 769                 if (r < 0)
 770                         return r;
 771         }
 772
 773         unit_release_cgroup(u);
 774
 775         u->cgroup_path = p;
 776         p = NULL;
 777
 778         return 1;
 779 }
 780
 781 int unit_watch_cgroup(Unit *u) {
 782         _cleanup_free_ char *populated = NULL;
 783         int r;
 784
 785         assert(u);
 786
 787         if (!u->cgroup_path)
 788                 return 0;
 789
 790         if (u->cgroup_inotify_wd >= 0)
 791                 return 0;
 792
 793         /* Only applies to the unified hierarchy */
 794         r = cg_unified();
 795         if (r < 0)
 796                 return log_unit_error_errno(u, r, "Failed detect wether the unified hierarchy is used: %m");
 797         if (r == 0)
 798                 return 0;
 799
 800         /* Don't watch the root slice, it's pointless. */
 801         if (unit_has_name(u, SPECIAL_ROOT_SLICE))
 802                 return 0;
 803
 804         r = hashmap_ensure_allocated(&u->manager->cgroup_inotify_wd_unit, &trivial_hash_ops);
 805         if (r < 0)
 806                 return log_oom();
 807
 808         r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, "cgroup.populated", &populated);
 809         if (r < 0)
 810                 return log_oom();
 811
 812         u->cgroup_inotify_wd = inotify_add_watch(u->manager->cgroup_inotify_fd, populated, IN_MODIFY);
 813         if (u->cgroup_inotify_wd < 0) {
 814
 815                 if (errno == ENOENT) /* If the directory is already
 816                                       * gone we don't need to track
 817                                       * it, so this is not an error */
 818                         return 0;
 819
 820                 return log_unit_error_errno(u, errno, "Failed to add inotify watch descriptor for control group %s: %m", u->cgroup_path);
 821         }
 822
 823         r = hashmap_put(u->manager->cgroup_inotify_wd_unit, INT_TO_PTR(u->cgroup_inotify_wd), u);
 824         if (r < 0)
 825                 return log_unit_error_errno(u, r, "Failed to add inotify watch descriptor to hash map: %m");
 826
 827         return 0;
 828 }
 829
 830 static int unit_create_cgroup(
 831                 Unit *u,
 832                 CGroupMask target_mask,
 833                 CGroupMask enable_mask) {
 834
 835         CGroupContext *c;
 836         int r;
 837
 838         assert(u);
 839
 840         c = unit_get_cgroup_context(u);
 841         if (!c)
 842                 return 0;
 843
 844         if (!u->cgroup_path) {
 845                 _cleanup_free_ char *path = NULL;
 846
 847                 path = unit_default_cgroup_path(u);
 848                 if (!path)
 849                         return log_oom();
 850
 851                 r = unit_set_cgroup_path(u, path);
 852                 if (r == -EEXIST)
 853                         return log_unit_error_errno(u, r, "Control group %s exists already.", path);
 854                 if (r < 0)
 855                         return log_unit_error_errno(u, r, "Failed to set unit's control group path to %s: %m", path);
 856         }
 857
 858         /* First, create our own group */
 859         r = cg_create_everywhere(u->manager->cgroup_supported, target_mask, u->cgroup_path);
 860         if (r < 0)
 861                 return log_unit_error_errno(u, r, "Failed to create cgroup %s: %m", u->cgroup_path);
 862
 863         /* Start watching it */
 864         (void) unit_watch_cgroup(u);
 865
 866         /* Enable all controllers we need */
 867         r = cg_enable_everywhere(u->manager->cgroup_supported, enable_mask, u->cgroup_path);
 868         if (r < 0)
 869                 log_unit_warning_errno(u, r, "Failed to enable controllers on cgroup %s, ignoring: %m", u->cgroup_path);
 870
 871         /* Keep track that this is now realized */
 872         u->cgroup_realized = true;
 873         u->cgroup_realized_mask = target_mask;
 874
 875         if (u->type != UNIT_SLICE && !c->delegate) {
 876
 877                 /* Then, possibly move things over, but not if
 878                  * subgroups may contain processes, which is the case
 879                  * for slice and delegation units. */
 880                 r = cg_migrate_everywhere(u->manager->cgroup_supported, u->cgroup_path, u->cgroup_path, migrate_callback, u);
 881                 if (r < 0)
 882                         log_unit_warning_errno(u, r, "Failed to migrate cgroup from to %s, ignoring: %m", u->cgroup_path);
 883         }
 884
 885         return 0;
 886 }
 887
 888 int unit_attach_pids_to_cgroup(Unit *u) {
 889         int r;
 890         assert(u);
 891
 892         r = unit_realize_cgroup(u);
 893         if (r < 0)
 894                 return r;
 895
 896         r = cg_attach_many_everywhere(u->manager->cgroup_supported, u->cgroup_path, u->pids, migrate_callback, u);
 897         if (r < 0)
 898                 return r;
 899
 900         return 0;
 901 }
 902
 903 static bool unit_has_mask_realized(Unit *u, CGroupMask target_mask) {
 904         assert(u);
 905
 906         return u->cgroup_realized && u->cgroup_realized_mask == target_mask;
 907 }
 908
 909 static int unit_find_free_netclass_cgroup(Unit *u, uint32_t *ret) {
 910
 911         uint32_t start, i;
 912         Manager *m;
 913
 914         assert(u);
 915
 916         m = u->manager;
 917
 918         i = start = m->cgroup_netclass_registry_last;
 919
 920         do {
 921                 i++;
 922
 923                 if (!hashmap_get(m->cgroup_netclass_registry, UINT_TO_PTR(i))) {
 924                         m->cgroup_netclass_registry_last = i;
 925                         *ret = i;
 926                         return 0;
 927                 }
 928
 929                 if (i == UINT32_MAX)
 930                         i = CGROUP_NETCLASS_FIXED_MAX;
 931
 932         } while (i != start);
 933
 934         return -ENOBUFS;
 935 }
 936
 937 int unit_add_to_netclass_cgroup(Unit *u) {
 938
 939         CGroupContext *cc;
 940         Unit *first;
 941         void *key;
 942         int r;
 943
 944         assert(u);
 945
 946         cc = unit_get_cgroup_context(u);
 947         if (!cc)
 948                 return 0;
 949
 950         switch (cc->netclass_type) {
 951         case CGROUP_NETCLASS_TYPE_NONE:
 952                 return 0;
 953
 954         case CGROUP_NETCLASS_TYPE_FIXED:
 955                 u->cgroup_netclass_id = cc->netclass_id;
 956                 break;
 957
 958         case CGROUP_NETCLASS_TYPE_AUTO:
 959                 /* Allocate a new ID in case it was requested and not done yet */
 960                 if (u->cgroup_netclass_id == 0) {
 961                         r = unit_find_free_netclass_cgroup(u, &u->cgroup_netclass_id);
 962                         if (r < 0)
 963                                 return r;
 964
 965                         log_debug("Dynamically assigned netclass cgroup id %" PRIu32 " to %s", u->cgroup_netclass_id, u->id);
 966                 }
 967
 968                 break;
 969         }
 970
 971         r = hashmap_ensure_allocated(&u->manager->cgroup_netclass_registry, &trivial_hash_ops);
 972         if (r < 0)
 973                 return r;
 974
 975         key = UINT32_TO_PTR(u->cgroup_netclass_id);
 976         first = hashmap_get(u->manager->cgroup_netclass_registry, key);
 977
 978         if (first) {
 979                 LIST_PREPEND(cgroup_netclass, first, u);
 980                 return hashmap_replace(u->manager->cgroup_netclass_registry, key, u);
 981         }
 982
 983         return hashmap_put(u->manager->cgroup_netclass_registry, key, u);
 984 }
 985
 986 int unit_remove_from_netclass_cgroup(Unit *u) {
 987
 988         Unit *head;
 989         void *key;
 990
 991         assert(u);
 992
 993         key = UINT32_TO_PTR(u->cgroup_netclass_id);
 994
 995         LIST_FIND_HEAD(cgroup_netclass, u, head);
 996         LIST_REMOVE(cgroup_netclass, head, u);
 997
 998         if (head)
 999                 return hashmap_replace(u->manager->cgroup_netclass_registry, key, head);
1000
1001         hashmap_remove(u->manager->cgroup_netclass_registry, key);
1002
1003         return 0;
1004 }
1005
1006 /* Check if necessary controllers and attributes for a unit are in place.
1007  *
1008  * If so, do nothing.
1009  * If not, create paths, move processes over, and set attributes.
1010  *
1011  * Returns 0 on success and < 0 on failure. */
1012 static int unit_realize_cgroup_now(Unit *u, ManagerState state) {
1013         CGroupMask target_mask, enable_mask;
1014         int r;
1015
1016         assert(u);
1017
1018         if (u->in_cgroup_queue) {
1019                 LIST_REMOVE(cgroup_queue, u->manager->cgroup_queue, u);
1020                 u->in_cgroup_queue = false;
1021         }
1022
1023         target_mask = unit_get_target_mask(u);
1024         if (unit_has_mask_realized(u, target_mask))
1025                 return 0;
1026
1027         /* First, realize parents */
1028         if (UNIT_ISSET(u->slice)) {
1029                 r = unit_realize_cgroup_now(UNIT_DEREF(u->slice), state);
1030                 if (r < 0)
1031                         return r;
1032         }
1033
1034         /* And then do the real work */
1035         enable_mask = unit_get_enable_mask(u);
1036         r = unit_create_cgroup(u, target_mask, enable_mask);
1037         if (r < 0)
1038                 return r;
1039
1040         /* Finally, apply the necessary attributes. */
1041         cgroup_context_apply(unit_get_cgroup_context(u), target_mask, u->cgroup_path, u->cgroup_netclass_id, state);
1042
1043         return 0;
1044 }
1045
1046 static void unit_add_to_cgroup_queue(Unit *u) {
1047
1048         if (u->in_cgroup_queue)
1049                 return;
1050
1051         LIST_PREPEND(cgroup_queue, u->manager->cgroup_queue, u);
1052         u->in_cgroup_queue = true;
1053 }
1054
1055 unsigned manager_dispatch_cgroup_queue(Manager *m) {
1056         ManagerState state;
1057         unsigned n = 0;
1058         Unit *i;
1059         int r;
1060
1061         state = manager_state(m);
1062
1063         while ((i = m->cgroup_queue)) {
1064                 assert(i->in_cgroup_queue);
1065
1066                 r = unit_realize_cgroup_now(i, state);
1067                 if (r < 0)
1068                         log_warning_errno(r, "Failed to realize cgroups for queued unit %s, ignoring: %m", i->id);
1069
1070                 n++;
1071         }
1072
1073         return n;
1074 }
1075
1076 static void unit_queue_siblings(Unit *u) {
1077         Unit *slice;
1078
1079         /* This adds the siblings of the specified unit and the
1080          * siblings of all parent units to the cgroup queue. (But
1081          * neither the specified unit itself nor the parents.) */
1082
1083         while ((slice = UNIT_DEREF(u->slice))) {
1084                 Iterator i;
1085                 Unit *m;
1086
1087                 SET_FOREACH(m, slice->dependencies[UNIT_BEFORE], i) {
1088                         if (m == u)
1089                                 continue;
1090
1091                         /* Skip units that have a dependency on the slice
1092                          * but aren't actually in it. */
1093                         if (UNIT_DEREF(m->slice) != slice)
1094                                 continue;
1095
1096                         /* No point in doing cgroup application for units
1097                          * without active processes. */
1098                         if (UNIT_IS_INACTIVE_OR_FAILED(unit_active_state(m)))
1099                                 continue;
1100
1101                         /* If the unit doesn't need any new controllers
1102                          * and has current ones realized, it doesn't need
1103                          * any changes. */
1104                         if (unit_has_mask_realized(m, unit_get_target_mask(m)))
1105                                 continue;
1106
1107                         unit_add_to_cgroup_queue(m);
1108                 }
1109
1110                 u = slice;
1111         }
1112 }
1113
1114 int unit_realize_cgroup(Unit *u) {
1115         assert(u);
1116
1117         if (!UNIT_HAS_CGROUP_CONTEXT(u))
1118                 return 0;
1119
1120         /* So, here's the deal: when realizing the cgroups for this
1121          * unit, we need to first create all parents, but there's more
1122          * actually: for the weight-based controllers we also need to
1123          * make sure that all our siblings (i.e. units that are in the
1124          * same slice as we are) have cgroups, too. Otherwise, things
1125          * would become very uneven as each of their processes would
1126          * get as much resources as all our group together. This call
1127          * will synchronously create the parent cgroups, but will
1128          * defer work on the siblings to the next event loop
1129          * iteration. */
1130
1131         /* Add all sibling slices to the cgroup queue. */
1132         unit_queue_siblings(u);
1133
1134         /* And realize this one now (and apply the values) */
1135         return unit_realize_cgroup_now(u, manager_state(u->manager));
1136 }
1137
1138 void unit_release_cgroup(Unit *u) {
1139         assert(u);
1140
1141         /* Forgets all cgroup details for this cgroup */
1142
1143         if (u->cgroup_path) {
1144                 (void) hashmap_remove(u->manager->cgroup_unit, u->cgroup_path);
1145                 u->cgroup_path = mfree(u->cgroup_path);
1146         }
1147
1148         if (u->cgroup_inotify_wd >= 0) {
1149                 if (inotify_rm_watch(u->manager->cgroup_inotify_fd, u->cgroup_inotify_wd) < 0)
1150                         log_unit_debug_errno(u, errno, "Failed to remove cgroup inotify watch %i for %s, ignoring", u->cgroup_inotify_wd, u->id);
1151
1152                 (void) hashmap_remove(u->manager->cgroup_inotify_wd_unit, INT_TO_PTR(u->cgroup_inotify_wd));
1153                 u->cgroup_inotify_wd = -1;
1154         }
1155 }
1156
1157 void unit_prune_cgroup(Unit *u) {
1158         int r;
1159         bool is_root_slice;
1160
1161         assert(u);
1162
1163         /* Removes the cgroup, if empty and possible, and stops watching it. */
1164
1165         if (!u->cgroup_path)
1166                 return;
1167
1168         is_root_slice = unit_has_name(u, SPECIAL_ROOT_SLICE);
1169
1170         r = cg_trim_everywhere(u->manager->cgroup_supported, u->cgroup_path, !is_root_slice);
1171         if (r < 0) {
1172                 log_debug_errno(r, "Failed to destroy cgroup %s, ignoring: %m", u->cgroup_path);
1173                 return;
1174         }
1175
1176         if (is_root_slice)
1177                 return;
1178
1179         unit_release_cgroup(u);
1180
1181         u->cgroup_realized = false;
1182         u->cgroup_realized_mask = 0;
1183 }
1184
1185 int unit_search_main_pid(Unit *u, pid_t *ret) {
1186         _cleanup_fclose_ FILE *f = NULL;
1187         pid_t pid = 0, npid, mypid;
1188         int r;
1189
1190         assert(u);
1191         assert(ret);
1192
1193         if (!u->cgroup_path)
1194                 return -ENXIO;
1195
1196         r = cg_enumerate_processes(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, &f);
1197         if (r < 0)
1198                 return r;
1199
1200         mypid = getpid();
1201         while (cg_read_pid(f, &npid) > 0)  {
1202                 pid_t ppid;
1203
1204                 if (npid == pid)
1205                         continue;
1206
1207                 /* Ignore processes that aren't our kids */
1208                 if (get_parent_of_pid(npid, &ppid) >= 0 && ppid != mypid)
1209                         continue;
1210
1211                 if (pid != 0)
1212                         /* Dang, there's more than one daemonized PID
1213                         in this group, so we don't know what process
1214                         is the main process. */
1215
1216                         return -ENODATA;
1217
1218                 pid = npid;
1219         }
1220
1221         *ret = pid;
1222         return 0;
1223 }
1224
1225 static int unit_watch_pids_in_path(Unit *u, const char *path) {
1226         _cleanup_closedir_ DIR *d = NULL;
1227         _cleanup_fclose_ FILE *f = NULL;
1228         int ret = 0, r;
1229
1230         assert(u);
1231         assert(path);
1232
1233         r = cg_enumerate_processes(SYSTEMD_CGROUP_CONTROLLER, path, &f);
1234         if (r < 0)
1235                 ret = r;
1236         else {
1237                 pid_t pid;
1238
1239                 while ((r = cg_read_pid(f, &pid)) > 0) {
1240                         r = unit_watch_pid(u, pid);
1241                         if (r < 0 && ret >= 0)
1242                                 ret = r;
1243                 }
1244
1245                 if (r < 0 && ret >= 0)
1246                         ret = r;
1247         }
1248
1249         r = cg_enumerate_subgroups(SYSTEMD_CGROUP_CONTROLLER, path, &d);
1250         if (r < 0) {
1251                 if (ret >= 0)
1252                         ret = r;
1253         } else {
1254                 char *fn;
1255
1256                 while ((r = cg_read_subgroup(d, &fn)) > 0) {
1257                         _cleanup_free_ char *p = NULL;
1258
1259                         p = strjoin(path, "/", fn, NULL);
1260                         free(fn);
1261
1262                         if (!p)
1263                                 return -ENOMEM;
1264
1265                         r = unit_watch_pids_in_path(u, p);
1266                         if (r < 0 && ret >= 0)
1267                                 ret = r;
1268                 }
1269
1270                 if (r < 0 && ret >= 0)
1271                         ret = r;
1272         }
1273
1274         return ret;
1275 }
1276
1277 int unit_watch_all_pids(Unit *u) {
1278         assert(u);
1279
1280         /* Adds all PIDs from our cgroup to the set of PIDs we
1281          * watch. This is a fallback logic for cases where we do not
1282          * get reliable cgroup empty notifications: we try to use
1283          * SIGCHLD as replacement. */
1284
1285         if (!u->cgroup_path)
1286                 return -ENOENT;
1287
1288         if (cg_unified() > 0) /* On unified we can use proper notifications */
1289                 return 0;
1290
1291         return unit_watch_pids_in_path(u, u->cgroup_path);
1292 }
1293
1294 int unit_notify_cgroup_empty(Unit *u) {
1295         int r;
1296
1297         assert(u);
1298
1299         if (!u->cgroup_path)
1300                 return 0;
1301
1302         r = cg_is_empty_recursive(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path);
1303         if (r <= 0)
1304                 return r;
1305
1306         unit_add_to_gc_queue(u);
1307
1308         if (UNIT_VTABLE(u)->notify_cgroup_empty)
1309                 UNIT_VTABLE(u)->notify_cgroup_empty(u);
1310
1311         return 0;
1312 }
1313
1314 static int on_cgroup_inotify_event(sd_event_source *s, int fd, uint32_t revents, void *userdata) {
1315         Manager *m = userdata;
1316
1317         assert(s);
1318         assert(fd >= 0);
1319         assert(m);
1320
1321         for (;;) {
1322                 union inotify_event_buffer buffer;
1323                 struct inotify_event *e;
1324                 ssize_t l;
1325
1326                 l = read(fd, &buffer, sizeof(buffer));
1327                 if (l < 0) {
1328                         if (errno == EINTR || errno == EAGAIN)
1329                                 return 0;
1330
1331                         return log_error_errno(errno, "Failed to read control group inotify events: %m");
1332                 }
1333
1334                 FOREACH_INOTIFY_EVENT(e, buffer, l) {
1335                         Unit *u;
1336
1337                         if (e->wd < 0)
1338                                 /* Queue overflow has no watch descriptor */
1339                                 continue;
1340
1341                         if (e->mask & IN_IGNORED)
1342                                 /* The watch was just removed */
1343                                 continue;
1344
1345                         u = hashmap_get(m->cgroup_inotify_wd_unit, INT_TO_PTR(e->wd));
1346                         if (!u) /* Not that inotify might deliver
1347                                  * events for a watch even after it
1348                                  * was removed, because it was queued
1349                                  * before the removal. Let's ignore
1350                                  * this here safely. */
1351                                 continue;
1352
1353                         (void) unit_notify_cgroup_empty(u);
1354                 }
1355         }
1356 }
1357 #endif // 0
1358
1359 int manager_setup_cgroup(Manager *m) {
1360         _cleanup_free_ char *path = NULL;
1361         CGroupController c;
1362         int r, unified;
1363         char *e;
1364
1365         assert(m);
1366
1367         /* 1. Determine hierarchy */
1368         m->cgroup_root = mfree(m->cgroup_root);
1369         r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, 0, &m->cgroup_root);
1370         if (r < 0)
1371                 return log_error_errno(r, "Cannot determine cgroup we are running in: %m");
1372
1373 /// elogind does not support systemd scopes and slices
1374 #if 0
1375         /* Chop off the init scope, if we are already located in it */
1376         e = endswith(m->cgroup_root, "/" SPECIAL_INIT_SCOPE);
1377
1378         /* LEGACY: Also chop off the system slice if we are in
1379          * it. This is to support live upgrades from older systemd
1380          * versions where PID 1 was moved there. Also see
1381          * cg_get_root_path(). */
1382         if (!e && m->running_as == MANAGER_SYSTEM) {
1383                 e = endswith(m->cgroup_root, "/" SPECIAL_SYSTEM_SLICE);
1384                 if (!e)
1385                         e = endswith(m->cgroup_root, "/system"); /* even more legacy */
1386         }
1387         if (e)
1388                 *e = 0;
1389 #endif // 0
1390
1391         /* And make sure to store away the root value without trailing
1392          * slash, even for the root dir, so that we can easily prepend
1393          * it everywhere. */
1394         while ((e = endswith(m->cgroup_root, "/")))
1395                 *e = 0;
1396         log_debug_elogind("Cgroup Controller \"%s\" -> root \"%s\"",
1397                           SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root);
1398
1399         /* 2. Show data */
1400         r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, NULL, &path);
1401         if (r < 0)
1402                 return log_error_errno(r, "Cannot find cgroup mount point: %m");
1403
1404         unified = cg_unified();
1405         if (unified < 0)
1406                 return log_error_errno(r, "Couldn't determine if we are running in the unified hierarchy: %m");
1407         if (unified > 0)
1408                 log_debug("Unified cgroup hierarchy is located at %s.", path);
1409         else
1410                 log_debug("Using cgroup controller " SYSTEMD_CGROUP_CONTROLLER ". File system hierarchy is at %s.", path);
1411
1412         if (!m->test_run) {
1413                 const char *scope_path;
1414
1415                 /* 3. Install agent */
1416                 if (unified) {
1417
1418                         /* In the unified hierarchy we can can get
1419                          * cgroup empty notifications via inotify. */
1420
1421 /// elogind does not support the unified hierarchy, yet.
1422 #if 0
1423                         m->cgroup_inotify_event_source = sd_event_source_unref(m->cgroup_inotify_event_source);
1424                         safe_close(m->cgroup_inotify_fd);
1425
1426                         m->cgroup_inotify_fd = inotify_init1(IN_NONBLOCK|IN_CLOEXEC);
1427                         if (m->cgroup_inotify_fd < 0)
1428                                 return log_error_errno(errno, "Failed to create control group inotify object: %m");
1429
1430                         r = sd_event_add_io(m->event, &m->cgroup_inotify_event_source, m->cgroup_inotify_fd, EPOLLIN, on_cgroup_inotify_event, m);
1431                         if (r < 0)
1432                                 return log_error_errno(r, "Failed to watch control group inotify object: %m");
1433
1434                         r = sd_event_source_set_priority(m->cgroup_inotify_event_source, SD_EVENT_PRIORITY_IDLE - 5);
1435                         if (r < 0)
1436                                 return log_error_errno(r, "Failed to set priority of inotify event source: %m");
1437
1438                         (void) sd_event_source_set_description(m->cgroup_inotify_event_source, "cgroup-inotify");
1439
1440 #else
1441                         return log_error_errno(EOPNOTSUPP, "Unified cgroup hierarchy not supported: %m");
1442 #endif // 0
1443                 } else if (m->running_as == MANAGER_SYSTEM) {
1444
1445                         /* On the legacy hierarchy we only get
1446                          * notifications via cgroup agents. (Which
1447                          * isn't really reliable, since it does not
1448                          * generate events when control groups with
1449                          * children run empty. */
1450
1451                         r = cg_install_release_agent(SYSTEMD_CGROUP_CONTROLLER, ELOGIND_CGROUP_AGENT_PATH);
1452                         if (r < 0)
1453                                 log_warning_errno(r, "Failed to install release agent, ignoring: %m");
1454                         else if (r > 0)
1455                                 log_debug("Installed release agent.");
1456                         else if (r == 0)
1457                                 log_debug("Release agent already installed.");
1458                 }
1459
1460 /// elogind is not meant to run in systemd init scope
1461 #if 0
1462                 /* 4. Make sure we are in the special "init.scope" unit in the root slice. */
1463                 scope_path = strjoina(m->cgroup_root, "/" SPECIAL_INIT_SCOPE);
1464                 r = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, scope_path, 0);
1465 #else
1466                 if (streq(SYSTEMD_CGROUP_CONTROLLER, "name=elogind"))
1467                         // we are our own cgroup controller
1468                         scope_path = strjoina("");
1469                 else if (streq(m->cgroup_root, "/elogind"))
1470                         // root already is our cgroup
1471                         scope_path = strjoina(m->cgroup_root);
1472                 else
1473                         // we have to create our own group
1474                         scope_path = strjoina(m->cgroup_root, "/elogind");
1475                 r = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, scope_path, 0);
1476 #endif // 0
1477                 if (r < 0)
1478                         return log_error_errno(r, "Failed to create %s control group: %m", scope_path);
1479                 log_debug_elogind("Created control group \"%s\"", scope_path);
1480
1481                 /* also, move all other userspace processes remaining
1482                  * in the root cgroup into that scope. */
1483                 if (!streq(m->cgroup_root, scope_path)) {
1484                         r = cg_migrate(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, SYSTEMD_CGROUP_CONTROLLER, scope_path, false);
1485                         if (r < 0)
1486                                 log_warning_errno(r, "Couldn't move remaining userspace processes, ignoring: %m");
1487                 }
1488
1489                 /* 5. And pin it, so that it cannot be unmounted */
1490                 safe_close(m->pin_cgroupfs_fd);
1491                 m->pin_cgroupfs_fd = open(path, O_RDONLY|O_CLOEXEC|O_DIRECTORY|O_NOCTTY|O_NONBLOCK);
1492                 if (m->pin_cgroupfs_fd < 0)
1493                         return log_error_errno(errno, "Failed to open pin file: %m");
1494
1495                 /* 6.  Always enable hierarchical support if it exists... */
1496                 if (!unified)
1497                         (void) cg_set_attribute("memory", "/", "memory.use_hierarchy", "1");
1498         }
1499
1500         /* 7. Figure out which controllers are supported */
1501         r = cg_mask_supported(&m->cgroup_supported);
1502         if (r < 0)
1503                 return log_error_errno(r, "Failed to determine supported controllers: %m");
1504
1505         for (c = 0; c < _CGROUP_CONTROLLER_MAX; c++)
1506                 log_debug("Controller '%s' supported: %s", cgroup_controller_to_string(c), yes_no(m->cgroup_supported & c));
1507
1508         return 0;
1509 }
1510
1511 void manager_shutdown_cgroup(Manager *m, bool delete) {
1512         assert(m);
1513
1514         /* We can't really delete the group, since we are in it. But
1515          * let's trim it. */
1516         if (delete && m->cgroup_root)
1517                 (void) cg_trim(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, false);
1518
1519 /// elogind does not support the unified hierarchy, yet.
1520 #if 0
1521         m->cgroup_inotify_wd_unit = hashmap_free(m->cgroup_inotify_wd_unit);
1522
1523         m->cgroup_inotify_event_source = sd_event_source_unref(m->cgroup_inotify_event_source);
1524         m->cgroup_inotify_fd = safe_close(m->cgroup_inotify_fd);
1525 #endif // 0
1526
1527         m->pin_cgroupfs_fd = safe_close(m->pin_cgroupfs_fd);
1528
1529         m->cgroup_root = mfree(m->cgroup_root);
1530 }
1531
1532 /// UNNEEDED by elogind
1533 #if 0
1534 Unit* manager_get_unit_by_cgroup(Manager *m, const char *cgroup) {
1535         char *p;
1536         Unit *u;
1537
1538         assert(m);
1539         assert(cgroup);
1540
1541         u = hashmap_get(m->cgroup_unit, cgroup);
1542         if (u)
1543                 return u;
1544
1545         p = strdupa(cgroup);
1546         for (;;) {
1547                 char *e;
1548
1549                 e = strrchr(p, '/');
1550                 if (!e || e == p)
1551                         return hashmap_get(m->cgroup_unit, SPECIAL_ROOT_SLICE);
1552
1553                 *e = 0;
1554
1555                 u = hashmap_get(m->cgroup_unit, p);
1556                 if (u)
1557                         return u;
1558         }
1559 }
1560
1561 Unit *manager_get_unit_by_pid_cgroup(Manager *m, pid_t pid) {
1562         _cleanup_free_ char *cgroup = NULL;
1563         int r;
1564
1565         assert(m);
1566
1567         if (pid <= 0)
1568                 return NULL;
1569
1570         r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, pid, &cgroup);
1571         if (r < 0)
1572                 return NULL;
1573
1574         return manager_get_unit_by_cgroup(m, cgroup);
1575 }
1576
1577 Unit *manager_get_unit_by_pid(Manager *m, pid_t pid) {
1578         Unit *u;
1579
1580         assert(m);
1581
1582         if (pid <= 0)
1583                 return NULL;
1584
1585         if (pid == 1)
1586                 return hashmap_get(m->units, SPECIAL_INIT_SCOPE);
1587
1588         u = hashmap_get(m->watch_pids1, PID_TO_PTR(pid));
1589         if (u)
1590                 return u;
1591
1592         u = hashmap_get(m->watch_pids2, PID_TO_PTR(pid));
1593         if (u)
1594                 return u;
1595
1596         return manager_get_unit_by_pid_cgroup(m, pid);
1597 }
1598
1599 int manager_notify_cgroup_empty(Manager *m, const char *cgroup) {
1600         Unit *u;
1601
1602         assert(m);
1603         assert(cgroup);
1604
1605         u = manager_get_unit_by_cgroup(m, cgroup);
1606         if (!u)
1607                 return 0;
1608
1609         return unit_notify_cgroup_empty(u);
1610 }
1611
1612 int unit_get_memory_current(Unit *u, uint64_t *ret) {
1613         _cleanup_free_ char *v = NULL;
1614         int r;
1615
1616         assert(u);
1617         assert(ret);
1618
1619         if (!u->cgroup_path)
1620                 return -ENODATA;
1621
1622         if ((u->cgroup_realized_mask & CGROUP_MASK_MEMORY) == 0)
1623                 return -ENODATA;
1624
1625         if (cg_unified() <= 0)
1626                 r = cg_get_attribute("memory", u->cgroup_path, "memory.usage_in_bytes", &v);
1627         else
1628                 r = cg_get_attribute("memory", u->cgroup_path, "memory.current", &v);
1629         if (r == -ENOENT)
1630                 return -ENODATA;
1631         if (r < 0)
1632                 return r;
1633
1634         return safe_atou64(v, ret);
1635 }
1636
1637 int unit_get_tasks_current(Unit *u, uint64_t *ret) {
1638         _cleanup_free_ char *v = NULL;
1639         int r;
1640
1641         assert(u);
1642         assert(ret);
1643
1644         if (!u->cgroup_path)
1645                 return -ENODATA;
1646
1647         if ((u->cgroup_realized_mask & CGROUP_MASK_PIDS) == 0)
1648                 return -ENODATA;
1649
1650         r = cg_get_attribute("pids", u->cgroup_path, "pids.current", &v);
1651         if (r == -ENOENT)
1652                 return -ENODATA;
1653         if (r < 0)
1654                 return r;
1655
1656         return safe_atou64(v, ret);
1657 }
1658
1659 static int unit_get_cpu_usage_raw(Unit *u, nsec_t *ret) {
1660         _cleanup_free_ char *v = NULL;
1661         uint64_t ns;
1662         int r;
1663
1664         assert(u);
1665         assert(ret);
1666
1667         if (!u->cgroup_path)
1668                 return -ENODATA;
1669
1670         if ((u->cgroup_realized_mask & CGROUP_MASK_CPUACCT) == 0)
1671                 return -ENODATA;
1672
1673         r = cg_get_attribute("cpuacct", u->cgroup_path, "cpuacct.usage", &v);
1674         if (r == -ENOENT)
1675                 return -ENODATA;
1676         if (r < 0)
1677                 return r;
1678
1679         r = safe_atou64(v, &ns);
1680         if (r < 0)
1681                 return r;
1682
1683         *ret = ns;
1684         return 0;
1685 }
1686
1687 int unit_get_cpu_usage(Unit *u, nsec_t *ret) {
1688         nsec_t ns;
1689         int r;
1690
1691         r = unit_get_cpu_usage_raw(u, &ns);
1692         if (r < 0)
1693                 return r;
1694
1695         if (ns > u->cpuacct_usage_base)
1696                 ns -= u->cpuacct_usage_base;
1697         else
1698                 ns = 0;
1699
1700         *ret = ns;
1701         return 0;
1702 }
1703
1704 int unit_reset_cpu_usage(Unit *u) {
1705         nsec_t ns;
1706         int r;
1707
1708         assert(u);
1709
1710         r = unit_get_cpu_usage_raw(u, &ns);
1711         if (r < 0) {
1712                 u->cpuacct_usage_base = 0;
1713                 return r;
1714         }
1715
1716         u->cpuacct_usage_base = ns;
1717         return 0;
1718 }
1719
1720 bool unit_cgroup_delegate(Unit *u) {
1721         CGroupContext *c;
1722
1723         assert(u);
1724
1725         c = unit_get_cgroup_context(u);
1726         if (!c)
1727                 return false;
1728
1729         return c->delegate;
1730 }
1731
1732 void unit_invalidate_cgroup(Unit *u, CGroupMask m) {
1733         assert(u);
1734
1735         if (!UNIT_HAS_CGROUP_CONTEXT(u))
1736                 return;
1737
1738         if (m == 0)
1739                 return;
1740
1741         if ((u->cgroup_realized_mask & m) == 0)
1742                 return;
1743
1744         u->cgroup_realized_mask &= ~m;
1745         unit_add_to_cgroup_queue(u);
1746 }
1747
1748 void manager_invalidate_startup_units(Manager *m) {
1749         Iterator i;
1750         Unit *u;
1751
1752         assert(m);
1753
1754         SET_FOREACH(u, m->startup_units, i)
1755                 unit_invalidate_cgroup(u, CGROUP_MASK_CPU|CGROUP_MASK_BLKIO);
1756 }
1757
1758 static const char* const cgroup_device_policy_table[_CGROUP_DEVICE_POLICY_MAX] = {
1759         [CGROUP_AUTO] = "auto",
1760         [CGROUP_CLOSED] = "closed",
1761         [CGROUP_STRICT] = "strict",
1762 };
1763
1764 DEFINE_STRING_TABLE_LOOKUP(cgroup_device_policy, CGroupDevicePolicy);
1765 #endif // 0