src/core/cgroup.c

   1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
   2
   3 /***
   4   This file is part of systemd.
   5
   6   Copyright 2013 Lennart Poettering
   7
   8   systemd is free software; you can redistribute it and/or modify it
   9   under the terms of the GNU Lesser General Public License as published by
  10   the Free Software Foundation; either version 2.1 of the License, or
  11   (at your option) any later version.
  12
  13   systemd is distributed in the hope that it will be useful, but
  14   WITHOUT ANY WARRANTY; without even the implied warranty of
  15   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  16   Lesser General Public License for more details.
  17
  18   You should have received a copy of the GNU Lesser General Public License
  19   along with systemd; If not, see <http://www.gnu.org/licenses/>.
  20 ***/
  21
  22 #include <fcntl.h>
  23 #include <fnmatch.h>
  24
  25 #include "alloc-util.h"
  26 #include "cgroup-util.h"
  27 #include "cgroup.h"
  28 #include "fd-util.h"
  29 #include "fileio.h"
  30 #include "fs-util.h"
  31 #include "parse-util.h"
  32 #include "path-util.h"
  33 #include "process-util.h"
  34 //#include "special.h"
  35 #include "string-table.h"
  36 #include "string-util.h"
  37
  38 #define CGROUP_CPU_QUOTA_PERIOD_USEC ((usec_t) 100 * USEC_PER_MSEC)
  39
  40 /// UNNEEDED by elogind
  41 #if 0
  42 void cgroup_context_init(CGroupContext *c) {
  43         assert(c);
  44
  45         /* Initialize everything to the kernel defaults, assuming the
  46          * structure is preinitialized to 0 */
  47
  48         c->cpu_shares = CGROUP_CPU_SHARES_INVALID;
  49         c->startup_cpu_shares = CGROUP_CPU_SHARES_INVALID;
  50         c->cpu_quota_per_sec_usec = USEC_INFINITY;
  51
  52         c->memory_limit = (uint64_t) -1;
  53
  54         c->blockio_weight = CGROUP_BLKIO_WEIGHT_INVALID;
  55         c->startup_blockio_weight = CGROUP_BLKIO_WEIGHT_INVALID;
  56
  57         c->tasks_max = (uint64_t) -1;
  58
  59         c->netclass_type = CGROUP_NETCLASS_TYPE_NONE;
  60 }
  61
  62 void cgroup_context_free_device_allow(CGroupContext *c, CGroupDeviceAllow *a) {
  63         assert(c);
  64         assert(a);
  65
  66         LIST_REMOVE(device_allow, c->device_allow, a);
  67         free(a->path);
  68         free(a);
  69 }
  70
  71 void cgroup_context_free_blockio_device_weight(CGroupContext *c, CGroupBlockIODeviceWeight *w) {
  72         assert(c);
  73         assert(w);
  74
  75         LIST_REMOVE(device_weights, c->blockio_device_weights, w);
  76         free(w->path);
  77         free(w);
  78 }
  79
  80 void cgroup_context_free_blockio_device_bandwidth(CGroupContext *c, CGroupBlockIODeviceBandwidth *b) {
  81         assert(c);
  82         assert(b);
  83
  84         LIST_REMOVE(device_bandwidths, c->blockio_device_bandwidths, b);
  85         free(b->path);
  86         free(b);
  87 }
  88
  89 void cgroup_context_done(CGroupContext *c) {
  90         assert(c);
  91
  92         while (c->blockio_device_weights)
  93                 cgroup_context_free_blockio_device_weight(c, c->blockio_device_weights);
  94
  95         while (c->blockio_device_bandwidths)
  96                 cgroup_context_free_blockio_device_bandwidth(c, c->blockio_device_bandwidths);
  97
  98         while (c->device_allow)
  99                 cgroup_context_free_device_allow(c, c->device_allow);
 100 }
 101
 102 void cgroup_context_dump(CGroupContext *c, FILE* f, const char *prefix) {
 103         CGroupBlockIODeviceBandwidth *b;
 104         CGroupBlockIODeviceWeight *w;
 105         CGroupDeviceAllow *a;
 106         char u[FORMAT_TIMESPAN_MAX];
 107
 108         assert(c);
 109         assert(f);
 110
 111         prefix = strempty(prefix);
 112
 113         fprintf(f,
 114                 "%sCPUAccounting=%s\n"
 115                 "%sBlockIOAccounting=%s\n"
 116                 "%sMemoryAccounting=%s\n"
 117                 "%sTasksAccounting=%s\n"
 118                 "%sCPUShares=%" PRIu64 "\n"
 119                 "%sStartupCPUShares=%" PRIu64 "\n"
 120                 "%sCPUQuotaPerSecSec=%s\n"
 121                 "%sBlockIOWeight=%" PRIu64 "\n"
 122                 "%sStartupBlockIOWeight=%" PRIu64 "\n"
 123                 "%sMemoryLimit=%" PRIu64 "\n"
 124                 "%sTasksMax=%" PRIu64 "\n"
 125                 "%sDevicePolicy=%s\n"
 126                 "%sDelegate=%s\n",
 127                 prefix, yes_no(c->cpu_accounting),
 128                 prefix, yes_no(c->blockio_accounting),
 129                 prefix, yes_no(c->memory_accounting),
 130                 prefix, yes_no(c->tasks_accounting),
 131                 prefix, c->cpu_shares,
 132                 prefix, c->startup_cpu_shares,
 133                 prefix, format_timespan(u, sizeof(u), c->cpu_quota_per_sec_usec, 1),
 134                 prefix, c->blockio_weight,
 135                 prefix, c->startup_blockio_weight,
 136                 prefix, c->memory_limit,
 137                 prefix, c->tasks_max,
 138                 prefix, cgroup_device_policy_to_string(c->device_policy),
 139                 prefix, yes_no(c->delegate));
 140
 141         LIST_FOREACH(device_allow, a, c->device_allow)
 142                 fprintf(f,
 143                         "%sDeviceAllow=%s %s%s%s\n",
 144                         prefix,
 145                         a->path,
 146                         a->r ? "r" : "", a->w ? "w" : "", a->m ? "m" : "");
 147
 148         LIST_FOREACH(device_weights, w, c->blockio_device_weights)
 149                 fprintf(f,
 150                         "%sBlockIODeviceWeight=%s %" PRIu64,
 151                         prefix,
 152                         w->path,
 153                         w->weight);
 154
 155         LIST_FOREACH(device_bandwidths, b, c->blockio_device_bandwidths) {
 156                 char buf[FORMAT_BYTES_MAX];
 157
 158                 fprintf(f,
 159                         "%s%s=%s %s\n",
 160                         prefix,
 161                         b->read ? "BlockIOReadBandwidth" : "BlockIOWriteBandwidth",
 162                         b->path,
 163                         format_bytes(buf, sizeof(buf), b->bandwidth));
 164         }
 165 }
 166
 167 static int lookup_blkio_device(const char *p, dev_t *dev) {
 168         struct stat st;
 169         int r;
 170
 171         assert(p);
 172         assert(dev);
 173
 174         r = stat(p, &st);
 175         if (r < 0)
 176                 return log_warning_errno(errno, "Couldn't stat device %s: %m", p);
 177
 178         if (S_ISBLK(st.st_mode))
 179                 *dev = st.st_rdev;
 180         else if (major(st.st_dev) != 0) {
 181                 /* If this is not a device node then find the block
 182                  * device this file is stored on */
 183                 *dev = st.st_dev;
 184
 185                 /* If this is a partition, try to get the originating
 186                  * block device */
 187                 block_get_whole_disk(*dev, dev);
 188         } else {
 189                 log_warning("%s is not a block device and file system block device cannot be determined or is not local.", p);
 190                 return -ENODEV;
 191         }
 192
 193         return 0;
 194 }
 195
 196 static int whitelist_device(const char *path, const char *node, const char *acc) {
 197         char buf[2+DECIMAL_STR_MAX(dev_t)*2+2+4];
 198         struct stat st;
 199         int r;
 200
 201         assert(path);
 202         assert(acc);
 203
 204         if (stat(node, &st) < 0) {
 205                 log_warning("Couldn't stat device %s", node);
 206                 return -errno;
 207         }
 208
 209         if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
 210                 log_warning("%s is not a device.", node);
 211                 return -ENODEV;
 212         }
 213
 214         sprintf(buf,
 215                 "%c %u:%u %s",
 216                 S_ISCHR(st.st_mode) ? 'c' : 'b',
 217                 major(st.st_rdev), minor(st.st_rdev),
 218                 acc);
 219
 220         r = cg_set_attribute("devices", path, "devices.allow", buf);
 221         if (r < 0)
 222                 log_full_errno(IN_SET(r, -ENOENT, -EROFS, -EINVAL, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
 223                                "Failed to set devices.allow on %s: %m", path);
 224
 225         return r;
 226 }
 227
 228 static int whitelist_major(const char *path, const char *name, char type, const char *acc) {
 229         _cleanup_fclose_ FILE *f = NULL;
 230         char line[LINE_MAX];
 231         bool good = false;
 232         int r;
 233
 234         assert(path);
 235         assert(acc);
 236         assert(type == 'b' || type == 'c');
 237
 238         f = fopen("/proc/devices", "re");
 239         if (!f)
 240                 return log_warning_errno(errno, "Cannot open /proc/devices to resolve %s (%c): %m", name, type);
 241
 242         FOREACH_LINE(line, f, goto fail) {
 243                 char buf[2+DECIMAL_STR_MAX(unsigned)+3+4], *p, *w;
 244                 unsigned maj;
 245
 246                 truncate_nl(line);
 247
 248                 if (type == 'c' && streq(line, "Character devices:")) {
 249                         good = true;
 250                         continue;
 251                 }
 252
 253                 if (type == 'b' && streq(line, "Block devices:")) {
 254                         good = true;
 255                         continue;
 256                 }
 257
 258                 if (isempty(line)) {
 259                         good = false;
 260                         continue;
 261                 }
 262
 263                 if (!good)
 264                         continue;
 265
 266                 p = strstrip(line);
 267
 268                 w = strpbrk(p, WHITESPACE);
 269                 if (!w)
 270                         continue;
 271                 *w = 0;
 272
 273                 r = safe_atou(p, &maj);
 274                 if (r < 0)
 275                         continue;
 276                 if (maj <= 0)
 277                         continue;
 278
 279                 w++;
 280                 w += strspn(w, WHITESPACE);
 281
 282                 if (fnmatch(name, w, 0) != 0)
 283                         continue;
 284
 285                 sprintf(buf,
 286                         "%c %u:* %s",
 287                         type,
 288                         maj,
 289                         acc);
 290
 291                 r = cg_set_attribute("devices", path, "devices.allow", buf);
 292                 if (r < 0)
 293                         log_full_errno(IN_SET(r, -ENOENT, -EROFS, -EINVAL, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
 294                                        "Failed to set devices.allow on %s: %m", path);
 295         }
 296
 297         return 0;
 298
 299 fail:
 300         log_warning_errno(errno, "Failed to read /proc/devices: %m");
 301         return -errno;
 302 }
 303
 304 void cgroup_context_apply(CGroupContext *c, CGroupMask mask, const char *path, uint32_t netclass, ManagerState state) {
 305         bool is_root;
 306         int r;
 307
 308         assert(c);
 309         assert(path);
 310
 311         if (mask == 0)
 312                 return;
 313
 314         /* Some cgroup attributes are not supported on the root cgroup,
 315          * hence silently ignore */
 316         is_root = isempty(path) || path_equal(path, "/");
 317         if (is_root)
 318                 /* Make sure we don't try to display messages with an empty path. */
 319                 path = "/";
 320
 321         /* We generally ignore errors caused by read-only mounted
 322          * cgroup trees (assuming we are running in a container then),
 323          * and missing cgroups, i.e. EROFS and ENOENT. */
 324
 325         if ((mask & CGROUP_MASK_CPU) && !is_root) {
 326                 char buf[MAX(DECIMAL_STR_MAX(uint64_t), DECIMAL_STR_MAX(usec_t)) + 1];
 327
 328                 sprintf(buf, "%" PRIu64 "\n",
 329                         IN_SET(state, MANAGER_STARTING, MANAGER_INITIALIZING) && c->startup_cpu_shares != CGROUP_CPU_SHARES_INVALID ? c->startup_cpu_shares :
 330                         c->cpu_shares != CGROUP_CPU_SHARES_INVALID ? c->cpu_shares : CGROUP_CPU_SHARES_DEFAULT);
 331                 r = cg_set_attribute("cpu", path, "cpu.shares", buf);
 332                 if (r < 0)
 333                         log_full_errno(IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
 334                                        "Failed to set cpu.shares on %s: %m", path);
 335
 336                 sprintf(buf, USEC_FMT "\n", CGROUP_CPU_QUOTA_PERIOD_USEC);
 337                 r = cg_set_attribute("cpu", path, "cpu.cfs_period_us", buf);
 338                 if (r < 0)
 339                         log_full_errno(IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
 340                                        "Failed to set cpu.cfs_period_us on %s: %m", path);
 341
 342                 if (c->cpu_quota_per_sec_usec != USEC_INFINITY) {
 343                         sprintf(buf, USEC_FMT "\n", c->cpu_quota_per_sec_usec * CGROUP_CPU_QUOTA_PERIOD_USEC / USEC_PER_SEC);
 344                         r = cg_set_attribute("cpu", path, "cpu.cfs_quota_us", buf);
 345                 } else
 346                         r = cg_set_attribute("cpu", path, "cpu.cfs_quota_us", "-1");
 347                 if (r < 0)
 348                         log_full_errno(IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
 349                                        "Failed to set cpu.cfs_quota_us on %s: %m", path);
 350         }
 351
 352         if (mask & CGROUP_MASK_BLKIO) {
 353                 char buf[MAX(DECIMAL_STR_MAX(uint64_t)+1,
 354                               DECIMAL_STR_MAX(dev_t)*2+2+DECIMAL_STR_MAX(uint64_t)+1)];
 355                 CGroupBlockIODeviceWeight *w;
 356                 CGroupBlockIODeviceBandwidth *b;
 357
 358                 if (!is_root) {
 359                         sprintf(buf, "%" PRIu64 "\n",
 360                                 IN_SET(state, MANAGER_STARTING, MANAGER_INITIALIZING) && c->startup_blockio_weight != CGROUP_BLKIO_WEIGHT_INVALID ? c->startup_blockio_weight :
 361                                 c->blockio_weight != CGROUP_BLKIO_WEIGHT_INVALID ? c->blockio_weight : CGROUP_BLKIO_WEIGHT_DEFAULT);
 362                         r = cg_set_attribute("blkio", path, "blkio.weight", buf);
 363                         if (r < 0)
 364                                 log_full_errno(IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
 365                                                "Failed to set blkio.weight on %s: %m", path);
 366
 367                         /* FIXME: no way to reset this list */
 368                         LIST_FOREACH(device_weights, w, c->blockio_device_weights) {
 369                                 dev_t dev;
 370
 371                                 r = lookup_blkio_device(w->path, &dev);
 372                                 if (r < 0)
 373                                         continue;
 374
 375                                 sprintf(buf, "%u:%u %" PRIu64 "\n", major(dev), minor(dev), w->weight);
 376                                 r = cg_set_attribute("blkio", path, "blkio.weight_device", buf);
 377                                 if (r < 0)
 378                                         log_full_errno(IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
 379                                                        "Failed to set blkio.weight_device on %s: %m", path);
 380                         }
 381                 }
 382
 383                 /* FIXME: no way to reset this list */
 384                 LIST_FOREACH(device_bandwidths, b, c->blockio_device_bandwidths) {
 385                         const char *a;
 386                         dev_t dev;
 387
 388                         r = lookup_blkio_device(b->path, &dev);
 389                         if (r < 0)
 390                                 continue;
 391
 392                         a = b->read ? "blkio.throttle.read_bps_device" : "blkio.throttle.write_bps_device";
 393
 394                         sprintf(buf, "%u:%u %" PRIu64 "\n", major(dev), minor(dev), b->bandwidth);
 395                         r = cg_set_attribute("blkio", path, a, buf);
 396                         if (r < 0)
 397                                 log_full_errno(IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
 398                                                "Failed to set %s on %s: %m", a, path);
 399                 }
 400         }
 401
 402         if ((mask & CGROUP_MASK_MEMORY) && !is_root) {
 403                 if (c->memory_limit != (uint64_t) -1) {
 404                         char buf[DECIMAL_STR_MAX(uint64_t) + 1];
 405
 406                         sprintf(buf, "%" PRIu64 "\n", c->memory_limit);
 407
 408                         if (cg_unified() <= 0)
 409                                 r = cg_set_attribute("memory", path, "memory.limit_in_bytes", buf);
 410                         else
 411                                 r = cg_set_attribute("memory", path, "memory.max", buf);
 412
 413                 } else {
 414                         if (cg_unified() <= 0)
 415                                 r = cg_set_attribute("memory", path, "memory.limit_in_bytes", "-1");
 416                         else
 417                                 r = cg_set_attribute("memory", path, "memory.max", "max");
 418                 }
 419
 420                 if (r < 0)
 421                         log_full_errno(IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
 422                                        "Failed to set memory.limit_in_bytes/memory.max on %s: %m", path);
 423         }
 424
 425         if ((mask & CGROUP_MASK_DEVICES) && !is_root) {
 426                 CGroupDeviceAllow *a;
 427
 428                 /* Changing the devices list of a populated cgroup
 429                  * might result in EINVAL, hence ignore EINVAL
 430                  * here. */
 431
 432                 if (c->device_allow || c->device_policy != CGROUP_AUTO)
 433                         r = cg_set_attribute("devices", path, "devices.deny", "a");
 434                 else
 435                         r = cg_set_attribute("devices", path, "devices.allow", "a");
 436                 if (r < 0)
 437                         log_full_errno(IN_SET(r, -ENOENT, -EROFS, -EINVAL, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
 438                                        "Failed to reset devices.list on %s: %m", path);
 439
 440                 if (c->device_policy == CGROUP_CLOSED ||
 441                     (c->device_policy == CGROUP_AUTO && c->device_allow)) {
 442                         static const char auto_devices[] =
 443                                 "/dev/null\0" "rwm\0"
 444                                 "/dev/zero\0" "rwm\0"
 445                                 "/dev/full\0" "rwm\0"
 446                                 "/dev/random\0" "rwm\0"
 447                                 "/dev/urandom\0" "rwm\0"
 448                                 "/dev/tty\0" "rwm\0"
 449                                 "/dev/pts/ptmx\0" "rw\0"; /* /dev/pts/ptmx may not be duplicated, but accessed */
 450
 451                         const char *x, *y;
 452
 453                         NULSTR_FOREACH_PAIR(x, y, auto_devices)
 454                                 whitelist_device(path, x, y);
 455
 456                         whitelist_major(path, "pts", 'c', "rw");
 457                         whitelist_major(path, "kdbus", 'c', "rw");
 458                         whitelist_major(path, "kdbus/*", 'c', "rw");
 459                 }
 460
 461                 LIST_FOREACH(device_allow, a, c->device_allow) {
 462                         char acc[4];
 463                         unsigned k = 0;
 464
 465                         if (a->r)
 466                                 acc[k++] = 'r';
 467                         if (a->w)
 468                                 acc[k++] = 'w';
 469                         if (a->m)
 470                                 acc[k++] = 'm';
 471
 472                         if (k == 0)
 473                                 continue;
 474
 475                         acc[k++] = 0;
 476
 477                         if (startswith(a->path, "/dev/"))
 478                                 whitelist_device(path, a->path, acc);
 479                         else if (startswith(a->path, "block-"))
 480                                 whitelist_major(path, a->path + 6, 'b', acc);
 481                         else if (startswith(a->path, "char-"))
 482                                 whitelist_major(path, a->path + 5, 'c', acc);
 483                         else
 484                                 log_debug("Ignoring device %s while writing cgroup attribute.", a->path);
 485                 }
 486         }
 487
 488         if ((mask & CGROUP_MASK_PIDS) && !is_root) {
 489
 490                 if (c->tasks_max != (uint64_t) -1) {
 491                         char buf[DECIMAL_STR_MAX(uint64_t) + 2];
 492
 493                         sprintf(buf, "%" PRIu64 "\n", c->tasks_max);
 494                         r = cg_set_attribute("pids", path, "pids.max", buf);
 495                 } else
 496                         r = cg_set_attribute("pids", path, "pids.max", "max");
 497
 498                 if (r < 0)
 499                         log_full_errno(IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
 500                                        "Failed to set pids.max on %s: %m", path);
 501         }
 502
 503         if (mask & CGROUP_MASK_NET_CLS) {
 504                 char buf[DECIMAL_STR_MAX(uint32_t)];
 505
 506                 sprintf(buf, "%" PRIu32, netclass);
 507
 508                 r = cg_set_attribute("net_cls", path, "net_cls.classid", buf);
 509                 if (r < 0)
 510                         log_full_errno(IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
 511                                        "Failed to set net_cls.classid on %s: %m", path);
 512         }
 513 }
 514
 515 CGroupMask cgroup_context_get_mask(CGroupContext *c) {
 516         CGroupMask mask = 0;
 517
 518         /* Figure out which controllers we need */
 519
 520         if (c->cpu_accounting ||
 521             c->cpu_shares != CGROUP_CPU_SHARES_INVALID ||
 522             c->startup_cpu_shares != CGROUP_CPU_SHARES_INVALID ||
 523             c->cpu_quota_per_sec_usec != USEC_INFINITY)
 524                 mask |= CGROUP_MASK_CPUACCT | CGROUP_MASK_CPU;
 525
 526         if (c->blockio_accounting ||
 527             c->blockio_weight != CGROUP_BLKIO_WEIGHT_INVALID ||
 528             c->startup_blockio_weight != CGROUP_BLKIO_WEIGHT_INVALID ||
 529             c->blockio_device_weights ||
 530             c->blockio_device_bandwidths)
 531                 mask |= CGROUP_MASK_BLKIO;
 532
 533         if (c->memory_accounting ||
 534             c->memory_limit != (uint64_t) -1)
 535                 mask |= CGROUP_MASK_MEMORY;
 536
 537         if (c->device_allow ||
 538             c->device_policy != CGROUP_AUTO)
 539                 mask |= CGROUP_MASK_DEVICES;
 540
 541         if (c->tasks_accounting ||
 542             c->tasks_max != (uint64_t) -1)
 543                 mask |= CGROUP_MASK_PIDS;
 544
 545         if (c->netclass_type != CGROUP_NETCLASS_TYPE_NONE)
 546                 mask |= CGROUP_MASK_NET_CLS;
 547
 548         return mask;
 549 }
 550
 551 CGroupMask unit_get_own_mask(Unit *u) {
 552         CGroupContext *c;
 553
 554         /* Returns the mask of controllers the unit needs for itself */
 555
 556         c = unit_get_cgroup_context(u);
 557         if (!c)
 558                 return 0;
 559
 560         /* If delegation is turned on, then turn on all cgroups,
 561          * unless we are on the legacy hierarchy and the process we
 562          * fork into it is known to drop privileges, and hence
 563          * shouldn't get access to the controllers.
 564          *
 565          * Note that on the unified hierarchy it is safe to delegate
 566          * controllers to unprivileged services. */
 567
 568         if (c->delegate) {
 569                 ExecContext *e;
 570
 571                 e = unit_get_exec_context(u);
 572                 if (!e ||
 573                     exec_context_maintains_privileges(e) ||
 574                     cg_unified() > 0)
 575                         return _CGROUP_MASK_ALL;
 576         }
 577
 578         return cgroup_context_get_mask(c);
 579 }
 580
 581 CGroupMask unit_get_members_mask(Unit *u) {
 582         assert(u);
 583
 584         /* Returns the mask of controllers all of the unit's children
 585          * require, merged */
 586
 587         if (u->cgroup_members_mask_valid)
 588                 return u->cgroup_members_mask;
 589
 590         u->cgroup_members_mask = 0;
 591
 592         if (u->type == UNIT_SLICE) {
 593                 Unit *member;
 594                 Iterator i;
 595
 596                 SET_FOREACH(member, u->dependencies[UNIT_BEFORE], i) {
 597
 598                         if (member == u)
 599                                 continue;
 600
 601                         if (UNIT_DEREF(member->slice) != u)
 602                                 continue;
 603
 604                         u->cgroup_members_mask |=
 605                                 unit_get_own_mask(member) |
 606                                 unit_get_members_mask(member);
 607                 }
 608         }
 609
 610         u->cgroup_members_mask_valid = true;
 611         return u->cgroup_members_mask;
 612 }
 613
 614 CGroupMask unit_get_siblings_mask(Unit *u) {
 615         assert(u);
 616
 617         /* Returns the mask of controllers all of the unit's siblings
 618          * require, i.e. the members mask of the unit's parent slice
 619          * if there is one. */
 620
 621         if (UNIT_ISSET(u->slice))
 622                 return unit_get_members_mask(UNIT_DEREF(u->slice));
 623
 624         return unit_get_own_mask(u) | unit_get_members_mask(u);
 625 }
 626
 627 CGroupMask unit_get_subtree_mask(Unit *u) {
 628
 629         /* Returns the mask of this subtree, meaning of the group
 630          * itself and its children. */
 631
 632         return unit_get_own_mask(u) | unit_get_members_mask(u);
 633 }
 634
 635 CGroupMask unit_get_target_mask(Unit *u) {
 636         CGroupMask mask;
 637
 638         /* This returns the cgroup mask of all controllers to enable
 639          * for a specific cgroup, i.e. everything it needs itself,
 640          * plus all that its children need, plus all that its siblings
 641          * need. This is primarily useful on the legacy cgroup
 642          * hierarchy, where we need to duplicate each cgroup in each
 643          * hierarchy that shall be enabled for it. */
 644
 645         mask = unit_get_own_mask(u) | unit_get_members_mask(u) | unit_get_siblings_mask(u);
 646         mask &= u->manager->cgroup_supported;
 647
 648         return mask;
 649 }
 650
 651 CGroupMask unit_get_enable_mask(Unit *u) {
 652         CGroupMask mask;
 653
 654         /* This returns the cgroup mask of all controllers to enable
 655          * for the children of a specific cgroup. This is primarily
 656          * useful for the unified cgroup hierarchy, where each cgroup
 657          * controls which controllers are enabled for its children. */
 658
 659         mask = unit_get_members_mask(u);
 660         mask &= u->manager->cgroup_supported;
 661
 662         return mask;
 663 }
 664
 665 /* Recurse from a unit up through its containing slices, propagating
 666  * mask bits upward. A unit is also member of itself. */
 667 void unit_update_cgroup_members_masks(Unit *u) {
 668         CGroupMask m;
 669         bool more;
 670
 671         assert(u);
 672
 673         /* Calculate subtree mask */
 674         m = unit_get_subtree_mask(u);
 675
 676         /* See if anything changed from the previous invocation. If
 677          * not, we're done. */
 678         if (u->cgroup_subtree_mask_valid && m == u->cgroup_subtree_mask)
 679                 return;
 680
 681         more =
 682                 u->cgroup_subtree_mask_valid &&
 683                 ((m & ~u->cgroup_subtree_mask) != 0) &&
 684                 ((~m & u->cgroup_subtree_mask) == 0);
 685
 686         u->cgroup_subtree_mask = m;
 687         u->cgroup_subtree_mask_valid = true;
 688
 689         if (UNIT_ISSET(u->slice)) {
 690                 Unit *s = UNIT_DEREF(u->slice);
 691
 692                 if (more)
 693                         /* There's more set now than before. We
 694                          * propagate the new mask to the parent's mask
 695                          * (not caring if it actually was valid or
 696                          * not). */
 697
 698                         s->cgroup_members_mask |= m;
 699
 700                 else
 701                         /* There's less set now than before (or we
 702                          * don't know), we need to recalculate
 703                          * everything, so let's invalidate the
 704                          * parent's members mask */
 705
 706                         s->cgroup_members_mask_valid = false;
 707
 708                 /* And now make sure that this change also hits our
 709                  * grandparents */
 710                 unit_update_cgroup_members_masks(s);
 711         }
 712 }
 713
 714 static const char *migrate_callback(CGroupMask mask, void *userdata) {
 715         Unit *u = userdata;
 716
 717         assert(mask != 0);
 718         assert(u);
 719
 720         while (u) {
 721                 if (u->cgroup_path &&
 722                     u->cgroup_realized &&
 723                     (u->cgroup_realized_mask & mask) == mask)
 724                         return u->cgroup_path;
 725
 726                 u = UNIT_DEREF(u->slice);
 727         }
 728
 729         return NULL;
 730 }
 731
 732 char *unit_default_cgroup_path(Unit *u) {
 733         _cleanup_free_ char *escaped = NULL, *slice = NULL;
 734         int r;
 735
 736         assert(u);
 737
 738         if (unit_has_name(u, SPECIAL_ROOT_SLICE))
 739                 return strdup(u->manager->cgroup_root);
 740
 741         if (UNIT_ISSET(u->slice) && !unit_has_name(UNIT_DEREF(u->slice), SPECIAL_ROOT_SLICE)) {
 742                 r = cg_slice_to_path(UNIT_DEREF(u->slice)->id, &slice);
 743                 if (r < 0)
 744                         return NULL;
 745         }
 746
 747         escaped = cg_escape(u->id);
 748         if (!escaped)
 749                 return NULL;
 750
 751         if (slice)
 752                 return strjoin(u->manager->cgroup_root, "/", slice, "/", escaped, NULL);
 753         else
 754                 return strjoin(u->manager->cgroup_root, "/", escaped, NULL);
 755 }
 756
 757 int unit_set_cgroup_path(Unit *u, const char *path) {
 758         _cleanup_free_ char *p = NULL;
 759         int r;
 760
 761         assert(u);
 762
 763         if (path) {
 764                 p = strdup(path);
 765                 if (!p)
 766                         return -ENOMEM;
 767         } else
 768                 p = NULL;
 769
 770         if (streq_ptr(u->cgroup_path, p))
 771                 return 0;
 772
 773         if (p) {
 774                 r = hashmap_put(u->manager->cgroup_unit, p, u);
 775                 if (r < 0)
 776                         return r;
 777         }
 778
 779         unit_release_cgroup(u);
 780
 781         u->cgroup_path = p;
 782         p = NULL;
 783
 784         return 1;
 785 }
 786
 787 int unit_watch_cgroup(Unit *u) {
 788         _cleanup_free_ char *populated = NULL;
 789         int r;
 790
 791         assert(u);
 792
 793         if (!u->cgroup_path)
 794                 return 0;
 795
 796         if (u->cgroup_inotify_wd >= 0)
 797                 return 0;
 798
 799         /* Only applies to the unified hierarchy */
 800         r = cg_unified();
 801         if (r < 0)
 802                 return log_unit_error_errno(u, r, "Failed detect wether the unified hierarchy is used: %m");
 803         if (r == 0)
 804                 return 0;
 805
 806         /* Don't watch the root slice, it's pointless. */
 807         if (unit_has_name(u, SPECIAL_ROOT_SLICE))
 808                 return 0;
 809
 810         r = hashmap_ensure_allocated(&u->manager->cgroup_inotify_wd_unit, &trivial_hash_ops);
 811         if (r < 0)
 812                 return log_oom();
 813
 814         r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, "cgroup.populated", &populated);
 815         if (r < 0)
 816                 return log_oom();
 817
 818         u->cgroup_inotify_wd = inotify_add_watch(u->manager->cgroup_inotify_fd, populated, IN_MODIFY);
 819         if (u->cgroup_inotify_wd < 0) {
 820
 821                 if (errno == ENOENT) /* If the directory is already
 822                                       * gone we don't need to track
 823                                       * it, so this is not an error */
 824                         return 0;
 825
 826                 return log_unit_error_errno(u, errno, "Failed to add inotify watch descriptor for control group %s: %m", u->cgroup_path);
 827         }
 828
 829         r = hashmap_put(u->manager->cgroup_inotify_wd_unit, INT_TO_PTR(u->cgroup_inotify_wd), u);
 830         if (r < 0)
 831                 return log_unit_error_errno(u, r, "Failed to add inotify watch descriptor to hash map: %m");
 832
 833         return 0;
 834 }
 835
 836 static int unit_create_cgroup(
 837                 Unit *u,
 838                 CGroupMask target_mask,
 839                 CGroupMask enable_mask) {
 840
 841         CGroupContext *c;
 842         int r;
 843
 844         assert(u);
 845
 846         c = unit_get_cgroup_context(u);
 847         if (!c)
 848                 return 0;
 849
 850         if (!u->cgroup_path) {
 851                 _cleanup_free_ char *path = NULL;
 852
 853                 path = unit_default_cgroup_path(u);
 854                 if (!path)
 855                         return log_oom();
 856
 857                 r = unit_set_cgroup_path(u, path);
 858                 if (r == -EEXIST)
 859                         return log_unit_error_errno(u, r, "Control group %s exists already.", path);
 860                 if (r < 0)
 861                         return log_unit_error_errno(u, r, "Failed to set unit's control group path to %s: %m", path);
 862         }
 863
 864         /* First, create our own group */
 865         r = cg_create_everywhere(u->manager->cgroup_supported, target_mask, u->cgroup_path);
 866         if (r < 0)
 867                 return log_unit_error_errno(u, r, "Failed to create cgroup %s: %m", u->cgroup_path);
 868
 869         /* Start watching it */
 870         (void) unit_watch_cgroup(u);
 871
 872         /* Enable all controllers we need */
 873         r = cg_enable_everywhere(u->manager->cgroup_supported, enable_mask, u->cgroup_path);
 874         if (r < 0)
 875                 log_unit_warning_errno(u, r, "Failed to enable controllers on cgroup %s, ignoring: %m", u->cgroup_path);
 876
 877         /* Keep track that this is now realized */
 878         u->cgroup_realized = true;
 879         u->cgroup_realized_mask = target_mask;
 880
 881         if (u->type != UNIT_SLICE && !c->delegate) {
 882
 883                 /* Then, possibly move things over, but not if
 884                  * subgroups may contain processes, which is the case
 885                  * for slice and delegation units. */
 886                 r = cg_migrate_everywhere(u->manager->cgroup_supported, u->cgroup_path, u->cgroup_path, migrate_callback, u);
 887                 if (r < 0)
 888                         log_unit_warning_errno(u, r, "Failed to migrate cgroup from to %s, ignoring: %m", u->cgroup_path);
 889         }
 890
 891         return 0;
 892 }
 893
 894 int unit_attach_pids_to_cgroup(Unit *u) {
 895         int r;
 896         assert(u);
 897
 898         r = unit_realize_cgroup(u);
 899         if (r < 0)
 900                 return r;
 901
 902         r = cg_attach_many_everywhere(u->manager->cgroup_supported, u->cgroup_path, u->pids, migrate_callback, u);
 903         if (r < 0)
 904                 return r;
 905
 906         return 0;
 907 }
 908
 909 static bool unit_has_mask_realized(Unit *u, CGroupMask target_mask) {
 910         assert(u);
 911
 912         return u->cgroup_realized && u->cgroup_realized_mask == target_mask;
 913 }
 914
 915 static int unit_find_free_netclass_cgroup(Unit *u, uint32_t *ret) {
 916
 917         uint32_t start, i;
 918         Manager *m;
 919
 920         assert(u);
 921
 922         m = u->manager;
 923
 924         i = start = m->cgroup_netclass_registry_last;
 925
 926         do {
 927                 i++;
 928
 929                 if (!hashmap_get(m->cgroup_netclass_registry, UINT_TO_PTR(i))) {
 930                         m->cgroup_netclass_registry_last = i;
 931                         *ret = i;
 932                         return 0;
 933                 }
 934
 935                 if (i == UINT32_MAX)
 936                         i = CGROUP_NETCLASS_FIXED_MAX;
 937
 938         } while (i != start);
 939
 940         return -ENOBUFS;
 941 }
 942
 943 int unit_add_to_netclass_cgroup(Unit *u) {
 944
 945         CGroupContext *cc;
 946         Unit *first;
 947         void *key;
 948         int r;
 949
 950         assert(u);
 951
 952         cc = unit_get_cgroup_context(u);
 953         if (!cc)
 954                 return 0;
 955
 956         switch (cc->netclass_type) {
 957         case CGROUP_NETCLASS_TYPE_NONE:
 958                 return 0;
 959
 960         case CGROUP_NETCLASS_TYPE_FIXED:
 961                 u->cgroup_netclass_id = cc->netclass_id;
 962                 break;
 963
 964         case CGROUP_NETCLASS_TYPE_AUTO:
 965                 /* Allocate a new ID in case it was requested and not done yet */
 966                 if (u->cgroup_netclass_id == 0) {
 967                         r = unit_find_free_netclass_cgroup(u, &u->cgroup_netclass_id);
 968                         if (r < 0)
 969                                 return r;
 970
 971                         log_debug("Dynamically assigned netclass cgroup id %" PRIu32 " to %s", u->cgroup_netclass_id, u->id);
 972                 }
 973
 974                 break;
 975         }
 976
 977         r = hashmap_ensure_allocated(&u->manager->cgroup_netclass_registry, &trivial_hash_ops);
 978         if (r < 0)
 979                 return r;
 980
 981         key = UINT32_TO_PTR(u->cgroup_netclass_id);
 982         first = hashmap_get(u->manager->cgroup_netclass_registry, key);
 983
 984         if (first) {
 985                 LIST_PREPEND(cgroup_netclass, first, u);
 986                 return hashmap_replace(u->manager->cgroup_netclass_registry, key, u);
 987         }
 988
 989         return hashmap_put(u->manager->cgroup_netclass_registry, key, u);
 990 }
 991
 992 int unit_remove_from_netclass_cgroup(Unit *u) {
 993
 994         Unit *head;
 995         void *key;
 996
 997         assert(u);
 998
 999         key = UINT32_TO_PTR(u->cgroup_netclass_id);
1000
1001         LIST_FIND_HEAD(cgroup_netclass, u, head);
1002         LIST_REMOVE(cgroup_netclass, head, u);
1003
1004         if (head)
1005                 return hashmap_replace(u->manager->cgroup_netclass_registry, key, head);
1006
1007         hashmap_remove(u->manager->cgroup_netclass_registry, key);
1008
1009         return 0;
1010 }
1011
1012 /* Check if necessary controllers and attributes for a unit are in place.
1013  *
1014  * If so, do nothing.
1015  * If not, create paths, move processes over, and set attributes.
1016  *
1017  * Returns 0 on success and < 0 on failure. */
1018 static int unit_realize_cgroup_now(Unit *u, ManagerState state) {
1019         CGroupMask target_mask, enable_mask;
1020         int r;
1021
1022         assert(u);
1023
1024         if (u->in_cgroup_queue) {
1025                 LIST_REMOVE(cgroup_queue, u->manager->cgroup_queue, u);
1026                 u->in_cgroup_queue = false;
1027         }
1028
1029         target_mask = unit_get_target_mask(u);
1030         if (unit_has_mask_realized(u, target_mask))
1031                 return 0;
1032
1033         /* First, realize parents */
1034         if (UNIT_ISSET(u->slice)) {
1035                 r = unit_realize_cgroup_now(UNIT_DEREF(u->slice), state);
1036                 if (r < 0)
1037                         return r;
1038         }
1039
1040         /* And then do the real work */
1041         enable_mask = unit_get_enable_mask(u);
1042         r = unit_create_cgroup(u, target_mask, enable_mask);
1043         if (r < 0)
1044                 return r;
1045
1046         /* Finally, apply the necessary attributes. */
1047         cgroup_context_apply(unit_get_cgroup_context(u), target_mask, u->cgroup_path, u->cgroup_netclass_id, state);
1048
1049         return 0;
1050 }
1051
1052 static void unit_add_to_cgroup_queue(Unit *u) {
1053
1054         if (u->in_cgroup_queue)
1055                 return;
1056
1057         LIST_PREPEND(cgroup_queue, u->manager->cgroup_queue, u);
1058         u->in_cgroup_queue = true;
1059 }
1060
1061 unsigned manager_dispatch_cgroup_queue(Manager *m) {
1062         ManagerState state;
1063         unsigned n = 0;
1064         Unit *i;
1065         int r;
1066
1067         state = manager_state(m);
1068
1069         while ((i = m->cgroup_queue)) {
1070                 assert(i->in_cgroup_queue);
1071
1072                 r = unit_realize_cgroup_now(i, state);
1073                 if (r < 0)
1074                         log_warning_errno(r, "Failed to realize cgroups for queued unit %s, ignoring: %m", i->id);
1075
1076                 n++;
1077         }
1078
1079         return n;
1080 }
1081
1082 static void unit_queue_siblings(Unit *u) {
1083         Unit *slice;
1084
1085         /* This adds the siblings of the specified unit and the
1086          * siblings of all parent units to the cgroup queue. (But
1087          * neither the specified unit itself nor the parents.) */
1088
1089         while ((slice = UNIT_DEREF(u->slice))) {
1090                 Iterator i;
1091                 Unit *m;
1092
1093                 SET_FOREACH(m, slice->dependencies[UNIT_BEFORE], i) {
1094                         if (m == u)
1095                                 continue;
1096
1097                         /* Skip units that have a dependency on the slice
1098                          * but aren't actually in it. */
1099                         if (UNIT_DEREF(m->slice) != slice)
1100                                 continue;
1101
1102                         /* No point in doing cgroup application for units
1103                          * without active processes. */
1104                         if (UNIT_IS_INACTIVE_OR_FAILED(unit_active_state(m)))
1105                                 continue;
1106
1107                         /* If the unit doesn't need any new controllers
1108                          * and has current ones realized, it doesn't need
1109                          * any changes. */
1110                         if (unit_has_mask_realized(m, unit_get_target_mask(m)))
1111                                 continue;
1112
1113                         unit_add_to_cgroup_queue(m);
1114                 }
1115
1116                 u = slice;
1117         }
1118 }
1119
1120 int unit_realize_cgroup(Unit *u) {
1121         assert(u);
1122
1123         if (!UNIT_HAS_CGROUP_CONTEXT(u))
1124                 return 0;
1125
1126         /* So, here's the deal: when realizing the cgroups for this
1127          * unit, we need to first create all parents, but there's more
1128          * actually: for the weight-based controllers we also need to
1129          * make sure that all our siblings (i.e. units that are in the
1130          * same slice as we are) have cgroups, too. Otherwise, things
1131          * would become very uneven as each of their processes would
1132          * get as much resources as all our group together. This call
1133          * will synchronously create the parent cgroups, but will
1134          * defer work on the siblings to the next event loop
1135          * iteration. */
1136
1137         /* Add all sibling slices to the cgroup queue. */
1138         unit_queue_siblings(u);
1139
1140         /* And realize this one now (and apply the values) */
1141         return unit_realize_cgroup_now(u, manager_state(u->manager));
1142 }
1143
1144 void unit_release_cgroup(Unit *u) {
1145         assert(u);
1146
1147         /* Forgets all cgroup details for this cgroup */
1148
1149         if (u->cgroup_path) {
1150                 (void) hashmap_remove(u->manager->cgroup_unit, u->cgroup_path);
1151                 u->cgroup_path = mfree(u->cgroup_path);
1152         }
1153
1154         if (u->cgroup_inotify_wd >= 0) {
1155                 if (inotify_rm_watch(u->manager->cgroup_inotify_fd, u->cgroup_inotify_wd) < 0)
1156                         log_unit_debug_errno(u, errno, "Failed to remove cgroup inotify watch %i for %s, ignoring", u->cgroup_inotify_wd, u->id);
1157
1158                 (void) hashmap_remove(u->manager->cgroup_inotify_wd_unit, INT_TO_PTR(u->cgroup_inotify_wd));
1159                 u->cgroup_inotify_wd = -1;
1160         }
1161 }
1162
1163 void unit_prune_cgroup(Unit *u) {
1164         int r;
1165         bool is_root_slice;
1166
1167         assert(u);
1168
1169         /* Removes the cgroup, if empty and possible, and stops watching it. */
1170
1171         if (!u->cgroup_path)
1172                 return;
1173
1174         is_root_slice = unit_has_name(u, SPECIAL_ROOT_SLICE);
1175
1176         r = cg_trim_everywhere(u->manager->cgroup_supported, u->cgroup_path, !is_root_slice);
1177         if (r < 0) {
1178                 log_debug_errno(r, "Failed to destroy cgroup %s, ignoring: %m", u->cgroup_path);
1179                 return;
1180         }
1181
1182         if (is_root_slice)
1183                 return;
1184
1185         unit_release_cgroup(u);
1186
1187         u->cgroup_realized = false;
1188         u->cgroup_realized_mask = 0;
1189 }
1190
1191 int unit_search_main_pid(Unit *u, pid_t *ret) {
1192         _cleanup_fclose_ FILE *f = NULL;
1193         pid_t pid = 0, npid, mypid;
1194         int r;
1195
1196         assert(u);
1197         assert(ret);
1198
1199         if (!u->cgroup_path)
1200                 return -ENXIO;
1201
1202         r = cg_enumerate_processes(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, &f);
1203         if (r < 0)
1204                 return r;
1205
1206         mypid = getpid();
1207         while (cg_read_pid(f, &npid) > 0)  {
1208                 pid_t ppid;
1209
1210                 if (npid == pid)
1211                         continue;
1212
1213                 /* Ignore processes that aren't our kids */
1214                 if (get_process_ppid(npid, &ppid) >= 0 && ppid != mypid)
1215                         continue;
1216
1217                 if (pid != 0)
1218                         /* Dang, there's more than one daemonized PID
1219                         in this group, so we don't know what process
1220                         is the main process. */
1221
1222                         return -ENODATA;
1223
1224                 pid = npid;
1225         }
1226
1227         *ret = pid;
1228         return 0;
1229 }
1230
1231 static int unit_watch_pids_in_path(Unit *u, const char *path) {
1232         _cleanup_closedir_ DIR *d = NULL;
1233         _cleanup_fclose_ FILE *f = NULL;
1234         int ret = 0, r;
1235
1236         assert(u);
1237         assert(path);
1238
1239         r = cg_enumerate_processes(SYSTEMD_CGROUP_CONTROLLER, path, &f);
1240         if (r < 0)
1241                 ret = r;
1242         else {
1243                 pid_t pid;
1244
1245                 while ((r = cg_read_pid(f, &pid)) > 0) {
1246                         r = unit_watch_pid(u, pid);
1247                         if (r < 0 && ret >= 0)
1248                                 ret = r;
1249                 }
1250
1251                 if (r < 0 && ret >= 0)
1252                         ret = r;
1253         }
1254
1255         r = cg_enumerate_subgroups(SYSTEMD_CGROUP_CONTROLLER, path, &d);
1256         if (r < 0) {
1257                 if (ret >= 0)
1258                         ret = r;
1259         } else {
1260                 char *fn;
1261
1262                 while ((r = cg_read_subgroup(d, &fn)) > 0) {
1263                         _cleanup_free_ char *p = NULL;
1264
1265                         p = strjoin(path, "/", fn, NULL);
1266                         free(fn);
1267
1268                         if (!p)
1269                                 return -ENOMEM;
1270
1271                         r = unit_watch_pids_in_path(u, p);
1272                         if (r < 0 && ret >= 0)
1273                                 ret = r;
1274                 }
1275
1276                 if (r < 0 && ret >= 0)
1277                         ret = r;
1278         }
1279
1280         return ret;
1281 }
1282
1283 int unit_watch_all_pids(Unit *u) {
1284         assert(u);
1285
1286         /* Adds all PIDs from our cgroup to the set of PIDs we
1287          * watch. This is a fallback logic for cases where we do not
1288          * get reliable cgroup empty notifications: we try to use
1289          * SIGCHLD as replacement. */
1290
1291         if (!u->cgroup_path)
1292                 return -ENOENT;
1293
1294         if (cg_unified() > 0) /* On unified we can use proper notifications */
1295                 return 0;
1296
1297         return unit_watch_pids_in_path(u, u->cgroup_path);
1298 }
1299
1300 int unit_notify_cgroup_empty(Unit *u) {
1301         int r;
1302
1303         assert(u);
1304
1305         if (!u->cgroup_path)
1306                 return 0;
1307
1308         r = cg_is_empty_recursive(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path);
1309         if (r <= 0)
1310                 return r;
1311
1312         unit_add_to_gc_queue(u);
1313
1314         if (UNIT_VTABLE(u)->notify_cgroup_empty)
1315                 UNIT_VTABLE(u)->notify_cgroup_empty(u);
1316
1317         return 0;
1318 }
1319
1320 static int on_cgroup_inotify_event(sd_event_source *s, int fd, uint32_t revents, void *userdata) {
1321         Manager *m = userdata;
1322
1323         assert(s);
1324         assert(fd >= 0);
1325         assert(m);
1326
1327         for (;;) {
1328                 union inotify_event_buffer buffer;
1329                 struct inotify_event *e;
1330                 ssize_t l;
1331
1332                 l = read(fd, &buffer, sizeof(buffer));
1333                 if (l < 0) {
1334                         if (errno == EINTR || errno == EAGAIN)
1335                                 return 0;
1336
1337                         return log_error_errno(errno, "Failed to read control group inotify events: %m");
1338                 }
1339
1340                 FOREACH_INOTIFY_EVENT(e, buffer, l) {
1341                         Unit *u;
1342
1343                         if (e->wd < 0)
1344                                 /* Queue overflow has no watch descriptor */
1345                                 continue;
1346
1347                         if (e->mask & IN_IGNORED)
1348                                 /* The watch was just removed */
1349                                 continue;
1350
1351                         u = hashmap_get(m->cgroup_inotify_wd_unit, INT_TO_PTR(e->wd));
1352                         if (!u) /* Not that inotify might deliver
1353                                  * events for a watch even after it
1354                                  * was removed, because it was queued
1355                                  * before the removal. Let's ignore
1356                                  * this here safely. */
1357                                 continue;
1358
1359                         (void) unit_notify_cgroup_empty(u);
1360                 }
1361         }
1362 }
1363 #endif // 0
1364
1365 int manager_setup_cgroup(Manager *m) {
1366         _cleanup_free_ char *path = NULL;
1367         CGroupController c;
1368         int r, unified;
1369         char *e;
1370
1371         assert(m);
1372
1373         /* 1. Determine hierarchy */
1374         m->cgroup_root = mfree(m->cgroup_root);
1375         r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, 0, &m->cgroup_root);
1376         if (r < 0)
1377                 return log_error_errno(r, "Cannot determine cgroup we are running in: %m");
1378
1379 /// elogind does not support systemd scopes and slices
1380 #if 0
1381         /* Chop off the init scope, if we are already located in it */
1382         e = endswith(m->cgroup_root, "/" SPECIAL_INIT_SCOPE);
1383
1384         /* LEGACY: Also chop off the system slice if we are in
1385          * it. This is to support live upgrades from older systemd
1386          * versions where PID 1 was moved there. Also see
1387          * cg_get_root_path(). */
1388         if (!e && m->running_as == MANAGER_SYSTEM) {
1389                 e = endswith(m->cgroup_root, "/" SPECIAL_SYSTEM_SLICE);
1390                 if (!e)
1391                         e = endswith(m->cgroup_root, "/system"); /* even more legacy */
1392         }
1393         if (e)
1394                 *e = 0;
1395 #endif // 0
1396
1397         /* And make sure to store away the root value without trailing
1398          * slash, even for the root dir, so that we can easily prepend
1399          * it everywhere. */
1400         while ((e = endswith(m->cgroup_root, "/")))
1401                 *e = 0;
1402         log_debug_elogind("Cgroup Controller \"%s\" -> root \"%s\"",
1403                           SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root);
1404
1405         /* 2. Show data */
1406         r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, NULL, &path);
1407         if (r < 0)
1408                 return log_error_errno(r, "Cannot find cgroup mount point: %m");
1409
1410         unified = cg_unified();
1411         if (unified < 0)
1412                 return log_error_errno(r, "Couldn't determine if we are running in the unified hierarchy: %m");
1413         if (unified > 0)
1414                 log_debug("Unified cgroup hierarchy is located at %s.", path);
1415         else
1416                 log_debug("Using cgroup controller " SYSTEMD_CGROUP_CONTROLLER ". File system hierarchy is at %s.", path);
1417
1418         if (!m->test_run) {
1419                 const char *scope_path;
1420
1421                 /* 3. Install agent */
1422                 if (unified) {
1423
1424                         /* In the unified hierarchy we can can get
1425                          * cgroup empty notifications via inotify. */
1426
1427 /// elogind does not support the unified hierarchy, yet.
1428 #if 0
1429                         m->cgroup_inotify_event_source = sd_event_source_unref(m->cgroup_inotify_event_source);
1430                         safe_close(m->cgroup_inotify_fd);
1431
1432                         m->cgroup_inotify_fd = inotify_init1(IN_NONBLOCK|IN_CLOEXEC);
1433                         if (m->cgroup_inotify_fd < 0)
1434                                 return log_error_errno(errno, "Failed to create control group inotify object: %m");
1435
1436                         r = sd_event_add_io(m->event, &m->cgroup_inotify_event_source, m->cgroup_inotify_fd, EPOLLIN, on_cgroup_inotify_event, m);
1437                         if (r < 0)
1438                                 return log_error_errno(r, "Failed to watch control group inotify object: %m");
1439
1440                         r = sd_event_source_set_priority(m->cgroup_inotify_event_source, SD_EVENT_PRIORITY_IDLE - 5);
1441                         if (r < 0)
1442                                 return log_error_errno(r, "Failed to set priority of inotify event source: %m");
1443
1444                         (void) sd_event_source_set_description(m->cgroup_inotify_event_source, "cgroup-inotify");
1445
1446 #else
1447                         return log_error_errno(EOPNOTSUPP, "Unified cgroup hierarchy not supported: %m");
1448 #endif // 0
1449                 } else if (m->running_as == MANAGER_SYSTEM) {
1450
1451                         /* On the legacy hierarchy we only get
1452                          * notifications via cgroup agents. (Which
1453                          * isn't really reliable, since it does not
1454                          * generate events when control groups with
1455                          * children run empty. */
1456
1457                         r = cg_install_release_agent(SYSTEMD_CGROUP_CONTROLLER, ELOGIND_CGROUP_AGENT_PATH);
1458                         if (r < 0)
1459                                 log_warning_errno(r, "Failed to install release agent, ignoring: %m");
1460                         else if (r > 0)
1461                                 log_debug("Installed release agent.");
1462                         else if (r == 0)
1463                                 log_debug("Release agent already installed.");
1464                 }
1465
1466 /// elogind is not meant to run in systemd init scope
1467 #if 0
1468                 /* 4. Make sure we are in the special "init.scope" unit in the root slice. */
1469                 scope_path = strjoina(m->cgroup_root, "/" SPECIAL_INIT_SCOPE);
1470                 r = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, scope_path, 0);
1471 #else
1472                 if (streq(SYSTEMD_CGROUP_CONTROLLER, "name=elogind"))
1473                         // we are our own cgroup controller
1474                         scope_path = strjoina("");
1475                 else if (streq(m->cgroup_root, "/elogind"))
1476                         // root already is our cgroup
1477                         scope_path = strjoina(m->cgroup_root);
1478                 else
1479                         // we have to create our own group
1480                         scope_path = strjoina(m->cgroup_root, "/elogind");
1481                 r = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, scope_path, 0);
1482 #endif // 0
1483                 if (r < 0)
1484                         return log_error_errno(r, "Failed to create %s control group: %m", scope_path);
1485                 log_debug_elogind("Created control group \"%s\"", scope_path);
1486
1487                 /* also, move all other userspace processes remaining
1488                  * in the root cgroup into that scope. */
1489                 if (!streq(m->cgroup_root, scope_path)) {
1490                         r = cg_migrate(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, SYSTEMD_CGROUP_CONTROLLER, scope_path, false);
1491                         if (r < 0)
1492                                 log_warning_errno(r, "Couldn't move remaining userspace processes, ignoring: %m");
1493                 }
1494
1495                 /* 5. And pin it, so that it cannot be unmounted */
1496                 safe_close(m->pin_cgroupfs_fd);
1497                 m->pin_cgroupfs_fd = open(path, O_RDONLY|O_CLOEXEC|O_DIRECTORY|O_NOCTTY|O_NONBLOCK);
1498                 if (m->pin_cgroupfs_fd < 0)
1499                         return log_error_errno(errno, "Failed to open pin file: %m");
1500
1501                 /* 6.  Always enable hierarchical support if it exists... */
1502                 if (!unified)
1503                         (void) cg_set_attribute("memory", "/", "memory.use_hierarchy", "1");
1504         }
1505
1506         /* 7. Figure out which controllers are supported */
1507         r = cg_mask_supported(&m->cgroup_supported);
1508         if (r < 0)
1509                 return log_error_errno(r, "Failed to determine supported controllers: %m");
1510
1511         for (c = 0; c < _CGROUP_CONTROLLER_MAX; c++)
1512                 log_debug("Controller '%s' supported: %s", cgroup_controller_to_string(c), yes_no(m->cgroup_supported & c));
1513
1514         return 0;
1515 }
1516
1517 void manager_shutdown_cgroup(Manager *m, bool delete) {
1518         assert(m);
1519
1520         /* We can't really delete the group, since we are in it. But
1521          * let's trim it. */
1522         if (delete && m->cgroup_root)
1523                 (void) cg_trim(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, false);
1524
1525 /// elogind does not support the unified hierarchy, yet.
1526 #if 0
1527         m->cgroup_inotify_wd_unit = hashmap_free(m->cgroup_inotify_wd_unit);
1528
1529         m->cgroup_inotify_event_source = sd_event_source_unref(m->cgroup_inotify_event_source);
1530         m->cgroup_inotify_fd = safe_close(m->cgroup_inotify_fd);
1531 #endif // 0
1532
1533         m->pin_cgroupfs_fd = safe_close(m->pin_cgroupfs_fd);
1534
1535         m->cgroup_root = mfree(m->cgroup_root);
1536 }
1537
1538 /// UNNEEDED by elogind
1539 #if 0
1540 Unit* manager_get_unit_by_cgroup(Manager *m, const char *cgroup) {
1541         char *p;
1542         Unit *u;
1543
1544         assert(m);
1545         assert(cgroup);
1546
1547         u = hashmap_get(m->cgroup_unit, cgroup);
1548         if (u)
1549                 return u;
1550
1551         p = strdupa(cgroup);
1552         for (;;) {
1553                 char *e;
1554
1555                 e = strrchr(p, '/');
1556                 if (!e || e == p)
1557                         return hashmap_get(m->cgroup_unit, SPECIAL_ROOT_SLICE);
1558
1559                 *e = 0;
1560
1561                 u = hashmap_get(m->cgroup_unit, p);
1562                 if (u)
1563                         return u;
1564         }
1565 }
1566
1567 Unit *manager_get_unit_by_pid_cgroup(Manager *m, pid_t pid) {
1568         _cleanup_free_ char *cgroup = NULL;
1569         int r;
1570
1571         assert(m);
1572
1573         if (pid <= 0)
1574                 return NULL;
1575
1576         r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, pid, &cgroup);
1577         if (r < 0)
1578                 return NULL;
1579
1580         return manager_get_unit_by_cgroup(m, cgroup);
1581 }
1582
1583 Unit *manager_get_unit_by_pid(Manager *m, pid_t pid) {
1584         Unit *u;
1585
1586         assert(m);
1587
1588         if (pid <= 0)
1589                 return NULL;
1590
1591         if (pid == 1)
1592                 return hashmap_get(m->units, SPECIAL_INIT_SCOPE);
1593
1594         u = hashmap_get(m->watch_pids1, PID_TO_PTR(pid));
1595         if (u)
1596                 return u;
1597
1598         u = hashmap_get(m->watch_pids2, PID_TO_PTR(pid));
1599         if (u)
1600                 return u;
1601
1602         return manager_get_unit_by_pid_cgroup(m, pid);
1603 }
1604
1605 int manager_notify_cgroup_empty(Manager *m, const char *cgroup) {
1606         Unit *u;
1607
1608         assert(m);
1609         assert(cgroup);
1610
1611         u = manager_get_unit_by_cgroup(m, cgroup);
1612         if (!u)
1613                 return 0;
1614
1615         return unit_notify_cgroup_empty(u);
1616 }
1617
1618 int unit_get_memory_current(Unit *u, uint64_t *ret) {
1619         _cleanup_free_ char *v = NULL;
1620         int r;
1621
1622         assert(u);
1623         assert(ret);
1624
1625         if (!u->cgroup_path)
1626                 return -ENODATA;
1627
1628         if ((u->cgroup_realized_mask & CGROUP_MASK_MEMORY) == 0)
1629                 return -ENODATA;
1630
1631         if (cg_unified() <= 0)
1632                 r = cg_get_attribute("memory", u->cgroup_path, "memory.usage_in_bytes", &v);
1633         else
1634                 r = cg_get_attribute("memory", u->cgroup_path, "memory.current", &v);
1635         if (r == -ENOENT)
1636                 return -ENODATA;
1637         if (r < 0)
1638                 return r;
1639
1640         return safe_atou64(v, ret);
1641 }
1642
1643 int unit_get_tasks_current(Unit *u, uint64_t *ret) {
1644         _cleanup_free_ char *v = NULL;
1645         int r;
1646
1647         assert(u);
1648         assert(ret);
1649
1650         if (!u->cgroup_path)
1651                 return -ENODATA;
1652
1653         if ((u->cgroup_realized_mask & CGROUP_MASK_PIDS) == 0)
1654                 return -ENODATA;
1655
1656         r = cg_get_attribute("pids", u->cgroup_path, "pids.current", &v);
1657         if (r == -ENOENT)
1658                 return -ENODATA;
1659         if (r < 0)
1660                 return r;
1661
1662         return safe_atou64(v, ret);
1663 }
1664
1665 static int unit_get_cpu_usage_raw(Unit *u, nsec_t *ret) {
1666         _cleanup_free_ char *v = NULL;
1667         uint64_t ns;
1668         int r;
1669
1670         assert(u);
1671         assert(ret);
1672
1673         if (!u->cgroup_path)
1674                 return -ENODATA;
1675
1676         if ((u->cgroup_realized_mask & CGROUP_MASK_CPUACCT) == 0)
1677                 return -ENODATA;
1678
1679         r = cg_get_attribute("cpuacct", u->cgroup_path, "cpuacct.usage", &v);
1680         if (r == -ENOENT)
1681                 return -ENODATA;
1682         if (r < 0)
1683                 return r;
1684
1685         r = safe_atou64(v, &ns);
1686         if (r < 0)
1687                 return r;
1688
1689         *ret = ns;
1690         return 0;
1691 }
1692
1693 int unit_get_cpu_usage(Unit *u, nsec_t *ret) {
1694         nsec_t ns;
1695         int r;
1696
1697         r = unit_get_cpu_usage_raw(u, &ns);
1698         if (r < 0)
1699                 return r;
1700
1701         if (ns > u->cpuacct_usage_base)
1702                 ns -= u->cpuacct_usage_base;
1703         else
1704                 ns = 0;
1705
1706         *ret = ns;
1707         return 0;
1708 }
1709
1710 int unit_reset_cpu_usage(Unit *u) {
1711         nsec_t ns;
1712         int r;
1713
1714         assert(u);
1715
1716         r = unit_get_cpu_usage_raw(u, &ns);
1717         if (r < 0) {
1718                 u->cpuacct_usage_base = 0;
1719                 return r;
1720         }
1721
1722         u->cpuacct_usage_base = ns;
1723         return 0;
1724 }
1725
1726 bool unit_cgroup_delegate(Unit *u) {
1727         CGroupContext *c;
1728
1729         assert(u);
1730
1731         c = unit_get_cgroup_context(u);
1732         if (!c)
1733                 return false;
1734
1735         return c->delegate;
1736 }
1737
1738 void unit_invalidate_cgroup(Unit *u, CGroupMask m) {
1739         assert(u);
1740
1741         if (!UNIT_HAS_CGROUP_CONTEXT(u))
1742                 return;
1743
1744         if (m == 0)
1745                 return;
1746
1747         if ((u->cgroup_realized_mask & m) == 0)
1748                 return;
1749
1750         u->cgroup_realized_mask &= ~m;
1751         unit_add_to_cgroup_queue(u);
1752 }
1753
1754 void manager_invalidate_startup_units(Manager *m) {
1755         Iterator i;
1756         Unit *u;
1757
1758         assert(m);
1759
1760         SET_FOREACH(u, m->startup_units, i)
1761                 unit_invalidate_cgroup(u, CGROUP_MASK_CPU|CGROUP_MASK_BLKIO);
1762 }
1763
1764 static const char* const cgroup_device_policy_table[_CGROUP_DEVICE_POLICY_MAX] = {
1765         [CGROUP_AUTO] = "auto",
1766         [CGROUP_CLOSED] = "closed",
1767         [CGROUP_STRICT] = "strict",
1768 };
1769
1770 DEFINE_STRING_TABLE_LOOKUP(cgroup_device_policy, CGroupDevicePolicy);
1771 #endif // 0