src/core/cgroup.c

   1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
   2
   3 /***
   4   This file is part of systemd.
   5
   6   Copyright 2013 Lennart Poettering
   7
   8   systemd is free software; you can redistribute it and/or modify it
   9   under the terms of the GNU Lesser General Public License as published by
  10   the Free Software Foundation; either version 2.1 of the License, or
  11   (at your option) any later version.
  12
  13   systemd is distributed in the hope that it will be useful, but
  14   WITHOUT ANY WARRANTY; without even the implied warranty of
  15   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  16   Lesser General Public License for more details.
  17
  18   You should have received a copy of the GNU Lesser General Public License
  19   along with systemd; If not, see <http://www.gnu.org/licenses/>.
  20 ***/
  21
  22 #include <fcntl.h>
  23 #include <fnmatch.h>
  24
  25 #include "path-util.h"
  26 #include "special.h"
  27 #include "cgroup-util.h"
  28 #include "cgroup.h"
  29
  30 #define CGROUP_CPU_QUOTA_PERIOD_USEC ((usec_t) 100 * USEC_PER_MSEC)
  31
  32 void cgroup_context_init(CGroupContext *c) {
  33         assert(c);
  34
  35         /* Initialize everything to the kernel defaults, assuming the
  36          * structure is preinitialized to 0 */
  37
  38         c->cpu_shares = (unsigned long) -1;
  39         c->startup_cpu_shares = (unsigned long) -1;
  40         c->memory_limit = (uint64_t) -1;
  41         c->blockio_weight = (unsigned long) -1;
  42         c->startup_blockio_weight = (unsigned long) -1;
  43
  44         c->cpu_quota_per_sec_usec = USEC_INFINITY;
  45 }
  46
  47 void cgroup_context_free_device_allow(CGroupContext *c, CGroupDeviceAllow *a) {
  48         assert(c);
  49         assert(a);
  50
  51         LIST_REMOVE(device_allow, c->device_allow, a);
  52         free(a->path);
  53         free(a);
  54 }
  55
  56 void cgroup_context_free_blockio_device_weight(CGroupContext *c, CGroupBlockIODeviceWeight *w) {
  57         assert(c);
  58         assert(w);
  59
  60         LIST_REMOVE(device_weights, c->blockio_device_weights, w);
  61         free(w->path);
  62         free(w);
  63 }
  64
  65 void cgroup_context_free_blockio_device_bandwidth(CGroupContext *c, CGroupBlockIODeviceBandwidth *b) {
  66         assert(c);
  67         assert(b);
  68
  69         LIST_REMOVE(device_bandwidths, c->blockio_device_bandwidths, b);
  70         free(b->path);
  71         free(b);
  72 }
  73
  74 void cgroup_context_done(CGroupContext *c) {
  75         assert(c);
  76
  77         while (c->blockio_device_weights)
  78                 cgroup_context_free_blockio_device_weight(c, c->blockio_device_weights);
  79
  80         while (c->blockio_device_bandwidths)
  81                 cgroup_context_free_blockio_device_bandwidth(c, c->blockio_device_bandwidths);
  82
  83         while (c->device_allow)
  84                 cgroup_context_free_device_allow(c, c->device_allow);
  85 }
  86
  87 void cgroup_context_dump(CGroupContext *c, FILE* f, const char *prefix) {
  88         CGroupBlockIODeviceBandwidth *b;
  89         CGroupBlockIODeviceWeight *w;
  90         CGroupDeviceAllow *a;
  91         char u[FORMAT_TIMESPAN_MAX];
  92
  93         assert(c);
  94         assert(f);
  95
  96         prefix = strempty(prefix);
  97
  98         fprintf(f,
  99                 "%sCPUAccounting=%s\n"
 100                 "%sBlockIOAccounting=%s\n"
 101                 "%sMemoryAccounting=%s\n"
 102                 "%sCPUShares=%lu\n"
 103                 "%sStartupCPUShares=%lu\n"
 104                 "%sCPUQuotaPerSecSec=%s\n"
 105                 "%sBlockIOWeight=%lu\n"
 106                 "%sStartupBlockIOWeight=%lu\n"
 107                 "%sMemoryLimit=%" PRIu64 "\n"
 108                 "%sDevicePolicy=%s\n"
 109                 "%sDelegate=%s\n",
 110                 prefix, yes_no(c->cpu_accounting),
 111                 prefix, yes_no(c->blockio_accounting),
 112                 prefix, yes_no(c->memory_accounting),
 113                 prefix, c->cpu_shares,
 114                 prefix, c->startup_cpu_shares,
 115                 prefix, format_timespan(u, sizeof(u), c->cpu_quota_per_sec_usec, 1),
 116                 prefix, c->blockio_weight,
 117                 prefix, c->startup_blockio_weight,
 118                 prefix, c->memory_limit,
 119                 prefix, cgroup_device_policy_to_string(c->device_policy),
 120                 prefix, yes_no(c->delegate));
 121
 122         LIST_FOREACH(device_allow, a, c->device_allow)
 123                 fprintf(f,
 124                         "%sDeviceAllow=%s %s%s%s\n",
 125                         prefix,
 126                         a->path,
 127                         a->r ? "r" : "", a->w ? "w" : "", a->m ? "m" : "");
 128
 129         LIST_FOREACH(device_weights, w, c->blockio_device_weights)
 130                 fprintf(f,
 131                         "%sBlockIODeviceWeight=%s %lu",
 132                         prefix,
 133                         w->path,
 134                         w->weight);
 135
 136         LIST_FOREACH(device_bandwidths, b, c->blockio_device_bandwidths) {
 137                 char buf[FORMAT_BYTES_MAX];
 138
 139                 fprintf(f,
 140                         "%s%s=%s %s\n",
 141                         prefix,
 142                         b->read ? "BlockIOReadBandwidth" : "BlockIOWriteBandwidth",
 143                         b->path,
 144                         format_bytes(buf, sizeof(buf), b->bandwidth));
 145         }
 146 }
 147
 148 static int lookup_blkio_device(const char *p, dev_t *dev) {
 149         struct stat st;
 150         int r;
 151
 152         assert(p);
 153         assert(dev);
 154
 155         r = stat(p, &st);
 156         if (r < 0) {
 157                 log_warning("Couldn't stat device %s: %m", p);
 158                 return -errno;
 159         }
 160
 161         if (S_ISBLK(st.st_mode))
 162                 *dev = st.st_rdev;
 163         else if (major(st.st_dev) != 0) {
 164                 /* If this is not a device node then find the block
 165                  * device this file is stored on */
 166                 *dev = st.st_dev;
 167
 168                 /* If this is a partition, try to get the originating
 169                  * block device */
 170                 block_get_whole_disk(*dev, dev);
 171         } else {
 172                 log_warning("%s is not a block device and file system block device cannot be determined or is not local.", p);
 173                 return -ENODEV;
 174         }
 175
 176         return 0;
 177 }
 178
 179 static int whitelist_device(const char *path, const char *node, const char *acc) {
 180         char buf[2+DECIMAL_STR_MAX(dev_t)*2+2+4];
 181         struct stat st;
 182         int r;
 183
 184         assert(path);
 185         assert(acc);
 186
 187         if (stat(node, &st) < 0) {
 188                 log_warning("Couldn't stat device %s", node);
 189                 return -errno;
 190         }
 191
 192         if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
 193                 log_warning("%s is not a device.", node);
 194                 return -ENODEV;
 195         }
 196
 197         sprintf(buf,
 198                 "%c %u:%u %s",
 199                 S_ISCHR(st.st_mode) ? 'c' : 'b',
 200                 major(st.st_rdev), minor(st.st_rdev),
 201                 acc);
 202
 203         r = cg_set_attribute("devices", path, "devices.allow", buf);
 204         if (r < 0)
 205                 log_full(r == -ENOENT ? LOG_DEBUG : LOG_WARNING, "Failed to set devices.allow on %s: %s", path, strerror(-r));
 206
 207         return r;
 208 }
 209
 210 static int whitelist_major(const char *path, const char *name, char type, const char *acc) {
 211         _cleanup_fclose_ FILE *f = NULL;
 212         char line[LINE_MAX];
 213         bool good = false;
 214         int r;
 215
 216         assert(path);
 217         assert(acc);
 218         assert(type == 'b' || type == 'c');
 219
 220         f = fopen("/proc/devices", "re");
 221         if (!f) {
 222                 log_warning("Cannot open /proc/devices to resolve %s (%c): %m", name, type);
 223                 return -errno;
 224         }
 225
 226         FOREACH_LINE(line, f, goto fail) {
 227                 char buf[2+DECIMAL_STR_MAX(unsigned)+3+4], *p, *w;
 228                 unsigned maj;
 229
 230                 truncate_nl(line);
 231
 232                 if (type == 'c' && streq(line, "Character devices:")) {
 233                         good = true;
 234                         continue;
 235                 }
 236
 237                 if (type == 'b' && streq(line, "Block devices:")) {
 238                         good = true;
 239                         continue;
 240                 }
 241
 242                 if (isempty(line)) {
 243                         good = false;
 244                         continue;
 245                 }
 246
 247                 if (!good)
 248                         continue;
 249
 250                 p = strstrip(line);
 251
 252                 w = strpbrk(p, WHITESPACE);
 253                 if (!w)
 254                         continue;
 255                 *w = 0;
 256
 257                 r = safe_atou(p, &maj);
 258                 if (r < 0)
 259                         continue;
 260                 if (maj <= 0)
 261                         continue;
 262
 263                 w++;
 264                 w += strspn(w, WHITESPACE);
 265
 266                 if (fnmatch(name, w, 0) != 0)
 267                         continue;
 268
 269                 sprintf(buf,
 270                         "%c %u:* %s",
 271                         type,
 272                         maj,
 273                         acc);
 274
 275                 r = cg_set_attribute("devices", path, "devices.allow", buf);
 276                 if (r < 0)
 277                         log_full(r == -ENOENT ? LOG_DEBUG : LOG_WARNING, "Failed to set devices.allow on %s: %s", path, strerror(-r));
 278         }
 279
 280         return 0;
 281
 282 fail:
 283         log_warning("Failed to read /proc/devices: %m");
 284         return -errno;
 285 }
 286
 287 void cgroup_context_apply(CGroupContext *c, CGroupControllerMask mask, const char *path, ManagerState state) {
 288         bool is_root;
 289         int r;
 290
 291         assert(c);
 292         assert(path);
 293
 294         if (mask == 0)
 295                 return;
 296
 297         /* Some cgroup attributes are not support on the root cgroup,
 298          * hence silently ignore */
 299         is_root = isempty(path) || path_equal(path, "/");
 300
 301         if ((mask & CGROUP_CPU) && !is_root) {
 302                 char buf[MAX(DECIMAL_STR_MAX(unsigned long), DECIMAL_STR_MAX(usec_t)) + 1];
 303
 304                 sprintf(buf, "%lu\n",
 305                         IN_SET(state, MANAGER_STARTING, MANAGER_INITIALIZING) && c->startup_cpu_shares != (unsigned long) -1 ? c->startup_cpu_shares :
 306                         c->cpu_shares != (unsigned long) -1 ? c->cpu_shares : 1024);
 307                 r = cg_set_attribute("cpu", path, "cpu.shares", buf);
 308                 if (r < 0)
 309                         log_full(r == -ENOENT ? LOG_DEBUG : LOG_WARNING, "Failed to set cpu.shares on %s: %s", path, strerror(-r));
 310
 311                 sprintf(buf, USEC_FMT "\n", CGROUP_CPU_QUOTA_PERIOD_USEC);
 312                 r = cg_set_attribute("cpu", path, "cpu.cfs_period_us", buf);
 313                 if (r < 0)
 314                         log_full(r == -ENOENT ? LOG_DEBUG : LOG_WARNING, "Failed to set cpu.cfs_period_us on %s: %s", path, strerror(-r));
 315
 316                 if (c->cpu_quota_per_sec_usec != USEC_INFINITY) {
 317                         sprintf(buf, USEC_FMT "\n", c->cpu_quota_per_sec_usec * CGROUP_CPU_QUOTA_PERIOD_USEC / USEC_PER_SEC);
 318                         r = cg_set_attribute("cpu", path, "cpu.cfs_quota_us", buf);
 319                 } else
 320                         r = cg_set_attribute("cpu", path, "cpu.cfs_quota_us", "-1");
 321                 if (r < 0)
 322                         log_full(r == -ENOENT ? LOG_DEBUG : LOG_WARNING, "Failed to set cpu.cfs_quota_us on %s: %s", path, strerror(-r));
 323         }
 324
 325         if (mask & CGROUP_BLKIO) {
 326                 char buf[MAX3(DECIMAL_STR_MAX(unsigned long)+1,
 327                               DECIMAL_STR_MAX(dev_t)*2+2+DECIMAL_STR_MAX(unsigned long)*1,
 328                               DECIMAL_STR_MAX(dev_t)*2+2+DECIMAL_STR_MAX(uint64_t)+1)];
 329                 CGroupBlockIODeviceWeight *w;
 330                 CGroupBlockIODeviceBandwidth *b;
 331
 332                 if (!is_root) {
 333                         sprintf(buf, "%lu\n", IN_SET(state, MANAGER_STARTING, MANAGER_INITIALIZING) && c->startup_blockio_weight != (unsigned long) -1 ? c->startup_blockio_weight :
 334                                 c->blockio_weight != (unsigned long) -1 ? c->blockio_weight : 1000);
 335                         r = cg_set_attribute("blkio", path, "blkio.weight", buf);
 336                         if (r < 0)
 337                                 log_full(r == -ENOENT ? LOG_DEBUG : LOG_WARNING, "Failed to set blkio.weight on %s: %s", path, strerror(-r));
 338
 339                         /* FIXME: no way to reset this list */
 340                         LIST_FOREACH(device_weights, w, c->blockio_device_weights) {
 341                                 dev_t dev;
 342
 343                                 r = lookup_blkio_device(w->path, &dev);
 344                                 if (r < 0)
 345                                         continue;
 346
 347                                 sprintf(buf, "%u:%u %lu", major(dev), minor(dev), w->weight);
 348                                 r = cg_set_attribute("blkio", path, "blkio.weight_device", buf);
 349                                 if (r < 0)
 350                                         log_full(r == -ENOENT ? LOG_DEBUG : LOG_WARNING, "Failed to set blkio.weight_device on %s: %s", path, strerror(-r));
 351                         }
 352                 }
 353
 354                 /* FIXME: no way to reset this list */
 355                 LIST_FOREACH(device_bandwidths, b, c->blockio_device_bandwidths) {
 356                         const char *a;
 357                         dev_t dev;
 358
 359                         r = lookup_blkio_device(b->path, &dev);
 360                         if (r < 0)
 361                                 continue;
 362
 363                         a = b->read ? "blkio.throttle.read_bps_device" : "blkio.throttle.write_bps_device";
 364
 365                         sprintf(buf, "%u:%u %" PRIu64 "\n", major(dev), minor(dev), b->bandwidth);
 366                         r = cg_set_attribute("blkio", path, a, buf);
 367                         if (r < 0)
 368                                 log_full(r == -ENOENT ? LOG_DEBUG : LOG_WARNING, "Failed to set %s on %s: %s", a, path, strerror(-r));
 369                 }
 370         }
 371
 372         if (mask & CGROUP_MEMORY) {
 373                 if (c->memory_limit != (uint64_t) -1) {
 374                         char buf[DECIMAL_STR_MAX(uint64_t) + 1];
 375
 376                         sprintf(buf, "%" PRIu64 "\n", c->memory_limit);
 377                         r = cg_set_attribute("memory", path, "memory.limit_in_bytes", buf);
 378                 } else
 379                         r = cg_set_attribute("memory", path, "memory.limit_in_bytes", "-1");
 380
 381                 if (r < 0)
 382                         log_full(r == -ENOENT ? LOG_DEBUG : LOG_WARNING, "Failed to set memory.limit_in_bytes on %s: %s", path, strerror(-r));
 383         }
 384
 385         if ((mask & CGROUP_DEVICE) && !is_root) {
 386                 CGroupDeviceAllow *a;
 387
 388                 if (c->device_allow || c->device_policy != CGROUP_AUTO)
 389                         r = cg_set_attribute("devices", path, "devices.deny", "a");
 390                 else
 391                         r = cg_set_attribute("devices", path, "devices.allow", "a");
 392                 if (r < 0)
 393                         log_full(r == -ENOENT ? LOG_DEBUG : LOG_WARNING, "Failed to reset devices.list on %s: %s", path, strerror(-r));
 394
 395                 if (c->device_policy == CGROUP_CLOSED ||
 396                     (c->device_policy == CGROUP_AUTO && c->device_allow)) {
 397                         static const char auto_devices[] =
 398                                 "/dev/null\0" "rwm\0"
 399                                 "/dev/zero\0" "rwm\0"
 400                                 "/dev/full\0" "rwm\0"
 401                                 "/dev/random\0" "rwm\0"
 402                                 "/dev/urandom\0" "rwm\0"
 403                                 "/dev/tty\0" "rwm\0"
 404                                 "/dev/pts/ptmx\0" "rw\0"; /* /dev/pts/ptmx may not be duplicated, but accessed */
 405
 406                         const char *x, *y;
 407
 408                         NULSTR_FOREACH_PAIR(x, y, auto_devices)
 409                                 whitelist_device(path, x, y);
 410
 411                         whitelist_major(path, "pts", 'c', "rw");
 412                         whitelist_major(path, "kdbus", 'c', "rw");
 413                         whitelist_major(path, "kdbus/*", 'c', "rw");
 414                 }
 415
 416                 LIST_FOREACH(device_allow, a, c->device_allow) {
 417                         char acc[4];
 418                         unsigned k = 0;
 419
 420                         if (a->r)
 421                                 acc[k++] = 'r';
 422                         if (a->w)
 423                                 acc[k++] = 'w';
 424                         if (a->m)
 425                                 acc[k++] = 'm';
 426
 427                         if (k == 0)
 428                                 continue;
 429
 430                         acc[k++] = 0;
 431
 432                         if (startswith(a->path, "/dev/"))
 433                                 whitelist_device(path, a->path, acc);
 434                         else if (startswith(a->path, "block-"))
 435                                 whitelist_major(path, a->path + 6, 'b', acc);
 436                         else if (startswith(a->path, "char-"))
 437                                 whitelist_major(path, a->path + 5, 'c', acc);
 438                         else
 439                                 log_debug("Ignoring device %s while writing cgroup attribute.", a->path);
 440                 }
 441         }
 442 }
 443
 444 CGroupControllerMask cgroup_context_get_mask(CGroupContext *c) {
 445         CGroupControllerMask mask = 0;
 446
 447         /* Figure out which controllers we need */
 448
 449         if (c->cpu_accounting ||
 450             c->cpu_shares != (unsigned long) -1 ||
 451             c->startup_cpu_shares != (unsigned long) -1 ||
 452             c->cpu_quota_per_sec_usec != USEC_INFINITY)
 453                 mask |= CGROUP_CPUACCT | CGROUP_CPU;
 454
 455         if (c->blockio_accounting ||
 456             c->blockio_weight != (unsigned long) -1 ||
 457             c->startup_blockio_weight != (unsigned long) -1 ||
 458             c->blockio_device_weights ||
 459             c->blockio_device_bandwidths)
 460                 mask |= CGROUP_BLKIO;
 461
 462         if (c->memory_accounting ||
 463             c->memory_limit != (uint64_t) -1)
 464                 mask |= CGROUP_MEMORY;
 465
 466         if (c->device_allow ||
 467             c->device_policy != CGROUP_AUTO)
 468                 mask |= CGROUP_DEVICE;
 469
 470         return mask;
 471 }
 472
 473 CGroupControllerMask unit_get_cgroup_mask(Unit *u) {
 474         CGroupContext *c;
 475
 476         c = unit_get_cgroup_context(u);
 477         if (!c)
 478                 return 0;
 479
 480         /* If delegation is turned on, then turn on all cgroups,
 481          * unless the process we fork into it is known to drop
 482          * privileges anyway, and shouldn't get access to the
 483          * controllers anyway. */
 484
 485         if (c->delegate) {
 486                 ExecContext *e;
 487
 488                 e = unit_get_exec_context(u);
 489                 if (!e || exec_context_maintains_privileges(e))
 490                         return _CGROUP_CONTROLLER_MASK_ALL;
 491         }
 492
 493         return cgroup_context_get_mask(c);
 494 }
 495
 496 CGroupControllerMask unit_get_members_mask(Unit *u) {
 497         assert(u);
 498
 499         if (u->cgroup_members_mask_valid)
 500                 return u->cgroup_members_mask;
 501
 502         u->cgroup_members_mask = 0;
 503
 504         if (u->type == UNIT_SLICE) {
 505                 Unit *member;
 506                 Iterator i;
 507
 508                 SET_FOREACH(member, u->dependencies[UNIT_BEFORE], i) {
 509
 510                         if (member == u)
 511                                 continue;
 512
 513                         if (UNIT_DEREF(member->slice) != u)
 514                                 continue;
 515
 516                         u->cgroup_members_mask |=
 517                                 unit_get_cgroup_mask(member) |
 518                                 unit_get_members_mask(member);
 519                 }
 520         }
 521
 522         u->cgroup_members_mask_valid = true;
 523         return u->cgroup_members_mask;
 524 }
 525
 526 CGroupControllerMask unit_get_siblings_mask(Unit *u) {
 527         assert(u);
 528
 529         if (UNIT_ISSET(u->slice))
 530                 return unit_get_members_mask(UNIT_DEREF(u->slice));
 531
 532         return unit_get_cgroup_mask(u) | unit_get_members_mask(u);
 533 }
 534
 535 CGroupControllerMask unit_get_target_mask(Unit *u) {
 536         CGroupControllerMask mask;
 537
 538         mask = unit_get_cgroup_mask(u) | unit_get_members_mask(u) | unit_get_siblings_mask(u);
 539         mask &= u->manager->cgroup_supported;
 540
 541         return mask;
 542 }
 543
 544 /* Recurse from a unit up through its containing slices, propagating
 545  * mask bits upward. A unit is also member of itself. */
 546 void unit_update_cgroup_members_masks(Unit *u) {
 547         CGroupControllerMask m;
 548         bool more;
 549
 550         assert(u);
 551
 552         /* Calculate subtree mask */
 553         m = unit_get_cgroup_mask(u) | unit_get_members_mask(u);
 554
 555         /* See if anything changed from the previous invocation. If
 556          * not, we're done. */
 557         if (u->cgroup_subtree_mask_valid && m == u->cgroup_subtree_mask)
 558                 return;
 559
 560         more =
 561                 u->cgroup_subtree_mask_valid &&
 562                 ((m & ~u->cgroup_subtree_mask) != 0) &&
 563                 ((~m & u->cgroup_subtree_mask) == 0);
 564
 565         u->cgroup_subtree_mask = m;
 566         u->cgroup_subtree_mask_valid = true;
 567
 568         if (UNIT_ISSET(u->slice)) {
 569                 Unit *s = UNIT_DEREF(u->slice);
 570
 571                 if (more)
 572                         /* There's more set now than before. We
 573                          * propagate the new mask to the parent's mask
 574                          * (not caring if it actually was valid or
 575                          * not). */
 576
 577                         s->cgroup_members_mask |= m;
 578
 579                 else
 580                         /* There's less set now than before (or we
 581                          * don't know), we need to recalculate
 582                          * everything, so let's invalidate the
 583                          * parent's members mask */
 584
 585                         s->cgroup_members_mask_valid = false;
 586
 587                 /* And now make sure that this change also hits our
 588                  * grandparents */
 589                 unit_update_cgroup_members_masks(s);
 590         }
 591 }
 592
 593 static const char *migrate_callback(CGroupControllerMask mask, void *userdata) {
 594         Unit *u = userdata;
 595
 596         assert(mask != 0);
 597         assert(u);
 598
 599         while (u) {
 600                 if (u->cgroup_path &&
 601                     u->cgroup_realized &&
 602                     (u->cgroup_realized_mask & mask) == mask)
 603                         return u->cgroup_path;
 604
 605                 u = UNIT_DEREF(u->slice);
 606         }
 607
 608         return NULL;
 609 }
 610
 611 static int unit_create_cgroups(Unit *u, CGroupControllerMask mask) {
 612         _cleanup_free_ char *path = NULL;
 613         int r;
 614
 615         assert(u);
 616
 617         path = unit_default_cgroup_path(u);
 618         if (!path)
 619                 return log_oom();
 620
 621         r = hashmap_put(u->manager->cgroup_unit, path, u);
 622         if (r < 0) {
 623                 log_error(r == -EEXIST ? "cgroup %s exists already: %s" : "hashmap_put failed for %s: %s", path, strerror(-r));
 624                 return r;
 625         }
 626         if (r > 0) {
 627                 u->cgroup_path = path;
 628                 path = NULL;
 629         }
 630
 631         /* First, create our own group */
 632         r = cg_create_everywhere(u->manager->cgroup_supported, mask, u->cgroup_path);
 633         if (r < 0) {
 634                 log_error("Failed to create cgroup %s: %s", u->cgroup_path, strerror(-r));
 635                 return r;
 636         }
 637
 638         /* Keep track that this is now realized */
 639         u->cgroup_realized = true;
 640         u->cgroup_realized_mask = mask;
 641
 642         /* Then, possibly move things over */
 643         r = cg_migrate_everywhere(u->manager->cgroup_supported, u->cgroup_path, u->cgroup_path, migrate_callback, u);
 644         if (r < 0)
 645                 log_warning("Failed to migrate cgroup from to %s: %s", u->cgroup_path, strerror(-r));
 646
 647         return 0;
 648 }
 649
 650 static bool unit_has_mask_realized(Unit *u, CGroupControllerMask mask) {
 651         assert(u);
 652
 653         return u->cgroup_realized && u->cgroup_realized_mask == mask;
 654 }
 655
 656 /* Check if necessary controllers and attributes for a unit are in place.
 657  *
 658  * If so, do nothing.
 659  * If not, create paths, move processes over, and set attributes.
 660  *
 661  * Returns 0 on success and < 0 on failure. */
 662 static int unit_realize_cgroup_now(Unit *u, ManagerState state) {
 663         CGroupControllerMask mask;
 664         int r;
 665
 666         assert(u);
 667
 668         if (u->in_cgroup_queue) {
 669                 LIST_REMOVE(cgroup_queue, u->manager->cgroup_queue, u);
 670                 u->in_cgroup_queue = false;
 671         }
 672
 673         mask = unit_get_target_mask(u);
 674
 675         if (unit_has_mask_realized(u, mask))
 676                 return 0;
 677
 678         /* First, realize parents */
 679         if (UNIT_ISSET(u->slice)) {
 680                 r = unit_realize_cgroup_now(UNIT_DEREF(u->slice), state);
 681                 if (r < 0)
 682                         return r;
 683         }
 684
 685         /* And then do the real work */
 686         r = unit_create_cgroups(u, mask);
 687         if (r < 0)
 688                 return r;
 689
 690         /* Finally, apply the necessary attributes. */
 691         cgroup_context_apply(unit_get_cgroup_context(u), mask, u->cgroup_path, state);
 692
 693         return 0;
 694 }
 695
 696 static void unit_add_to_cgroup_queue(Unit *u) {
 697
 698         if (u->in_cgroup_queue)
 699                 return;
 700
 701         LIST_PREPEND(cgroup_queue, u->manager->cgroup_queue, u);
 702         u->in_cgroup_queue = true;
 703 }
 704
 705 unsigned manager_dispatch_cgroup_queue(Manager *m) {
 706         ManagerState state;
 707         unsigned n = 0;
 708         Unit *i;
 709         int r;
 710
 711         state = manager_state(m);
 712
 713         while ((i = m->cgroup_queue)) {
 714                 assert(i->in_cgroup_queue);
 715
 716                 r = unit_realize_cgroup_now(i, state);
 717                 if (r < 0)
 718                         log_warning("Failed to realize cgroups for queued unit %s: %s", i->id, strerror(-r));
 719
 720                 n++;
 721         }
 722
 723         return n;
 724 }
 725
 726 static void unit_queue_siblings(Unit *u) {
 727         Unit *slice;
 728
 729         /* This adds the siblings of the specified unit and the
 730          * siblings of all parent units to the cgroup queue. (But
 731          * neither the specified unit itself nor the parents.) */
 732
 733         while ((slice = UNIT_DEREF(u->slice))) {
 734                 Iterator i;
 735                 Unit *m;
 736
 737                 SET_FOREACH(m, slice->dependencies[UNIT_BEFORE], i) {
 738                         if (m == u)
 739                                 continue;
 740
 741                         /* Skip units that have a dependency on the slice
 742                          * but aren't actually in it. */
 743                         if (UNIT_DEREF(m->slice) != slice)
 744                                 continue;
 745
 746                         /* No point in doing cgroup application for units
 747                          * without active processes. */
 748                         if (UNIT_IS_INACTIVE_OR_FAILED(unit_active_state(m)))
 749                                 continue;
 750
 751                         /* If the unit doesn't need any new controllers
 752                          * and has current ones realized, it doesn't need
 753                          * any changes. */
 754                         if (unit_has_mask_realized(m, unit_get_target_mask(m)))
 755                                 continue;
 756
 757                         unit_add_to_cgroup_queue(m);
 758                 }
 759
 760                 u = slice;
 761         }
 762 }
 763
 764 int unit_realize_cgroup(Unit *u) {
 765         CGroupContext *c;
 766
 767         assert(u);
 768
 769         c = unit_get_cgroup_context(u);
 770         if (!c)
 771                 return 0;
 772
 773         /* So, here's the deal: when realizing the cgroups for this
 774          * unit, we need to first create all parents, but there's more
 775          * actually: for the weight-based controllers we also need to
 776          * make sure that all our siblings (i.e. units that are in the
 777          * same slice as we are) have cgroups, too. Otherwise, things
 778          * would become very uneven as each of their processes would
 779          * get as much resources as all our group together. This call
 780          * will synchronously create the parent cgroups, but will
 781          * defer work on the siblings to the next event loop
 782          * iteration. */
 783
 784         /* Add all sibling slices to the cgroup queue. */
 785         unit_queue_siblings(u);
 786
 787         /* And realize this one now (and apply the values) */
 788         return unit_realize_cgroup_now(u, manager_state(u->manager));
 789 }
 790
 791 void unit_destroy_cgroup(Unit *u) {
 792         int r;
 793
 794         assert(u);
 795
 796         if (!u->cgroup_path)
 797                 return;
 798
 799         r = cg_trim_everywhere(u->manager->cgroup_supported, u->cgroup_path, !unit_has_name(u, SPECIAL_ROOT_SLICE));
 800         if (r < 0)
 801                 log_debug("Failed to destroy cgroup %s: %s", u->cgroup_path, strerror(-r));
 802
 803         hashmap_remove(u->manager->cgroup_unit, u->cgroup_path);
 804
 805         free(u->cgroup_path);
 806         u->cgroup_path = NULL;
 807         u->cgroup_realized = false;
 808         u->cgroup_realized_mask = 0;
 809
 810 }
 811
 812 pid_t unit_search_main_pid(Unit *u) {
 813         _cleanup_fclose_ FILE *f = NULL;
 814         pid_t pid = 0, npid, mypid;
 815
 816         assert(u);
 817
 818         if (!u->cgroup_path)
 819                 return 0;
 820
 821         if (cg_enumerate_processes(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, &f) < 0)
 822                 return 0;
 823
 824         mypid = getpid();
 825         while (cg_read_pid(f, &npid) > 0)  {
 826                 pid_t ppid;
 827
 828                 if (npid == pid)
 829                         continue;
 830
 831                 /* Ignore processes that aren't our kids */
 832                 if (get_parent_of_pid(npid, &ppid) >= 0 && ppid != mypid)
 833                         continue;
 834
 835                 if (pid != 0) {
 836                         /* Dang, there's more than one daemonized PID
 837                         in this group, so we don't know what process
 838                         is the main process. */
 839                         pid = 0;
 840                         break;
 841                 }
 842
 843                 pid = npid;
 844         }
 845
 846         return pid;
 847 }
 848
 849 int manager_setup_cgroup(Manager *m) {
 850         _cleanup_free_ char *path = NULL;
 851         int r;
 852
 853         assert(m);
 854
 855         /* 1. Determine hierarchy */
 856         free(m->cgroup_root);
 857         m->cgroup_root = NULL;
 858
 859         r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, 0, &m->cgroup_root);
 860         if (r < 0) {
 861                 log_error("Cannot determine cgroup we are running in: %s", strerror(-r));
 862                 return r;
 863         }
 864
 865         /* LEGACY: Already in /system.slice? If so, let's cut this
 866          * off. This is to support live upgrades from older systemd
 867          * versions where PID 1 was moved there. */
 868         if (m->running_as == SYSTEMD_SYSTEM) {
 869                 char *e;
 870
 871                 e = endswith(m->cgroup_root, "/" SPECIAL_SYSTEM_SLICE);
 872                 if (!e)
 873                         e = endswith(m->cgroup_root, "/system");
 874                 if (e)
 875                         *e = 0;
 876         }
 877
 878         /* And make sure to store away the root value without trailing
 879          * slash, even for the root dir, so that we can easily prepend
 880          * it everywhere. */
 881         if (streq(m->cgroup_root, "/"))
 882                 m->cgroup_root[0] = 0;
 883
 884         /* 2. Show data */
 885         r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, NULL, &path);
 886         if (r < 0) {
 887                 log_error("Cannot find cgroup mount point: %s", strerror(-r));
 888                 return r;
 889         }
 890
 891         log_debug("Using cgroup controller " SYSTEMD_CGROUP_CONTROLLER ". File system hierarchy is at %s.", path);
 892         if (!m->test_run) {
 893
 894                 /* 3. Install agent */
 895                 if (m->running_as == SYSTEMD_SYSTEM) {
 896                         r = cg_install_release_agent(SYSTEMD_CGROUP_CONTROLLER, SYSTEMD_CGROUP_AGENT_PATH);
 897                         if (r < 0)
 898                                 log_warning("Failed to install release agent, ignoring: %s", strerror(-r));
 899                         else if (r > 0)
 900                                 log_debug("Installed release agent.");
 901                         else
 902                                 log_debug("Release agent already installed.");
 903                 }
 904
 905                 /* 4. Make sure we are in the root cgroup */
 906                 r = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, 0);
 907                 if (r < 0) {
 908                         log_error("Failed to create root cgroup hierarchy: %s", strerror(-r));
 909                         return r;
 910                 }
 911
 912                 /* 5. And pin it, so that it cannot be unmounted */
 913                 safe_close(m->pin_cgroupfs_fd);
 914
 915                 m->pin_cgroupfs_fd = open(path, O_RDONLY|O_CLOEXEC|O_DIRECTORY|O_NOCTTY|O_NONBLOCK);
 916                 if (m->pin_cgroupfs_fd < 0) {
 917                         log_error("Failed to open pin file: %m");
 918                         return -errno;
 919                 }
 920
 921                 /* 6.  Always enable hierarchial support if it exists... */
 922                 cg_set_attribute("memory", "/", "memory.use_hierarchy", "1");
 923         }
 924
 925         /* 7. Figure out which controllers are supported */
 926         m->cgroup_supported = cg_mask_supported();
 927
 928         return 0;
 929 }
 930
 931 void manager_shutdown_cgroup(Manager *m, bool delete) {
 932         assert(m);
 933
 934         /* We can't really delete the group, since we are in it. But
 935          * let's trim it. */
 936         if (delete && m->cgroup_root)
 937                 cg_trim(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, false);
 938
 939         m->pin_cgroupfs_fd = safe_close(m->pin_cgroupfs_fd);
 940
 941         free(m->cgroup_root);
 942         m->cgroup_root = NULL;
 943 }
 944
 945 Unit* manager_get_unit_by_cgroup(Manager *m, const char *cgroup) {
 946         char *p;
 947         Unit *u;
 948
 949         assert(m);
 950         assert(cgroup);
 951
 952         u = hashmap_get(m->cgroup_unit, cgroup);
 953         if (u)
 954                 return u;
 955
 956         p = strdupa(cgroup);
 957         for (;;) {
 958                 char *e;
 959
 960                 e = strrchr(p, '/');
 961                 if (e == p || !e)
 962                         return NULL;
 963
 964                 *e = 0;
 965
 966                 u = hashmap_get(m->cgroup_unit, p);
 967                 if (u)
 968                         return u;
 969         }
 970 }
 971
 972 Unit *manager_get_unit_by_pid(Manager *m, pid_t pid) {
 973         _cleanup_free_ char *cgroup = NULL;
 974         int r;
 975
 976         assert(m);
 977
 978         if (pid <= 1)
 979                 return NULL;
 980
 981         r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, pid, &cgroup);
 982         if (r < 0)
 983                 return NULL;
 984
 985         return manager_get_unit_by_cgroup(m, cgroup);
 986 }
 987
 988 int manager_notify_cgroup_empty(Manager *m, const char *cgroup) {
 989         Unit *u;
 990         int r;
 991
 992         assert(m);
 993         assert(cgroup);
 994
 995         u = manager_get_unit_by_cgroup(m, cgroup);
 996         if (u) {
 997                 r = cg_is_empty_recursive(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, true);
 998                 if (r > 0) {
 999                         if (UNIT_VTABLE(u)->notify_cgroup_empty)
1000                                 UNIT_VTABLE(u)->notify_cgroup_empty(u);
1001
1002                         unit_add_to_gc_queue(u);
1003                 }
1004         }
1005
1006         return 0;
1007 }
1008
1009 static const char* const cgroup_device_policy_table[_CGROUP_DEVICE_POLICY_MAX] = {
1010         [CGROUP_AUTO] = "auto",
1011         [CGROUP_CLOSED] = "closed",
1012         [CGROUP_STRICT] = "strict",
1013 };
1014
1015 DEFINE_STRING_TABLE_LOOKUP(cgroup_device_policy, CGroupDevicePolicy);