src/core/cgroup.c

   1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
   2
   3 /***
   4   This file is part of systemd.
   5
   6   Copyright 2013 Lennart Poettering
   7
   8   systemd is free software; you can redistribute it and/or modify it
   9   under the terms of the GNU Lesser General Public License as published by
  10   the Free Software Foundation; either version 2.1 of the License, or
  11   (at your option) any later version.
  12
  13   systemd is distributed in the hope that it will be useful, but
  14   WITHOUT ANY WARRANTY; without even the implied warranty of
  15   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  16   Lesser General Public License for more details.
  17
  18   You should have received a copy of the GNU Lesser General Public License
  19   along with systemd; If not, see <http://www.gnu.org/licenses/>.
  20 ***/
  21
  22 #include <fcntl.h>
  23 #include <fnmatch.h>
  24
  25 #include "path-util.h"
  26 #include "special.h"
  27 #include "cgroup-util.h"
  28 #include "cgroup.h"
  29
  30 #define CGROUP_CPU_QUOTA_PERIOD_USEC ((usec_t) 100 * USEC_PER_MSEC)
  31
  32 void cgroup_context_init(CGroupContext *c) {
  33         assert(c);
  34
  35         /* Initialize everything to the kernel defaults, assuming the
  36          * structure is preinitialized to 0 */
  37
  38         c->cpu_shares = (unsigned long) -1;
  39         c->startup_cpu_shares = (unsigned long) -1;
  40         c->memory_limit = (uint64_t) -1;
  41         c->blockio_weight = (unsigned long) -1;
  42         c->startup_blockio_weight = (unsigned long) -1;
  43
  44         c->cpu_quota_per_sec_usec = USEC_INFINITY;
  45 }
  46
  47 void cgroup_context_free_device_allow(CGroupContext *c, CGroupDeviceAllow *a) {
  48         assert(c);
  49         assert(a);
  50
  51         LIST_REMOVE(device_allow, c->device_allow, a);
  52         free(a->path);
  53         free(a);
  54 }
  55
  56 void cgroup_context_free_blockio_device_weight(CGroupContext *c, CGroupBlockIODeviceWeight *w) {
  57         assert(c);
  58         assert(w);
  59
  60         LIST_REMOVE(device_weights, c->blockio_device_weights, w);
  61         free(w->path);
  62         free(w);
  63 }
  64
  65 void cgroup_context_free_blockio_device_bandwidth(CGroupContext *c, CGroupBlockIODeviceBandwidth *b) {
  66         assert(c);
  67         assert(b);
  68
  69         LIST_REMOVE(device_bandwidths, c->blockio_device_bandwidths, b);
  70         free(b->path);
  71         free(b);
  72 }
  73
  74 void cgroup_context_done(CGroupContext *c) {
  75         assert(c);
  76
  77         while (c->blockio_device_weights)
  78                 cgroup_context_free_blockio_device_weight(c, c->blockio_device_weights);
  79
  80         while (c->blockio_device_bandwidths)
  81                 cgroup_context_free_blockio_device_bandwidth(c, c->blockio_device_bandwidths);
  82
  83         while (c->device_allow)
  84                 cgroup_context_free_device_allow(c, c->device_allow);
  85 }
  86
  87 void cgroup_context_dump(CGroupContext *c, FILE* f, const char *prefix) {
  88         CGroupBlockIODeviceBandwidth *b;
  89         CGroupBlockIODeviceWeight *w;
  90         CGroupDeviceAllow *a;
  91         char u[FORMAT_TIMESPAN_MAX];
  92
  93         assert(c);
  94         assert(f);
  95
  96         prefix = strempty(prefix);
  97
  98         fprintf(f,
  99                 "%sCPUAccounting=%s\n"
 100                 "%sBlockIOAccounting=%s\n"
 101                 "%sMemoryAccounting=%s\n"
 102                 "%sCPUShares=%lu\n"
 103                 "%sStartupCPUShares=%lu\n"
 104                 "%sCPUQuotaPerSecSec=%s\n"
 105                 "%sBlockIOWeight=%lu\n"
 106                 "%sStartupBlockIOWeight=%lu\n"
 107                 "%sMemoryLimit=%" PRIu64 "\n"
 108                 "%sDevicePolicy=%s\n"
 109                 "%sDelegate=%s\n",
 110                 prefix, yes_no(c->cpu_accounting),
 111                 prefix, yes_no(c->blockio_accounting),
 112                 prefix, yes_no(c->memory_accounting),
 113                 prefix, c->cpu_shares,
 114                 prefix, c->startup_cpu_shares,
 115                 prefix, format_timespan(u, sizeof(u), c->cpu_quota_per_sec_usec, 1),
 116                 prefix, c->blockio_weight,
 117                 prefix, c->startup_blockio_weight,
 118                 prefix, c->memory_limit,
 119                 prefix, cgroup_device_policy_to_string(c->device_policy),
 120                 prefix, yes_no(c->delegate));
 121
 122         LIST_FOREACH(device_allow, a, c->device_allow)
 123                 fprintf(f,
 124                         "%sDeviceAllow=%s %s%s%s\n",
 125                         prefix,
 126                         a->path,
 127                         a->r ? "r" : "", a->w ? "w" : "", a->m ? "m" : "");
 128
 129         LIST_FOREACH(device_weights, w, c->blockio_device_weights)
 130                 fprintf(f,
 131                         "%sBlockIODeviceWeight=%s %lu",
 132                         prefix,
 133                         w->path,
 134                         w->weight);
 135
 136         LIST_FOREACH(device_bandwidths, b, c->blockio_device_bandwidths) {
 137                 char buf[FORMAT_BYTES_MAX];
 138
 139                 fprintf(f,
 140                         "%s%s=%s %s\n",
 141                         prefix,
 142                         b->read ? "BlockIOReadBandwidth" : "BlockIOWriteBandwidth",
 143                         b->path,
 144                         format_bytes(buf, sizeof(buf), b->bandwidth));
 145         }
 146 }
 147
 148 static int lookup_blkio_device(const char *p, dev_t *dev) {
 149         struct stat st;
 150         int r;
 151
 152         assert(p);
 153         assert(dev);
 154
 155         r = stat(p, &st);
 156         if (r < 0) {
 157                 log_warning("Couldn't stat device %s: %m", p);
 158                 return -errno;
 159         }
 160
 161         if (S_ISBLK(st.st_mode))
 162                 *dev = st.st_rdev;
 163         else if (major(st.st_dev) != 0) {
 164                 /* If this is not a device node then find the block
 165                  * device this file is stored on */
 166                 *dev = st.st_dev;
 167
 168                 /* If this is a partition, try to get the originating
 169                  * block device */
 170                 block_get_whole_disk(*dev, dev);
 171         } else {
 172                 log_warning("%s is not a block device and file system block device cannot be determined or is not local.", p);
 173                 return -ENODEV;
 174         }
 175
 176         return 0;
 177 }
 178
 179 static int whitelist_device(const char *path, const char *node, const char *acc) {
 180         char buf[2+DECIMAL_STR_MAX(dev_t)*2+2+4];
 181         struct stat st;
 182         int r;
 183
 184         assert(path);
 185         assert(acc);
 186
 187         if (stat(node, &st) < 0) {
 188                 log_warning("Couldn't stat device %s", node);
 189                 return -errno;
 190         }
 191
 192         if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
 193                 log_warning("%s is not a device.", node);
 194                 return -ENODEV;
 195         }
 196
 197         sprintf(buf,
 198                 "%c %u:%u %s",
 199                 S_ISCHR(st.st_mode) ? 'c' : 'b',
 200                 major(st.st_rdev), minor(st.st_rdev),
 201                 acc);
 202
 203         r = cg_set_attribute("devices", path, "devices.allow", buf);
 204         if (r < 0)
 205                 log_full(r == -ENOENT ? LOG_DEBUG : LOG_WARNING, "Failed to set devices.allow on %s: %s", path, strerror(-r));
 206
 207         return r;
 208 }
 209
 210 static int whitelist_major(const char *path, const char *name, char type, const char *acc) {
 211         _cleanup_fclose_ FILE *f = NULL;
 212         char line[LINE_MAX];
 213         bool good = false;
 214         int r;
 215
 216         assert(path);
 217         assert(acc);
 218         assert(type == 'b' || type == 'c');
 219
 220         f = fopen("/proc/devices", "re");
 221         if (!f) {
 222                 log_warning("Cannot open /proc/devices to resolve %s (%c): %m", name, type);
 223                 return -errno;
 224         }
 225
 226         FOREACH_LINE(line, f, goto fail) {
 227                 char buf[2+DECIMAL_STR_MAX(unsigned)+3+4], *p, *w;
 228                 unsigned maj;
 229
 230                 truncate_nl(line);
 231
 232                 if (type == 'c' && streq(line, "Character devices:")) {
 233                         good = true;
 234                         continue;
 235                 }
 236
 237                 if (type == 'b' && streq(line, "Block devices:")) {
 238                         good = true;
 239                         continue;
 240                 }
 241
 242                 if (isempty(line)) {
 243                         good = false;
 244                         continue;
 245                 }
 246
 247                 if (!good)
 248                         continue;
 249
 250                 p = strstrip(line);
 251
 252                 w = strpbrk(p, WHITESPACE);
 253                 if (!w)
 254                         continue;
 255                 *w = 0;
 256
 257                 r = safe_atou(p, &maj);
 258                 if (r < 0)
 259                         continue;
 260                 if (maj <= 0)
 261                         continue;
 262
 263                 w++;
 264                 w += strspn(w, WHITESPACE);
 265
 266                 if (fnmatch(name, w, 0) != 0)
 267                         continue;
 268
 269                 sprintf(buf,
 270                         "%c %u:* %s",
 271                         type,
 272                         maj,
 273                         acc);
 274
 275                 r = cg_set_attribute("devices", path, "devices.allow", buf);
 276                 if (r < 0)
 277                         log_full(r == -ENOENT ? LOG_DEBUG : LOG_WARNING, "Failed to set devices.allow on %s: %s", path, strerror(-r));
 278         }
 279
 280         return 0;
 281
 282 fail:
 283         log_warning("Failed to read /proc/devices: %m");
 284         return -errno;
 285 }
 286
 287 void cgroup_context_apply(CGroupContext *c, CGroupControllerMask mask, const char *path, ManagerState state) {
 288         bool is_root;
 289         int r;
 290
 291         assert(c);
 292         assert(path);
 293
 294         if (mask == 0)
 295                 return;
 296
 297         /* Some cgroup attributes are not support on the root cgroup,
 298          * hence silently ignore */
 299         is_root = isempty(path) || path_equal(path, "/");
 300
 301         if ((mask & CGROUP_CPU) && !is_root) {
 302                 char buf[MAX(DECIMAL_STR_MAX(unsigned long), DECIMAL_STR_MAX(usec_t)) + 1];
 303
 304                 sprintf(buf, "%lu\n",
 305                         IN_SET(state, MANAGER_STARTING, MANAGER_INITIALIZING) && c->startup_cpu_shares != (unsigned long) -1 ? c->startup_cpu_shares :
 306                         c->cpu_shares != (unsigned long) -1 ? c->cpu_shares : 1024);
 307                 r = cg_set_attribute("cpu", path, "cpu.shares", buf);
 308                 if (r < 0)
 309                         log_full(r == -ENOENT ? LOG_DEBUG : LOG_WARNING, "Failed to set cpu.shares on %s: %s", path, strerror(-r));
 310
 311                 sprintf(buf, USEC_FMT "\n", CGROUP_CPU_QUOTA_PERIOD_USEC);
 312                 r = cg_set_attribute("cpu", path, "cpu.cfs_period_us", buf);
 313                 if (r < 0)
 314                         log_full(r == -ENOENT ? LOG_DEBUG : LOG_WARNING, "Failed to set cpu.cfs_period_us on %s: %s", path, strerror(-r));
 315
 316                 if (c->cpu_quota_per_sec_usec != USEC_INFINITY) {
 317                         sprintf(buf, USEC_FMT "\n", c->cpu_quota_per_sec_usec * CGROUP_CPU_QUOTA_PERIOD_USEC / USEC_PER_SEC);
 318                         r = cg_set_attribute("cpu", path, "cpu.cfs_quota_us", buf);
 319                 } else
 320                         r = cg_set_attribute("cpu", path, "cpu.cfs_quota_us", "-1");
 321                 if (r < 0)
 322                         log_full(r == -ENOENT ? LOG_DEBUG : LOG_WARNING, "Failed to set cpu.cfs_quota_us on %s: %s", path, strerror(-r));
 323         }
 324
 325         if (mask & CGROUP_BLKIO) {
 326                 char buf[MAX3(DECIMAL_STR_MAX(unsigned long)+1,
 327                               DECIMAL_STR_MAX(dev_t)*2+2+DECIMAL_STR_MAX(unsigned long)*1,
 328                               DECIMAL_STR_MAX(dev_t)*2+2+DECIMAL_STR_MAX(uint64_t)+1)];
 329                 CGroupBlockIODeviceWeight *w;
 330                 CGroupBlockIODeviceBandwidth *b;
 331
 332                 if (!is_root) {
 333                         sprintf(buf, "%lu\n", IN_SET(state, MANAGER_STARTING, MANAGER_INITIALIZING) && c->startup_blockio_weight != (unsigned long) -1 ? c->startup_blockio_weight :
 334                                 c->blockio_weight != (unsigned long) -1 ? c->blockio_weight : 1000);
 335                         r = cg_set_attribute("blkio", path, "blkio.weight", buf);
 336                         if (r < 0)
 337                                 log_full(r == -ENOENT ? LOG_DEBUG : LOG_WARNING, "Failed to set blkio.weight on %s: %s", path, strerror(-r));
 338
 339                         /* FIXME: no way to reset this list */
 340                         LIST_FOREACH(device_weights, w, c->blockio_device_weights) {
 341                                 dev_t dev;
 342
 343                                 r = lookup_blkio_device(w->path, &dev);
 344                                 if (r < 0)
 345                                         continue;
 346
 347                                 sprintf(buf, "%u:%u %lu", major(dev), minor(dev), w->weight);
 348                                 r = cg_set_attribute("blkio", path, "blkio.weight_device", buf);
 349                                 if (r < 0)
 350                                         log_full(r == -ENOENT ? LOG_DEBUG : LOG_WARNING, "Failed to set blkio.weight_device on %s: %s", path, strerror(-r));
 351                         }
 352                 }
 353
 354                 /* FIXME: no way to reset this list */
 355                 LIST_FOREACH(device_bandwidths, b, c->blockio_device_bandwidths) {
 356                         const char *a;
 357                         dev_t dev;
 358
 359                         r = lookup_blkio_device(b->path, &dev);
 360                         if (r < 0)
 361                                 continue;
 362
 363                         a = b->read ? "blkio.throttle.read_bps_device" : "blkio.throttle.write_bps_device";
 364
 365                         sprintf(buf, "%u:%u %" PRIu64 "\n", major(dev), minor(dev), b->bandwidth);
 366                         r = cg_set_attribute("blkio", path, a, buf);
 367                         if (r < 0)
 368                                 log_full(r == -ENOENT ? LOG_DEBUG : LOG_WARNING, "Failed to set %s on %s: %s", a, path, strerror(-r));
 369                 }
 370         }
 371
 372         if (mask & CGROUP_MEMORY) {
 373                 if (c->memory_limit != (uint64_t) -1) {
 374                         char buf[DECIMAL_STR_MAX(uint64_t) + 1];
 375
 376                         sprintf(buf, "%" PRIu64 "\n", c->memory_limit);
 377                         r = cg_set_attribute("memory", path, "memory.limit_in_bytes", buf);
 378                 } else
 379                         r = cg_set_attribute("memory", path, "memory.limit_in_bytes", "-1");
 380
 381                 if (r < 0)
 382                         log_full(r == -ENOENT ? LOG_DEBUG : LOG_WARNING, "Failed to set memory.limit_in_bytes on %s: %s", path, strerror(-r));
 383         }
 384
 385         if ((mask & CGROUP_DEVICE) && !is_root) {
 386                 CGroupDeviceAllow *a;
 387
 388                 if (c->device_allow || c->device_policy != CGROUP_AUTO)
 389                         r = cg_set_attribute("devices", path, "devices.deny", "a");
 390                 else
 391                         r = cg_set_attribute("devices", path, "devices.allow", "a");
 392                 if (r < 0)
 393                         log_full(r == -ENOENT ? LOG_DEBUG : LOG_WARNING, "Failed to reset devices.list on %s: %s", path, strerror(-r));
 394
 395                 if (c->device_policy == CGROUP_CLOSED ||
 396                     (c->device_policy == CGROUP_AUTO && c->device_allow)) {
 397                         static const char auto_devices[] =
 398                                 "/dev/null\0" "rwm\0"
 399                                 "/dev/zero\0" "rwm\0"
 400                                 "/dev/full\0" "rwm\0"
 401                                 "/dev/random\0" "rwm\0"
 402                                 "/dev/urandom\0" "rwm\0"
 403                                 "/dev/tty\0" "rwm\0"
 404                                 "/dev/pts/ptmx\0" "rw\0"; /* /dev/pts/ptmx may not be duplicated, but accessed */
 405
 406                         const char *x, *y;
 407
 408                         NULSTR_FOREACH_PAIR(x, y, auto_devices)
 409                                 whitelist_device(path, x, y);
 410
 411                         whitelist_major(path, "pts", 'c', "rw");
 412                         whitelist_major(path, "kdbus", 'c', "rw");
 413                         whitelist_major(path, "kdbus/*", 'c', "rw");
 414                 }
 415
 416                 LIST_FOREACH(device_allow, a, c->device_allow) {
 417                         char acc[4];
 418                         unsigned k = 0;
 419
 420                         if (a->r)
 421                                 acc[k++] = 'r';
 422                         if (a->w)
 423                                 acc[k++] = 'w';
 424                         if (a->m)
 425                                 acc[k++] = 'm';
 426
 427                         if (k == 0)
 428                                 continue;
 429
 430                         acc[k++] = 0;
 431
 432                         if (startswith(a->path, "/dev/"))
 433                                 whitelist_device(path, a->path, acc);
 434                         else if (startswith(a->path, "block-"))
 435                                 whitelist_major(path, a->path + 6, 'b', acc);
 436                         else if (startswith(a->path, "char-"))
 437                                 whitelist_major(path, a->path + 5, 'c', acc);
 438                         else
 439                                 log_debug("Ignoring device %s while writing cgroup attribute.", a->path);
 440                 }
 441         }
 442 }
 443
 444 CGroupControllerMask cgroup_context_get_mask(CGroupContext *c) {
 445         CGroupControllerMask mask = 0;
 446
 447         /* Figure out which controllers we need */
 448
 449         if (c->cpu_accounting ||
 450             c->cpu_shares != (unsigned long) -1 ||
 451             c->startup_cpu_shares != (unsigned long) -1 ||
 452             c->cpu_quota_per_sec_usec != USEC_INFINITY)
 453                 mask |= CGROUP_CPUACCT | CGROUP_CPU;
 454
 455         if (c->blockio_accounting ||
 456             c->blockio_weight != (unsigned long) -1 ||
 457             c->startup_blockio_weight != (unsigned long) -1 ||
 458             c->blockio_device_weights ||
 459             c->blockio_device_bandwidths)
 460                 mask |= CGROUP_BLKIO;
 461
 462         if (c->memory_accounting ||
 463             c->memory_limit != (uint64_t) -1)
 464                 mask |= CGROUP_MEMORY;
 465
 466         if (c->device_allow ||
 467             c->device_policy != CGROUP_AUTO)
 468                 mask |= CGROUP_DEVICE;
 469
 470         return mask;
 471 }
 472
 473 CGroupControllerMask unit_get_cgroup_mask(Unit *u) {
 474         CGroupContext *c;
 475
 476         c = unit_get_cgroup_context(u);
 477         if (!c)
 478                 return 0;
 479
 480         /* If delegation is turned on, then turn on all cgroups,
 481          * unless the process we fork into it is known to drop
 482          * privileges anyway, and shouldn't get access to the
 483          * controllers anyway. */
 484
 485         if (c->delegate) {
 486                 ExecContext *e;
 487
 488                 e = unit_get_exec_context(u);
 489                 if (!e || exec_context_maintains_privileges(e))
 490                         return _CGROUP_CONTROLLER_MASK_ALL;
 491         }
 492
 493         return cgroup_context_get_mask(c);
 494 }
 495
 496 CGroupControllerMask unit_get_members_mask(Unit *u) {
 497         assert(u);
 498
 499         if (u->cgroup_members_mask_valid)
 500                 return u->cgroup_members_mask;
 501
 502         u->cgroup_members_mask = 0;
 503
 504         if (u->type == UNIT_SLICE) {
 505                 Unit *member;
 506                 Iterator i;
 507
 508                 SET_FOREACH(member, u->dependencies[UNIT_BEFORE], i) {
 509
 510                         if (member == u)
 511                                 continue;
 512
 513                         if (UNIT_DEREF(member->slice) != u)
 514                                 continue;
 515
 516                         u->cgroup_members_mask |=
 517                                 unit_get_cgroup_mask(member) |
 518                                 unit_get_members_mask(member);
 519                 }
 520         }
 521
 522         u->cgroup_members_mask_valid = true;
 523         return u->cgroup_members_mask;
 524 }
 525
 526 CGroupControllerMask unit_get_siblings_mask(Unit *u) {
 527         assert(u);
 528
 529         if (UNIT_ISSET(u->slice))
 530                 return unit_get_members_mask(UNIT_DEREF(u->slice));
 531
 532         return unit_get_cgroup_mask(u) | unit_get_members_mask(u);
 533 }
 534
 535 CGroupControllerMask unit_get_target_mask(Unit *u) {
 536         CGroupControllerMask mask;
 537
 538         mask = unit_get_cgroup_mask(u) | unit_get_members_mask(u) | unit_get_siblings_mask(u);
 539         mask &= u->manager->cgroup_supported;
 540
 541         return mask;
 542 }
 543
 544 /* Recurse from a unit up through its containing slices, propagating
 545  * mask bits upward. A unit is also member of itself. */
 546 void unit_update_cgroup_members_masks(Unit *u) {
 547         CGroupControllerMask m;
 548         bool more;
 549
 550         assert(u);
 551
 552         /* Calculate subtree mask */
 553         m = unit_get_cgroup_mask(u) | unit_get_members_mask(u);
 554
 555         /* See if anything changed from the previous invocation. If
 556          * not, we're done. */
 557         if (u->cgroup_subtree_mask_valid && m == u->cgroup_subtree_mask)
 558                 return;
 559
 560         more =
 561                 u->cgroup_subtree_mask_valid &&
 562                 ((m & ~u->cgroup_subtree_mask) != 0) &&
 563                 ((~m & u->cgroup_subtree_mask) == 0);
 564
 565         u->cgroup_subtree_mask = m;
 566         u->cgroup_subtree_mask_valid = true;
 567
 568         if (UNIT_ISSET(u->slice)) {
 569                 Unit *s = UNIT_DEREF(u->slice);
 570
 571                 if (more)
 572                         /* There's more set now than before. We
 573                          * propagate the new mask to the parent's mask
 574                          * (not caring if it actually was valid or
 575                          * not). */
 576
 577                         s->cgroup_members_mask |= m;
 578
 579                 else
 580                         /* There's less set now than before (or we
 581                          * don't know), we need to recalculate
 582                          * everything, so let's invalidate the
 583                          * parent's members mask */
 584
 585                         s->cgroup_members_mask_valid = false;
 586
 587                 /* And now make sure that this change also hits our
 588                  * grandparents */
 589                 unit_update_cgroup_members_masks(s);
 590         }
 591 }
 592
 593 static const char *migrate_callback(CGroupControllerMask mask, void *userdata) {
 594         Unit *u = userdata;
 595
 596         assert(mask != 0);
 597         assert(u);
 598
 599         while (u) {
 600                 if (u->cgroup_path &&
 601                     u->cgroup_realized &&
 602                     (u->cgroup_realized_mask & mask) == mask)
 603                         return u->cgroup_path;
 604
 605                 u = UNIT_DEREF(u->slice);
 606         }
 607
 608         return NULL;
 609 }
 610
 611 static int unit_create_cgroups(Unit *u, CGroupControllerMask mask) {
 612         _cleanup_free_ char *path = NULL;
 613         int r;
 614
 615         assert(u);
 616
 617         path = unit_default_cgroup_path(u);
 618         if (!path)
 619                 return log_oom();
 620
 621         r = hashmap_put(u->manager->cgroup_unit, path, u);
 622         if (r < 0) {
 623                 log_error(r == -EEXIST ? "cgroup %s exists already: %s" : "hashmap_put failed for %s: %s", path, strerror(-r));
 624                 return r;
 625         }
 626         if (r > 0) {
 627                 u->cgroup_path = path;
 628                 path = NULL;
 629         }
 630
 631         /* First, create our own group */
 632         r = cg_create_everywhere(u->manager->cgroup_supported, mask, u->cgroup_path);
 633         if (r < 0)
 634                 return log_error_errno(r, "Failed to create cgroup %s: %m", u->cgroup_path);
 635
 636         /* Keep track that this is now realized */
 637         u->cgroup_realized = true;
 638         u->cgroup_realized_mask = mask;
 639
 640         /* Then, possibly move things over */
 641         r = cg_migrate_everywhere(u->manager->cgroup_supported, u->cgroup_path, u->cgroup_path, migrate_callback, u);
 642         if (r < 0)
 643                 log_warning_errno(r, "Failed to migrate cgroup from to %s: %m", u->cgroup_path);
 644
 645         return 0;
 646 }
 647
 648 static bool unit_has_mask_realized(Unit *u, CGroupControllerMask mask) {
 649         assert(u);
 650
 651         return u->cgroup_realized && u->cgroup_realized_mask == mask;
 652 }
 653
 654 /* Check if necessary controllers and attributes for a unit are in place.
 655  *
 656  * If so, do nothing.
 657  * If not, create paths, move processes over, and set attributes.
 658  *
 659  * Returns 0 on success and < 0 on failure. */
 660 static int unit_realize_cgroup_now(Unit *u, ManagerState state) {
 661         CGroupControllerMask mask;
 662         int r;
 663
 664         assert(u);
 665
 666         if (u->in_cgroup_queue) {
 667                 LIST_REMOVE(cgroup_queue, u->manager->cgroup_queue, u);
 668                 u->in_cgroup_queue = false;
 669         }
 670
 671         mask = unit_get_target_mask(u);
 672
 673         if (unit_has_mask_realized(u, mask))
 674                 return 0;
 675
 676         /* First, realize parents */
 677         if (UNIT_ISSET(u->slice)) {
 678                 r = unit_realize_cgroup_now(UNIT_DEREF(u->slice), state);
 679                 if (r < 0)
 680                         return r;
 681         }
 682
 683         /* And then do the real work */
 684         r = unit_create_cgroups(u, mask);
 685         if (r < 0)
 686                 return r;
 687
 688         /* Finally, apply the necessary attributes. */
 689         cgroup_context_apply(unit_get_cgroup_context(u), mask, u->cgroup_path, state);
 690
 691         return 0;
 692 }
 693
 694 static void unit_add_to_cgroup_queue(Unit *u) {
 695
 696         if (u->in_cgroup_queue)
 697                 return;
 698
 699         LIST_PREPEND(cgroup_queue, u->manager->cgroup_queue, u);
 700         u->in_cgroup_queue = true;
 701 }
 702
 703 unsigned manager_dispatch_cgroup_queue(Manager *m) {
 704         ManagerState state;
 705         unsigned n = 0;
 706         Unit *i;
 707         int r;
 708
 709         state = manager_state(m);
 710
 711         while ((i = m->cgroup_queue)) {
 712                 assert(i->in_cgroup_queue);
 713
 714                 r = unit_realize_cgroup_now(i, state);
 715                 if (r < 0)
 716                         log_warning_errno(r, "Failed to realize cgroups for queued unit %s: %m", i->id);
 717
 718                 n++;
 719         }
 720
 721         return n;
 722 }
 723
 724 static void unit_queue_siblings(Unit *u) {
 725         Unit *slice;
 726
 727         /* This adds the siblings of the specified unit and the
 728          * siblings of all parent units to the cgroup queue. (But
 729          * neither the specified unit itself nor the parents.) */
 730
 731         while ((slice = UNIT_DEREF(u->slice))) {
 732                 Iterator i;
 733                 Unit *m;
 734
 735                 SET_FOREACH(m, slice->dependencies[UNIT_BEFORE], i) {
 736                         if (m == u)
 737                                 continue;
 738
 739                         /* Skip units that have a dependency on the slice
 740                          * but aren't actually in it. */
 741                         if (UNIT_DEREF(m->slice) != slice)
 742                                 continue;
 743
 744                         /* No point in doing cgroup application for units
 745                          * without active processes. */
 746                         if (UNIT_IS_INACTIVE_OR_FAILED(unit_active_state(m)))
 747                                 continue;
 748
 749                         /* If the unit doesn't need any new controllers
 750                          * and has current ones realized, it doesn't need
 751                          * any changes. */
 752                         if (unit_has_mask_realized(m, unit_get_target_mask(m)))
 753                                 continue;
 754
 755                         unit_add_to_cgroup_queue(m);
 756                 }
 757
 758                 u = slice;
 759         }
 760 }
 761
 762 int unit_realize_cgroup(Unit *u) {
 763         CGroupContext *c;
 764
 765         assert(u);
 766
 767         c = unit_get_cgroup_context(u);
 768         if (!c)
 769                 return 0;
 770
 771         /* So, here's the deal: when realizing the cgroups for this
 772          * unit, we need to first create all parents, but there's more
 773          * actually: for the weight-based controllers we also need to
 774          * make sure that all our siblings (i.e. units that are in the
 775          * same slice as we are) have cgroups, too. Otherwise, things
 776          * would become very uneven as each of their processes would
 777          * get as much resources as all our group together. This call
 778          * will synchronously create the parent cgroups, but will
 779          * defer work on the siblings to the next event loop
 780          * iteration. */
 781
 782         /* Add all sibling slices to the cgroup queue. */
 783         unit_queue_siblings(u);
 784
 785         /* And realize this one now (and apply the values) */
 786         return unit_realize_cgroup_now(u, manager_state(u->manager));
 787 }
 788
 789 void unit_destroy_cgroup(Unit *u) {
 790         int r;
 791
 792         assert(u);
 793
 794         if (!u->cgroup_path)
 795                 return;
 796
 797         r = cg_trim_everywhere(u->manager->cgroup_supported, u->cgroup_path, !unit_has_name(u, SPECIAL_ROOT_SLICE));
 798         if (r < 0)
 799                 log_debug_errno(r, "Failed to destroy cgroup %s: %m", u->cgroup_path);
 800
 801         hashmap_remove(u->manager->cgroup_unit, u->cgroup_path);
 802
 803         free(u->cgroup_path);
 804         u->cgroup_path = NULL;
 805         u->cgroup_realized = false;
 806         u->cgroup_realized_mask = 0;
 807
 808 }
 809
 810 pid_t unit_search_main_pid(Unit *u) {
 811         _cleanup_fclose_ FILE *f = NULL;
 812         pid_t pid = 0, npid, mypid;
 813
 814         assert(u);
 815
 816         if (!u->cgroup_path)
 817                 return 0;
 818
 819         if (cg_enumerate_processes(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, &f) < 0)
 820                 return 0;
 821
 822         mypid = getpid();
 823         while (cg_read_pid(f, &npid) > 0)  {
 824                 pid_t ppid;
 825
 826                 if (npid == pid)
 827                         continue;
 828
 829                 /* Ignore processes that aren't our kids */
 830                 if (get_parent_of_pid(npid, &ppid) >= 0 && ppid != mypid)
 831                         continue;
 832
 833                 if (pid != 0) {
 834                         /* Dang, there's more than one daemonized PID
 835                         in this group, so we don't know what process
 836                         is the main process. */
 837                         pid = 0;
 838                         break;
 839                 }
 840
 841                 pid = npid;
 842         }
 843
 844         return pid;
 845 }
 846
 847 int manager_setup_cgroup(Manager *m) {
 848         _cleanup_free_ char *path = NULL;
 849         int r;
 850
 851         assert(m);
 852
 853         /* 1. Determine hierarchy */
 854         free(m->cgroup_root);
 855         m->cgroup_root = NULL;
 856
 857         r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, 0, &m->cgroup_root);
 858         if (r < 0)
 859                 return log_error_errno(r, "Cannot determine cgroup we are running in: %m");
 860
 861         /* LEGACY: Already in /system.slice? If so, let's cut this
 862          * off. This is to support live upgrades from older systemd
 863          * versions where PID 1 was moved there. */
 864         if (m->running_as == SYSTEMD_SYSTEM) {
 865                 char *e;
 866
 867                 e = endswith(m->cgroup_root, "/" SPECIAL_SYSTEM_SLICE);
 868                 if (!e)
 869                         e = endswith(m->cgroup_root, "/system");
 870                 if (e)
 871                         *e = 0;
 872         }
 873
 874         /* And make sure to store away the root value without trailing
 875          * slash, even for the root dir, so that we can easily prepend
 876          * it everywhere. */
 877         if (streq(m->cgroup_root, "/"))
 878                 m->cgroup_root[0] = 0;
 879
 880         /* 2. Show data */
 881         r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, NULL, &path);
 882         if (r < 0)
 883                 return log_error_errno(r, "Cannot find cgroup mount point: %m");
 884
 885         log_debug("Using cgroup controller " SYSTEMD_CGROUP_CONTROLLER ". File system hierarchy is at %s.", path);
 886         if (!m->test_run) {
 887
 888                 /* 3. Install agent */
 889                 if (m->running_as == SYSTEMD_SYSTEM) {
 890                         r = cg_install_release_agent(SYSTEMD_CGROUP_CONTROLLER, SYSTEMD_CGROUP_AGENT_PATH);
 891                         if (r < 0)
 892                                 log_warning_errno(r, "Failed to install release agent, ignoring: %m");
 893                         else if (r > 0)
 894                                 log_debug("Installed release agent.");
 895                         else
 896                                 log_debug("Release agent already installed.");
 897                 }
 898
 899                 /* 4. Make sure we are in the root cgroup */
 900                 r = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, 0);
 901                 if (r < 0)
 902                         return log_error_errno(r, "Failed to create root cgroup hierarchy: %m");
 903
 904                 /* 5. And pin it, so that it cannot be unmounted */
 905                 safe_close(m->pin_cgroupfs_fd);
 906
 907                 m->pin_cgroupfs_fd = open(path, O_RDONLY|O_CLOEXEC|O_DIRECTORY|O_NOCTTY|O_NONBLOCK);
 908                 if (m->pin_cgroupfs_fd < 0) {
 909                         log_error("Failed to open pin file: %m");
 910                         return -errno;
 911                 }
 912
 913                 /* 6.  Always enable hierarchial support if it exists... */
 914                 cg_set_attribute("memory", "/", "memory.use_hierarchy", "1");
 915         }
 916
 917         /* 7. Figure out which controllers are supported */
 918         m->cgroup_supported = cg_mask_supported();
 919
 920         return 0;
 921 }
 922
 923 void manager_shutdown_cgroup(Manager *m, bool delete) {
 924         assert(m);
 925
 926         /* We can't really delete the group, since we are in it. But
 927          * let's trim it. */
 928         if (delete && m->cgroup_root)
 929                 cg_trim(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, false);
 930
 931         m->pin_cgroupfs_fd = safe_close(m->pin_cgroupfs_fd);
 932
 933         free(m->cgroup_root);
 934         m->cgroup_root = NULL;
 935 }
 936
 937 Unit* manager_get_unit_by_cgroup(Manager *m, const char *cgroup) {
 938         char *p;
 939         Unit *u;
 940
 941         assert(m);
 942         assert(cgroup);
 943
 944         u = hashmap_get(m->cgroup_unit, cgroup);
 945         if (u)
 946                 return u;
 947
 948         p = strdupa(cgroup);
 949         for (;;) {
 950                 char *e;
 951
 952                 e = strrchr(p, '/');
 953                 if (e == p || !e)
 954                         return NULL;
 955
 956                 *e = 0;
 957
 958                 u = hashmap_get(m->cgroup_unit, p);
 959                 if (u)
 960                         return u;
 961         }
 962 }
 963
 964 Unit *manager_get_unit_by_pid(Manager *m, pid_t pid) {
 965         _cleanup_free_ char *cgroup = NULL;
 966         int r;
 967
 968         assert(m);
 969
 970         if (pid <= 1)
 971                 return NULL;
 972
 973         r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, pid, &cgroup);
 974         if (r < 0)
 975                 return NULL;
 976
 977         return manager_get_unit_by_cgroup(m, cgroup);
 978 }
 979
 980 int manager_notify_cgroup_empty(Manager *m, const char *cgroup) {
 981         Unit *u;
 982         int r;
 983
 984         assert(m);
 985         assert(cgroup);
 986
 987         u = manager_get_unit_by_cgroup(m, cgroup);
 988         if (u) {
 989                 r = cg_is_empty_recursive(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, true);
 990                 if (r > 0) {
 991                         if (UNIT_VTABLE(u)->notify_cgroup_empty)
 992                                 UNIT_VTABLE(u)->notify_cgroup_empty(u);
 993
 994                         unit_add_to_gc_queue(u);
 995                 }
 996         }
 997
 998         return 0;
 999 }
1000
1001 static const char* const cgroup_device_policy_table[_CGROUP_DEVICE_POLICY_MAX] = {
1002         [CGROUP_AUTO] = "auto",
1003         [CGROUP_CLOSED] = "closed",
1004         [CGROUP_STRICT] = "strict",
1005 };
1006
1007 DEFINE_STRING_TABLE_LOOKUP(cgroup_device_policy, CGroupDevicePolicy);