src/basic/cgroup-util.c

   1 /* SPDX-License-Identifier: LGPL-2.1+ */
   2 /***
   3   This file is part of systemd.
   4
   5   Copyright 2010 Lennart Poettering
   6
   7   systemd is free software; you can redistribute it and/or modify it
   8   under the terms of the GNU Lesser General Public License as published by
   9   the Free Software Foundation; either version 2.1 of the License, or
  10   (at your option) any later version.
  11
  12   systemd is distributed in the hope that it will be useful, but
  13   WITHOUT ANY WARRANTY; without even the implied warranty of
  14   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  15   Lesser General Public License for more details.
  16
  17   You should have received a copy of the GNU Lesser General Public License
  18   along with systemd; If not, see <http://www.gnu.org/licenses/>.
  19 ***/
  20
  21 #include <dirent.h>
  22 #include <errno.h>
  23 #include <ftw.h>
  24 //#include <limits.h>
  25 #include <signal.h>
  26 //#include <stddef.h>
  27 #include <stdio_ext.h>
  28 #include <stdlib.h>
  29 #include <string.h>
  30 #include <sys/stat.h>
  31 //#include <sys/statfs.h>
  32 #include <sys/types.h>
  33 #include <sys/xattr.h>
  34 #include <unistd.h>
  35
  36 #include "alloc-util.h"
  37 #include "cgroup-util.h"
  38 //#include "def.h"
  39 #include "dirent-util.h"
  40 #include "extract-word.h"
  41 #include "fd-util.h"
  42 #include "fileio.h"
  43 #include "format-util.h"
  44 #include "fs-util.h"
  45 //#include "log.h"
  46 #include "login-util.h"
  47 #include "macro.h"
  48 //#include "missing.h"
  49 #include "mkdir.h"
  50 #include "parse-util.h"
  51 #include "path-util.h"
  52 #include "proc-cmdline.h"
  53 #include "process-util.h"
  54 #include "set.h"
  55 //#include "special.h"
  56 #include "stat-util.h"
  57 #include "stdio-util.h"
  58 #include "string-table.h"
  59 #include "string-util.h"
  60 #include "strv.h"
  61 #include "unit-name.h"
  62 #include "user-util.h"
  63
  64 int cg_enumerate_processes(const char *controller, const char *path, FILE **_f) {
  65         _cleanup_free_ char *fs = NULL;
  66         FILE *f;
  67         int r;
  68
  69         assert(_f);
  70
  71         r = cg_get_path(controller, path, "cgroup.procs", &fs);
  72         if (r < 0)
  73                 return r;
  74
  75         f = fopen(fs, "re");
  76         if (!f)
  77                 return -errno;
  78
  79         *_f = f;
  80         return 0;
  81 }
  82
  83 int cg_read_pid(FILE *f, pid_t *_pid) {
  84         unsigned long ul;
  85
  86         /* Note that the cgroup.procs might contain duplicates! See
  87          * cgroups.txt for details. */
  88
  89         assert(f);
  90         assert(_pid);
  91
  92         errno = 0;
  93         if (fscanf(f, "%lu", &ul) != 1) {
  94
  95                 if (feof(f))
  96                         return 0;
  97
  98                 return errno > 0 ? -errno : -EIO;
  99         }
 100
 101         if (ul <= 0)
 102                 return -EIO;
 103
 104         *_pid = (pid_t) ul;
 105         return 1;
 106 }
 107
 108 int cg_read_event(
 109                 const char *controller,
 110                 const char *path,
 111                 const char *event,
 112                 char **val) {
 113
 114         _cleanup_free_ char *events = NULL, *content = NULL;
 115         char *p, *line;
 116         int r;
 117
 118         r = cg_get_path(controller, path, "cgroup.events", &events);
 119         if (r < 0)
 120                 return r;
 121
 122         r = read_full_file(events, &content, NULL);
 123         if (r < 0)
 124                 return r;
 125
 126         p = content;
 127         while ((line = strsep(&p, "\n"))) {
 128                 char *key;
 129
 130                 key = strsep(&line, " ");
 131                 if (!key || !line)
 132                         return -EINVAL;
 133
 134                 if (strcmp(key, event))
 135                         continue;
 136
 137                 *val = strdup(line);
 138                 return 0;
 139         }
 140
 141         return -ENOENT;
 142 }
 143
 144 #if 0 /// UNNEEDED by elogind
 145 bool cg_ns_supported(void) {
 146         static thread_local int enabled = -1;
 147
 148         if (enabled >= 0)
 149                 return enabled;
 150
 151         if (access("/proc/self/ns/cgroup", F_OK) == 0)
 152                 enabled = 1;
 153         else
 154                 enabled = 0;
 155
 156         return enabled;
 157 }
 158 #endif // 0
 159
 160 int cg_enumerate_subgroups(const char *controller, const char *path, DIR **_d) {
 161         _cleanup_free_ char *fs = NULL;
 162         int r;
 163         DIR *d;
 164
 165         assert(_d);
 166
 167         /* This is not recursive! */
 168
 169         r = cg_get_path(controller, path, NULL, &fs);
 170         if (r < 0)
 171                 return r;
 172
 173         d = opendir(fs);
 174         if (!d)
 175                 return -errno;
 176
 177         *_d = d;
 178         return 0;
 179 }
 180
 181 int cg_read_subgroup(DIR *d, char **fn) {
 182         struct dirent *de;
 183
 184         assert(d);
 185         assert(fn);
 186
 187         FOREACH_DIRENT_ALL(de, d, return -errno) {
 188                 char *b;
 189
 190                 if (de->d_type != DT_DIR)
 191                         continue;
 192
 193                 if (dot_or_dot_dot(de->d_name))
 194                         continue;
 195
 196                 b = strdup(de->d_name);
 197                 if (!b)
 198                         return -ENOMEM;
 199
 200                 *fn = b;
 201                 return 1;
 202         }
 203
 204         return 0;
 205 }
 206
 207 int cg_rmdir(const char *controller, const char *path) {
 208         _cleanup_free_ char *p = NULL;
 209         int r;
 210
 211         r = cg_get_path(controller, path, NULL, &p);
 212         if (r < 0)
 213                 return r;
 214
 215         r = rmdir(p);
 216         if (r < 0 && errno != ENOENT)
 217                 return -errno;
 218
 219         r = cg_hybrid_unified();
 220         if (r < 0)
 221                 return r;
 222         if (r == 0)
 223                 return 0;
 224
 225         if (streq(controller, SYSTEMD_CGROUP_CONTROLLER)) {
 226                 r = cg_rmdir(SYSTEMD_CGROUP_CONTROLLER_LEGACY, path);
 227                 if (r < 0)
 228                         log_warning_errno(r, "Failed to remove compat systemd cgroup %s: %m", path);
 229         }
 230
 231         return 0;
 232 }
 233
 234 int cg_kill(
 235                 const char *controller,
 236                 const char *path,
 237                 int sig,
 238                 CGroupFlags flags,
 239                 Set *s,
 240                 cg_kill_log_func_t log_kill,
 241                 void *userdata) {
 242
 243         _cleanup_set_free_ Set *allocated_set = NULL;
 244         bool done = false;
 245         int r, ret = 0;
 246         pid_t my_pid;
 247
 248         assert(sig >= 0);
 249
 250          /* Don't send SIGCONT twice. Also, SIGKILL always works even when process is suspended, hence don't send
 251           * SIGCONT on SIGKILL. */
 252         if (IN_SET(sig, SIGCONT, SIGKILL))
 253                 flags &= ~CGROUP_SIGCONT;
 254
 255         /* This goes through the tasks list and kills them all. This
 256          * is repeated until no further processes are added to the
 257          * tasks list, to properly handle forking processes */
 258
 259         if (!s) {
 260                 s = allocated_set = set_new(NULL);
 261                 if (!s)
 262                         return -ENOMEM;
 263         }
 264
 265         my_pid = getpid_cached();
 266
 267         do {
 268                 _cleanup_fclose_ FILE *f = NULL;
 269                 pid_t pid = 0;
 270                 done = true;
 271
 272                 r = cg_enumerate_processes(controller, path, &f);
 273                 if (r < 0) {
 274                         if (ret >= 0 && r != -ENOENT)
 275                                 return r;
 276
 277                         return ret;
 278                 }
 279
 280                 while ((r = cg_read_pid(f, &pid)) > 0) {
 281
 282                         if ((flags & CGROUP_IGNORE_SELF) && pid == my_pid)
 283                                 continue;
 284
 285                         if (set_get(s, PID_TO_PTR(pid)) == PID_TO_PTR(pid))
 286                                 continue;
 287
 288                         if (log_kill)
 289                                 log_kill(pid, sig, userdata);
 290
 291                         /* If we haven't killed this process yet, kill
 292                          * it */
 293                         if (kill(pid, sig) < 0) {
 294                                 if (ret >= 0 && errno != ESRCH)
 295                                         ret = -errno;
 296                         } else {
 297                                 if (flags & CGROUP_SIGCONT)
 298                                         (void) kill(pid, SIGCONT);
 299
 300                                 if (ret == 0)
 301                                         ret = 1;
 302                         }
 303
 304                         done = false;
 305
 306                         r = set_put(s, PID_TO_PTR(pid));
 307                         if (r < 0) {
 308                                 if (ret >= 0)
 309                                         return r;
 310
 311                                 return ret;
 312                         }
 313                 }
 314
 315                 if (r < 0) {
 316                         if (ret >= 0)
 317                                 return r;
 318
 319                         return ret;
 320                 }
 321
 322                 /* To avoid racing against processes which fork
 323                  * quicker than we can kill them we repeat this until
 324                  * no new pids need to be killed. */
 325
 326         } while (!done);
 327
 328         return ret;
 329 }
 330
 331 int cg_kill_recursive(
 332                 const char *controller,
 333                 const char *path,
 334                 int sig,
 335                 CGroupFlags flags,
 336                 Set *s,
 337                 cg_kill_log_func_t log_kill,
 338                 void *userdata) {
 339
 340         _cleanup_set_free_ Set *allocated_set = NULL;
 341         _cleanup_closedir_ DIR *d = NULL;
 342         int r, ret;
 343         char *fn;
 344
 345         assert(path);
 346         assert(sig >= 0);
 347
 348         if (!s) {
 349                 s = allocated_set = set_new(NULL);
 350                 if (!s)
 351                         return -ENOMEM;
 352         }
 353
 354         ret = cg_kill(controller, path, sig, flags, s, log_kill, userdata);
 355
 356         r = cg_enumerate_subgroups(controller, path, &d);
 357         if (r < 0) {
 358                 if (ret >= 0 && r != -ENOENT)
 359                         return r;
 360
 361                 return ret;
 362         }
 363
 364         while ((r = cg_read_subgroup(d, &fn)) > 0) {
 365                 _cleanup_free_ char *p = NULL;
 366
 367                 p = strjoin(path, "/", fn);
 368                 free(fn);
 369                 if (!p)
 370                         return -ENOMEM;
 371
 372                 r = cg_kill_recursive(controller, p, sig, flags, s, log_kill, userdata);
 373                 if (r != 0 && ret >= 0)
 374                         ret = r;
 375         }
 376         if (ret >= 0 && r < 0)
 377                 ret = r;
 378
 379         if (flags & CGROUP_REMOVE) {
 380                 r = cg_rmdir(controller, path);
 381                 if (r < 0 && ret >= 0 && !IN_SET(r, -ENOENT, -EBUSY))
 382                         return r;
 383         }
 384
 385         return ret;
 386 }
 387
 388 int cg_migrate(
 389                 const char *cfrom,
 390                 const char *pfrom,
 391                 const char *cto,
 392                 const char *pto,
 393                 CGroupFlags flags) {
 394
 395         bool done = false;
 396         _cleanup_set_free_ Set *s = NULL;
 397         int r, ret = 0;
 398         pid_t my_pid;
 399
 400         assert(cfrom);
 401         assert(pfrom);
 402         assert(cto);
 403         assert(pto);
 404
 405         s = set_new(NULL);
 406         if (!s)
 407                 return -ENOMEM;
 408
 409         my_pid = getpid_cached();
 410
 411         log_debug_elogind("Migrating \"%s\"/\"%s\" to \"%s\"/\"%s\" (%s)",
 412                           cfrom, pfrom, cto, pto,
 413                           (flags & CGROUP_IGNORE_SELF)
 414                           ? "ignoring self" : "watching self");
 415         do {
 416                 _cleanup_fclose_ FILE *f = NULL;
 417                 pid_t pid = 0;
 418                 done = true;
 419
 420                 r = cg_enumerate_processes(cfrom, pfrom, &f);
 421                 if (r < 0) {
 422                         if (ret >= 0 && r != -ENOENT)
 423                                 return r;
 424
 425                         return ret;
 426                 }
 427
 428                 while ((r = cg_read_pid(f, &pid)) > 0) {
 429
 430                         /* This might do weird stuff if we aren't a
 431                          * single-threaded program. However, we
 432                          * luckily know we are not */
 433                         if ((flags & CGROUP_IGNORE_SELF) && pid == my_pid)
 434                                 continue;
 435
 436                         if (set_get(s, PID_TO_PTR(pid)) == PID_TO_PTR(pid))
 437                                 continue;
 438
 439                         /* Ignore kernel threads. Since they can only
 440                          * exist in the root cgroup, we only check for
 441                          * them there. */
 442                         if (cfrom &&
 443                             (isempty(pfrom) || path_equal(pfrom, "/")) &&
 444                             is_kernel_thread(pid) > 0)
 445                                 continue;
 446
 447                         r = cg_attach(cto, pto, pid);
 448                         if (r < 0) {
 449                                 if (ret >= 0 && r != -ESRCH)
 450                                         ret = r;
 451                         } else if (ret == 0)
 452                                 ret = 1;
 453
 454                         done = false;
 455
 456                         r = set_put(s, PID_TO_PTR(pid));
 457                         if (r < 0) {
 458                                 if (ret >= 0)
 459                                         return r;
 460
 461                                 return ret;
 462                         }
 463                 }
 464
 465                 if (r < 0) {
 466                         if (ret >= 0)
 467                                 return r;
 468
 469                         return ret;
 470                 }
 471         } while (!done);
 472
 473         return ret;
 474 }
 475
 476 int cg_migrate_recursive(
 477                 const char *cfrom,
 478                 const char *pfrom,
 479                 const char *cto,
 480                 const char *pto,
 481                 CGroupFlags flags) {
 482
 483         _cleanup_closedir_ DIR *d = NULL;
 484         int r, ret = 0;
 485         char *fn;
 486
 487         assert(cfrom);
 488         assert(pfrom);
 489         assert(cto);
 490         assert(pto);
 491
 492         ret = cg_migrate(cfrom, pfrom, cto, pto, flags);
 493
 494         r = cg_enumerate_subgroups(cfrom, pfrom, &d);
 495         if (r < 0) {
 496                 if (ret >= 0 && r != -ENOENT)
 497                         return r;
 498
 499                 return ret;
 500         }
 501
 502         while ((r = cg_read_subgroup(d, &fn)) > 0) {
 503                 _cleanup_free_ char *p = NULL;
 504
 505                 p = strjoin(pfrom, "/", fn);
 506                 free(fn);
 507                 if (!p)
 508                         return -ENOMEM;
 509
 510                 r = cg_migrate_recursive(cfrom, p, cto, pto, flags);
 511                 if (r != 0 && ret >= 0)
 512                         ret = r;
 513         }
 514
 515         if (r < 0 && ret >= 0)
 516                 ret = r;
 517
 518         if (flags & CGROUP_REMOVE) {
 519                 r = cg_rmdir(cfrom, pfrom);
 520                 if (r < 0 && ret >= 0 && !IN_SET(r, -ENOENT, -EBUSY))
 521                         return r;
 522         }
 523
 524         return ret;
 525 }
 526
 527 int cg_migrate_recursive_fallback(
 528                 const char *cfrom,
 529                 const char *pfrom,
 530                 const char *cto,
 531                 const char *pto,
 532                 CGroupFlags flags) {
 533
 534         int r;
 535
 536         assert(cfrom);
 537         assert(pfrom);
 538         assert(cto);
 539         assert(pto);
 540
 541         r = cg_migrate_recursive(cfrom, pfrom, cto, pto, flags);
 542         if (r < 0) {
 543                 char prefix[strlen(pto) + 1];
 544
 545                 /* This didn't work? Then let's try all prefixes of the destination */
 546
 547                 PATH_FOREACH_PREFIX(prefix, pto) {
 548                         int q;
 549
 550                         q = cg_migrate_recursive(cfrom, pfrom, cto, prefix, flags);
 551                         if (q >= 0)
 552                                 return q;
 553                 }
 554         }
 555
 556         return r;
 557 }
 558
 559 static const char *controller_to_dirname(const char *controller) {
 560         const char *e;
 561
 562         assert(controller);
 563
 564         /* Converts a controller name to the directory name below
 565          * /sys/fs/cgroup/ we want to mount it to. Effectively, this
 566          * just cuts off the name= prefixed used for named
 567          * hierarchies, if it is specified. */
 568
 569         if (streq(controller, SYSTEMD_CGROUP_CONTROLLER)) {
 570                 if (cg_hybrid_unified() > 0)
 571                         controller = SYSTEMD_CGROUP_CONTROLLER_HYBRID;
 572                 else
 573                         controller = SYSTEMD_CGROUP_CONTROLLER_LEGACY;
 574         }
 575
 576         e = startswith(controller, "name=");
 577         if (e)
 578                 return e;
 579
 580         return controller;
 581 }
 582
 583 static int join_path_legacy(const char *controller, const char *path, const char *suffix, char **fs) {
 584         const char *dn;
 585         char *t = NULL;
 586
 587         assert(fs);
 588         assert(controller);
 589
 590         dn = controller_to_dirname(controller);
 591
 592         if (isempty(path) && isempty(suffix))
 593                 t = strappend("/sys/fs/cgroup/", dn);
 594         else if (isempty(path))
 595                 t = strjoin("/sys/fs/cgroup/", dn, "/", suffix);
 596         else if (isempty(suffix))
 597                 t = strjoin("/sys/fs/cgroup/", dn, "/", path);
 598         else
 599                 t = strjoin("/sys/fs/cgroup/", dn, "/", path, "/", suffix);
 600         if (!t)
 601                 return -ENOMEM;
 602
 603         *fs = t;
 604         return 0;
 605 }
 606
 607 static int join_path_unified(const char *path, const char *suffix, char **fs) {
 608         char *t;
 609
 610         assert(fs);
 611
 612         if (isempty(path) && isempty(suffix))
 613                 t = strdup("/sys/fs/cgroup");
 614         else if (isempty(path))
 615                 t = strappend("/sys/fs/cgroup/", suffix);
 616         else if (isempty(suffix))
 617                 t = strappend("/sys/fs/cgroup/", path);
 618         else
 619                 t = strjoin("/sys/fs/cgroup/", path, "/", suffix);
 620         if (!t)
 621                 return -ENOMEM;
 622
 623         *fs = t;
 624         return 0;
 625 }
 626
 627 int cg_get_path(const char *controller, const char *path, const char *suffix, char **fs) {
 628         int r;
 629
 630         assert(fs);
 631
 632         if (!controller) {
 633                 char *t;
 634
 635                 /* If no controller is specified, we return the path
 636                  * *below* the controllers, without any prefix. */
 637
 638                 if (!path && !suffix)
 639                         return -EINVAL;
 640
 641                 if (!suffix)
 642                         t = strdup(path);
 643                 else if (!path)
 644                         t = strdup(suffix);
 645                 else
 646                         t = strjoin(path, "/", suffix);
 647                 if (!t)
 648                         return -ENOMEM;
 649
 650                 *fs = path_kill_slashes(t);
 651                 return 0;
 652         }
 653
 654         if (!cg_controller_is_valid(controller))
 655                 return -EINVAL;
 656
 657         r = cg_all_unified();
 658         if (r < 0)
 659                 return r;
 660         if (r > 0)
 661                 r = join_path_unified(path, suffix, fs);
 662         else
 663                 r = join_path_legacy(controller, path, suffix, fs);
 664         if (r < 0)
 665                 return r;
 666
 667         path_kill_slashes(*fs);
 668         return 0;
 669 }
 670
 671 static int controller_is_accessible(const char *controller) {
 672         int r;
 673
 674         assert(controller);
 675
 676         /* Checks whether a specific controller is accessible,
 677          * i.e. its hierarchy mounted. In the unified hierarchy all
 678          * controllers are considered accessible, except for the named
 679          * hierarchies */
 680
 681         if (!cg_controller_is_valid(controller))
 682                 return -EINVAL;
 683
 684         r = cg_all_unified();
 685         if (r < 0)
 686                 return r;
 687         if (r > 0) {
 688                 /* We don't support named hierarchies if we are using
 689                  * the unified hierarchy. */
 690
 691                 if (streq(controller, SYSTEMD_CGROUP_CONTROLLER))
 692                         return 0;
 693
 694                 if (startswith(controller, "name="))
 695                         return -EOPNOTSUPP;
 696
 697         } else {
 698                 const char *cc, *dn;
 699
 700                 dn = controller_to_dirname(controller);
 701                 cc = strjoina("/sys/fs/cgroup/", dn);
 702
 703                 if (laccess(cc, F_OK) < 0)
 704                         return -errno;
 705         }
 706
 707         return 0;
 708 }
 709
 710 int cg_get_path_and_check(const char *controller, const char *path, const char *suffix, char **fs) {
 711         int r;
 712
 713         assert(controller);
 714         assert(fs);
 715
 716         /* Check if the specified controller is actually accessible */
 717         r = controller_is_accessible(controller);
 718         if (r < 0)
 719                 return r;
 720
 721         return cg_get_path(controller, path, suffix, fs);
 722 }
 723
 724 static int trim_cb(const char *path, const struct stat *sb, int typeflag, struct FTW *ftwbuf) {
 725         assert(path);
 726         assert(sb);
 727         assert(ftwbuf);
 728
 729         if (typeflag != FTW_DP)
 730                 return 0;
 731
 732         if (ftwbuf->level < 1)
 733                 return 0;
 734
 735         (void) rmdir(path);
 736         return 0;
 737 }
 738
 739 int cg_trim(const char *controller, const char *path, bool delete_root) {
 740         _cleanup_free_ char *fs = NULL;
 741         int r = 0, q;
 742
 743         assert(path);
 744
 745         r = cg_get_path(controller, path, NULL, &fs);
 746         if (r < 0)
 747                 return r;
 748
 749         errno = 0;
 750         if (nftw(fs, trim_cb, 64, FTW_DEPTH|FTW_MOUNT|FTW_PHYS) != 0) {
 751                 if (errno == ENOENT)
 752                         r = 0;
 753                 else if (errno > 0)
 754                         r = -errno;
 755                 else
 756                         r = -EIO;
 757         }
 758
 759         if (delete_root) {
 760                 if (rmdir(fs) < 0 && errno != ENOENT)
 761                         return -errno;
 762         }
 763
 764         q = cg_hybrid_unified();
 765         if (q < 0)
 766                 return q;
 767         if (q > 0 && streq(controller, SYSTEMD_CGROUP_CONTROLLER)) {
 768                 q = cg_trim(SYSTEMD_CGROUP_CONTROLLER_LEGACY, path, delete_root);
 769                 if (q < 0)
 770                         log_warning_errno(q, "Failed to trim compat systemd cgroup %s: %m", path);
 771         }
 772
 773         return r;
 774 }
 775
 776 int cg_create(const char *controller, const char *path) {
 777         _cleanup_free_ char *fs = NULL;
 778         int r;
 779
 780         r = cg_get_path_and_check(controller, path, NULL, &fs);
 781         if (r < 0)
 782                 return r;
 783
 784         r = mkdir_parents(fs, 0755);
 785         if (r < 0)
 786                 return r;
 787
 788         r = mkdir_errno_wrapper(fs, 0755);
 789         if (r == -EEXIST)
 790                 return 0;
 791         if (r < 0)
 792                 return r;
 793
 794         r = cg_hybrid_unified();
 795         if (r < 0)
 796                 return r;
 797
 798         if (r > 0 && streq(controller, SYSTEMD_CGROUP_CONTROLLER)) {
 799                 r = cg_create(SYSTEMD_CGROUP_CONTROLLER_LEGACY, path);
 800                 if (r < 0)
 801                         log_warning_errno(r, "Failed to create compat systemd cgroup %s: %m", path);
 802         }
 803
 804         return 1;
 805 }
 806
 807 int cg_create_and_attach(const char *controller, const char *path, pid_t pid) {
 808         int r, q;
 809
 810         assert(pid >= 0);
 811
 812         r = cg_create(controller, path);
 813         if (r < 0)
 814                 return r;
 815
 816         q = cg_attach(controller, path, pid);
 817         if (q < 0)
 818                 return q;
 819
 820         /* This does not remove the cgroup on failure */
 821         return r;
 822 }
 823
 824 int cg_attach(const char *controller, const char *path, pid_t pid) {
 825         _cleanup_free_ char *fs = NULL;
 826         char c[DECIMAL_STR_MAX(pid_t) + 2];
 827         int r;
 828
 829         assert(path);
 830         assert(pid >= 0);
 831
 832         r = cg_get_path_and_check(controller, path, "cgroup.procs", &fs);
 833         if (r < 0)
 834                 return r;
 835
 836         if (pid == 0)
 837                 pid = getpid_cached();
 838
 839         xsprintf(c, PID_FMT "\n", pid);
 840
 841         r = write_string_file(fs, c, 0);
 842         if (r < 0)
 843                 return r;
 844
 845         r = cg_hybrid_unified();
 846         if (r < 0)
 847                 return r;
 848
 849         if (r > 0 && streq(controller, SYSTEMD_CGROUP_CONTROLLER)) {
 850                 r = cg_attach(SYSTEMD_CGROUP_CONTROLLER_LEGACY, path, pid);
 851                 if (r < 0)
 852                         log_warning_errno(r, "Failed to attach "PID_FMT" to compat systemd cgroup %s: %m", pid, path);
 853         }
 854
 855         return 0;
 856 }
 857
 858 int cg_attach_fallback(const char *controller, const char *path, pid_t pid) {
 859         int r;
 860
 861         assert(controller);
 862         assert(path);
 863         assert(pid >= 0);
 864
 865         r = cg_attach(controller, path, pid);
 866         if (r < 0) {
 867                 char prefix[strlen(path) + 1];
 868
 869                 /* This didn't work? Then let's try all prefixes of
 870                  * the destination */
 871
 872                 PATH_FOREACH_PREFIX(prefix, path) {
 873                         int q;
 874
 875                         q = cg_attach(controller, prefix, pid);
 876                         if (q >= 0)
 877                                 return q;
 878                 }
 879         }
 880
 881         return r;
 882 }
 883
 884 #if 0 /// UNNEEDED by elogind
 885 int cg_set_access(
 886                 const char *controller,
 887                 const char *path,
 888                 uid_t uid,
 889                 gid_t gid) {
 890
 891         struct Attribute {
 892                 const char *name;
 893                 bool fatal;
 894         };
 895
 896         /* cgroupsv1, aka legacy/non-unified */
 897         static const struct Attribute legacy_attributes[] = {
 898                 { "cgroup.procs",           true  },
 899                 { "tasks",                  false },
 900                 { "cgroup.clone_children",  false },
 901                 {},
 902         };
 903
 904         /* cgroupsv2, aka unified */
 905         static const struct Attribute unified_attributes[] = {
 906                 { "cgroup.procs",           true  },
 907                 { "cgroup.subtree_control", true  },
 908                 { "cgroup.threads",         false },
 909                 {},
 910         };
 911
 912         static const struct Attribute* const attributes[] = {
 913                 [false] = legacy_attributes,
 914                 [true]  = unified_attributes,
 915         };
 916
 917         _cleanup_free_ char *fs = NULL;
 918         const struct Attribute *i;
 919         int r, unified;
 920
 921         assert(path);
 922
 923         if (uid == UID_INVALID && gid == GID_INVALID)
 924                 return 0;
 925
 926         unified = cg_unified_controller(controller);
 927         if (unified < 0)
 928                 return unified;
 929
 930         /* Configure access to the cgroup itself */
 931         r = cg_get_path(controller, path, NULL, &fs);
 932         if (r < 0)
 933                 return r;
 934
 935         r = chmod_and_chown(fs, 0755, uid, gid);
 936         if (r < 0)
 937                 return r;
 938
 939         /* Configure access to the cgroup's attributes */
 940         for (i = attributes[unified]; i->name; i++) {
 941                 fs = mfree(fs);
 942
 943                 r = cg_get_path(controller, path, i->name, &fs);
 944                 if (r < 0)
 945                         return r;
 946
 947                 r = chmod_and_chown(fs, 0644, uid, gid);
 948                 if (r < 0) {
 949                         if (i->fatal)
 950                                 return r;
 951
 952                         log_debug_errno(r, "Failed to set access on cgroup %s, ignoring: %m", fs);
 953                 }
 954         }
 955
 956         if (streq(controller, SYSTEMD_CGROUP_CONTROLLER)) {
 957                 r = cg_hybrid_unified();
 958                 if (r < 0)
 959                         return r;
 960                 if (r > 0) {
 961                         /* Always propagate access mode from unified to legacy controller */
 962                         r = cg_set_access(SYSTEMD_CGROUP_CONTROLLER_LEGACY, path, uid, gid);
 963                         if (r < 0)
 964                                 log_debug_errno(r, "Failed to set access on compatibility elogind cgroup %s, ignoring: %m", path);
 965                 }
 966         }
 967
 968         return 0;
 969 }
 970
 971 int cg_set_xattr(const char *controller, const char *path, const char *name, const void *value, size_t size, int flags) {
 972         _cleanup_free_ char *fs = NULL;
 973         int r;
 974
 975         assert(path);
 976         assert(name);
 977         assert(value || size <= 0);
 978
 979         r = cg_get_path(controller, path, NULL, &fs);
 980         if (r < 0)
 981                 return r;
 982
 983         if (setxattr(fs, name, value, size, flags) < 0)
 984                 return -errno;
 985
 986         return 0;
 987 }
 988
 989 int cg_get_xattr(const char *controller, const char *path, const char *name, void *value, size_t size) {
 990         _cleanup_free_ char *fs = NULL;
 991         ssize_t n;
 992         int r;
 993
 994         assert(path);
 995         assert(name);
 996
 997         r = cg_get_path(controller, path, NULL, &fs);
 998         if (r < 0)
 999                 return r;
1000
1001         n = getxattr(fs, name, value, size);
1002         if (n < 0)
1003                 return -errno;
1004
1005         return (int) n;
1006 }
1007 #endif // 0
1008
1009 int cg_pid_get_path(const char *controller, pid_t pid, char **path) {
1010         _cleanup_fclose_ FILE *f = NULL;
1011         char line[LINE_MAX];
1012 #if 0 /// At elogind we do not want that (false alarm) "maybe uninitialized" warning
1013         const char *fs, *controller_str;
1014 #else
1015         const char *fs, *controller_str = NULL;
1016 #endif // 0
1017         size_t cs = 0;
1018         int unified;
1019
1020         assert(path);
1021         assert(pid >= 0);
1022
1023         if (controller) {
1024                 if (!cg_controller_is_valid(controller))
1025                         return -EINVAL;
1026         } else
1027                 controller = SYSTEMD_CGROUP_CONTROLLER;
1028
1029         unified = cg_unified_controller(controller);
1030         if (unified < 0)
1031                 return unified;
1032         if (unified == 0) {
1033                 if (streq(controller, SYSTEMD_CGROUP_CONTROLLER))
1034                         controller_str = SYSTEMD_CGROUP_CONTROLLER_LEGACY;
1035                 else
1036                         controller_str = controller;
1037
1038                 cs = strlen(controller_str);
1039         }
1040
1041         fs = procfs_file_alloca(pid, "cgroup");
1042         log_debug_elogind("Searching for PID %u in \"%s\" (controller \"%s\")",
1043                           pid, fs, controller);
1044         f = fopen(fs, "re");
1045         if (!f)
1046                 return errno == ENOENT ? -ESRCH : -errno;
1047
1048         (void) __fsetlocking(f, FSETLOCKING_BYCALLER);
1049
1050         FOREACH_LINE(line, f, return -errno) {
1051                 char *e, *p;
1052
1053                 truncate_nl(line);
1054
1055                 if (unified) {
1056                         e = startswith(line, "0:");
1057                         if (!e)
1058                                 continue;
1059
1060                         e = strchr(e, ':');
1061                         if (!e)
1062                                 continue;
1063                 } else {
1064                         char *l;
1065                         size_t k;
1066                         const char *word, *state;
1067                         bool found = false;
1068
1069                         l = strchr(line, ':');
1070                         if (!l)
1071                                 continue;
1072
1073                         l++;
1074                         e = strchr(l, ':');
1075                         if (!e)
1076                                 continue;
1077
1078                         *e = 0;
1079                         FOREACH_WORD_SEPARATOR(word, k, l, ",", state) {
1080                                 if (k == cs && memcmp(word, controller_str, cs) == 0) {
1081                                         found = true;
1082                                         break;
1083                                 }
1084                         }
1085
1086                         if (!found)
1087                                 continue;
1088                 }
1089
1090                 log_debug_elogind("Found %s:%s", line, e+1);
1091                 p = strdup(e + 1);
1092                 if (!p)
1093                         return -ENOMEM;
1094
1095                 /* Truncate suffix indicating the process is a zombie */
1096                 e = endswith(p, " (deleted)");
1097                 if (e)
1098                         *e = 0;
1099
1100                 *path = p;
1101                 return 0;
1102         }
1103
1104         return -ENODATA;
1105 }
1106
1107 #if 0 /// UNNEEDED by elogind
1108 int cg_install_release_agent(const char *controller, const char *agent) {
1109         _cleanup_free_ char *fs = NULL, *contents = NULL;
1110         const char *sc;
1111         int r;
1112
1113         assert(agent);
1114
1115         r = cg_unified_controller(controller);
1116         if (r < 0)
1117                 return r;
1118         if (r > 0) /* doesn't apply to unified hierarchy */
1119                 return -EOPNOTSUPP;
1120
1121         r = cg_get_path(controller, NULL, "release_agent", &fs);
1122         if (r < 0)
1123                 return r;
1124
1125         r = read_one_line_file(fs, &contents);
1126         if (r < 0)
1127                 return r;
1128
1129         sc = strstrip(contents);
1130         if (isempty(sc)) {
1131                 r = write_string_file(fs, agent, 0);
1132                 if (r < 0)
1133                         return r;
1134         } else if (!path_equal(sc, agent))
1135                 return -EEXIST;
1136
1137         fs = mfree(fs);
1138         r = cg_get_path(controller, NULL, "notify_on_release", &fs);
1139         if (r < 0)
1140                 return r;
1141
1142         contents = mfree(contents);
1143         r = read_one_line_file(fs, &contents);
1144         if (r < 0)
1145                 return r;
1146
1147         sc = strstrip(contents);
1148         if (streq(sc, "0")) {
1149                 r = write_string_file(fs, "1", 0);
1150                 if (r < 0)
1151                         return r;
1152
1153                 return 1;
1154         }
1155
1156         if (!streq(sc, "1"))
1157                 return -EIO;
1158
1159         return 0;
1160 }
1161
1162 int cg_uninstall_release_agent(const char *controller) {
1163         _cleanup_free_ char *fs = NULL;
1164         int r;
1165
1166         r = cg_unified_controller(controller);
1167         if (r < 0)
1168                 return r;
1169         if (r > 0) /* Doesn't apply to unified hierarchy */
1170                 return -EOPNOTSUPP;
1171
1172         r = cg_get_path(controller, NULL, "notify_on_release", &fs);
1173         if (r < 0)
1174                 return r;
1175
1176         r = write_string_file(fs, "0", 0);
1177         if (r < 0)
1178                 return r;
1179
1180         fs = mfree(fs);
1181
1182         r = cg_get_path(controller, NULL, "release_agent", &fs);
1183         if (r < 0)
1184                 return r;
1185
1186         r = write_string_file(fs, "", 0);
1187         if (r < 0)
1188                 return r;
1189
1190         return 0;
1191 }
1192 #endif // 0
1193
1194 int cg_is_empty(const char *controller, const char *path) {
1195         _cleanup_fclose_ FILE *f = NULL;
1196         pid_t pid;
1197         int r;
1198
1199         assert(path);
1200
1201         r = cg_enumerate_processes(controller, path, &f);
1202         if (r == -ENOENT)
1203                 return 1;
1204         if (r < 0)
1205                 return r;
1206
1207         r = cg_read_pid(f, &pid);
1208         if (r < 0)
1209                 return r;
1210
1211         return r == 0;
1212 }
1213
1214 int cg_is_empty_recursive(const char *controller, const char *path) {
1215         int r;
1216
1217         assert(path);
1218
1219         /* The root cgroup is always populated */
1220         if (controller && (isempty(path) || path_equal(path, "/")))
1221                 return false;
1222
1223         r = cg_unified_controller(controller);
1224         if (r < 0)
1225                 return r;
1226         if (r > 0) {
1227                 _cleanup_free_ char *t = NULL;
1228
1229                 /* On the unified hierarchy we can check empty state
1230                  * via the "populated" attribute of "cgroup.events". */
1231
1232                 r = cg_read_event(controller, path, "populated", &t);
1233                 if (r < 0)
1234                         return r;
1235
1236                 return streq(t, "0");
1237         } else {
1238                 _cleanup_closedir_ DIR *d = NULL;
1239                 char *fn;
1240
1241                 r = cg_is_empty(controller, path);
1242                 if (r <= 0)
1243                         return r;
1244
1245                 r = cg_enumerate_subgroups(controller, path, &d);
1246                 if (r == -ENOENT)
1247                         return 1;
1248                 if (r < 0)
1249                         return r;
1250
1251                 while ((r = cg_read_subgroup(d, &fn)) > 0) {
1252                         _cleanup_free_ char *p = NULL;
1253
1254                         p = strjoin(path, "/", fn);
1255                         free(fn);
1256                         if (!p)
1257                                 return -ENOMEM;
1258
1259                         r = cg_is_empty_recursive(controller, p);
1260                         if (r <= 0)
1261                                 return r;
1262                 }
1263                 if (r < 0)
1264                         return r;
1265
1266                 return true;
1267         }
1268 }
1269
1270 int cg_split_spec(const char *spec, char **controller, char **path) {
1271         char *t = NULL, *u = NULL;
1272         const char *e;
1273
1274         assert(spec);
1275
1276         if (*spec == '/') {
1277                 if (!path_is_normalized(spec))
1278                         return -EINVAL;
1279
1280                 if (path) {
1281                         t = strdup(spec);
1282                         if (!t)
1283                                 return -ENOMEM;
1284
1285                         *path = path_kill_slashes(t);
1286                 }
1287
1288                 if (controller)
1289                         *controller = NULL;
1290
1291                 return 0;
1292         }
1293
1294         e = strchr(spec, ':');
1295         if (!e) {
1296                 if (!cg_controller_is_valid(spec))
1297                         return -EINVAL;
1298
1299                 if (controller) {
1300                         t = strdup(spec);
1301                         if (!t)
1302                                 return -ENOMEM;
1303
1304                         *controller = t;
1305                 }
1306
1307                 if (path)
1308                         *path = NULL;
1309
1310                 return 0;
1311         }
1312
1313         t = strndup(spec, e-spec);
1314         if (!t)
1315                 return -ENOMEM;
1316         if (!cg_controller_is_valid(t)) {
1317                 free(t);
1318                 return -EINVAL;
1319         }
1320
1321         if (isempty(e+1))
1322                 u = NULL;
1323         else {
1324                 u = strdup(e+1);
1325                 if (!u) {
1326                         free(t);
1327                         return -ENOMEM;
1328                 }
1329
1330                 if (!path_is_normalized(u) ||
1331                     !path_is_absolute(u)) {
1332                         free(t);
1333                         free(u);
1334                         return -EINVAL;
1335                 }
1336
1337                 path_kill_slashes(u);
1338         }
1339
1340         if (controller)
1341                 *controller = t;
1342         else
1343                 free(t);
1344
1345         if (path)
1346                 *path = u;
1347         else
1348                 free(u);
1349
1350         return 0;
1351 }
1352
1353 int cg_mangle_path(const char *path, char **result) {
1354         _cleanup_free_ char *c = NULL, *p = NULL;
1355         char *t;
1356         int r;
1357
1358         assert(path);
1359         assert(result);
1360
1361         /* First, check if it already is a filesystem path */
1362         if (path_startswith(path, "/sys/fs/cgroup")) {
1363
1364                 t = strdup(path);
1365                 if (!t)
1366                         return -ENOMEM;
1367
1368                 *result = path_kill_slashes(t);
1369                 return 0;
1370         }
1371
1372         /* Otherwise, treat it as cg spec */
1373         r = cg_split_spec(path, &c, &p);
1374         if (r < 0)
1375                 return r;
1376
1377         return cg_get_path(c ?: SYSTEMD_CGROUP_CONTROLLER, p ?: "/", NULL, result);
1378 }
1379
1380 int cg_get_root_path(char **path) {
1381         char *p, *e;
1382         int r;
1383
1384         assert(path);
1385
1386         r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, 1, &p);
1387         if (r < 0)
1388                 return r;
1389
1390 #if 0 /// elogind does not support systemd scopes and slices
1391         e = endswith(p, "/" SPECIAL_INIT_SCOPE);
1392         if (!e)
1393                 e = endswith(p, "/" SPECIAL_SYSTEM_SLICE); /* legacy */
1394         if (!e)
1395                 e = endswith(p, "/system"); /* even more legacy */
1396 #else
1397         e = endswith(p, "/elogind");
1398 #endif // 0
1399         if (e)
1400                 *e = 0;
1401
1402         *path = p;
1403         return 0;
1404 }
1405
1406 int cg_shift_path(const char *cgroup, const char *root, const char **shifted) {
1407         _cleanup_free_ char *rt = NULL;
1408         char *p;
1409         int r;
1410
1411         assert(cgroup);
1412         assert(shifted);
1413
1414         if (!root) {
1415                 /* If the root was specified let's use that, otherwise
1416                  * let's determine it from PID 1 */
1417
1418                 r = cg_get_root_path(&rt);
1419                 if (r < 0)
1420                         return r;
1421
1422                 root = rt;
1423                 log_debug_elogind("Determined root path: \"%s\"", root);
1424         }
1425
1426         p = path_startswith(cgroup, root);
1427 #if 0 /// With other controllers, elogind might end up in /elogind, and *p is 0
1428         if (p && p > cgroup)
1429 #else
1430         if (p && p[0] && (p > cgroup))
1431 #endif // 0
1432                 *shifted = p - 1;
1433         else
1434                 *shifted = cgroup;
1435
1436         return 0;
1437 }
1438
1439 int cg_pid_get_path_shifted(pid_t pid, const char *root, char **cgroup) {
1440         _cleanup_free_ char *raw = NULL;
1441         const char *c;
1442         int r;
1443
1444         assert(pid >= 0);
1445         assert(cgroup);
1446
1447         r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, pid, &raw);
1448         if (r < 0)
1449                 return r;
1450
1451         log_debug_elogind("Shifting path: \"%s\" (PID %u, root: \"%s\")",
1452                           raw, pid, root ? root : "NULL");
1453         r = cg_shift_path(raw, root, &c);
1454         if (r < 0)
1455                 return r;
1456
1457         if (c == raw) {
1458                 *cgroup = raw;
1459                 raw = NULL;
1460         } else {
1461                 char *n;
1462
1463                 n = strdup(c);
1464                 if (!n)
1465                         return -ENOMEM;
1466
1467                 *cgroup = n;
1468         }
1469         log_debug_elogind("Resulting cgroup:\"%s\"", *cgroup);
1470
1471         return 0;
1472 }
1473
1474 #if 0 /// UNNEEDED by elogind
1475 int cg_path_decode_unit(const char *cgroup, char **unit) {
1476         char *c, *s;
1477         size_t n;
1478
1479         assert(cgroup);
1480         assert(unit);
1481
1482         n = strcspn(cgroup, "/");
1483         if (n < 3)
1484                 return -ENXIO;
1485
1486         c = strndupa(cgroup, n);
1487         c = cg_unescape(c);
1488
1489         if (!unit_name_is_valid(c, UNIT_NAME_PLAIN|UNIT_NAME_INSTANCE))
1490                 return -ENXIO;
1491
1492         s = strdup(c);
1493         if (!s)
1494                 return -ENOMEM;
1495
1496         *unit = s;
1497         return 0;
1498 }
1499
1500 static bool valid_slice_name(const char *p, size_t n) {
1501
1502         if (!p)
1503                 return false;
1504
1505         if (n < STRLEN("x.slice"))
1506                 return false;
1507
1508         if (memcmp(p + n - 6, ".slice", 6) == 0) {
1509                 char buf[n+1], *c;
1510
1511                 memcpy(buf, p, n);
1512                 buf[n] = 0;
1513
1514                 c = cg_unescape(buf);
1515
1516                 return unit_name_is_valid(c, UNIT_NAME_PLAIN);
1517         }
1518
1519         return false;
1520 }
1521
1522 static const char *skip_slices(const char *p) {
1523         assert(p);
1524
1525         /* Skips over all slice assignments */
1526
1527         for (;;) {
1528                 size_t n;
1529
1530                 p += strspn(p, "/");
1531
1532                 n = strcspn(p, "/");
1533                 if (!valid_slice_name(p, n))
1534                         return p;
1535
1536                 p += n;
1537         }
1538 }
1539
1540 int cg_path_get_unit(const char *path, char **ret) {
1541         const char *e;
1542         char *unit;
1543         int r;
1544
1545         assert(path);
1546         assert(ret);
1547
1548         e = skip_slices(path);
1549
1550         r = cg_path_decode_unit(e, &unit);
1551         if (r < 0)
1552                 return r;
1553
1554         /* We skipped over the slices, don't accept any now */
1555         if (endswith(unit, ".slice")) {
1556                 free(unit);
1557                 return -ENXIO;
1558         }
1559
1560         *ret = unit;
1561         return 0;
1562 }
1563
1564 int cg_pid_get_unit(pid_t pid, char **unit) {
1565         _cleanup_free_ char *cgroup = NULL;
1566         int r;
1567
1568         assert(unit);
1569
1570         r = cg_pid_get_path_shifted(pid, NULL, &cgroup);
1571         if (r < 0)
1572                 return r;
1573
1574         return cg_path_get_unit(cgroup, unit);
1575 }
1576
1577 /**
1578  * Skip session-*.scope, but require it to be there.
1579  */
1580 static const char *skip_session(const char *p) {
1581         size_t n;
1582
1583         if (isempty(p))
1584                 return NULL;
1585
1586         p += strspn(p, "/");
1587
1588         n = strcspn(p, "/");
1589         if (n < STRLEN("session-x.scope"))
1590                 return NULL;
1591
1592         if (memcmp(p, "session-", 8) == 0 && memcmp(p + n - 6, ".scope", 6) == 0) {
1593                 char buf[n - 8 - 6 + 1];
1594
1595                 memcpy(buf, p + 8, n - 8 - 6);
1596                 buf[n - 8 - 6] = 0;
1597
1598                 /* Note that session scopes never need unescaping,
1599                  * since they cannot conflict with the kernel's own
1600                  * names, hence we don't need to call cg_unescape()
1601                  * here. */
1602
1603                 if (!session_id_valid(buf))
1604                         return false;
1605
1606                 p += n;
1607                 p += strspn(p, "/");
1608                 return p;
1609         }
1610
1611         return NULL;
1612 }
1613
1614 /**
1615  * Skip user@*.service, but require it to be there.
1616  */
1617 static const char *skip_user_manager(const char *p) {
1618         size_t n;
1619
1620         if (isempty(p))
1621                 return NULL;
1622
1623         p += strspn(p, "/");
1624
1625         n = strcspn(p, "/");
1626         if (n < STRLEN("user@x.service"))
1627                 return NULL;
1628
1629         if (memcmp(p, "user@", 5) == 0 && memcmp(p + n - 8, ".service", 8) == 0) {
1630                 char buf[n - 5 - 8 + 1];
1631
1632                 memcpy(buf, p + 5, n - 5 - 8);
1633                 buf[n - 5 - 8] = 0;
1634
1635                 /* Note that user manager services never need unescaping,
1636                  * since they cannot conflict with the kernel's own
1637                  * names, hence we don't need to call cg_unescape()
1638                  * here. */
1639
1640                 if (parse_uid(buf, NULL) < 0)
1641                         return NULL;
1642
1643                 p += n;
1644                 p += strspn(p, "/");
1645
1646                 return p;
1647         }
1648
1649         return NULL;
1650 }
1651
1652 static const char *skip_user_prefix(const char *path) {
1653         const char *e, *t;
1654
1655         assert(path);
1656
1657         /* Skip slices, if there are any */
1658         e = skip_slices(path);
1659
1660         /* Skip the user manager, if it's in the path now... */
1661         t = skip_user_manager(e);
1662         if (t)
1663                 return t;
1664
1665         /* Alternatively skip the user session if it is in the path... */
1666         return skip_session(e);
1667 }
1668
1669 int cg_path_get_user_unit(const char *path, char **ret) {
1670         const char *t;
1671
1672         assert(path);
1673         assert(ret);
1674
1675         t = skip_user_prefix(path);
1676         if (!t)
1677                 return -ENXIO;
1678
1679         /* And from here on it looks pretty much the same as for a
1680          * system unit, hence let's use the same parser from here
1681          * on. */
1682         return cg_path_get_unit(t, ret);
1683 }
1684
1685 int cg_pid_get_user_unit(pid_t pid, char **unit) {
1686         _cleanup_free_ char *cgroup = NULL;
1687         int r;
1688
1689         assert(unit);
1690
1691         r = cg_pid_get_path_shifted(pid, NULL, &cgroup);
1692         if (r < 0)
1693                 return r;
1694
1695         return cg_path_get_user_unit(cgroup, unit);
1696 }
1697
1698 int cg_path_get_machine_name(const char *path, char **machine) {
1699         _cleanup_free_ char *u = NULL;
1700         const char *sl;
1701         int r;
1702
1703         r = cg_path_get_unit(path, &u);
1704         if (r < 0)
1705                 return r;
1706
1707         sl = strjoina("/run/systemd/machines/unit:", u);
1708         return readlink_malloc(sl, machine);
1709 }
1710
1711 int cg_pid_get_machine_name(pid_t pid, char **machine) {
1712         _cleanup_free_ char *cgroup = NULL;
1713         int r;
1714
1715         assert(machine);
1716
1717         r = cg_pid_get_path_shifted(pid, NULL, &cgroup);
1718         if (r < 0)
1719                 return r;
1720
1721         return cg_path_get_machine_name(cgroup, machine);
1722 }
1723 #endif // 0
1724
1725 int cg_path_get_session(const char *path, char **session) {
1726 #if 0 /// UNNEEDED by elogind
1727         _cleanup_free_ char *unit = NULL;
1728         char *start, *end;
1729         int r;
1730
1731         assert(path);
1732
1733         r = cg_path_get_unit(path, &unit);
1734         if (r < 0)
1735                 return r;
1736
1737         start = startswith(unit, "session-");
1738         if (!start)
1739                 return -ENXIO;
1740         end = endswith(start, ".scope");
1741         if (!end)
1742                 return -ENXIO;
1743
1744         *end = 0;
1745         if (!session_id_valid(start))
1746                 return -ENXIO;
1747 #else
1748         /* Elogind uses a flat hierarchy, just "/SESSION".  The only
1749            wrinkle is that SESSION might be escaped.  */
1750         const char *e, *n, *start;
1751
1752         assert(path);
1753         log_debug_elogind("path is \"%s\"", path);
1754         assert(path[0] == '/');
1755
1756         e = path + 1;
1757         n = strchrnul(e, '/');
1758         if (e == n)
1759                 return -ENOENT;
1760
1761         start = strndupa(e, n - e);
1762         start = cg_unescape(start);
1763
1764         if (!start[0])
1765                 return -ENOENT;
1766 #endif // 0
1767
1768         if (session) {
1769                 char *rr;
1770
1771                 log_debug_elogind("found session: \"%s\"", start);
1772                 rr = strdup(start);
1773                 if (!rr)
1774                         return -ENOMEM;
1775
1776                 *session = rr;
1777         }
1778
1779         return 0;
1780 }
1781
1782 int cg_pid_get_session(pid_t pid, char **session) {
1783         _cleanup_free_ char *cgroup = NULL;
1784         int r;
1785
1786         r = cg_pid_get_path_shifted(pid, NULL, &cgroup);
1787         if (r < 0)
1788                 return r;
1789
1790         return cg_path_get_session(cgroup, session);
1791 }
1792
1793 int cg_path_get_owner_uid(const char *path, uid_t *uid) {
1794 #if 0 /// elogind needs one more value
1795         _cleanup_free_ char *slice = NULL;
1796         char *start, *end;
1797 #else
1798         _cleanup_free_ char *slice = NULL, *p = NULL, *s = NULL;
1799 #endif // 0
1800         int r;
1801
1802         assert(path);
1803
1804         r = cg_path_get_slice(path, &slice);
1805         if (r < 0)
1806                 return r;
1807
1808 #if 0 /// elogind does not support systemd slices
1809         start = startswith(slice, "user-");
1810         if (!start)
1811                 return -ENXIO;
1812         end = endswith(start, ".slice");
1813         if (!end)
1814                 return -ENXIO;
1815
1816         *end = 0;
1817         if (parse_uid(start, uid) < 0)
1818                 return -ENXIO;
1819 #else
1820         p = strappend("/run/systemd/sessions/", slice);
1821
1822         r = parse_env_file(p, NEWLINE, "UID", &s, NULL);
1823         if (r == -ENOENT)
1824                 return -ENXIO;
1825         if (r < 0)
1826                 return r;
1827         if (isempty(s))
1828                 return -EIO;
1829
1830         if (parse_uid(s, uid) < 0)
1831                 return -ENXIO;
1832 #endif // 0
1833
1834         return 0;
1835 }
1836
1837 int cg_pid_get_owner_uid(pid_t pid, uid_t *uid) {
1838         _cleanup_free_ char *cgroup = NULL;
1839         int r;
1840
1841         r = cg_pid_get_path_shifted(pid, NULL, &cgroup);
1842         if (r < 0)
1843                 return r;
1844
1845         return cg_path_get_owner_uid(cgroup, uid);
1846 }
1847
1848 int cg_path_get_slice(const char *p, char **slice) {
1849         const char *e = NULL;
1850
1851         assert(p);
1852         assert(slice);
1853
1854 #if 0 /// elogind does not support systemd slices
1855         /* Finds the right-most slice unit from the beginning, but
1856          * stops before we come to the first non-slice unit. */
1857
1858         for (;;) {
1859                 size_t n;
1860
1861                 p += strspn(p, "/");
1862
1863                 n = strcspn(p, "/");
1864                 if (!valid_slice_name(p, n)) {
1865
1866                         if (!e) {
1867                                 char *s;
1868
1869                                 s = strdup(SPECIAL_ROOT_SLICE);
1870                                 if (!s)
1871                                         return -ENOMEM;
1872
1873                                 *slice = s;
1874                                 return 0;
1875                         }
1876
1877                         return cg_path_decode_unit(e, slice);
1878                 }
1879
1880                 e = p;
1881                 p += n;
1882         }
1883 #else
1884         /* In elogind, what is reported here, is the location of
1885          * the session. This is derived from /proc/<self|PID>/cgroup.
1886          * In there we look at the controller, which will look something
1887          * like "1:name=openrc:/3".
1888          * The last part gets extracted (and is now p), which is "/3" in
1889          * this case. The three is the session id, and that can be mapped.
1890          */
1891         e = startswith(p, "/");
1892
1893         if (e)
1894                 *slice = strdup(e);
1895         else
1896                 *slice = strdup(p);
1897
1898         return 0;
1899 #endif // 0
1900 }
1901
1902 int cg_pid_get_slice(pid_t pid, char **slice) {
1903         _cleanup_free_ char *cgroup = NULL;
1904         int r;
1905
1906         assert(slice);
1907
1908         r = cg_pid_get_path_shifted(pid, NULL, &cgroup);
1909         log_debug_elogind("Found cgroup %s for pid %u (result %d)",
1910                           cgroup, pid, r);
1911         if (r < 0)
1912                 return r;
1913
1914         return cg_path_get_slice(cgroup, slice);
1915 }
1916
1917 int cg_path_get_user_slice(const char *p, char **slice) {
1918 #if 0 /// UNNEEDED by elogind
1919         const char *t;
1920 #endif // 0
1921         assert(p);
1922         assert(slice);
1923
1924 #if 0 /// nothing to skip in elogind
1925         t = skip_user_prefix(p);
1926         if (!t)
1927                 return -ENXIO;
1928 #endif // 0
1929
1930 #if 0 /// UNNEEDED by elogind
1931         /* And now it looks pretty much the same as for a system
1932          * slice, so let's just use the same parser from here on. */
1933         return cg_path_get_slice(t, slice);
1934 #else
1935         /* In elogind there is nothing to skip, we can use the path
1936          * directly. Generally speaking this is always a session id
1937          * to user mapping. */
1938         return cg_path_get_slice(p, slice);
1939 #endif // 0
1940 }
1941
1942 int cg_pid_get_user_slice(pid_t pid, char **slice) {
1943         _cleanup_free_ char *cgroup = NULL;
1944         int r;
1945
1946         assert(slice);
1947
1948         r = cg_pid_get_path_shifted(pid, NULL, &cgroup);
1949         if (r < 0)
1950                 return r;
1951
1952         return cg_path_get_user_slice(cgroup, slice);
1953 }
1954
1955 char *cg_escape(const char *p) {
1956         bool need_prefix = false;
1957
1958         /* This implements very minimal escaping for names to be used
1959          * as file names in the cgroup tree: any name which might
1960          * conflict with a kernel name or is prefixed with '_' is
1961          * prefixed with a '_'. That way, when reading cgroup names it
1962          * is sufficient to remove a single prefixing underscore if
1963          * there is one. */
1964
1965         /* The return value of this function (unlike cg_unescape())
1966          * needs free()! */
1967
1968         if (IN_SET(p[0], 0, '_', '.') ||
1969             streq(p, "notify_on_release") ||
1970             streq(p, "release_agent") ||
1971             streq(p, "tasks") ||
1972             startswith(p, "cgroup."))
1973                 need_prefix = true;
1974         else {
1975                 const char *dot;
1976
1977                 dot = strrchr(p, '.');
1978                 if (dot) {
1979                         CGroupController c;
1980                         size_t l = dot - p;
1981
1982                         for (c = 0; c < _CGROUP_CONTROLLER_MAX; c++) {
1983                                 const char *n;
1984
1985                                 n = cgroup_controller_to_string(c);
1986
1987                                 if (l != strlen(n))
1988                                         continue;
1989
1990                                 if (memcmp(p, n, l) != 0)
1991                                         continue;
1992
1993                                 need_prefix = true;
1994                                 break;
1995                         }
1996                 }
1997         }
1998
1999         if (need_prefix)
2000                 return strappend("_", p);
2001
2002         return strdup(p);
2003 }
2004
2005 char *cg_unescape(const char *p) {
2006         assert(p);
2007
2008         /* The return value of this function (unlike cg_escape())
2009          * doesn't need free()! */
2010
2011         if (p[0] == '_')
2012                 return (char*) p+1;
2013
2014         return (char*) p;
2015 }
2016
2017 #define CONTROLLER_VALID                        \
2018         DIGITS LETTERS                          \
2019         "_"
2020
2021 bool cg_controller_is_valid(const char *p) {
2022         const char *t, *s;
2023
2024         if (!p)
2025                 return false;
2026
2027         if (streq(p, SYSTEMD_CGROUP_CONTROLLER))
2028                 return true;
2029
2030         s = startswith(p, "name=");
2031         if (s)
2032                 p = s;
2033
2034         if (IN_SET(*p, 0, '_'))
2035                 return false;
2036
2037         for (t = p; *t; t++)
2038                 if (!strchr(CONTROLLER_VALID, *t))
2039                         return false;
2040
2041         if (t - p > FILENAME_MAX)
2042                 return false;
2043
2044         return true;
2045 }
2046
2047 #if 0 /// UNNEEDED by elogind
2048 int cg_slice_to_path(const char *unit, char **ret) {
2049         _cleanup_free_ char *p = NULL, *s = NULL, *e = NULL;
2050         const char *dash;
2051         int r;
2052
2053         assert(unit);
2054         assert(ret);
2055
2056         if (streq(unit, SPECIAL_ROOT_SLICE)) {
2057                 char *x;
2058
2059                 x = strdup("");
2060                 if (!x)
2061                         return -ENOMEM;
2062                 *ret = x;
2063                 return 0;
2064         }
2065
2066         if (!unit_name_is_valid(unit, UNIT_NAME_PLAIN))
2067                 return -EINVAL;
2068
2069         if (!endswith(unit, ".slice"))
2070                 return -EINVAL;
2071
2072         r = unit_name_to_prefix(unit, &p);
2073         if (r < 0)
2074                 return r;
2075
2076         dash = strchr(p, '-');
2077
2078         /* Don't allow initial dashes */
2079         if (dash == p)
2080                 return -EINVAL;
2081
2082         while (dash) {
2083                 _cleanup_free_ char *escaped = NULL;
2084                 char n[dash - p + sizeof(".slice")];
2085
2086                 /* Don't allow trailing or double dashes */
2087                 if (IN_SET(dash[1], 0, '-'))
2088                         return -EINVAL;
2089
2090                 strcpy(stpncpy(n, p, dash - p), ".slice");
2091                 if (!unit_name_is_valid(n, UNIT_NAME_PLAIN))
2092                         return -EINVAL;
2093
2094                 escaped = cg_escape(n);
2095                 if (!escaped)
2096                         return -ENOMEM;
2097
2098                 if (!strextend(&s, escaped, "/", NULL))
2099                         return -ENOMEM;
2100
2101                 dash = strchr(dash+1, '-');
2102         }
2103
2104         e = cg_escape(unit);
2105         if (!e)
2106                 return -ENOMEM;
2107
2108         if (!strextend(&s, e, NULL))
2109                 return -ENOMEM;
2110
2111         *ret = s;
2112         s = NULL;
2113
2114         return 0;
2115 }
2116 #endif // 0
2117
2118 int cg_set_attribute(const char *controller, const char *path, const char *attribute, const char *value) {
2119         _cleanup_free_ char *p = NULL;
2120         int r;
2121
2122         r = cg_get_path(controller, path, attribute, &p);
2123         if (r < 0)
2124                 return r;
2125
2126         return write_string_file(p, value, 0);
2127 }
2128
2129 int cg_get_attribute(const char *controller, const char *path, const char *attribute, char **ret) {
2130         _cleanup_free_ char *p = NULL;
2131         int r;
2132
2133         r = cg_get_path(controller, path, attribute, &p);
2134         if (r < 0)
2135                 return r;
2136
2137         return read_one_line_file(p, ret);
2138 }
2139
2140 #if 0 /// UNNEEDED by elogind
2141 int cg_get_keyed_attribute(const char *controller, const char *path, const char *attribute, const char **keys, char **values) {
2142         _cleanup_free_ char *filename = NULL, *content = NULL;
2143         char *line, *p;
2144         int i, r;
2145
2146         for (i = 0; keys[i]; i++)
2147                 values[i] = NULL;
2148
2149         r = cg_get_path(controller, path, attribute, &filename);
2150         if (r < 0)
2151                 return r;
2152
2153         r = read_full_file(filename, &content, NULL);
2154         if (r < 0)
2155                 return r;
2156
2157         p = content;
2158         while ((line = strsep(&p, "\n"))) {
2159                 char *key;
2160
2161                 key = strsep(&line, " ");
2162
2163                 for (i = 0; keys[i]; i++) {
2164                         if (streq(key, keys[i])) {
2165                                 values[i] = strdup(line);
2166                                 break;
2167                         }
2168                 }
2169         }
2170
2171         for (i = 0; keys[i]; i++) {
2172                 if (!values[i]) {
2173                         for (i = 0; keys[i]; i++) {
2174                                 values[i] = mfree(values[i]);
2175                         }
2176                         return -ENOENT;
2177                 }
2178         }
2179
2180         return 0;
2181 }
2182
2183 int cg_create_everywhere(CGroupMask supported, CGroupMask mask, const char *path) {
2184         CGroupController c;
2185         int r;
2186
2187         /* This one will create a cgroup in our private tree, but also
2188          * duplicate it in the trees specified in mask, and remove it
2189          * in all others */
2190
2191         /* First create the cgroup in our own hierarchy. */
2192         r = cg_create(SYSTEMD_CGROUP_CONTROLLER, path);
2193         if (r < 0)
2194                 return r;
2195
2196         /* If we are in the unified hierarchy, we are done now */
2197         r = cg_all_unified();
2198         if (r < 0)
2199                 return r;
2200         if (r > 0)
2201                 return 0;
2202
2203         /* Otherwise, do the same in the other hierarchies */
2204         for (c = 0; c < _CGROUP_CONTROLLER_MAX; c++) {
2205                 CGroupMask bit = CGROUP_CONTROLLER_TO_MASK(c);
2206                 const char *n;
2207
2208                 n = cgroup_controller_to_string(c);
2209
2210                 if (mask & bit)
2211                         (void) cg_create(n, path);
2212                 else if (supported & bit)
2213                         (void) cg_trim(n, path, true);
2214         }
2215
2216         return 0;
2217 }
2218
2219 int cg_attach_everywhere(CGroupMask supported, const char *path, pid_t pid, cg_migrate_callback_t path_callback, void *userdata) {
2220         CGroupController c;
2221         int r;
2222
2223         r = cg_attach(SYSTEMD_CGROUP_CONTROLLER, path, pid);
2224         if (r < 0)
2225                 return r;
2226
2227         r = cg_all_unified();
2228         if (r < 0)
2229                 return r;
2230         if (r > 0)
2231                 return 0;
2232
2233         for (c = 0; c < _CGROUP_CONTROLLER_MAX; c++) {
2234                 CGroupMask bit = CGROUP_CONTROLLER_TO_MASK(c);
2235                 const char *p = NULL;
2236
2237                 if (!(supported & bit))
2238                         continue;
2239
2240                 if (path_callback)
2241                         p = path_callback(bit, userdata);
2242
2243                 if (!p)
2244                         p = path;
2245
2246                 (void) cg_attach_fallback(cgroup_controller_to_string(c), p, pid);
2247         }
2248
2249         return 0;
2250 }
2251
2252 int cg_attach_many_everywhere(CGroupMask supported, const char *path, Set* pids, cg_migrate_callback_t path_callback, void *userdata) {
2253         Iterator i;
2254         void *pidp;
2255         int r = 0;
2256
2257         SET_FOREACH(pidp, pids, i) {
2258                 pid_t pid = PTR_TO_PID(pidp);
2259                 int q;
2260
2261                 q = cg_attach_everywhere(supported, path, pid, path_callback, userdata);
2262                 if (q < 0 && r >= 0)
2263                         r = q;
2264         }
2265
2266         return r;
2267 }
2268
2269 int cg_migrate_everywhere(CGroupMask supported, const char *from, const char *to, cg_migrate_callback_t to_callback, void *userdata) {
2270         CGroupController c;
2271         int r = 0, q;
2272
2273         if (!path_equal(from, to))  {
2274                 r = cg_migrate_recursive(SYSTEMD_CGROUP_CONTROLLER, from, SYSTEMD_CGROUP_CONTROLLER, to, CGROUP_REMOVE);
2275                 if (r < 0)
2276                         return r;
2277         }
2278
2279         q = cg_all_unified();
2280         if (q < 0)
2281                 return q;
2282         if (q > 0)
2283                 return r;
2284
2285         for (c = 0; c < _CGROUP_CONTROLLER_MAX; c++) {
2286                 CGroupMask bit = CGROUP_CONTROLLER_TO_MASK(c);
2287                 const char *p = NULL;
2288
2289                 if (!(supported & bit))
2290                         continue;
2291
2292                 if (to_callback)
2293                         p = to_callback(bit, userdata);
2294
2295                 if (!p)
2296                         p = to;
2297
2298                 (void) cg_migrate_recursive_fallback(SYSTEMD_CGROUP_CONTROLLER, to, cgroup_controller_to_string(c), p, 0);
2299         }
2300
2301         return 0;
2302 }
2303
2304 int cg_trim_everywhere(CGroupMask supported, const char *path, bool delete_root) {
2305         CGroupController c;
2306         int r, q;
2307
2308         r = cg_trim(SYSTEMD_CGROUP_CONTROLLER, path, delete_root);
2309         if (r < 0)
2310                 return r;
2311
2312         q = cg_all_unified();
2313         if (q < 0)
2314                 return q;
2315         if (q > 0)
2316                 return r;
2317
2318         for (c = 0; c < _CGROUP_CONTROLLER_MAX; c++) {
2319                 CGroupMask bit = CGROUP_CONTROLLER_TO_MASK(c);
2320
2321                 if (!(supported & bit))
2322                         continue;
2323
2324                 (void) cg_trim(cgroup_controller_to_string(c), path, delete_root);
2325         }
2326
2327         return 0;
2328 }
2329 #endif // 0
2330
2331 int cg_mask_to_string(CGroupMask mask, char **ret) {
2332         _cleanup_free_ char *s = NULL;
2333         size_t n = 0, allocated = 0;
2334         bool space = false;
2335         CGroupController c;
2336
2337         assert(ret);
2338
2339         if (mask == 0) {
2340                 *ret = NULL;
2341                 return 0;
2342         }
2343
2344         for (c = 0; c < _CGROUP_CONTROLLER_MAX; c++) {
2345                 const char *k;
2346                 size_t l;
2347
2348                 if (!(mask & CGROUP_CONTROLLER_TO_MASK(c)))
2349                         continue;
2350
2351                 k = cgroup_controller_to_string(c);
2352                 l = strlen(k);
2353
2354                 if (!GREEDY_REALLOC(s, allocated, n + space + l + 1))
2355                         return -ENOMEM;
2356
2357                 if (space)
2358                         s[n] = ' ';
2359                 memcpy(s + n + space, k, l);
2360                 n += space + l;
2361
2362                 space = true;
2363         }
2364
2365         assert(s);
2366
2367         s[n] = 0;
2368         *ret = s;
2369         s = NULL;
2370
2371         return 0;
2372 }
2373
2374 int cg_mask_from_string(const char *value, CGroupMask *mask) {
2375         assert(mask);
2376         assert(value);
2377
2378         for (;;) {
2379                 _cleanup_free_ char *n = NULL;
2380                 CGroupController v;
2381                 int r;
2382
2383                 r = extract_first_word(&value, &n, NULL, 0);
2384                 if (r < 0)
2385                         return r;
2386                 if (r == 0)
2387                         break;
2388
2389                 v = cgroup_controller_from_string(n);
2390                 if (v < 0)
2391                         continue;
2392
2393                 *mask |= CGROUP_CONTROLLER_TO_MASK(v);
2394         }
2395         return 0;
2396 }
2397
2398 int cg_mask_supported(CGroupMask *ret) {
2399         CGroupMask mask = 0;
2400         int r;
2401
2402         /* Determines the mask of supported cgroup controllers. Only
2403          * includes controllers we can make sense of and that are
2404          * actually accessible. */
2405
2406         r = cg_all_unified();
2407         if (r < 0)
2408                 return r;
2409         if (r > 0) {
2410                 _cleanup_free_ char *root = NULL, *controllers = NULL, *path = NULL;
2411
2412                 /* In the unified hierarchy we can read the supported
2413                  * and accessible controllers from a the top-level
2414                  * cgroup attribute */
2415
2416                 r = cg_get_root_path(&root);
2417                 if (r < 0)
2418                         return r;
2419
2420                 r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, root, "cgroup.controllers", &path);
2421                 if (r < 0)
2422                         return r;
2423
2424                 r = read_one_line_file(path, &controllers);
2425                 if (r < 0)
2426                         return r;
2427
2428                 r = cg_mask_from_string(controllers, &mask);
2429                 if (r < 0)
2430                         return r;
2431
2432                 /* Currently, we support the cpu, memory, io and pids
2433                  * controller in the unified hierarchy, mask
2434                  * everything else off. */
2435                 mask &= CGROUP_MASK_CPU | CGROUP_MASK_MEMORY | CGROUP_MASK_IO | CGROUP_MASK_PIDS;
2436
2437         } else {
2438                 CGroupController c;
2439
2440                 /* In the legacy hierarchy, we check whether which
2441                  * hierarchies are mounted. */
2442
2443                 for (c = 0; c < _CGROUP_CONTROLLER_MAX; c++) {
2444                         const char *n;
2445
2446                         n = cgroup_controller_to_string(c);
2447                         if (controller_is_accessible(n) >= 0)
2448                                 mask |= CGROUP_CONTROLLER_TO_MASK(c);
2449                 }
2450         }
2451
2452         *ret = mask;
2453         return 0;
2454 }
2455
2456 #if 0 /// UNNEEDED by elogind
2457 int cg_kernel_controllers(Set **ret) {
2458         _cleanup_set_free_free_ Set *controllers = NULL;
2459         _cleanup_fclose_ FILE *f = NULL;
2460         int r;
2461
2462         assert(ret);
2463
2464         /* Determines the full list of kernel-known controllers. Might
2465          * include controllers we don't actually support, arbitrary
2466          * named hierarchies and controllers that aren't currently
2467          * accessible (because not mounted). */
2468
2469         controllers = set_new(&string_hash_ops);
2470         if (!controllers)
2471                 return -ENOMEM;
2472
2473         f = fopen("/proc/cgroups", "re");
2474         if (!f) {
2475                 if (errno == ENOENT) {
2476                         *ret = NULL;
2477                         return 0;
2478                 }
2479
2480                 return -errno;
2481         }
2482
2483         (void) __fsetlocking(f, FSETLOCKING_BYCALLER);
2484
2485         /* Ignore the header line */
2486         (void) read_line(f, (size_t) -1, NULL);
2487
2488         for (;;) {
2489                 char *controller;
2490                 int enabled = 0;
2491
2492                 errno = 0;
2493                 if (fscanf(f, "%ms %*i %*i %i", &controller, &enabled) != 2) {
2494
2495                         if (feof(f))
2496                                 break;
2497
2498                         if (ferror(f) && errno > 0)
2499                                 return -errno;
2500
2501                         return -EBADMSG;
2502                 }
2503
2504                 if (!enabled) {
2505                         free(controller);
2506                         continue;
2507                 }
2508
2509                 if (!cg_controller_is_valid(controller)) {
2510                         free(controller);
2511                         return -EBADMSG;
2512                 }
2513
2514                 r = set_consume(controllers, controller);
2515                 if (r < 0)
2516                         return r;
2517         }
2518
2519         *ret = controllers;
2520         controllers = NULL;
2521
2522         return 0;
2523 }
2524 #endif // 0
2525
2526 static thread_local CGroupUnified unified_cache = CGROUP_UNIFIED_UNKNOWN;
2527
2528 /* The hybrid mode was initially implemented in v232 and simply mounted cgroup v2 on /sys/fs/cgroup/systemd.  This
2529  * unfortunately broke other tools (such as docker) which expected the v1 "name=systemd" hierarchy on
2530  * /sys/fs/cgroup/systemd.  From v233 and on, the hybrid mode mountnbs v2 on /sys/fs/cgroup/unified and maintains
2531  * "name=systemd" hierarchy on /sys/fs/cgroup/systemd for compatibility with other tools.
2532  *
2533  * To keep live upgrade working, we detect and support v232 layout.  When v232 layout is detected, to keep cgroup v2
2534  * process management but disable the compat dual layout, we return %true on
2535  * cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER) and %false on cg_hybrid_unified().
2536  */
2537 static thread_local bool unified_systemd_v232;
2538
2539 static int cg_unified_update(void) {
2540
2541         struct statfs fs;
2542
2543         /* Checks if we support the unified hierarchy. Returns an
2544          * error when the cgroup hierarchies aren't mounted yet or we
2545          * have any other trouble determining if the unified hierarchy
2546          * is supported. */
2547
2548         if (unified_cache >= CGROUP_UNIFIED_NONE)
2549                 return 0;
2550
2551         if (statfs("/sys/fs/cgroup/", &fs) < 0)
2552                 return log_debug_errno(errno, "statfs(\"/sys/fs/cgroup/\" failed: %m");
2553
2554         if (F_TYPE_EQUAL(fs.f_type, CGROUP2_SUPER_MAGIC)) {
2555                 log_debug("Found cgroup2 on /sys/fs/cgroup/, full unified hierarchy");
2556                 unified_cache = CGROUP_UNIFIED_ALL;
2557 #if 0 /// The handling of cgroups is a bit different with elogind
2558         } else if (F_TYPE_EQUAL(fs.f_type, TMPFS_MAGIC)) {
2559                         log_debug("Found cgroup2 on /sys/fs/cgroup/unified, unified hierarchy for systemd controller");
2560 #else
2561         } else if (F_TYPE_EQUAL(fs.f_type, CGROUP_SUPER_MAGIC)
2562               || F_TYPE_EQUAL(fs.f_type, TMPFS_MAGIC)) {
2563 #endif // 0
2564                 if (statfs("/sys/fs/cgroup/unified/", &fs) == 0 &&
2565                     F_TYPE_EQUAL(fs.f_type, CGROUP2_SUPER_MAGIC)) {
2566                         unified_cache = CGROUP_UNIFIED_SYSTEMD;
2567                         unified_systemd_v232 = false;
2568                 } else {
2569 #if 0 /// There is no sub-grouping within elogind
2570                         if (statfs("/sys/fs/cgroup/systemd/", &fs) < 0)
2571                                 return log_debug_errno(errno, "statfs(\"/sys/fs/cgroup/systemd\" failed: %m");
2572
2573                         if (F_TYPE_EQUAL(fs.f_type, CGROUP2_SUPER_MAGIC)) {
2574                                 log_debug("Found cgroup2 on /sys/fs/cgroup/systemd, unified hierarchy for systemd controller (v232 variant)");
2575                                 unified_cache = CGROUP_UNIFIED_SYSTEMD;
2576                                 unified_systemd_v232 = true;
2577                         } else if (F_TYPE_EQUAL(fs.f_type, CGROUP_SUPER_MAGIC)) {
2578                                 log_debug("Found cgroup on /sys/fs/cgroup/systemd, legacy hierarchy");
2579                                 unified_cache = CGROUP_UNIFIED_NONE;
2580                         } else {
2581                                 log_debug("Unexpected filesystem type %llx mounted on /sys/fs/cgroup/systemd, assuming legacy hierarchy",
2582                                           (unsigned long long) fs.f_type);
2583                                 unified_cache = CGROUP_UNIFIED_NONE;
2584                         }
2585 #else
2586                         unified_cache = CGROUP_UNIFIED_NONE;
2587 #endif // 0
2588                 }
2589         } else {
2590                 log_debug("Unknown filesystem type %llx mounted on /sys/fs/cgroup.",
2591                           (unsigned long long) fs.f_type);
2592                 return -ENOMEDIUM;
2593         }
2594
2595         return 0;
2596 }
2597
2598 int cg_unified_controller(const char *controller) {
2599         int r;
2600
2601         r = cg_unified_update();
2602         if (r < 0)
2603                 return r;
2604
2605         if (unified_cache == CGROUP_UNIFIED_NONE)
2606                 return false;
2607
2608         if (unified_cache >= CGROUP_UNIFIED_ALL)
2609                 return true;
2610
2611 #if 0 /// only if elogind is the controller we can use cgroups2 in hybrid mode
2612         return streq_ptr(controller, SYSTEMD_CGROUP_CONTROLLER);
2613 #else
2614         return streq_ptr(controller, SYSTEMD_CGROUP_CONTROLLER_HYBRID);
2615 #endif // 0
2616 }
2617
2618 int cg_all_unified(void) {
2619         int r;
2620
2621         r = cg_unified_update();
2622         if (r < 0)
2623                 return r;
2624
2625         return unified_cache >= CGROUP_UNIFIED_ALL;
2626 }
2627
2628 int cg_hybrid_unified(void) {
2629         int r;
2630
2631         r = cg_unified_update();
2632         if (r < 0)
2633                 return r;
2634
2635         return unified_cache == CGROUP_UNIFIED_SYSTEMD && !unified_systemd_v232;
2636 }
2637
2638 int cg_unified_flush(void) {
2639         unified_cache = CGROUP_UNIFIED_UNKNOWN;
2640
2641         return cg_unified_update();
2642 }
2643
2644 #if 0 /// UNNEEDED by elogind
2645 int cg_enable_everywhere(CGroupMask supported, CGroupMask mask, const char *p) {
2646         _cleanup_fclose_ FILE *f = NULL;
2647         _cleanup_free_ char *fs = NULL;
2648         CGroupController c;
2649         int r;
2650
2651         assert(p);
2652
2653         if (supported == 0)
2654                 return 0;
2655
2656         r = cg_all_unified();
2657         if (r < 0)
2658                 return r;
2659         if (r == 0) /* on the legacy hiearchy there's no joining of controllers defined */
2660                 return 0;
2661
2662         r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, p, "cgroup.subtree_control", &fs);
2663         if (r < 0)
2664                 return r;
2665
2666         for (c = 0; c < _CGROUP_CONTROLLER_MAX; c++) {
2667                 CGroupMask bit = CGROUP_CONTROLLER_TO_MASK(c);
2668                 const char *n;
2669
2670                 if (!(supported & bit))
2671                         continue;
2672
2673                 n = cgroup_controller_to_string(c);
2674                 {
2675                         char s[1 + strlen(n) + 1];
2676
2677                         s[0] = mask & bit ? '+' : '-';
2678                         strcpy(s + 1, n);
2679
2680                         if (!f) {
2681                                 f = fopen(fs, "we");
2682                                 if (!f) {
2683                                         log_debug_errno(errno, "Failed to open cgroup.subtree_control file of %s: %m", p);
2684                                         break;
2685                                 }
2686                         }
2687
2688                         r = write_string_stream(f, s, 0);
2689                         if (r < 0)
2690                                 log_debug_errno(r, "Failed to enable controller %s for %s (%s): %m", n, p, fs);
2691                 }
2692         }
2693
2694         return 0;
2695 }
2696 #endif // 0
2697
2698 bool cg_is_unified_wanted(void) {
2699         static thread_local int wanted = -1;
2700         int r;
2701         bool b;
2702         const bool is_default = DEFAULT_HIERARCHY == CGROUP_UNIFIED_ALL;
2703
2704         /* If we have a cached value, return that. */
2705         if (wanted >= 0)
2706                 return wanted;
2707
2708         /* If the hierarchy is already mounted, then follow whatever
2709          * was chosen for it. */
2710         if (cg_unified_flush() >= 0)
2711                 return (wanted = unified_cache >= CGROUP_UNIFIED_ALL);
2712
2713 #if 0 /// elogind is not init and has no business with kernel command line
2714         /* Otherwise, let's see what the kernel command line has to say.
2715          * Since checking is expensive, cache a non-error result. */
2716         r = proc_cmdline_get_bool("systemd.unified_cgroup_hierarchy", &b);
2717 #endif // 0
2718
2719         return (wanted = r > 0 ? b : is_default);
2720 }
2721
2722 bool cg_is_legacy_wanted(void) {
2723         static thread_local int wanted = -1;
2724
2725         /* If we have a cached value, return that. */
2726         if (wanted >= 0)
2727                 return wanted;
2728
2729         /* Check if we have cgroups2 already mounted. */
2730         if (cg_unified_flush() >= 0 &&
2731             unified_cache == CGROUP_UNIFIED_ALL)
2732                 return (wanted = false);
2733
2734         /* Otherwise, assume that at least partial legacy is wanted,
2735          * since cgroups2 should already be mounted at this point. */
2736         return (wanted = true);
2737 }
2738
2739 bool cg_is_hybrid_wanted(void) {
2740         static thread_local int wanted = -1;
2741         int r;
2742         bool b;
2743         const bool is_default = DEFAULT_HIERARCHY >= CGROUP_UNIFIED_SYSTEMD;
2744         /* We default to true if the default is "hybrid", obviously,
2745          * but also when the default is "unified", because if we get
2746          * called, it means that unified hierarchy was not mounted. */
2747
2748         /* If we have a cached value, return that. */
2749         if (wanted >= 0)
2750                 return wanted;
2751
2752         /* If the hierarchy is already mounted, then follow whatever
2753          * was chosen for it. */
2754         if (cg_unified_flush() >= 0 &&
2755             unified_cache == CGROUP_UNIFIED_ALL)
2756                 return (wanted = false);
2757
2758 #if 0 /// elogind is not init and has no business with kernel command line
2759         /* Otherwise, let's see what the kernel command line has to say.
2760          * Since checking is expensive, cache a non-error result. */
2761         r = proc_cmdline_get_bool("systemd.legacy_systemd_cgroup_controller", &b);
2762 #endif // 0
2763
2764         /* The meaning of the kernel option is reversed wrt. to the return value
2765          * of this function, hence the negation. */
2766         return (wanted = r > 0 ? !b : is_default);
2767 }
2768
2769 #if 0 /// UNNEEDED by elogind
2770 int cg_weight_parse(const char *s, uint64_t *ret) {
2771         uint64_t u;
2772         int r;
2773
2774         if (isempty(s)) {
2775                 *ret = CGROUP_WEIGHT_INVALID;
2776                 return 0;
2777         }
2778
2779         r = safe_atou64(s, &u);
2780         if (r < 0)
2781                 return r;
2782
2783         if (u < CGROUP_WEIGHT_MIN || u > CGROUP_WEIGHT_MAX)
2784                 return -ERANGE;
2785
2786         *ret = u;
2787         return 0;
2788 }
2789
2790 const uint64_t cgroup_io_limit_defaults[_CGROUP_IO_LIMIT_TYPE_MAX] = {
2791         [CGROUP_IO_RBPS_MAX]    = CGROUP_LIMIT_MAX,
2792         [CGROUP_IO_WBPS_MAX]    = CGROUP_LIMIT_MAX,
2793         [CGROUP_IO_RIOPS_MAX]   = CGROUP_LIMIT_MAX,
2794         [CGROUP_IO_WIOPS_MAX]   = CGROUP_LIMIT_MAX,
2795 };
2796
2797 static const char* const cgroup_io_limit_type_table[_CGROUP_IO_LIMIT_TYPE_MAX] = {
2798         [CGROUP_IO_RBPS_MAX]    = "IOReadBandwidthMax",
2799         [CGROUP_IO_WBPS_MAX]    = "IOWriteBandwidthMax",
2800         [CGROUP_IO_RIOPS_MAX]   = "IOReadIOPSMax",
2801         [CGROUP_IO_WIOPS_MAX]   = "IOWriteIOPSMax",
2802 };
2803
2804 DEFINE_STRING_TABLE_LOOKUP(cgroup_io_limit_type, CGroupIOLimitType);
2805
2806 int cg_cpu_shares_parse(const char *s, uint64_t *ret) {
2807         uint64_t u;
2808         int r;
2809
2810         if (isempty(s)) {
2811                 *ret = CGROUP_CPU_SHARES_INVALID;
2812                 return 0;
2813         }
2814
2815         r = safe_atou64(s, &u);
2816         if (r < 0)
2817                 return r;
2818
2819         if (u < CGROUP_CPU_SHARES_MIN || u > CGROUP_CPU_SHARES_MAX)
2820                 return -ERANGE;
2821
2822         *ret = u;
2823         return 0;
2824 }
2825
2826 int cg_blkio_weight_parse(const char *s, uint64_t *ret) {
2827         uint64_t u;
2828         int r;
2829
2830         if (isempty(s)) {
2831                 *ret = CGROUP_BLKIO_WEIGHT_INVALID;
2832                 return 0;
2833         }
2834
2835         r = safe_atou64(s, &u);
2836         if (r < 0)
2837                 return r;
2838
2839         if (u < CGROUP_BLKIO_WEIGHT_MIN || u > CGROUP_BLKIO_WEIGHT_MAX)
2840                 return -ERANGE;
2841
2842         *ret = u;
2843         return 0;
2844 }
2845 #endif // 0
2846
2847 bool is_cgroup_fs(const struct statfs *s) {
2848         return is_fs_type(s, CGROUP_SUPER_MAGIC) ||
2849                is_fs_type(s, CGROUP2_SUPER_MAGIC);
2850 }
2851
2852 bool fd_is_cgroup_fs(int fd) {
2853         struct statfs s;
2854
2855         if (fstatfs(fd, &s) < 0)
2856                 return -errno;
2857
2858         return is_cgroup_fs(&s);
2859 }
2860
2861 static const char *cgroup_controller_table[_CGROUP_CONTROLLER_MAX] = {
2862         [CGROUP_CONTROLLER_CPU] = "cpu",
2863         [CGROUP_CONTROLLER_CPUACCT] = "cpuacct",
2864         [CGROUP_CONTROLLER_IO] = "io",
2865         [CGROUP_CONTROLLER_BLKIO] = "blkio",
2866         [CGROUP_CONTROLLER_MEMORY] = "memory",
2867         [CGROUP_CONTROLLER_DEVICES] = "devices",
2868         [CGROUP_CONTROLLER_PIDS] = "pids",
2869 };
2870
2871 DEFINE_STRING_TABLE_LOOKUP(cgroup_controller, CGroupController);