src/basic/cgroup-util.c

   1 /* SPDX-License-Identifier: LGPL-2.1+ */
   2 /***
   3   This file is part of systemd.
   4
   5   Copyright 2010 Lennart Poettering
   6
   7   systemd is free software; you can redistribute it and/or modify it
   8   under the terms of the GNU Lesser General Public License as published by
   9   the Free Software Foundation; either version 2.1 of the License, or
  10   (at your option) any later version.
  11
  12   systemd is distributed in the hope that it will be useful, but
  13   WITHOUT ANY WARRANTY; without even the implied warranty of
  14   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  15   Lesser General Public License for more details.
  16
  17   You should have received a copy of the GNU Lesser General Public License
  18   along with systemd; If not, see <http://www.gnu.org/licenses/>.
  19 ***/
  20
  21 #include <dirent.h>
  22 #include <errno.h>
  23 #include <ftw.h>
  24 //#include <limits.h>
  25 #include <signal.h>
  26 //#include <stddef.h>
  27 #include <stdio_ext.h>
  28 #include <stdlib.h>
  29 #include <string.h>
  30 #include <sys/stat.h>
  31 //#include <sys/statfs.h>
  32 #include <sys/types.h>
  33 #include <sys/xattr.h>
  34 #include <unistd.h>
  35
  36 #include "alloc-util.h"
  37 #include "cgroup-util.h"
  38 //#include "def.h"
  39 #include "dirent-util.h"
  40 #include "extract-word.h"
  41 #include "fd-util.h"
  42 #include "fileio.h"
  43 #include "format-util.h"
  44 #include "fs-util.h"
  45 //#include "log.h"
  46 #include "login-util.h"
  47 #include "macro.h"
  48 //#include "missing.h"
  49 #include "mkdir.h"
  50 #include "parse-util.h"
  51 #include "path-util.h"
  52 #include "proc-cmdline.h"
  53 #include "process-util.h"
  54 #include "set.h"
  55 //#include "special.h"
  56 #include "stat-util.h"
  57 #include "stdio-util.h"
  58 #include "string-table.h"
  59 #include "string-util.h"
  60 #include "strv.h"
  61 #include "unit-name.h"
  62 #include "user-util.h"
  63
  64 int cg_enumerate_processes(const char *controller, const char *path, FILE **_f) {
  65         _cleanup_free_ char *fs = NULL;
  66         FILE *f;
  67         int r;
  68
  69         assert(_f);
  70
  71         r = cg_get_path(controller, path, "cgroup.procs", &fs);
  72         if (r < 0)
  73                 return r;
  74
  75         f = fopen(fs, "re");
  76         if (!f)
  77                 return -errno;
  78
  79         *_f = f;
  80         return 0;
  81 }
  82
  83 int cg_read_pid(FILE *f, pid_t *_pid) {
  84         unsigned long ul;
  85
  86         /* Note that the cgroup.procs might contain duplicates! See
  87          * cgroups.txt for details. */
  88
  89         assert(f);
  90         assert(_pid);
  91
  92         errno = 0;
  93         if (fscanf(f, "%lu", &ul) != 1) {
  94
  95                 if (feof(f))
  96                         return 0;
  97
  98                 return errno > 0 ? -errno : -EIO;
  99         }
 100
 101         if (ul <= 0)
 102                 return -EIO;
 103
 104         *_pid = (pid_t) ul;
 105         return 1;
 106 }
 107
 108 int cg_read_event(
 109                 const char *controller,
 110                 const char *path,
 111                 const char *event,
 112                 char **val) {
 113
 114         _cleanup_free_ char *events = NULL, *content = NULL;
 115         char *p, *line;
 116         int r;
 117
 118         r = cg_get_path(controller, path, "cgroup.events", &events);
 119         if (r < 0)
 120                 return r;
 121
 122         r = read_full_file(events, &content, NULL);
 123         if (r < 0)
 124                 return r;
 125
 126         p = content;
 127         while ((line = strsep(&p, "\n"))) {
 128                 char *key;
 129
 130                 key = strsep(&line, " ");
 131                 if (!key || !line)
 132                         return -EINVAL;
 133
 134                 if (strcmp(key, event))
 135                         continue;
 136
 137                 *val = strdup(line);
 138                 return 0;
 139         }
 140
 141         return -ENOENT;
 142 }
 143
 144 #if 0 /// UNNEEDED by elogind
 145 bool cg_ns_supported(void) {
 146         static thread_local int enabled = -1;
 147
 148         if (enabled >= 0)
 149                 return enabled;
 150
 151         if (access("/proc/self/ns/cgroup", F_OK) == 0)
 152                 enabled = 1;
 153         else
 154                 enabled = 0;
 155
 156         return enabled;
 157 }
 158 #endif // 0
 159
 160 int cg_enumerate_subgroups(const char *controller, const char *path, DIR **_d) {
 161         _cleanup_free_ char *fs = NULL;
 162         int r;
 163         DIR *d;
 164
 165         assert(_d);
 166
 167         /* This is not recursive! */
 168
 169         r = cg_get_path(controller, path, NULL, &fs);
 170         if (r < 0)
 171                 return r;
 172
 173         d = opendir(fs);
 174         if (!d)
 175                 return -errno;
 176
 177         *_d = d;
 178         return 0;
 179 }
 180
 181 int cg_read_subgroup(DIR *d, char **fn) {
 182         struct dirent *de;
 183
 184         assert(d);
 185         assert(fn);
 186
 187         FOREACH_DIRENT_ALL(de, d, return -errno) {
 188                 char *b;
 189
 190                 if (de->d_type != DT_DIR)
 191                         continue;
 192
 193                 if (dot_or_dot_dot(de->d_name))
 194                         continue;
 195
 196                 b = strdup(de->d_name);
 197                 if (!b)
 198                         return -ENOMEM;
 199
 200                 *fn = b;
 201                 return 1;
 202         }
 203
 204         return 0;
 205 }
 206
 207 int cg_rmdir(const char *controller, const char *path) {
 208         _cleanup_free_ char *p = NULL;
 209         int r;
 210
 211         r = cg_get_path(controller, path, NULL, &p);
 212         if (r < 0)
 213                 return r;
 214
 215         r = rmdir(p);
 216         if (r < 0 && errno != ENOENT)
 217                 return -errno;
 218
 219         r = cg_hybrid_unified();
 220         if (r < 0)
 221                 return r;
 222         if (r == 0)
 223                 return 0;
 224
 225         if (streq(controller, SYSTEMD_CGROUP_CONTROLLER)) {
 226                 r = cg_rmdir(SYSTEMD_CGROUP_CONTROLLER_LEGACY, path);
 227                 if (r < 0)
 228                         log_warning_errno(r, "Failed to remove compat systemd cgroup %s: %m", path);
 229         }
 230
 231         return 0;
 232 }
 233
 234 int cg_kill(
 235                 const char *controller,
 236                 const char *path,
 237                 int sig,
 238                 CGroupFlags flags,
 239                 Set *s,
 240                 cg_kill_log_func_t log_kill,
 241                 void *userdata) {
 242
 243         _cleanup_set_free_ Set *allocated_set = NULL;
 244         bool done = false;
 245         int r, ret = 0;
 246         pid_t my_pid;
 247
 248         assert(sig >= 0);
 249
 250          /* Don't send SIGCONT twice. Also, SIGKILL always works even when process is suspended, hence don't send
 251           * SIGCONT on SIGKILL. */
 252         if (IN_SET(sig, SIGCONT, SIGKILL))
 253                 flags &= ~CGROUP_SIGCONT;
 254
 255         /* This goes through the tasks list and kills them all. This
 256          * is repeated until no further processes are added to the
 257          * tasks list, to properly handle forking processes */
 258
 259         if (!s) {
 260                 s = allocated_set = set_new(NULL);
 261                 if (!s)
 262                         return -ENOMEM;
 263         }
 264
 265         my_pid = getpid_cached();
 266
 267         do {
 268                 _cleanup_fclose_ FILE *f = NULL;
 269                 pid_t pid = 0;
 270                 done = true;
 271
 272                 r = cg_enumerate_processes(controller, path, &f);
 273                 if (r < 0) {
 274                         if (ret >= 0 && r != -ENOENT)
 275                                 return r;
 276
 277                         return ret;
 278                 }
 279
 280                 while ((r = cg_read_pid(f, &pid)) > 0) {
 281
 282                         if ((flags & CGROUP_IGNORE_SELF) && pid == my_pid)
 283                                 continue;
 284
 285                         if (set_get(s, PID_TO_PTR(pid)) == PID_TO_PTR(pid))
 286                                 continue;
 287
 288                         if (log_kill)
 289                                 log_kill(pid, sig, userdata);
 290
 291                         /* If we haven't killed this process yet, kill
 292                          * it */
 293                         if (kill(pid, sig) < 0) {
 294                                 if (ret >= 0 && errno != ESRCH)
 295                                         ret = -errno;
 296                         } else {
 297                                 if (flags & CGROUP_SIGCONT)
 298                                         (void) kill(pid, SIGCONT);
 299
 300                                 if (ret == 0)
 301                                         ret = 1;
 302                         }
 303
 304                         done = false;
 305
 306                         r = set_put(s, PID_TO_PTR(pid));
 307                         if (r < 0) {
 308                                 if (ret >= 0)
 309                                         return r;
 310
 311                                 return ret;
 312                         }
 313                 }
 314
 315                 if (r < 0) {
 316                         if (ret >= 0)
 317                                 return r;
 318
 319                         return ret;
 320                 }
 321
 322                 /* To avoid racing against processes which fork
 323                  * quicker than we can kill them we repeat this until
 324                  * no new pids need to be killed. */
 325
 326         } while (!done);
 327
 328         return ret;
 329 }
 330
 331 int cg_kill_recursive(
 332                 const char *controller,
 333                 const char *path,
 334                 int sig,
 335                 CGroupFlags flags,
 336                 Set *s,
 337                 cg_kill_log_func_t log_kill,
 338                 void *userdata) {
 339
 340         _cleanup_set_free_ Set *allocated_set = NULL;
 341         _cleanup_closedir_ DIR *d = NULL;
 342         int r, ret;
 343         char *fn;
 344
 345         assert(path);
 346         assert(sig >= 0);
 347
 348         if (!s) {
 349                 s = allocated_set = set_new(NULL);
 350                 if (!s)
 351                         return -ENOMEM;
 352         }
 353
 354         ret = cg_kill(controller, path, sig, flags, s, log_kill, userdata);
 355
 356         r = cg_enumerate_subgroups(controller, path, &d);
 357         if (r < 0) {
 358                 if (ret >= 0 && r != -ENOENT)
 359                         return r;
 360
 361                 return ret;
 362         }
 363
 364         while ((r = cg_read_subgroup(d, &fn)) > 0) {
 365                 _cleanup_free_ char *p = NULL;
 366
 367                 p = strjoin(path, "/", fn);
 368                 free(fn);
 369                 if (!p)
 370                         return -ENOMEM;
 371
 372                 r = cg_kill_recursive(controller, p, sig, flags, s, log_kill, userdata);
 373                 if (r != 0 && ret >= 0)
 374                         ret = r;
 375         }
 376         if (ret >= 0 && r < 0)
 377                 ret = r;
 378
 379         if (flags & CGROUP_REMOVE) {
 380                 r = cg_rmdir(controller, path);
 381                 if (r < 0 && ret >= 0 && !IN_SET(r, -ENOENT, -EBUSY))
 382                         return r;
 383         }
 384
 385         return ret;
 386 }
 387
 388 int cg_migrate(
 389                 const char *cfrom,
 390                 const char *pfrom,
 391                 const char *cto,
 392                 const char *pto,
 393                 CGroupFlags flags) {
 394
 395         bool done = false;
 396         _cleanup_set_free_ Set *s = NULL;
 397         int r, ret = 0;
 398         pid_t my_pid;
 399
 400         assert(cfrom);
 401         assert(pfrom);
 402         assert(cto);
 403         assert(pto);
 404
 405         s = set_new(NULL);
 406         if (!s)
 407                 return -ENOMEM;
 408
 409         my_pid = getpid_cached();
 410
 411         log_debug_elogind("Migrating \"%s\"/\"%s\" to \"%s\"/\"%s\" (%s)",
 412                           cfrom, pfrom, cto, pto,
 413                           (flags & CGROUP_IGNORE_SELF)
 414                           ? "ignoring self" : "watching self");
 415         do {
 416                 _cleanup_fclose_ FILE *f = NULL;
 417                 pid_t pid = 0;
 418                 done = true;
 419
 420                 r = cg_enumerate_processes(cfrom, pfrom, &f);
 421                 if (r < 0) {
 422                         if (ret >= 0 && r != -ENOENT)
 423                                 return r;
 424
 425                         return ret;
 426                 }
 427
 428                 while ((r = cg_read_pid(f, &pid)) > 0) {
 429
 430                         /* This might do weird stuff if we aren't a
 431                          * single-threaded program. However, we
 432                          * luckily know we are not */
 433                         if ((flags & CGROUP_IGNORE_SELF) && pid == my_pid)
 434                                 continue;
 435
 436                         if (set_get(s, PID_TO_PTR(pid)) == PID_TO_PTR(pid))
 437                                 continue;
 438
 439                         /* Ignore kernel threads. Since they can only
 440                          * exist in the root cgroup, we only check for
 441                          * them there. */
 442                         if (cfrom &&
 443                             (isempty(pfrom) || path_equal(pfrom, "/")) &&
 444                             is_kernel_thread(pid) > 0)
 445                                 continue;
 446
 447                         r = cg_attach(cto, pto, pid);
 448                         if (r < 0) {
 449                                 if (ret >= 0 && r != -ESRCH)
 450                                         ret = r;
 451                         } else if (ret == 0)
 452                                 ret = 1;
 453
 454                         done = false;
 455
 456                         r = set_put(s, PID_TO_PTR(pid));
 457                         if (r < 0) {
 458                                 if (ret >= 0)
 459                                         return r;
 460
 461                                 return ret;
 462                         }
 463                 }
 464
 465                 if (r < 0) {
 466                         if (ret >= 0)
 467                                 return r;
 468
 469                         return ret;
 470                 }
 471         } while (!done);
 472
 473         return ret;
 474 }
 475
 476 int cg_migrate_recursive(
 477                 const char *cfrom,
 478                 const char *pfrom,
 479                 const char *cto,
 480                 const char *pto,
 481                 CGroupFlags flags) {
 482
 483         _cleanup_closedir_ DIR *d = NULL;
 484         int r, ret = 0;
 485         char *fn;
 486
 487         assert(cfrom);
 488         assert(pfrom);
 489         assert(cto);
 490         assert(pto);
 491
 492         ret = cg_migrate(cfrom, pfrom, cto, pto, flags);
 493
 494         r = cg_enumerate_subgroups(cfrom, pfrom, &d);
 495         if (r < 0) {
 496                 if (ret >= 0 && r != -ENOENT)
 497                         return r;
 498
 499                 return ret;
 500         }
 501
 502         while ((r = cg_read_subgroup(d, &fn)) > 0) {
 503                 _cleanup_free_ char *p = NULL;
 504
 505                 p = strjoin(pfrom, "/", fn);
 506                 free(fn);
 507                 if (!p)
 508                         return -ENOMEM;
 509
 510                 r = cg_migrate_recursive(cfrom, p, cto, pto, flags);
 511                 if (r != 0 && ret >= 0)
 512                         ret = r;
 513         }
 514
 515         if (r < 0 && ret >= 0)
 516                 ret = r;
 517
 518         if (flags & CGROUP_REMOVE) {
 519                 r = cg_rmdir(cfrom, pfrom);
 520                 if (r < 0 && ret >= 0 && !IN_SET(r, -ENOENT, -EBUSY))
 521                         return r;
 522         }
 523
 524         return ret;
 525 }
 526
 527 int cg_migrate_recursive_fallback(
 528                 const char *cfrom,
 529                 const char *pfrom,
 530                 const char *cto,
 531                 const char *pto,
 532                 CGroupFlags flags) {
 533
 534         int r;
 535
 536         assert(cfrom);
 537         assert(pfrom);
 538         assert(cto);
 539         assert(pto);
 540
 541         r = cg_migrate_recursive(cfrom, pfrom, cto, pto, flags);
 542         if (r < 0) {
 543                 char prefix[strlen(pto) + 1];
 544
 545                 /* This didn't work? Then let's try all prefixes of the destination */
 546
 547                 PATH_FOREACH_PREFIX(prefix, pto) {
 548                         int q;
 549
 550                         q = cg_migrate_recursive(cfrom, pfrom, cto, prefix, flags);
 551                         if (q >= 0)
 552                                 return q;
 553                 }
 554         }
 555
 556         return r;
 557 }
 558
 559 static const char *controller_to_dirname(const char *controller) {
 560         const char *e;
 561
 562         assert(controller);
 563
 564         /* Converts a controller name to the directory name below
 565          * /sys/fs/cgroup/ we want to mount it to. Effectively, this
 566          * just cuts off the name= prefixed used for named
 567          * hierarchies, if it is specified. */
 568
 569         if (streq(controller, SYSTEMD_CGROUP_CONTROLLER)) {
 570                 if (cg_hybrid_unified() > 0)
 571                         controller = SYSTEMD_CGROUP_CONTROLLER_HYBRID;
 572                 else
 573                         controller = SYSTEMD_CGROUP_CONTROLLER_LEGACY;
 574         }
 575
 576         e = startswith(controller, "name=");
 577         if (e)
 578                 return e;
 579
 580         return controller;
 581 }
 582
 583 static int join_path_legacy(const char *controller, const char *path, const char *suffix, char **fs) {
 584         const char *dn;
 585         char *t = NULL;
 586
 587         assert(fs);
 588         assert(controller);
 589
 590         dn = controller_to_dirname(controller);
 591
 592         if (isempty(path) && isempty(suffix))
 593                 t = strappend("/sys/fs/cgroup/", dn);
 594         else if (isempty(path))
 595                 t = strjoin("/sys/fs/cgroup/", dn, "/", suffix);
 596         else if (isempty(suffix))
 597                 t = strjoin("/sys/fs/cgroup/", dn, "/", path);
 598         else
 599                 t = strjoin("/sys/fs/cgroup/", dn, "/", path, "/", suffix);
 600         if (!t)
 601                 return -ENOMEM;
 602
 603         *fs = t;
 604         return 0;
 605 }
 606
 607 static int join_path_unified(const char *path, const char *suffix, char **fs) {
 608         char *t;
 609
 610         assert(fs);
 611
 612         if (isempty(path) && isempty(suffix))
 613                 t = strdup("/sys/fs/cgroup");
 614         else if (isempty(path))
 615                 t = strappend("/sys/fs/cgroup/", suffix);
 616         else if (isempty(suffix))
 617                 t = strappend("/sys/fs/cgroup/", path);
 618         else
 619                 t = strjoin("/sys/fs/cgroup/", path, "/", suffix);
 620         if (!t)
 621                 return -ENOMEM;
 622
 623         *fs = t;
 624         return 0;
 625 }
 626
 627 int cg_get_path(const char *controller, const char *path, const char *suffix, char **fs) {
 628         int r;
 629
 630         assert(fs);
 631
 632         if (!controller) {
 633                 char *t;
 634
 635                 /* If no controller is specified, we return the path
 636                  * *below* the controllers, without any prefix. */
 637
 638                 if (!path && !suffix)
 639                         return -EINVAL;
 640
 641                 if (!suffix)
 642                         t = strdup(path);
 643                 else if (!path)
 644                         t = strdup(suffix);
 645                 else
 646                         t = strjoin(path, "/", suffix);
 647                 if (!t)
 648                         return -ENOMEM;
 649
 650                 *fs = path_kill_slashes(t);
 651                 return 0;
 652         }
 653
 654         if (!cg_controller_is_valid(controller))
 655                 return -EINVAL;
 656
 657         r = cg_all_unified();
 658         if (r < 0)
 659                 return r;
 660         if (r > 0)
 661                 r = join_path_unified(path, suffix, fs);
 662         else
 663                 r = join_path_legacy(controller, path, suffix, fs);
 664         if (r < 0)
 665                 return r;
 666
 667         path_kill_slashes(*fs);
 668         return 0;
 669 }
 670
 671 static int controller_is_accessible(const char *controller) {
 672         int r;
 673
 674         assert(controller);
 675
 676         /* Checks whether a specific controller is accessible,
 677          * i.e. its hierarchy mounted. In the unified hierarchy all
 678          * controllers are considered accessible, except for the named
 679          * hierarchies */
 680
 681         if (!cg_controller_is_valid(controller))
 682                 return -EINVAL;
 683
 684         r = cg_all_unified();
 685         if (r < 0)
 686                 return r;
 687         if (r > 0) {
 688                 /* We don't support named hierarchies if we are using
 689                  * the unified hierarchy. */
 690
 691                 if (streq(controller, SYSTEMD_CGROUP_CONTROLLER))
 692                         return 0;
 693
 694                 if (startswith(controller, "name="))
 695                         return -EOPNOTSUPP;
 696
 697         } else {
 698                 const char *cc, *dn;
 699
 700                 dn = controller_to_dirname(controller);
 701                 cc = strjoina("/sys/fs/cgroup/", dn);
 702
 703                 if (laccess(cc, F_OK) < 0)
 704                         return -errno;
 705         }
 706
 707         return 0;
 708 }
 709
 710 int cg_get_path_and_check(const char *controller, const char *path, const char *suffix, char **fs) {
 711         int r;
 712
 713         assert(controller);
 714         assert(fs);
 715
 716         /* Check if the specified controller is actually accessible */
 717         r = controller_is_accessible(controller);
 718         if (r < 0)
 719                 return r;
 720
 721         return cg_get_path(controller, path, suffix, fs);
 722 }
 723
 724 static int trim_cb(const char *path, const struct stat *sb, int typeflag, struct FTW *ftwbuf) {
 725         assert(path);
 726         assert(sb);
 727         assert(ftwbuf);
 728
 729         if (typeflag != FTW_DP)
 730                 return 0;
 731
 732         if (ftwbuf->level < 1)
 733                 return 0;
 734
 735         (void) rmdir(path);
 736         return 0;
 737 }
 738
 739 int cg_trim(const char *controller, const char *path, bool delete_root) {
 740         _cleanup_free_ char *fs = NULL;
 741         int r = 0, q;
 742
 743         assert(path);
 744
 745         r = cg_get_path(controller, path, NULL, &fs);
 746         if (r < 0)
 747                 return r;
 748
 749         errno = 0;
 750         if (nftw(fs, trim_cb, 64, FTW_DEPTH|FTW_MOUNT|FTW_PHYS) != 0) {
 751                 if (errno == ENOENT)
 752                         r = 0;
 753                 else if (errno > 0)
 754                         r = -errno;
 755                 else
 756                         r = -EIO;
 757         }
 758
 759         if (delete_root) {
 760                 if (rmdir(fs) < 0 && errno != ENOENT)
 761                         return -errno;
 762         }
 763
 764         q = cg_hybrid_unified();
 765         if (q < 0)
 766                 return q;
 767         if (q > 0 && streq(controller, SYSTEMD_CGROUP_CONTROLLER)) {
 768                 q = cg_trim(SYSTEMD_CGROUP_CONTROLLER_LEGACY, path, delete_root);
 769                 if (q < 0)
 770                         log_warning_errno(q, "Failed to trim compat systemd cgroup %s: %m", path);
 771         }
 772
 773         return r;
 774 }
 775
 776 int cg_create(const char *controller, const char *path) {
 777         _cleanup_free_ char *fs = NULL;
 778         int r;
 779
 780         r = cg_get_path_and_check(controller, path, NULL, &fs);
 781         if (r < 0)
 782                 return r;
 783
 784         r = mkdir_parents(fs, 0755);
 785         if (r < 0)
 786                 return r;
 787
 788         if (mkdir(fs, 0755) < 0) {
 789
 790                 if (errno == EEXIST)
 791                         return 0;
 792
 793                 return -errno;
 794         }
 795
 796         r = cg_hybrid_unified();
 797         if (r < 0)
 798                 return r;
 799
 800         if (r > 0 && streq(controller, SYSTEMD_CGROUP_CONTROLLER)) {
 801                 r = cg_create(SYSTEMD_CGROUP_CONTROLLER_LEGACY, path);
 802                 if (r < 0)
 803                         log_warning_errno(r, "Failed to create compat systemd cgroup %s: %m", path);
 804         }
 805
 806         return 1;
 807 }
 808
 809 int cg_create_and_attach(const char *controller, const char *path, pid_t pid) {
 810         int r, q;
 811
 812         assert(pid >= 0);
 813
 814         r = cg_create(controller, path);
 815         if (r < 0)
 816                 return r;
 817
 818         q = cg_attach(controller, path, pid);
 819         if (q < 0)
 820                 return q;
 821
 822         /* This does not remove the cgroup on failure */
 823         return r;
 824 }
 825
 826 int cg_attach(const char *controller, const char *path, pid_t pid) {
 827         _cleanup_free_ char *fs = NULL;
 828         char c[DECIMAL_STR_MAX(pid_t) + 2];
 829         int r;
 830
 831         assert(path);
 832         assert(pid >= 0);
 833
 834         r = cg_get_path_and_check(controller, path, "cgroup.procs", &fs);
 835         if (r < 0)
 836                 return r;
 837
 838         if (pid == 0)
 839                 pid = getpid_cached();
 840
 841         xsprintf(c, PID_FMT "\n", pid);
 842
 843         r = write_string_file(fs, c, 0);
 844         if (r < 0)
 845                 return r;
 846
 847         r = cg_hybrid_unified();
 848         if (r < 0)
 849                 return r;
 850
 851         if (r > 0 && streq(controller, SYSTEMD_CGROUP_CONTROLLER)) {
 852                 r = cg_attach(SYSTEMD_CGROUP_CONTROLLER_LEGACY, path, pid);
 853                 if (r < 0)
 854                         log_warning_errno(r, "Failed to attach "PID_FMT" to compat systemd cgroup %s: %m", pid, path);
 855         }
 856
 857         return 0;
 858 }
 859
 860 int cg_attach_fallback(const char *controller, const char *path, pid_t pid) {
 861         int r;
 862
 863         assert(controller);
 864         assert(path);
 865         assert(pid >= 0);
 866
 867         r = cg_attach(controller, path, pid);
 868         if (r < 0) {
 869                 char prefix[strlen(path) + 1];
 870
 871                 /* This didn't work? Then let's try all prefixes of
 872                  * the destination */
 873
 874                 PATH_FOREACH_PREFIX(prefix, path) {
 875                         int q;
 876
 877                         q = cg_attach(controller, prefix, pid);
 878                         if (q >= 0)
 879                                 return q;
 880                 }
 881         }
 882
 883         return r;
 884 }
 885
 886 #if 0 /// UNNEEDED by elogind
 887 int cg_set_access(
 888                 const char *controller,
 889                 const char *path,
 890                 uid_t uid,
 891                 gid_t gid) {
 892
 893         struct Attribute {
 894                 const char *name;
 895                 bool fatal;
 896         };
 897
 898         /* cgroupsv1, aka legacy/non-unified */
 899         static const struct Attribute legacy_attributes[] = {
 900                 { "cgroup.procs",           true  },
 901                 { "tasks",                  false },
 902                 { "cgroup.clone_children",  false },
 903                 {},
 904         };
 905
 906         /* cgroupsv2, aka unified */
 907         static const struct Attribute unified_attributes[] = {
 908                 { "cgroup.procs",           true  },
 909                 { "cgroup.subtree_control", true  },
 910                 { "cgroup.threads",         false },
 911                 {},
 912         };
 913
 914         static const struct Attribute* const attributes[] = {
 915                 [false] = legacy_attributes,
 916                 [true]  = unified_attributes,
 917         };
 918
 919         _cleanup_free_ char *fs = NULL;
 920         const struct Attribute *i;
 921         int r, unified;
 922
 923         assert(path);
 924
 925         if (uid == UID_INVALID && gid == GID_INVALID)
 926                 return 0;
 927
 928         unified = cg_unified_controller(controller);
 929         if (unified < 0)
 930                 return unified;
 931
 932         /* Configure access to the cgroup itself */
 933         r = cg_get_path(controller, path, NULL, &fs);
 934         if (r < 0)
 935                 return r;
 936
 937         r = chmod_and_chown(fs, 0755, uid, gid);
 938         if (r < 0)
 939                 return r;
 940
 941         /* Configure access to the cgroup's attributes */
 942         for (i = attributes[unified]; i->name; i++) {
 943                 fs = mfree(fs);
 944
 945                 r = cg_get_path(controller, path, i->name, &fs);
 946                 if (r < 0)
 947                         return r;
 948
 949                 r = chmod_and_chown(fs, 0644, uid, gid);
 950                 if (r < 0) {
 951                         if (i->fatal)
 952                                 return r;
 953
 954                         log_debug_errno(r, "Failed to set access on cgroup %s, ignoring: %m", fs);
 955                 }
 956         }
 957
 958         if (streq(controller, SYSTEMD_CGROUP_CONTROLLER)) {
 959                 r = cg_hybrid_unified();
 960                 if (r < 0)
 961                         return r;
 962                 if (r > 0) {
 963                         /* Always propagate access mode from unified to legacy controller */
 964                         r = cg_set_access(SYSTEMD_CGROUP_CONTROLLER_LEGACY, path, uid, gid);
 965                         if (r < 0)
 966                                 log_debug_errno(r, "Failed to set access on compatibility elogind cgroup %s, ignoring: %m", path);
 967                 }
 968         }
 969
 970         return 0;
 971 }
 972
 973 int cg_set_xattr(const char *controller, const char *path, const char *name, const void *value, size_t size, int flags) {
 974         _cleanup_free_ char *fs = NULL;
 975         int r;
 976
 977         assert(path);
 978         assert(name);
 979         assert(value || size <= 0);
 980
 981         r = cg_get_path(controller, path, NULL, &fs);
 982         if (r < 0)
 983                 return r;
 984
 985         if (setxattr(fs, name, value, size, flags) < 0)
 986                 return -errno;
 987
 988         return 0;
 989 }
 990
 991 int cg_get_xattr(const char *controller, const char *path, const char *name, void *value, size_t size) {
 992         _cleanup_free_ char *fs = NULL;
 993         ssize_t n;
 994         int r;
 995
 996         assert(path);
 997         assert(name);
 998
 999         r = cg_get_path(controller, path, NULL, &fs);
1000         if (r < 0)
1001                 return r;
1002
1003         n = getxattr(fs, name, value, size);
1004         if (n < 0)
1005                 return -errno;
1006
1007         return (int) n;
1008 }
1009 #endif // 0
1010
1011 int cg_pid_get_path(const char *controller, pid_t pid, char **path) {
1012         _cleanup_fclose_ FILE *f = NULL;
1013         char line[LINE_MAX];
1014 #if 0 /// At elogind we do not want that (false alarm) "maybe uninitialized" warning
1015         const char *fs, *controller_str;
1016 #else
1017         const char *fs, *controller_str = NULL;
1018 #endif // 0
1019         size_t cs = 0;
1020         int unified;
1021
1022         assert(path);
1023         assert(pid >= 0);
1024
1025         if (controller) {
1026                 if (!cg_controller_is_valid(controller))
1027                         return -EINVAL;
1028         } else
1029                 controller = SYSTEMD_CGROUP_CONTROLLER;
1030
1031         unified = cg_unified_controller(controller);
1032         if (unified < 0)
1033                 return unified;
1034         if (unified == 0) {
1035                 if (streq(controller, SYSTEMD_CGROUP_CONTROLLER))
1036                         controller_str = SYSTEMD_CGROUP_CONTROLLER_LEGACY;
1037                 else
1038                         controller_str = controller;
1039
1040                 cs = strlen(controller_str);
1041         }
1042
1043         fs = procfs_file_alloca(pid, "cgroup");
1044         log_debug_elogind("Searching for PID %u in \"%s\" (controller \"%s\")",
1045                           pid, fs, controller);
1046         f = fopen(fs, "re");
1047         if (!f)
1048                 return errno == ENOENT ? -ESRCH : -errno;
1049
1050         (void) __fsetlocking(f, FSETLOCKING_BYCALLER);
1051
1052         FOREACH_LINE(line, f, return -errno) {
1053                 char *e, *p;
1054
1055                 truncate_nl(line);
1056
1057                 if (unified) {
1058                         e = startswith(line, "0:");
1059                         if (!e)
1060                                 continue;
1061
1062                         e = strchr(e, ':');
1063                         if (!e)
1064                                 continue;
1065                 } else {
1066                         char *l;
1067                         size_t k;
1068                         const char *word, *state;
1069                         bool found = false;
1070
1071                         l = strchr(line, ':');
1072                         if (!l)
1073                                 continue;
1074
1075                         l++;
1076                         e = strchr(l, ':');
1077                         if (!e)
1078                                 continue;
1079
1080                         *e = 0;
1081                         FOREACH_WORD_SEPARATOR(word, k, l, ",", state) {
1082                                 if (k == cs && memcmp(word, controller_str, cs) == 0) {
1083                                         found = true;
1084                                         break;
1085                                 }
1086                         }
1087
1088                         if (!found)
1089                                 continue;
1090                 }
1091
1092                 log_debug_elogind("Found %s:%s", line, e+1);
1093                 p = strdup(e + 1);
1094                 if (!p)
1095                         return -ENOMEM;
1096
1097                 /* Truncate suffix indicating the process is a zombie */
1098                 e = endswith(p, " (deleted)");
1099                 if (e)
1100                         *e = 0;
1101
1102                 *path = p;
1103                 return 0;
1104         }
1105
1106         return -ENODATA;
1107 }
1108
1109 #if 0 /// UNNEEDED by elogind
1110 int cg_install_release_agent(const char *controller, const char *agent) {
1111         _cleanup_free_ char *fs = NULL, *contents = NULL;
1112         const char *sc;
1113         int r;
1114
1115         assert(agent);
1116
1117         r = cg_unified_controller(controller);
1118         if (r < 0)
1119                 return r;
1120         if (r > 0) /* doesn't apply to unified hierarchy */
1121                 return -EOPNOTSUPP;
1122
1123         r = cg_get_path(controller, NULL, "release_agent", &fs);
1124         if (r < 0)
1125                 return r;
1126
1127         r = read_one_line_file(fs, &contents);
1128         if (r < 0)
1129                 return r;
1130
1131         sc = strstrip(contents);
1132         if (isempty(sc)) {
1133                 r = write_string_file(fs, agent, 0);
1134                 if (r < 0)
1135                         return r;
1136         } else if (!path_equal(sc, agent))
1137                 return -EEXIST;
1138
1139         fs = mfree(fs);
1140         r = cg_get_path(controller, NULL, "notify_on_release", &fs);
1141         if (r < 0)
1142                 return r;
1143
1144         contents = mfree(contents);
1145         r = read_one_line_file(fs, &contents);
1146         if (r < 0)
1147                 return r;
1148
1149         sc = strstrip(contents);
1150         if (streq(sc, "0")) {
1151                 r = write_string_file(fs, "1", 0);
1152                 if (r < 0)
1153                         return r;
1154
1155                 return 1;
1156         }
1157
1158         if (!streq(sc, "1"))
1159                 return -EIO;
1160
1161         return 0;
1162 }
1163
1164 int cg_uninstall_release_agent(const char *controller) {
1165         _cleanup_free_ char *fs = NULL;
1166         int r;
1167
1168         r = cg_unified_controller(controller);
1169         if (r < 0)
1170                 return r;
1171         if (r > 0) /* Doesn't apply to unified hierarchy */
1172                 return -EOPNOTSUPP;
1173
1174         r = cg_get_path(controller, NULL, "notify_on_release", &fs);
1175         if (r < 0)
1176                 return r;
1177
1178         r = write_string_file(fs, "0", 0);
1179         if (r < 0)
1180                 return r;
1181
1182         fs = mfree(fs);
1183
1184         r = cg_get_path(controller, NULL, "release_agent", &fs);
1185         if (r < 0)
1186                 return r;
1187
1188         r = write_string_file(fs, "", 0);
1189         if (r < 0)
1190                 return r;
1191
1192         return 0;
1193 }
1194 #endif // 0
1195
1196 int cg_is_empty(const char *controller, const char *path) {
1197         _cleanup_fclose_ FILE *f = NULL;
1198         pid_t pid;
1199         int r;
1200
1201         assert(path);
1202
1203         r = cg_enumerate_processes(controller, path, &f);
1204         if (r == -ENOENT)
1205                 return 1;
1206         if (r < 0)
1207                 return r;
1208
1209         r = cg_read_pid(f, &pid);
1210         if (r < 0)
1211                 return r;
1212
1213         return r == 0;
1214 }
1215
1216 int cg_is_empty_recursive(const char *controller, const char *path) {
1217         int r;
1218
1219         assert(path);
1220
1221         /* The root cgroup is always populated */
1222         if (controller && (isempty(path) || path_equal(path, "/")))
1223                 return false;
1224
1225         r = cg_unified_controller(controller);
1226         if (r < 0)
1227                 return r;
1228         if (r > 0) {
1229                 _cleanup_free_ char *t = NULL;
1230
1231                 /* On the unified hierarchy we can check empty state
1232                  * via the "populated" attribute of "cgroup.events". */
1233
1234                 r = cg_read_event(controller, path, "populated", &t);
1235                 if (r < 0)
1236                         return r;
1237
1238                 return streq(t, "0");
1239         } else {
1240                 _cleanup_closedir_ DIR *d = NULL;
1241                 char *fn;
1242
1243                 r = cg_is_empty(controller, path);
1244                 if (r <= 0)
1245                         return r;
1246
1247                 r = cg_enumerate_subgroups(controller, path, &d);
1248                 if (r == -ENOENT)
1249                         return 1;
1250                 if (r < 0)
1251                         return r;
1252
1253                 while ((r = cg_read_subgroup(d, &fn)) > 0) {
1254                         _cleanup_free_ char *p = NULL;
1255
1256                         p = strjoin(path, "/", fn);
1257                         free(fn);
1258                         if (!p)
1259                                 return -ENOMEM;
1260
1261                         r = cg_is_empty_recursive(controller, p);
1262                         if (r <= 0)
1263                                 return r;
1264                 }
1265                 if (r < 0)
1266                         return r;
1267
1268                 return true;
1269         }
1270 }
1271
1272 int cg_split_spec(const char *spec, char **controller, char **path) {
1273         char *t = NULL, *u = NULL;
1274         const char *e;
1275
1276         assert(spec);
1277
1278         if (*spec == '/') {
1279                 if (!path_is_normalized(spec))
1280                         return -EINVAL;
1281
1282                 if (path) {
1283                         t = strdup(spec);
1284                         if (!t)
1285                                 return -ENOMEM;
1286
1287                         *path = path_kill_slashes(t);
1288                 }
1289
1290                 if (controller)
1291                         *controller = NULL;
1292
1293                 return 0;
1294         }
1295
1296         e = strchr(spec, ':');
1297         if (!e) {
1298                 if (!cg_controller_is_valid(spec))
1299                         return -EINVAL;
1300
1301                 if (controller) {
1302                         t = strdup(spec);
1303                         if (!t)
1304                                 return -ENOMEM;
1305
1306                         *controller = t;
1307                 }
1308
1309                 if (path)
1310                         *path = NULL;
1311
1312                 return 0;
1313         }
1314
1315         t = strndup(spec, e-spec);
1316         if (!t)
1317                 return -ENOMEM;
1318         if (!cg_controller_is_valid(t)) {
1319                 free(t);
1320                 return -EINVAL;
1321         }
1322
1323         if (isempty(e+1))
1324                 u = NULL;
1325         else {
1326                 u = strdup(e+1);
1327                 if (!u) {
1328                         free(t);
1329                         return -ENOMEM;
1330                 }
1331
1332                 if (!path_is_normalized(u) ||
1333                     !path_is_absolute(u)) {
1334                         free(t);
1335                         free(u);
1336                         return -EINVAL;
1337                 }
1338
1339                 path_kill_slashes(u);
1340         }
1341
1342         if (controller)
1343                 *controller = t;
1344         else
1345                 free(t);
1346
1347         if (path)
1348                 *path = u;
1349         else
1350                 free(u);
1351
1352         return 0;
1353 }
1354
1355 int cg_mangle_path(const char *path, char **result) {
1356         _cleanup_free_ char *c = NULL, *p = NULL;
1357         char *t;
1358         int r;
1359
1360         assert(path);
1361         assert(result);
1362
1363         /* First, check if it already is a filesystem path */
1364         if (path_startswith(path, "/sys/fs/cgroup")) {
1365
1366                 t = strdup(path);
1367                 if (!t)
1368                         return -ENOMEM;
1369
1370                 *result = path_kill_slashes(t);
1371                 return 0;
1372         }
1373
1374         /* Otherwise, treat it as cg spec */
1375         r = cg_split_spec(path, &c, &p);
1376         if (r < 0)
1377                 return r;
1378
1379         return cg_get_path(c ?: SYSTEMD_CGROUP_CONTROLLER, p ?: "/", NULL, result);
1380 }
1381
1382 int cg_get_root_path(char **path) {
1383         char *p, *e;
1384         int r;
1385
1386         assert(path);
1387
1388         r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, 1, &p);
1389         if (r < 0)
1390                 return r;
1391
1392 #if 0 /// elogind does not support systemd scopes and slices
1393         e = endswith(p, "/" SPECIAL_INIT_SCOPE);
1394         if (!e)
1395                 e = endswith(p, "/" SPECIAL_SYSTEM_SLICE); /* legacy */
1396         if (!e)
1397                 e = endswith(p, "/system"); /* even more legacy */
1398 #else
1399         e = endswith(p, "/elogind");
1400 #endif // 0
1401         if (e)
1402                 *e = 0;
1403
1404         *path = p;
1405         return 0;
1406 }
1407
1408 int cg_shift_path(const char *cgroup, const char *root, const char **shifted) {
1409         _cleanup_free_ char *rt = NULL;
1410         char *p;
1411         int r;
1412
1413         assert(cgroup);
1414         assert(shifted);
1415
1416         if (!root) {
1417                 /* If the root was specified let's use that, otherwise
1418                  * let's determine it from PID 1 */
1419
1420                 r = cg_get_root_path(&rt);
1421                 if (r < 0)
1422                         return r;
1423
1424                 root = rt;
1425                 log_debug_elogind("Determined root path: \"%s\"", root);
1426         }
1427
1428         p = path_startswith(cgroup, root);
1429 #if 0 /// With other controllers, elogind might end up in /elogind, and *p is 0
1430         if (p && p > cgroup)
1431 #else
1432         if (p && p[0] && (p > cgroup))
1433 #endif // 0
1434                 *shifted = p - 1;
1435         else
1436                 *shifted = cgroup;
1437
1438         return 0;
1439 }
1440
1441 int cg_pid_get_path_shifted(pid_t pid, const char *root, char **cgroup) {
1442         _cleanup_free_ char *raw = NULL;
1443         const char *c;
1444         int r;
1445
1446         assert(pid >= 0);
1447         assert(cgroup);
1448
1449         r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, pid, &raw);
1450         if (r < 0)
1451                 return r;
1452
1453         log_debug_elogind("Shifting path: \"%s\" (PID %u, root: \"%s\")",
1454                           raw, pid, root ? root : "NULL");
1455         r = cg_shift_path(raw, root, &c);
1456         if (r < 0)
1457                 return r;
1458
1459         if (c == raw) {
1460                 *cgroup = raw;
1461                 raw = NULL;
1462         } else {
1463                 char *n;
1464
1465                 n = strdup(c);
1466                 if (!n)
1467                         return -ENOMEM;
1468
1469                 *cgroup = n;
1470         }
1471         log_debug_elogind("Resulting cgroup:\"%s\"", *cgroup);
1472
1473         return 0;
1474 }
1475
1476 #if 0 /// UNNEEDED by elogind
1477 int cg_path_decode_unit(const char *cgroup, char **unit) {
1478         char *c, *s;
1479         size_t n;
1480
1481         assert(cgroup);
1482         assert(unit);
1483
1484         n = strcspn(cgroup, "/");
1485         if (n < 3)
1486                 return -ENXIO;
1487
1488         c = strndupa(cgroup, n);
1489         c = cg_unescape(c);
1490
1491         if (!unit_name_is_valid(c, UNIT_NAME_PLAIN|UNIT_NAME_INSTANCE))
1492                 return -ENXIO;
1493
1494         s = strdup(c);
1495         if (!s)
1496                 return -ENOMEM;
1497
1498         *unit = s;
1499         return 0;
1500 }
1501
1502 static bool valid_slice_name(const char *p, size_t n) {
1503
1504         if (!p)
1505                 return false;
1506
1507         if (n < STRLEN("x.slice"))
1508                 return false;
1509
1510         if (memcmp(p + n - 6, ".slice", 6) == 0) {
1511                 char buf[n+1], *c;
1512
1513                 memcpy(buf, p, n);
1514                 buf[n] = 0;
1515
1516                 c = cg_unescape(buf);
1517
1518                 return unit_name_is_valid(c, UNIT_NAME_PLAIN);
1519         }
1520
1521         return false;
1522 }
1523
1524 static const char *skip_slices(const char *p) {
1525         assert(p);
1526
1527         /* Skips over all slice assignments */
1528
1529         for (;;) {
1530                 size_t n;
1531
1532                 p += strspn(p, "/");
1533
1534                 n = strcspn(p, "/");
1535                 if (!valid_slice_name(p, n))
1536                         return p;
1537
1538                 p += n;
1539         }
1540 }
1541
1542 int cg_path_get_unit(const char *path, char **ret) {
1543         const char *e;
1544         char *unit;
1545         int r;
1546
1547         assert(path);
1548         assert(ret);
1549
1550         e = skip_slices(path);
1551
1552         r = cg_path_decode_unit(e, &unit);
1553         if (r < 0)
1554                 return r;
1555
1556         /* We skipped over the slices, don't accept any now */
1557         if (endswith(unit, ".slice")) {
1558                 free(unit);
1559                 return -ENXIO;
1560         }
1561
1562         *ret = unit;
1563         return 0;
1564 }
1565
1566 int cg_pid_get_unit(pid_t pid, char **unit) {
1567         _cleanup_free_ char *cgroup = NULL;
1568         int r;
1569
1570         assert(unit);
1571
1572         r = cg_pid_get_path_shifted(pid, NULL, &cgroup);
1573         if (r < 0)
1574                 return r;
1575
1576         return cg_path_get_unit(cgroup, unit);
1577 }
1578
1579 /**
1580  * Skip session-*.scope, but require it to be there.
1581  */
1582 static const char *skip_session(const char *p) {
1583         size_t n;
1584
1585         if (isempty(p))
1586                 return NULL;
1587
1588         p += strspn(p, "/");
1589
1590         n = strcspn(p, "/");
1591         if (n < STRLEN("session-x.scope"))
1592                 return NULL;
1593
1594         if (memcmp(p, "session-", 8) == 0 && memcmp(p + n - 6, ".scope", 6) == 0) {
1595                 char buf[n - 8 - 6 + 1];
1596
1597                 memcpy(buf, p + 8, n - 8 - 6);
1598                 buf[n - 8 - 6] = 0;
1599
1600                 /* Note that session scopes never need unescaping,
1601                  * since they cannot conflict with the kernel's own
1602                  * names, hence we don't need to call cg_unescape()
1603                  * here. */
1604
1605                 if (!session_id_valid(buf))
1606                         return false;
1607
1608                 p += n;
1609                 p += strspn(p, "/");
1610                 return p;
1611         }
1612
1613         return NULL;
1614 }
1615
1616 /**
1617  * Skip user@*.service, but require it to be there.
1618  */
1619 static const char *skip_user_manager(const char *p) {
1620         size_t n;
1621
1622         if (isempty(p))
1623                 return NULL;
1624
1625         p += strspn(p, "/");
1626
1627         n = strcspn(p, "/");
1628         if (n < STRLEN("user@x.service"))
1629                 return NULL;
1630
1631         if (memcmp(p, "user@", 5) == 0 && memcmp(p + n - 8, ".service", 8) == 0) {
1632                 char buf[n - 5 - 8 + 1];
1633
1634                 memcpy(buf, p + 5, n - 5 - 8);
1635                 buf[n - 5 - 8] = 0;
1636
1637                 /* Note that user manager services never need unescaping,
1638                  * since they cannot conflict with the kernel's own
1639                  * names, hence we don't need to call cg_unescape()
1640                  * here. */
1641
1642                 if (parse_uid(buf, NULL) < 0)
1643                         return NULL;
1644
1645                 p += n;
1646                 p += strspn(p, "/");
1647
1648                 return p;
1649         }
1650
1651         return NULL;
1652 }
1653
1654 static const char *skip_user_prefix(const char *path) {
1655         const char *e, *t;
1656
1657         assert(path);
1658
1659         /* Skip slices, if there are any */
1660         e = skip_slices(path);
1661
1662         /* Skip the user manager, if it's in the path now... */
1663         t = skip_user_manager(e);
1664         if (t)
1665                 return t;
1666
1667         /* Alternatively skip the user session if it is in the path... */
1668         return skip_session(e);
1669 }
1670
1671 int cg_path_get_user_unit(const char *path, char **ret) {
1672         const char *t;
1673
1674         assert(path);
1675         assert(ret);
1676
1677         t = skip_user_prefix(path);
1678         if (!t)
1679                 return -ENXIO;
1680
1681         /* And from here on it looks pretty much the same as for a
1682          * system unit, hence let's use the same parser from here
1683          * on. */
1684         return cg_path_get_unit(t, ret);
1685 }
1686
1687 int cg_pid_get_user_unit(pid_t pid, char **unit) {
1688         _cleanup_free_ char *cgroup = NULL;
1689         int r;
1690
1691         assert(unit);
1692
1693         r = cg_pid_get_path_shifted(pid, NULL, &cgroup);
1694         if (r < 0)
1695                 return r;
1696
1697         return cg_path_get_user_unit(cgroup, unit);
1698 }
1699
1700 int cg_path_get_machine_name(const char *path, char **machine) {
1701         _cleanup_free_ char *u = NULL;
1702         const char *sl;
1703         int r;
1704
1705         r = cg_path_get_unit(path, &u);
1706         if (r < 0)
1707                 return r;
1708
1709         sl = strjoina("/run/systemd/machines/unit:", u);
1710         return readlink_malloc(sl, machine);
1711 }
1712
1713 int cg_pid_get_machine_name(pid_t pid, char **machine) {
1714         _cleanup_free_ char *cgroup = NULL;
1715         int r;
1716
1717         assert(machine);
1718
1719         r = cg_pid_get_path_shifted(pid, NULL, &cgroup);
1720         if (r < 0)
1721                 return r;
1722
1723         return cg_path_get_machine_name(cgroup, machine);
1724 }
1725 #endif // 0
1726
1727 int cg_path_get_session(const char *path, char **session) {
1728 #if 0 /// UNNEEDED by elogind
1729         _cleanup_free_ char *unit = NULL;
1730         char *start, *end;
1731         int r;
1732
1733         assert(path);
1734
1735         r = cg_path_get_unit(path, &unit);
1736         if (r < 0)
1737                 return r;
1738
1739         start = startswith(unit, "session-");
1740         if (!start)
1741                 return -ENXIO;
1742         end = endswith(start, ".scope");
1743         if (!end)
1744                 return -ENXIO;
1745
1746         *end = 0;
1747         if (!session_id_valid(start))
1748                 return -ENXIO;
1749 #else
1750         /* Elogind uses a flat hierarchy, just "/SESSION".  The only
1751            wrinkle is that SESSION might be escaped.  */
1752         const char *e, *n, *start;
1753
1754         assert(path);
1755         log_debug_elogind("path is \"%s\"", path);
1756         assert(path[0] == '/');
1757
1758         e = path + 1;
1759         n = strchrnul(e, '/');
1760         if (e == n)
1761                 return -ENOENT;
1762
1763         start = strndupa(e, n - e);
1764         start = cg_unescape(start);
1765
1766         if (!start[0])
1767                 return -ENOENT;
1768 #endif // 0
1769
1770         if (session) {
1771                 char *rr;
1772
1773                 log_debug_elogind("found session: \"%s\"", start);
1774                 rr = strdup(start);
1775                 if (!rr)
1776                         return -ENOMEM;
1777
1778                 *session = rr;
1779         }
1780
1781         return 0;
1782 }
1783
1784 int cg_pid_get_session(pid_t pid, char **session) {
1785         _cleanup_free_ char *cgroup = NULL;
1786         int r;
1787
1788         r = cg_pid_get_path_shifted(pid, NULL, &cgroup);
1789         if (r < 0)
1790                 return r;
1791
1792         return cg_path_get_session(cgroup, session);
1793 }
1794
1795 int cg_path_get_owner_uid(const char *path, uid_t *uid) {
1796 #if 0 /// elogind needs one more value
1797         _cleanup_free_ char *slice = NULL;
1798         char *start, *end;
1799 #else
1800         _cleanup_free_ char *slice = NULL, *p = NULL, *s = NULL;
1801 #endif // 0
1802         int r;
1803
1804         assert(path);
1805
1806         r = cg_path_get_slice(path, &slice);
1807         if (r < 0)
1808                 return r;
1809
1810 #if 0 /// elogind does not support systemd slices
1811         start = startswith(slice, "user-");
1812         if (!start)
1813                 return -ENXIO;
1814         end = endswith(start, ".slice");
1815         if (!end)
1816                 return -ENXIO;
1817
1818         *end = 0;
1819         if (parse_uid(start, uid) < 0)
1820                 return -ENXIO;
1821 #else
1822         p = strappend("/run/systemd/sessions/", slice);
1823
1824         r = parse_env_file(p, NEWLINE, "UID", &s, NULL);
1825         if (r == -ENOENT)
1826                 return -ENXIO;
1827         if (r < 0)
1828                 return r;
1829         if (isempty(s))
1830                 return -EIO;
1831
1832         if (parse_uid(s, uid) < 0)
1833                 return -ENXIO;
1834 #endif // 0
1835
1836         return 0;
1837 }
1838
1839 int cg_pid_get_owner_uid(pid_t pid, uid_t *uid) {
1840         _cleanup_free_ char *cgroup = NULL;
1841         int r;
1842
1843         r = cg_pid_get_path_shifted(pid, NULL, &cgroup);
1844         if (r < 0)
1845                 return r;
1846
1847         return cg_path_get_owner_uid(cgroup, uid);
1848 }
1849
1850 int cg_path_get_slice(const char *p, char **slice) {
1851         const char *e = NULL;
1852
1853         assert(p);
1854         assert(slice);
1855
1856 #if 0 /// elogind does not support systemd slices
1857         /* Finds the right-most slice unit from the beginning, but
1858          * stops before we come to the first non-slice unit. */
1859
1860         for (;;) {
1861                 size_t n;
1862
1863                 p += strspn(p, "/");
1864
1865                 n = strcspn(p, "/");
1866                 if (!valid_slice_name(p, n)) {
1867
1868                         if (!e) {
1869                                 char *s;
1870
1871                                 s = strdup(SPECIAL_ROOT_SLICE);
1872                                 if (!s)
1873                                         return -ENOMEM;
1874
1875                                 *slice = s;
1876                                 return 0;
1877                         }
1878
1879                         return cg_path_decode_unit(e, slice);
1880                 }
1881
1882                 e = p;
1883                 p += n;
1884         }
1885 #else
1886         /* In elogind, what is reported here, is the location of
1887          * the session. This is derived from /proc/<self|PID>/cgroup.
1888          * In there we look at the controller, which will look something
1889          * like "1:name=openrc:/3".
1890          * The last part gets extracted (and is now p), which is "/3" in
1891          * this case. The three is the session id, and that can be mapped.
1892          */
1893         e = startswith(p, "/");
1894
1895         if (e)
1896                 *slice = strdup(e);
1897         else
1898                 *slice = strdup(p);
1899
1900         return 0;
1901 #endif // 0
1902 }
1903
1904 int cg_pid_get_slice(pid_t pid, char **slice) {
1905         _cleanup_free_ char *cgroup = NULL;
1906         int r;
1907
1908         assert(slice);
1909
1910         r = cg_pid_get_path_shifted(pid, NULL, &cgroup);
1911         log_debug_elogind("Found cgroup %s for pid %u (result %d)",
1912                           cgroup, pid, r);
1913         if (r < 0)
1914                 return r;
1915
1916         return cg_path_get_slice(cgroup, slice);
1917 }
1918
1919 int cg_path_get_user_slice(const char *p, char **slice) {
1920 #if 0 /// UNNEEDED by elogind
1921         const char *t;
1922 #endif // 0
1923         assert(p);
1924         assert(slice);
1925
1926 #if 0 /// nothing to skip in elogind
1927         t = skip_user_prefix(p);
1928         if (!t)
1929                 return -ENXIO;
1930 #endif // 0
1931
1932 #if 0 /// UNNEEDED by elogind
1933         /* And now it looks pretty much the same as for a system
1934          * slice, so let's just use the same parser from here on. */
1935         return cg_path_get_slice(t, slice);
1936 #else
1937         /* In elogind there is nothing to skip, we can use the path
1938          * directly. Generally speaking this is always a session id
1939          * to user mapping. */
1940         return cg_path_get_slice(p, slice);
1941 #endif // 0
1942 }
1943
1944 int cg_pid_get_user_slice(pid_t pid, char **slice) {
1945         _cleanup_free_ char *cgroup = NULL;
1946         int r;
1947
1948         assert(slice);
1949
1950         r = cg_pid_get_path_shifted(pid, NULL, &cgroup);
1951         if (r < 0)
1952                 return r;
1953
1954         return cg_path_get_user_slice(cgroup, slice);
1955 }
1956
1957 char *cg_escape(const char *p) {
1958         bool need_prefix = false;
1959
1960         /* This implements very minimal escaping for names to be used
1961          * as file names in the cgroup tree: any name which might
1962          * conflict with a kernel name or is prefixed with '_' is
1963          * prefixed with a '_'. That way, when reading cgroup names it
1964          * is sufficient to remove a single prefixing underscore if
1965          * there is one. */
1966
1967         /* The return value of this function (unlike cg_unescape())
1968          * needs free()! */
1969
1970         if (IN_SET(p[0], 0, '_', '.') ||
1971             streq(p, "notify_on_release") ||
1972             streq(p, "release_agent") ||
1973             streq(p, "tasks") ||
1974             startswith(p, "cgroup."))
1975                 need_prefix = true;
1976         else {
1977                 const char *dot;
1978
1979                 dot = strrchr(p, '.');
1980                 if (dot) {
1981                         CGroupController c;
1982                         size_t l = dot - p;
1983
1984                         for (c = 0; c < _CGROUP_CONTROLLER_MAX; c++) {
1985                                 const char *n;
1986
1987                                 n = cgroup_controller_to_string(c);
1988
1989                                 if (l != strlen(n))
1990                                         continue;
1991
1992                                 if (memcmp(p, n, l) != 0)
1993                                         continue;
1994
1995                                 need_prefix = true;
1996                                 break;
1997                         }
1998                 }
1999         }
2000
2001         if (need_prefix)
2002                 return strappend("_", p);
2003
2004         return strdup(p);
2005 }
2006
2007 char *cg_unescape(const char *p) {
2008         assert(p);
2009
2010         /* The return value of this function (unlike cg_escape())
2011          * doesn't need free()! */
2012
2013         if (p[0] == '_')
2014                 return (char*) p+1;
2015
2016         return (char*) p;
2017 }
2018
2019 #define CONTROLLER_VALID                        \
2020         DIGITS LETTERS                          \
2021         "_"
2022
2023 bool cg_controller_is_valid(const char *p) {
2024         const char *t, *s;
2025
2026         if (!p)
2027                 return false;
2028
2029         if (streq(p, SYSTEMD_CGROUP_CONTROLLER))
2030                 return true;
2031
2032         s = startswith(p, "name=");
2033         if (s)
2034                 p = s;
2035
2036         if (IN_SET(*p, 0, '_'))
2037                 return false;
2038
2039         for (t = p; *t; t++)
2040                 if (!strchr(CONTROLLER_VALID, *t))
2041                         return false;
2042
2043         if (t - p > FILENAME_MAX)
2044                 return false;
2045
2046         return true;
2047 }
2048
2049 #if 0 /// UNNEEDED by elogind
2050 int cg_slice_to_path(const char *unit, char **ret) {
2051         _cleanup_free_ char *p = NULL, *s = NULL, *e = NULL;
2052         const char *dash;
2053         int r;
2054
2055         assert(unit);
2056         assert(ret);
2057
2058         if (streq(unit, SPECIAL_ROOT_SLICE)) {
2059                 char *x;
2060
2061                 x = strdup("");
2062                 if (!x)
2063                         return -ENOMEM;
2064                 *ret = x;
2065                 return 0;
2066         }
2067
2068         if (!unit_name_is_valid(unit, UNIT_NAME_PLAIN))
2069                 return -EINVAL;
2070
2071         if (!endswith(unit, ".slice"))
2072                 return -EINVAL;
2073
2074         r = unit_name_to_prefix(unit, &p);
2075         if (r < 0)
2076                 return r;
2077
2078         dash = strchr(p, '-');
2079
2080         /* Don't allow initial dashes */
2081         if (dash == p)
2082                 return -EINVAL;
2083
2084         while (dash) {
2085                 _cleanup_free_ char *escaped = NULL;
2086                 char n[dash - p + sizeof(".slice")];
2087
2088                 /* Don't allow trailing or double dashes */
2089                 if (IN_SET(dash[1], 0, '-'))
2090                         return -EINVAL;
2091
2092                 strcpy(stpncpy(n, p, dash - p), ".slice");
2093                 if (!unit_name_is_valid(n, UNIT_NAME_PLAIN))
2094                         return -EINVAL;
2095
2096                 escaped = cg_escape(n);
2097                 if (!escaped)
2098                         return -ENOMEM;
2099
2100                 if (!strextend(&s, escaped, "/", NULL))
2101                         return -ENOMEM;
2102
2103                 dash = strchr(dash+1, '-');
2104         }
2105
2106         e = cg_escape(unit);
2107         if (!e)
2108                 return -ENOMEM;
2109
2110         if (!strextend(&s, e, NULL))
2111                 return -ENOMEM;
2112
2113         *ret = s;
2114         s = NULL;
2115
2116         return 0;
2117 }
2118 #endif // 0
2119
2120 int cg_set_attribute(const char *controller, const char *path, const char *attribute, const char *value) {
2121         _cleanup_free_ char *p = NULL;
2122         int r;
2123
2124         r = cg_get_path(controller, path, attribute, &p);
2125         if (r < 0)
2126                 return r;
2127
2128         return write_string_file(p, value, 0);
2129 }
2130
2131 int cg_get_attribute(const char *controller, const char *path, const char *attribute, char **ret) {
2132         _cleanup_free_ char *p = NULL;
2133         int r;
2134
2135         r = cg_get_path(controller, path, attribute, &p);
2136         if (r < 0)
2137                 return r;
2138
2139         return read_one_line_file(p, ret);
2140 }
2141
2142 #if 0 /// UNNEEDED by elogind
2143 int cg_get_keyed_attribute(const char *controller, const char *path, const char *attribute, const char **keys, char **values) {
2144         _cleanup_free_ char *filename = NULL, *content = NULL;
2145         char *line, *p;
2146         int i, r;
2147
2148         for (i = 0; keys[i]; i++)
2149                 values[i] = NULL;
2150
2151         r = cg_get_path(controller, path, attribute, &filename);
2152         if (r < 0)
2153                 return r;
2154
2155         r = read_full_file(filename, &content, NULL);
2156         if (r < 0)
2157                 return r;
2158
2159         p = content;
2160         while ((line = strsep(&p, "\n"))) {
2161                 char *key;
2162
2163                 key = strsep(&line, " ");
2164
2165                 for (i = 0; keys[i]; i++) {
2166                         if (streq(key, keys[i])) {
2167                                 values[i] = strdup(line);
2168                                 break;
2169                         }
2170                 }
2171         }
2172
2173         for (i = 0; keys[i]; i++) {
2174                 if (!values[i]) {
2175                         for (i = 0; keys[i]; i++) {
2176                                 values[i] = mfree(values[i]);
2177                         }
2178                         return -ENOENT;
2179                 }
2180         }
2181
2182         return 0;
2183 }
2184
2185 int cg_create_everywhere(CGroupMask supported, CGroupMask mask, const char *path) {
2186         CGroupController c;
2187         int r;
2188
2189         /* This one will create a cgroup in our private tree, but also
2190          * duplicate it in the trees specified in mask, and remove it
2191          * in all others */
2192
2193         /* First create the cgroup in our own hierarchy. */
2194         r = cg_create(SYSTEMD_CGROUP_CONTROLLER, path);
2195         if (r < 0)
2196                 return r;
2197
2198         /* If we are in the unified hierarchy, we are done now */
2199         r = cg_all_unified();
2200         if (r < 0)
2201                 return r;
2202         if (r > 0)
2203                 return 0;
2204
2205         /* Otherwise, do the same in the other hierarchies */
2206         for (c = 0; c < _CGROUP_CONTROLLER_MAX; c++) {
2207                 CGroupMask bit = CGROUP_CONTROLLER_TO_MASK(c);
2208                 const char *n;
2209
2210                 n = cgroup_controller_to_string(c);
2211
2212                 if (mask & bit)
2213                         (void) cg_create(n, path);
2214                 else if (supported & bit)
2215                         (void) cg_trim(n, path, true);
2216         }
2217
2218         return 0;
2219 }
2220
2221 int cg_attach_everywhere(CGroupMask supported, const char *path, pid_t pid, cg_migrate_callback_t path_callback, void *userdata) {
2222         CGroupController c;
2223         int r;
2224
2225         r = cg_attach(SYSTEMD_CGROUP_CONTROLLER, path, pid);
2226         if (r < 0)
2227                 return r;
2228
2229         r = cg_all_unified();
2230         if (r < 0)
2231                 return r;
2232         if (r > 0)
2233                 return 0;
2234
2235         for (c = 0; c < _CGROUP_CONTROLLER_MAX; c++) {
2236                 CGroupMask bit = CGROUP_CONTROLLER_TO_MASK(c);
2237                 const char *p = NULL;
2238
2239                 if (!(supported & bit))
2240                         continue;
2241
2242                 if (path_callback)
2243                         p = path_callback(bit, userdata);
2244
2245                 if (!p)
2246                         p = path;
2247
2248                 (void) cg_attach_fallback(cgroup_controller_to_string(c), p, pid);
2249         }
2250
2251         return 0;
2252 }
2253
2254 int cg_attach_many_everywhere(CGroupMask supported, const char *path, Set* pids, cg_migrate_callback_t path_callback, void *userdata) {
2255         Iterator i;
2256         void *pidp;
2257         int r = 0;
2258
2259         SET_FOREACH(pidp, pids, i) {
2260                 pid_t pid = PTR_TO_PID(pidp);
2261                 int q;
2262
2263                 q = cg_attach_everywhere(supported, path, pid, path_callback, userdata);
2264                 if (q < 0 && r >= 0)
2265                         r = q;
2266         }
2267
2268         return r;
2269 }
2270
2271 int cg_migrate_everywhere(CGroupMask supported, const char *from, const char *to, cg_migrate_callback_t to_callback, void *userdata) {
2272         CGroupController c;
2273         int r = 0, q;
2274
2275         if (!path_equal(from, to))  {
2276                 r = cg_migrate_recursive(SYSTEMD_CGROUP_CONTROLLER, from, SYSTEMD_CGROUP_CONTROLLER, to, CGROUP_REMOVE);
2277                 if (r < 0)
2278                         return r;
2279         }
2280
2281         q = cg_all_unified();
2282         if (q < 0)
2283                 return q;
2284         if (q > 0)
2285                 return r;
2286
2287         for (c = 0; c < _CGROUP_CONTROLLER_MAX; c++) {
2288                 CGroupMask bit = CGROUP_CONTROLLER_TO_MASK(c);
2289                 const char *p = NULL;
2290
2291                 if (!(supported & bit))
2292                         continue;
2293
2294                 if (to_callback)
2295                         p = to_callback(bit, userdata);
2296
2297                 if (!p)
2298                         p = to;
2299
2300                 (void) cg_migrate_recursive_fallback(SYSTEMD_CGROUP_CONTROLLER, to, cgroup_controller_to_string(c), p, 0);
2301         }
2302
2303         return 0;
2304 }
2305
2306 int cg_trim_everywhere(CGroupMask supported, const char *path, bool delete_root) {
2307         CGroupController c;
2308         int r, q;
2309
2310         r = cg_trim(SYSTEMD_CGROUP_CONTROLLER, path, delete_root);
2311         if (r < 0)
2312                 return r;
2313
2314         q = cg_all_unified();
2315         if (q < 0)
2316                 return q;
2317         if (q > 0)
2318                 return r;
2319
2320         for (c = 0; c < _CGROUP_CONTROLLER_MAX; c++) {
2321                 CGroupMask bit = CGROUP_CONTROLLER_TO_MASK(c);
2322
2323                 if (!(supported & bit))
2324                         continue;
2325
2326                 (void) cg_trim(cgroup_controller_to_string(c), path, delete_root);
2327         }
2328
2329         return 0;
2330 }
2331 #endif // 0
2332
2333 int cg_mask_to_string(CGroupMask mask, char **ret) {
2334         _cleanup_free_ char *s = NULL;
2335         size_t n = 0, allocated = 0;
2336         bool space = false;
2337         CGroupController c;
2338
2339         assert(ret);
2340
2341         if (mask == 0) {
2342                 *ret = NULL;
2343                 return 0;
2344         }
2345
2346         for (c = 0; c < _CGROUP_CONTROLLER_MAX; c++) {
2347                 const char *k;
2348                 size_t l;
2349
2350                 if (!(mask & CGROUP_CONTROLLER_TO_MASK(c)))
2351                         continue;
2352
2353                 k = cgroup_controller_to_string(c);
2354                 l = strlen(k);
2355
2356                 if (!GREEDY_REALLOC(s, allocated, n + space + l + 1))
2357                         return -ENOMEM;
2358
2359                 if (space)
2360                         s[n] = ' ';
2361                 memcpy(s + n + space, k, l);
2362                 n += space + l;
2363
2364                 space = true;
2365         }
2366
2367         assert(s);
2368
2369         s[n] = 0;
2370         *ret = s;
2371         s = NULL;
2372
2373         return 0;
2374 }
2375
2376 int cg_mask_from_string(const char *value, CGroupMask *mask) {
2377         assert(mask);
2378         assert(value);
2379
2380         for (;;) {
2381                 _cleanup_free_ char *n = NULL;
2382                 CGroupController v;
2383                 int r;
2384
2385                 r = extract_first_word(&value, &n, NULL, 0);
2386                 if (r < 0)
2387                         return r;
2388                 if (r == 0)
2389                         break;
2390
2391                 v = cgroup_controller_from_string(n);
2392                 if (v < 0)
2393                         continue;
2394
2395                 *mask |= CGROUP_CONTROLLER_TO_MASK(v);
2396         }
2397         return 0;
2398 }
2399
2400 int cg_mask_supported(CGroupMask *ret) {
2401         CGroupMask mask = 0;
2402         int r;
2403
2404         /* Determines the mask of supported cgroup controllers. Only
2405          * includes controllers we can make sense of and that are
2406          * actually accessible. */
2407
2408         r = cg_all_unified();
2409         if (r < 0)
2410                 return r;
2411         if (r > 0) {
2412                 _cleanup_free_ char *root = NULL, *controllers = NULL, *path = NULL;
2413
2414                 /* In the unified hierarchy we can read the supported
2415                  * and accessible controllers from a the top-level
2416                  * cgroup attribute */
2417
2418                 r = cg_get_root_path(&root);
2419                 if (r < 0)
2420                         return r;
2421
2422                 r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, root, "cgroup.controllers", &path);
2423                 if (r < 0)
2424                         return r;
2425
2426                 r = read_one_line_file(path, &controllers);
2427                 if (r < 0)
2428                         return r;
2429
2430                 r = cg_mask_from_string(controllers, &mask);
2431                 if (r < 0)
2432                         return r;
2433
2434                 /* Currently, we support the cpu, memory, io and pids
2435                  * controller in the unified hierarchy, mask
2436                  * everything else off. */
2437                 mask &= CGROUP_MASK_CPU | CGROUP_MASK_MEMORY | CGROUP_MASK_IO | CGROUP_MASK_PIDS;
2438
2439         } else {
2440                 CGroupController c;
2441
2442                 /* In the legacy hierarchy, we check whether which
2443                  * hierarchies are mounted. */
2444
2445                 for (c = 0; c < _CGROUP_CONTROLLER_MAX; c++) {
2446                         const char *n;
2447
2448                         n = cgroup_controller_to_string(c);
2449                         if (controller_is_accessible(n) >= 0)
2450                                 mask |= CGROUP_CONTROLLER_TO_MASK(c);
2451                 }
2452         }
2453
2454         *ret = mask;
2455         return 0;
2456 }
2457
2458 #if 0 /// UNNEEDED by elogind
2459 int cg_kernel_controllers(Set **ret) {
2460         _cleanup_set_free_free_ Set *controllers = NULL;
2461         _cleanup_fclose_ FILE *f = NULL;
2462         int r;
2463
2464         assert(ret);
2465
2466         /* Determines the full list of kernel-known controllers. Might
2467          * include controllers we don't actually support, arbitrary
2468          * named hierarchies and controllers that aren't currently
2469          * accessible (because not mounted). */
2470
2471         controllers = set_new(&string_hash_ops);
2472         if (!controllers)
2473                 return -ENOMEM;
2474
2475         f = fopen("/proc/cgroups", "re");
2476         if (!f) {
2477                 if (errno == ENOENT) {
2478                         *ret = NULL;
2479                         return 0;
2480                 }
2481
2482                 return -errno;
2483         }
2484
2485         (void) __fsetlocking(f, FSETLOCKING_BYCALLER);
2486
2487         /* Ignore the header line */
2488         (void) read_line(f, (size_t) -1, NULL);
2489
2490         for (;;) {
2491                 char *controller;
2492                 int enabled = 0;
2493
2494                 errno = 0;
2495                 if (fscanf(f, "%ms %*i %*i %i", &controller, &enabled) != 2) {
2496
2497                         if (feof(f))
2498                                 break;
2499
2500                         if (ferror(f) && errno > 0)
2501                                 return -errno;
2502
2503                         return -EBADMSG;
2504                 }
2505
2506                 if (!enabled) {
2507                         free(controller);
2508                         continue;
2509                 }
2510
2511                 if (!cg_controller_is_valid(controller)) {
2512                         free(controller);
2513                         return -EBADMSG;
2514                 }
2515
2516                 r = set_consume(controllers, controller);
2517                 if (r < 0)
2518                         return r;
2519         }
2520
2521         *ret = controllers;
2522         controllers = NULL;
2523
2524         return 0;
2525 }
2526 #endif // 0
2527
2528 static thread_local CGroupUnified unified_cache = CGROUP_UNIFIED_UNKNOWN;
2529
2530 /* The hybrid mode was initially implemented in v232 and simply mounted cgroup v2 on /sys/fs/cgroup/systemd.  This
2531  * unfortunately broke other tools (such as docker) which expected the v1 "name=systemd" hierarchy on
2532  * /sys/fs/cgroup/systemd.  From v233 and on, the hybrid mode mountnbs v2 on /sys/fs/cgroup/unified and maintains
2533  * "name=systemd" hierarchy on /sys/fs/cgroup/systemd for compatibility with other tools.
2534  *
2535  * To keep live upgrade working, we detect and support v232 layout.  When v232 layout is detected, to keep cgroup v2
2536  * process management but disable the compat dual layout, we return %true on
2537  * cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER) and %false on cg_hybrid_unified().
2538  */
2539 static thread_local bool unified_systemd_v232;
2540
2541 static int cg_unified_update(void) {
2542
2543         struct statfs fs;
2544
2545         /* Checks if we support the unified hierarchy. Returns an
2546          * error when the cgroup hierarchies aren't mounted yet or we
2547          * have any other trouble determining if the unified hierarchy
2548          * is supported. */
2549
2550         if (unified_cache >= CGROUP_UNIFIED_NONE)
2551                 return 0;
2552
2553         if (statfs("/sys/fs/cgroup/", &fs) < 0)
2554                 return log_debug_errno(errno, "statfs(\"/sys/fs/cgroup/\" failed: %m");
2555
2556         if (F_TYPE_EQUAL(fs.f_type, CGROUP2_SUPER_MAGIC)) {
2557                 log_debug("Found cgroup2 on /sys/fs/cgroup/, full unified hierarchy");
2558                 unified_cache = CGROUP_UNIFIED_ALL;
2559 #if 0 /// The handling of cgroups is a bit different with elogind
2560         } else if (F_TYPE_EQUAL(fs.f_type, TMPFS_MAGIC)) {
2561 #else
2562         } else if (F_TYPE_EQUAL(fs.f_type, CGROUP_SUPER_MAGIC)
2563               || F_TYPE_EQUAL(fs.f_type, TMPFS_MAGIC)) {
2564 #endif // 0
2565                 if (statfs("/sys/fs/cgroup/unified/", &fs) == 0 &&
2566                     F_TYPE_EQUAL(fs.f_type, CGROUP2_SUPER_MAGIC)) {
2567                         log_debug("Found cgroup2 on /sys/fs/cgroup/unified, unified hierarchy for elogind controller");
2568                         unified_cache = CGROUP_UNIFIED_SYSTEMD;
2569                         unified_systemd_v232 = false;
2570                 } else {
2571 #if 0 /// There is no sub-grouping within elogind
2572                         if (statfs("/sys/fs/cgroup/systemd/", &fs) < 0)
2573                                 return log_debug_errno(errno, "statfs(\"/sys/fs/cgroup/systemd\" failed: %m");
2574
2575                         if (F_TYPE_EQUAL(fs.f_type, CGROUP2_SUPER_MAGIC)) {
2576                                 log_debug("Found cgroup2 on /sys/fs/cgroup/systemd, unified hierarchy for systemd controller (v232 variant)");
2577                                 unified_cache = CGROUP_UNIFIED_SYSTEMD;
2578                                 unified_systemd_v232 = true;
2579                         } else if (F_TYPE_EQUAL(fs.f_type, CGROUP_SUPER_MAGIC)) {
2580                                 log_debug("Found cgroup on /sys/fs/cgroup/systemd, legacy hierarchy");
2581                                 unified_cache = CGROUP_UNIFIED_NONE;
2582                         } else {
2583                                 log_debug("Unexpected filesystem type %llx mounted on /sys/fs/cgroup/systemd, assuming legacy hierarchy",
2584                                           (unsigned long long) fs.f_type);
2585                                 unified_cache = CGROUP_UNIFIED_NONE;
2586                         }
2587 #else
2588                         unified_cache = CGROUP_UNIFIED_NONE;
2589 #endif // 0
2590                 }
2591         } else {
2592                 log_debug("Unknown filesystem type %llx mounted on /sys/fs/cgroup.",
2593                           (unsigned long long) fs.f_type);
2594                 return -ENOMEDIUM;
2595         }
2596
2597         return 0;
2598 }
2599
2600 int cg_unified_controller(const char *controller) {
2601         int r;
2602
2603         r = cg_unified_update();
2604         if (r < 0)
2605                 return r;
2606
2607         if (unified_cache == CGROUP_UNIFIED_NONE)
2608                 return false;
2609
2610         if (unified_cache >= CGROUP_UNIFIED_ALL)
2611                 return true;
2612
2613 #if 0 /// only if elogind is the controller we can use cgroups2 in hybrid mode
2614         return streq_ptr(controller, SYSTEMD_CGROUP_CONTROLLER);
2615 #else
2616         return streq_ptr(controller, SYSTEMD_CGROUP_CONTROLLER_HYBRID);
2617 #endif // 0
2618 }
2619
2620 int cg_all_unified(void) {
2621         int r;
2622
2623         r = cg_unified_update();
2624         if (r < 0)
2625                 return r;
2626
2627         return unified_cache >= CGROUP_UNIFIED_ALL;
2628 }
2629
2630 int cg_hybrid_unified(void) {
2631         int r;
2632
2633         r = cg_unified_update();
2634         if (r < 0)
2635                 return r;
2636
2637         return unified_cache == CGROUP_UNIFIED_SYSTEMD && !unified_systemd_v232;
2638 }
2639
2640 int cg_unified_flush(void) {
2641         unified_cache = CGROUP_UNIFIED_UNKNOWN;
2642
2643         return cg_unified_update();
2644 }
2645
2646 #if 0 /// UNNEEDED by elogind
2647 int cg_enable_everywhere(CGroupMask supported, CGroupMask mask, const char *p) {
2648         _cleanup_fclose_ FILE *f = NULL;
2649         _cleanup_free_ char *fs = NULL;
2650         CGroupController c;
2651         int r;
2652
2653         assert(p);
2654
2655         if (supported == 0)
2656                 return 0;
2657
2658         r = cg_all_unified();
2659         if (r < 0)
2660                 return r;
2661         if (r == 0) /* on the legacy hiearchy there's no joining of controllers defined */
2662                 return 0;
2663
2664         r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, p, "cgroup.subtree_control", &fs);
2665         if (r < 0)
2666                 return r;
2667
2668         for (c = 0; c < _CGROUP_CONTROLLER_MAX; c++) {
2669                 CGroupMask bit = CGROUP_CONTROLLER_TO_MASK(c);
2670                 const char *n;
2671
2672                 if (!(supported & bit))
2673                         continue;
2674
2675                 n = cgroup_controller_to_string(c);
2676                 {
2677                         char s[1 + strlen(n) + 1];
2678
2679                         s[0] = mask & bit ? '+' : '-';
2680                         strcpy(s + 1, n);
2681
2682                         if (!f) {
2683                                 f = fopen(fs, "we");
2684                                 if (!f) {
2685                                         log_debug_errno(errno, "Failed to open cgroup.subtree_control file of %s: %m", p);
2686                                         break;
2687                                 }
2688                         }
2689
2690                         r = write_string_stream(f, s, 0);
2691                         if (r < 0)
2692                                 log_debug_errno(r, "Failed to enable controller %s for %s (%s): %m", n, p, fs);
2693                 }
2694         }
2695
2696         return 0;
2697 }
2698 #endif // 0
2699
2700 bool cg_is_unified_wanted(void) {
2701         static thread_local int wanted = -1;
2702         int r;
2703         bool b;
2704         const bool is_default = DEFAULT_HIERARCHY == CGROUP_UNIFIED_ALL;
2705
2706         /* If we have a cached value, return that. */
2707         if (wanted >= 0)
2708                 return wanted;
2709
2710         /* If the hierarchy is already mounted, then follow whatever
2711          * was chosen for it. */
2712         if (cg_unified_flush() >= 0)
2713                 return (wanted = unified_cache >= CGROUP_UNIFIED_ALL);
2714
2715 #if 0 /// elogind is not init and has no business with kernel command line
2716         /* Otherwise, let's see what the kernel command line has to say.
2717          * Since checking is expensive, cache a non-error result. */
2718         r = proc_cmdline_get_bool("systemd.unified_cgroup_hierarchy", &b);
2719 #endif // 0
2720
2721         return (wanted = r > 0 ? b : is_default);
2722 }
2723
2724 bool cg_is_legacy_wanted(void) {
2725         static thread_local int wanted = -1;
2726
2727         /* If we have a cached value, return that. */
2728         if (wanted >= 0)
2729                 return wanted;
2730
2731         /* Check if we have cgroups2 already mounted. */
2732         if (cg_unified_flush() >= 0 &&
2733             unified_cache == CGROUP_UNIFIED_ALL)
2734                 return (wanted = false);
2735
2736         /* Otherwise, assume that at least partial legacy is wanted,
2737          * since cgroups2 should already be mounted at this point. */
2738         return (wanted = true);
2739 }
2740
2741 bool cg_is_hybrid_wanted(void) {
2742         static thread_local int wanted = -1;
2743         int r;
2744         bool b;
2745         const bool is_default = DEFAULT_HIERARCHY >= CGROUP_UNIFIED_SYSTEMD;
2746         /* We default to true if the default is "hybrid", obviously,
2747          * but also when the default is "unified", because if we get
2748          * called, it means that unified hierarchy was not mounted. */
2749
2750         /* If we have a cached value, return that. */
2751         if (wanted >= 0)
2752                 return wanted;
2753
2754         /* If the hierarchy is already mounted, then follow whatever
2755          * was chosen for it. */
2756         if (cg_unified_flush() >= 0 &&
2757             unified_cache == CGROUP_UNIFIED_ALL)
2758                 return (wanted = false);
2759
2760 #if 0 /// elogind is not init and has no business with kernel command line
2761         /* Otherwise, let's see what the kernel command line has to say.
2762          * Since checking is expensive, cache a non-error result. */
2763         r = proc_cmdline_get_bool("systemd.legacy_systemd_cgroup_controller", &b);
2764 #endif // 0
2765
2766         /* The meaning of the kernel option is reversed wrt. to the return value
2767          * of this function, hence the negation. */
2768         return (wanted = r > 0 ? !b : is_default);
2769 }
2770
2771 #if 0 /// UNNEEDED by elogind
2772 int cg_weight_parse(const char *s, uint64_t *ret) {
2773         uint64_t u;
2774         int r;
2775
2776         if (isempty(s)) {
2777                 *ret = CGROUP_WEIGHT_INVALID;
2778                 return 0;
2779         }
2780
2781         r = safe_atou64(s, &u);
2782         if (r < 0)
2783                 return r;
2784
2785         if (u < CGROUP_WEIGHT_MIN || u > CGROUP_WEIGHT_MAX)
2786                 return -ERANGE;
2787
2788         *ret = u;
2789         return 0;
2790 }
2791
2792 const uint64_t cgroup_io_limit_defaults[_CGROUP_IO_LIMIT_TYPE_MAX] = {
2793         [CGROUP_IO_RBPS_MAX]    = CGROUP_LIMIT_MAX,
2794         [CGROUP_IO_WBPS_MAX]    = CGROUP_LIMIT_MAX,
2795         [CGROUP_IO_RIOPS_MAX]   = CGROUP_LIMIT_MAX,
2796         [CGROUP_IO_WIOPS_MAX]   = CGROUP_LIMIT_MAX,
2797 };
2798
2799 static const char* const cgroup_io_limit_type_table[_CGROUP_IO_LIMIT_TYPE_MAX] = {
2800         [CGROUP_IO_RBPS_MAX]    = "IOReadBandwidthMax",
2801         [CGROUP_IO_WBPS_MAX]    = "IOWriteBandwidthMax",
2802         [CGROUP_IO_RIOPS_MAX]   = "IOReadIOPSMax",
2803         [CGROUP_IO_WIOPS_MAX]   = "IOWriteIOPSMax",
2804 };
2805
2806 DEFINE_STRING_TABLE_LOOKUP(cgroup_io_limit_type, CGroupIOLimitType);
2807
2808 int cg_cpu_shares_parse(const char *s, uint64_t *ret) {
2809         uint64_t u;
2810         int r;
2811
2812         if (isempty(s)) {
2813                 *ret = CGROUP_CPU_SHARES_INVALID;
2814                 return 0;
2815         }
2816
2817         r = safe_atou64(s, &u);
2818         if (r < 0)
2819                 return r;
2820
2821         if (u < CGROUP_CPU_SHARES_MIN || u > CGROUP_CPU_SHARES_MAX)
2822                 return -ERANGE;
2823
2824         *ret = u;
2825         return 0;
2826 }
2827
2828 int cg_blkio_weight_parse(const char *s, uint64_t *ret) {
2829         uint64_t u;
2830         int r;
2831
2832         if (isempty(s)) {
2833                 *ret = CGROUP_BLKIO_WEIGHT_INVALID;
2834                 return 0;
2835         }
2836
2837         r = safe_atou64(s, &u);
2838         if (r < 0)
2839                 return r;
2840
2841         if (u < CGROUP_BLKIO_WEIGHT_MIN || u > CGROUP_BLKIO_WEIGHT_MAX)
2842                 return -ERANGE;
2843
2844         *ret = u;
2845         return 0;
2846 }
2847 #endif // 0
2848
2849 bool is_cgroup_fs(const struct statfs *s) {
2850         return is_fs_type(s, CGROUP_SUPER_MAGIC) ||
2851                is_fs_type(s, CGROUP2_SUPER_MAGIC);
2852 }
2853
2854 bool fd_is_cgroup_fs(int fd) {
2855         struct statfs s;
2856
2857         if (fstatfs(fd, &s) < 0)
2858                 return -errno;
2859
2860         return is_cgroup_fs(&s);
2861 }
2862
2863 static const char *cgroup_controller_table[_CGROUP_CONTROLLER_MAX] = {
2864         [CGROUP_CONTROLLER_CPU] = "cpu",
2865         [CGROUP_CONTROLLER_CPUACCT] = "cpuacct",
2866         [CGROUP_CONTROLLER_IO] = "io",
2867         [CGROUP_CONTROLLER_BLKIO] = "blkio",
2868         [CGROUP_CONTROLLER_MEMORY] = "memory",
2869         [CGROUP_CONTROLLER_DEVICES] = "devices",
2870         [CGROUP_CONTROLLER_PIDS] = "pids",
2871 };
2872
2873 DEFINE_STRING_TABLE_LOOKUP(cgroup_controller, CGroupController);