chiark / gitweb /
core: use the unified hierarchy for the elogind cgroup controller hierarchy
[elogind.git] / src / core / cgroup.c
1 /***
2   This file is part of systemd.
3
4   Copyright 2013 Lennart Poettering
5
6   systemd is free software; you can redistribute it and/or modify it
7   under the terms of the GNU Lesser General Public License as published by
8   the Free Software Foundation; either version 2.1 of the License, or
9   (at your option) any later version.
10
11   systemd is distributed in the hope that it will be useful, but
12   WITHOUT ANY WARRANTY; without even the implied warranty of
13   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14   Lesser General Public License for more details.
15
16   You should have received a copy of the GNU Lesser General Public License
17   along with systemd; If not, see <http://www.gnu.org/licenses/>.
18 ***/
19
20 #include <fcntl.h>
21 #include <fnmatch.h>
22
23 #include "alloc-util.h"
24 #include "cgroup-util.h"
25 #include "cgroup.h"
26 #include "fd-util.h"
27 #include "fileio.h"
28 #include "fs-util.h"
29 #include "parse-util.h"
30 #include "path-util.h"
31 #include "process-util.h"
32 //#include "special.h"
33 #include "string-table.h"
34 #include "string-util.h"
35 #include "stdio-util.h"
36
37 #define CGROUP_CPU_QUOTA_PERIOD_USEC ((usec_t) 100 * USEC_PER_MSEC)
38
39 #if 0 /// UNNEEDED by elogind
40 static void cgroup_compat_warn(void) {
41         static bool cgroup_compat_warned = false;
42
43         if (cgroup_compat_warned)
44                 return;
45
46         log_warning("cgroup compatibility translation between legacy and unified hierarchy settings activated. See cgroup-compat debug messages for details.");
47         cgroup_compat_warned = true;
48 }
49
50 #define log_cgroup_compat(unit, fmt, ...) do {                                  \
51                 cgroup_compat_warn();                                           \
52                 log_unit_debug(unit, "cgroup-compat: " fmt, ##__VA_ARGS__);     \
53         } while (false)
54
55 void cgroup_context_init(CGroupContext *c) {
56         assert(c);
57
58         /* Initialize everything to the kernel defaults, assuming the
59          * structure is preinitialized to 0 */
60
61         c->cpu_weight = CGROUP_WEIGHT_INVALID;
62         c->startup_cpu_weight = CGROUP_WEIGHT_INVALID;
63         c->cpu_quota_per_sec_usec = USEC_INFINITY;
64
65         c->cpu_shares = CGROUP_CPU_SHARES_INVALID;
66         c->startup_cpu_shares = CGROUP_CPU_SHARES_INVALID;
67
68         c->memory_high = CGROUP_LIMIT_MAX;
69         c->memory_max = CGROUP_LIMIT_MAX;
70
71         c->memory_limit = CGROUP_LIMIT_MAX;
72
73         c->io_weight = CGROUP_WEIGHT_INVALID;
74         c->startup_io_weight = CGROUP_WEIGHT_INVALID;
75
76         c->blockio_weight = CGROUP_BLKIO_WEIGHT_INVALID;
77         c->startup_blockio_weight = CGROUP_BLKIO_WEIGHT_INVALID;
78
79         c->tasks_max = (uint64_t) -1;
80 }
81
82 void cgroup_context_free_device_allow(CGroupContext *c, CGroupDeviceAllow *a) {
83         assert(c);
84         assert(a);
85
86         LIST_REMOVE(device_allow, c->device_allow, a);
87         free(a->path);
88         free(a);
89 }
90
91 void cgroup_context_free_io_device_weight(CGroupContext *c, CGroupIODeviceWeight *w) {
92         assert(c);
93         assert(w);
94
95         LIST_REMOVE(device_weights, c->io_device_weights, w);
96         free(w->path);
97         free(w);
98 }
99
100 void cgroup_context_free_io_device_limit(CGroupContext *c, CGroupIODeviceLimit *l) {
101         assert(c);
102         assert(l);
103
104         LIST_REMOVE(device_limits, c->io_device_limits, l);
105         free(l->path);
106         free(l);
107 }
108
109 void cgroup_context_free_blockio_device_weight(CGroupContext *c, CGroupBlockIODeviceWeight *w) {
110         assert(c);
111         assert(w);
112
113         LIST_REMOVE(device_weights, c->blockio_device_weights, w);
114         free(w->path);
115         free(w);
116 }
117
118 void cgroup_context_free_blockio_device_bandwidth(CGroupContext *c, CGroupBlockIODeviceBandwidth *b) {
119         assert(c);
120         assert(b);
121
122         LIST_REMOVE(device_bandwidths, c->blockio_device_bandwidths, b);
123         free(b->path);
124         free(b);
125 }
126
127 void cgroup_context_done(CGroupContext *c) {
128         assert(c);
129
130         while (c->io_device_weights)
131                 cgroup_context_free_io_device_weight(c, c->io_device_weights);
132
133         while (c->io_device_limits)
134                 cgroup_context_free_io_device_limit(c, c->io_device_limits);
135
136         while (c->blockio_device_weights)
137                 cgroup_context_free_blockio_device_weight(c, c->blockio_device_weights);
138
139         while (c->blockio_device_bandwidths)
140                 cgroup_context_free_blockio_device_bandwidth(c, c->blockio_device_bandwidths);
141
142         while (c->device_allow)
143                 cgroup_context_free_device_allow(c, c->device_allow);
144 }
145
146 void cgroup_context_dump(CGroupContext *c, FILE* f, const char *prefix) {
147         CGroupIODeviceLimit *il;
148         CGroupIODeviceWeight *iw;
149         CGroupBlockIODeviceBandwidth *b;
150         CGroupBlockIODeviceWeight *w;
151         CGroupDeviceAllow *a;
152         char u[FORMAT_TIMESPAN_MAX];
153
154         assert(c);
155         assert(f);
156
157         prefix = strempty(prefix);
158
159         fprintf(f,
160                 "%sCPUAccounting=%s\n"
161                 "%sIOAccounting=%s\n"
162                 "%sBlockIOAccounting=%s\n"
163                 "%sMemoryAccounting=%s\n"
164                 "%sTasksAccounting=%s\n"
165                 "%sCPUWeight=%" PRIu64 "\n"
166                 "%sStartupCPUWeight=%" PRIu64 "\n"
167                 "%sCPUShares=%" PRIu64 "\n"
168                 "%sStartupCPUShares=%" PRIu64 "\n"
169                 "%sCPUQuotaPerSecSec=%s\n"
170                 "%sIOWeight=%" PRIu64 "\n"
171                 "%sStartupIOWeight=%" PRIu64 "\n"
172                 "%sBlockIOWeight=%" PRIu64 "\n"
173                 "%sStartupBlockIOWeight=%" PRIu64 "\n"
174                 "%sMemoryLow=%" PRIu64 "\n"
175                 "%sMemoryHigh=%" PRIu64 "\n"
176                 "%sMemoryMax=%" PRIu64 "\n"
177                 "%sMemoryLimit=%" PRIu64 "\n"
178                 "%sTasksMax=%" PRIu64 "\n"
179                 "%sDevicePolicy=%s\n"
180                 "%sDelegate=%s\n",
181                 prefix, yes_no(c->cpu_accounting),
182                 prefix, yes_no(c->io_accounting),
183                 prefix, yes_no(c->blockio_accounting),
184                 prefix, yes_no(c->memory_accounting),
185                 prefix, yes_no(c->tasks_accounting),
186                 prefix, c->cpu_weight,
187                 prefix, c->startup_cpu_weight,
188                 prefix, c->cpu_shares,
189                 prefix, c->startup_cpu_shares,
190                 prefix, format_timespan(u, sizeof(u), c->cpu_quota_per_sec_usec, 1),
191                 prefix, c->io_weight,
192                 prefix, c->startup_io_weight,
193                 prefix, c->blockio_weight,
194                 prefix, c->startup_blockio_weight,
195                 prefix, c->memory_low,
196                 prefix, c->memory_high,
197                 prefix, c->memory_max,
198                 prefix, c->memory_limit,
199                 prefix, c->tasks_max,
200                 prefix, cgroup_device_policy_to_string(c->device_policy),
201                 prefix, yes_no(c->delegate));
202
203         LIST_FOREACH(device_allow, a, c->device_allow)
204                 fprintf(f,
205                         "%sDeviceAllow=%s %s%s%s\n",
206                         prefix,
207                         a->path,
208                         a->r ? "r" : "", a->w ? "w" : "", a->m ? "m" : "");
209
210         LIST_FOREACH(device_weights, iw, c->io_device_weights)
211                 fprintf(f,
212                         "%sIODeviceWeight=%s %" PRIu64,
213                         prefix,
214                         iw->path,
215                         iw->weight);
216
217         LIST_FOREACH(device_limits, il, c->io_device_limits) {
218                 char buf[FORMAT_BYTES_MAX];
219                 CGroupIOLimitType type;
220
221                 for (type = 0; type < _CGROUP_IO_LIMIT_TYPE_MAX; type++)
222                         if (il->limits[type] != cgroup_io_limit_defaults[type])
223                                 fprintf(f,
224                                         "%s%s=%s %s\n",
225                                         prefix,
226                                         cgroup_io_limit_type_to_string(type),
227                                         il->path,
228                                         format_bytes(buf, sizeof(buf), il->limits[type]));
229         }
230
231         LIST_FOREACH(device_weights, w, c->blockio_device_weights)
232                 fprintf(f,
233                         "%sBlockIODeviceWeight=%s %" PRIu64,
234                         prefix,
235                         w->path,
236                         w->weight);
237
238         LIST_FOREACH(device_bandwidths, b, c->blockio_device_bandwidths) {
239                 char buf[FORMAT_BYTES_MAX];
240
241                 if (b->rbps != CGROUP_LIMIT_MAX)
242                         fprintf(f,
243                                 "%sBlockIOReadBandwidth=%s %s\n",
244                                 prefix,
245                                 b->path,
246                                 format_bytes(buf, sizeof(buf), b->rbps));
247                 if (b->wbps != CGROUP_LIMIT_MAX)
248                         fprintf(f,
249                                 "%sBlockIOWriteBandwidth=%s %s\n",
250                                 prefix,
251                                 b->path,
252                                 format_bytes(buf, sizeof(buf), b->wbps));
253         }
254 }
255
256 static int lookup_block_device(const char *p, dev_t *dev) {
257         struct stat st;
258         int r;
259
260         assert(p);
261         assert(dev);
262
263         r = stat(p, &st);
264         if (r < 0)
265                 return log_warning_errno(errno, "Couldn't stat device %s: %m", p);
266
267         if (S_ISBLK(st.st_mode))
268                 *dev = st.st_rdev;
269         else if (major(st.st_dev) != 0) {
270                 /* If this is not a device node then find the block
271                  * device this file is stored on */
272                 *dev = st.st_dev;
273
274                 /* If this is a partition, try to get the originating
275                  * block device */
276                 block_get_whole_disk(*dev, dev);
277         } else {
278                 log_warning("%s is not a block device and file system block device cannot be determined or is not local.", p);
279                 return -ENODEV;
280         }
281
282         return 0;
283 }
284
285 static int whitelist_device(const char *path, const char *node, const char *acc) {
286         char buf[2+DECIMAL_STR_MAX(dev_t)*2+2+4];
287         struct stat st;
288         int r;
289
290         assert(path);
291         assert(acc);
292
293         if (stat(node, &st) < 0) {
294                 log_warning("Couldn't stat device %s", node);
295                 return -errno;
296         }
297
298         if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
299                 log_warning("%s is not a device.", node);
300                 return -ENODEV;
301         }
302
303         sprintf(buf,
304                 "%c %u:%u %s",
305                 S_ISCHR(st.st_mode) ? 'c' : 'b',
306                 major(st.st_rdev), minor(st.st_rdev),
307                 acc);
308
309         r = cg_set_attribute("devices", path, "devices.allow", buf);
310         if (r < 0)
311                 log_full_errno(IN_SET(r, -ENOENT, -EROFS, -EINVAL, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
312                                "Failed to set devices.allow on %s: %m", path);
313
314         return r;
315 }
316
317 static int whitelist_major(const char *path, const char *name, char type, const char *acc) {
318         _cleanup_fclose_ FILE *f = NULL;
319         char line[LINE_MAX];
320         bool good = false;
321         int r;
322
323         assert(path);
324         assert(acc);
325         assert(type == 'b' || type == 'c');
326
327         f = fopen("/proc/devices", "re");
328         if (!f)
329                 return log_warning_errno(errno, "Cannot open /proc/devices to resolve %s (%c): %m", name, type);
330
331         FOREACH_LINE(line, f, goto fail) {
332                 char buf[2+DECIMAL_STR_MAX(unsigned)+3+4], *p, *w;
333                 unsigned maj;
334
335                 truncate_nl(line);
336
337                 if (type == 'c' && streq(line, "Character devices:")) {
338                         good = true;
339                         continue;
340                 }
341
342                 if (type == 'b' && streq(line, "Block devices:")) {
343                         good = true;
344                         continue;
345                 }
346
347                 if (isempty(line)) {
348                         good = false;
349                         continue;
350                 }
351
352                 if (!good)
353                         continue;
354
355                 p = strstrip(line);
356
357                 w = strpbrk(p, WHITESPACE);
358                 if (!w)
359                         continue;
360                 *w = 0;
361
362                 r = safe_atou(p, &maj);
363                 if (r < 0)
364                         continue;
365                 if (maj <= 0)
366                         continue;
367
368                 w++;
369                 w += strspn(w, WHITESPACE);
370
371                 if (fnmatch(name, w, 0) != 0)
372                         continue;
373
374                 sprintf(buf,
375                         "%c %u:* %s",
376                         type,
377                         maj,
378                         acc);
379
380                 r = cg_set_attribute("devices", path, "devices.allow", buf);
381                 if (r < 0)
382                         log_full_errno(IN_SET(r, -ENOENT, -EROFS, -EINVAL, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
383                                        "Failed to set devices.allow on %s: %m", path);
384         }
385
386         return 0;
387
388 fail:
389         log_warning_errno(errno, "Failed to read /proc/devices: %m");
390         return -errno;
391 }
392
393 static bool cgroup_context_has_cpu_weight(CGroupContext *c) {
394         return c->cpu_weight != CGROUP_WEIGHT_INVALID ||
395                 c->startup_cpu_weight != CGROUP_WEIGHT_INVALID;
396 }
397
398 static bool cgroup_context_has_cpu_shares(CGroupContext *c) {
399         return c->cpu_shares != CGROUP_CPU_SHARES_INVALID ||
400                 c->startup_cpu_shares != CGROUP_CPU_SHARES_INVALID;
401 }
402
403 static uint64_t cgroup_context_cpu_weight(CGroupContext *c, ManagerState state) {
404         if (IN_SET(state, MANAGER_STARTING, MANAGER_INITIALIZING) &&
405             c->startup_cpu_weight != CGROUP_WEIGHT_INVALID)
406                 return c->startup_cpu_weight;
407         else if (c->cpu_weight != CGROUP_WEIGHT_INVALID)
408                 return c->cpu_weight;
409         else
410                 return CGROUP_WEIGHT_DEFAULT;
411 }
412
413 static uint64_t cgroup_context_cpu_shares(CGroupContext *c, ManagerState state) {
414         if (IN_SET(state, MANAGER_STARTING, MANAGER_INITIALIZING) &&
415             c->startup_cpu_shares != CGROUP_CPU_SHARES_INVALID)
416                 return c->startup_cpu_shares;
417         else if (c->cpu_shares != CGROUP_CPU_SHARES_INVALID)
418                 return c->cpu_shares;
419         else
420                 return CGROUP_CPU_SHARES_DEFAULT;
421 }
422
423 static void cgroup_apply_unified_cpu_config(Unit *u, uint64_t weight, uint64_t quota) {
424         char buf[MAX(DECIMAL_STR_MAX(uint64_t) + 1, (DECIMAL_STR_MAX(usec_t) + 1) * 2)];
425         int r;
426
427         xsprintf(buf, "%" PRIu64 "\n", weight);
428         r = cg_set_attribute("cpu", u->cgroup_path, "cpu.weight", buf);
429         if (r < 0)
430                 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
431                               "Failed to set cpu.weight: %m");
432
433         if (quota != USEC_INFINITY)
434                 xsprintf(buf, USEC_FMT " " USEC_FMT "\n",
435                          quota * CGROUP_CPU_QUOTA_PERIOD_USEC / USEC_PER_SEC, CGROUP_CPU_QUOTA_PERIOD_USEC);
436         else
437                 xsprintf(buf, "max " USEC_FMT "\n", CGROUP_CPU_QUOTA_PERIOD_USEC);
438
439         r = cg_set_attribute("cpu", u->cgroup_path, "cpu.max", buf);
440
441         if (r < 0)
442                 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
443                               "Failed to set cpu.max: %m");
444 }
445
446 static void cgroup_apply_legacy_cpu_config(Unit *u, uint64_t shares, uint64_t quota) {
447         char buf[MAX(DECIMAL_STR_MAX(uint64_t), DECIMAL_STR_MAX(usec_t)) + 1];
448         int r;
449
450         xsprintf(buf, "%" PRIu64 "\n", shares);
451         r = cg_set_attribute("cpu", u->cgroup_path, "cpu.shares", buf);
452         if (r < 0)
453                 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
454                               "Failed to set cpu.shares: %m");
455
456         xsprintf(buf, USEC_FMT "\n", CGROUP_CPU_QUOTA_PERIOD_USEC);
457         r = cg_set_attribute("cpu", u->cgroup_path, "cpu.cfs_period_us", buf);
458         if (r < 0)
459                 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
460                               "Failed to set cpu.cfs_period_us: %m");
461
462         if (quota != USEC_INFINITY) {
463                 xsprintf(buf, USEC_FMT "\n", quota * CGROUP_CPU_QUOTA_PERIOD_USEC / USEC_PER_SEC);
464                 r = cg_set_attribute("cpu", u->cgroup_path, "cpu.cfs_quota_us", buf);
465         } else
466                 r = cg_set_attribute("cpu", u->cgroup_path, "cpu.cfs_quota_us", "-1");
467         if (r < 0)
468                 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
469                               "Failed to set cpu.cfs_quota_us: %m");
470 }
471
472 static uint64_t cgroup_cpu_shares_to_weight(uint64_t shares) {
473         return CLAMP(shares * CGROUP_WEIGHT_DEFAULT / CGROUP_CPU_SHARES_DEFAULT,
474                      CGROUP_WEIGHT_MIN, CGROUP_WEIGHT_MAX);
475 }
476
477 static uint64_t cgroup_cpu_weight_to_shares(uint64_t weight) {
478         return CLAMP(weight * CGROUP_CPU_SHARES_DEFAULT / CGROUP_WEIGHT_DEFAULT,
479                      CGROUP_CPU_SHARES_MIN, CGROUP_CPU_SHARES_MAX);
480 }
481
482 static bool cgroup_context_has_io_config(CGroupContext *c) {
483         return c->io_accounting ||
484                 c->io_weight != CGROUP_WEIGHT_INVALID ||
485                 c->startup_io_weight != CGROUP_WEIGHT_INVALID ||
486                 c->io_device_weights ||
487                 c->io_device_limits;
488 }
489
490 static bool cgroup_context_has_blockio_config(CGroupContext *c) {
491         return c->blockio_accounting ||
492                 c->blockio_weight != CGROUP_BLKIO_WEIGHT_INVALID ||
493                 c->startup_blockio_weight != CGROUP_BLKIO_WEIGHT_INVALID ||
494                 c->blockio_device_weights ||
495                 c->blockio_device_bandwidths;
496 }
497
498 static uint64_t cgroup_context_io_weight(CGroupContext *c, ManagerState state) {
499         if (IN_SET(state, MANAGER_STARTING, MANAGER_INITIALIZING) &&
500             c->startup_io_weight != CGROUP_WEIGHT_INVALID)
501                 return c->startup_io_weight;
502         else if (c->io_weight != CGROUP_WEIGHT_INVALID)
503                 return c->io_weight;
504         else
505                 return CGROUP_WEIGHT_DEFAULT;
506 }
507
508 static uint64_t cgroup_context_blkio_weight(CGroupContext *c, ManagerState state) {
509         if (IN_SET(state, MANAGER_STARTING, MANAGER_INITIALIZING) &&
510             c->startup_blockio_weight != CGROUP_BLKIO_WEIGHT_INVALID)
511                 return c->startup_blockio_weight;
512         else if (c->blockio_weight != CGROUP_BLKIO_WEIGHT_INVALID)
513                 return c->blockio_weight;
514         else
515                 return CGROUP_BLKIO_WEIGHT_DEFAULT;
516 }
517
518 static uint64_t cgroup_weight_blkio_to_io(uint64_t blkio_weight) {
519         return CLAMP(blkio_weight * CGROUP_WEIGHT_DEFAULT / CGROUP_BLKIO_WEIGHT_DEFAULT,
520                      CGROUP_WEIGHT_MIN, CGROUP_WEIGHT_MAX);
521 }
522
523 static uint64_t cgroup_weight_io_to_blkio(uint64_t io_weight) {
524         return CLAMP(io_weight * CGROUP_BLKIO_WEIGHT_DEFAULT / CGROUP_WEIGHT_DEFAULT,
525                      CGROUP_BLKIO_WEIGHT_MIN, CGROUP_BLKIO_WEIGHT_MAX);
526 }
527
528 static void cgroup_apply_io_device_weight(Unit *u, const char *dev_path, uint64_t io_weight) {
529         char buf[DECIMAL_STR_MAX(dev_t)*2+2+DECIMAL_STR_MAX(uint64_t)+1];
530         dev_t dev;
531         int r;
532
533         r = lookup_block_device(dev_path, &dev);
534         if (r < 0)
535                 return;
536
537         xsprintf(buf, "%u:%u %" PRIu64 "\n", major(dev), minor(dev), io_weight);
538         r = cg_set_attribute("io", u->cgroup_path, "io.weight", buf);
539         if (r < 0)
540                 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
541                               "Failed to set io.weight: %m");
542 }
543
544 static void cgroup_apply_blkio_device_weight(Unit *u, const char *dev_path, uint64_t blkio_weight) {
545         char buf[DECIMAL_STR_MAX(dev_t)*2+2+DECIMAL_STR_MAX(uint64_t)+1];
546         dev_t dev;
547         int r;
548
549         r = lookup_block_device(dev_path, &dev);
550         if (r < 0)
551                 return;
552
553         xsprintf(buf, "%u:%u %" PRIu64 "\n", major(dev), minor(dev), blkio_weight);
554         r = cg_set_attribute("blkio", u->cgroup_path, "blkio.weight_device", buf);
555         if (r < 0)
556                 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
557                               "Failed to set blkio.weight_device: %m");
558 }
559
560 static unsigned cgroup_apply_io_device_limit(Unit *u, const char *dev_path, uint64_t *limits) {
561         char limit_bufs[_CGROUP_IO_LIMIT_TYPE_MAX][DECIMAL_STR_MAX(uint64_t)];
562         char buf[DECIMAL_STR_MAX(dev_t)*2+2+(6+DECIMAL_STR_MAX(uint64_t)+1)*4];
563         CGroupIOLimitType type;
564         dev_t dev;
565         unsigned n = 0;
566         int r;
567
568         r = lookup_block_device(dev_path, &dev);
569         if (r < 0)
570                 return 0;
571
572         for (type = 0; type < _CGROUP_IO_LIMIT_TYPE_MAX; type++) {
573                 if (limits[type] != cgroup_io_limit_defaults[type]) {
574                         xsprintf(limit_bufs[type], "%" PRIu64, limits[type]);
575                         n++;
576                 } else {
577                         xsprintf(limit_bufs[type], "%s", limits[type] == CGROUP_LIMIT_MAX ? "max" : "0");
578                 }
579         }
580
581         xsprintf(buf, "%u:%u rbps=%s wbps=%s riops=%s wiops=%s\n", major(dev), minor(dev),
582                  limit_bufs[CGROUP_IO_RBPS_MAX], limit_bufs[CGROUP_IO_WBPS_MAX],
583                  limit_bufs[CGROUP_IO_RIOPS_MAX], limit_bufs[CGROUP_IO_WIOPS_MAX]);
584         r = cg_set_attribute("io", u->cgroup_path, "io.max", buf);
585         if (r < 0)
586                 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
587                               "Failed to set io.max: %m");
588         return n;
589 }
590
591 static unsigned cgroup_apply_blkio_device_limit(Unit *u, const char *dev_path, uint64_t rbps, uint64_t wbps) {
592         char buf[DECIMAL_STR_MAX(dev_t)*2+2+DECIMAL_STR_MAX(uint64_t)+1];
593         dev_t dev;
594         unsigned n = 0;
595         int r;
596
597         r = lookup_block_device(dev_path, &dev);
598         if (r < 0)
599                 return 0;
600
601         if (rbps != CGROUP_LIMIT_MAX)
602                 n++;
603         sprintf(buf, "%u:%u %" PRIu64 "\n", major(dev), minor(dev), rbps);
604         r = cg_set_attribute("blkio", u->cgroup_path, "blkio.throttle.read_bps_device", buf);
605         if (r < 0)
606                 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
607                               "Failed to set blkio.throttle.read_bps_device: %m");
608
609         if (wbps != CGROUP_LIMIT_MAX)
610                 n++;
611         sprintf(buf, "%u:%u %" PRIu64 "\n", major(dev), minor(dev), wbps);
612         r = cg_set_attribute("blkio", u->cgroup_path, "blkio.throttle.write_bps_device", buf);
613         if (r < 0)
614                 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
615                               "Failed to set blkio.throttle.write_bps_device: %m");
616
617         return n;
618 }
619
620 static bool cgroup_context_has_unified_memory_config(CGroupContext *c) {
621         return c->memory_low > 0 || c->memory_high != CGROUP_LIMIT_MAX || c->memory_max != CGROUP_LIMIT_MAX;
622 }
623
624 static void cgroup_apply_unified_memory_limit(Unit *u, const char *file, uint64_t v) {
625         char buf[DECIMAL_STR_MAX(uint64_t) + 1] = "max";
626         int r;
627
628         if (v != CGROUP_LIMIT_MAX)
629                 xsprintf(buf, "%" PRIu64 "\n", v);
630
631         r = cg_set_attribute("memory", u->cgroup_path, file, buf);
632         if (r < 0)
633                 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
634                               "Failed to set %s: %m", file);
635 }
636
637 static void cgroup_context_apply(Unit *u, CGroupMask mask, ManagerState state) {
638         const char *path;
639         CGroupContext *c;
640         bool is_root;
641         int r;
642
643         assert(u);
644
645         c = unit_get_cgroup_context(u);
646         path = u->cgroup_path;
647
648         assert(c);
649         assert(path);
650
651         if (mask == 0)
652                 return;
653
654         /* Some cgroup attributes are not supported on the root cgroup,
655          * hence silently ignore */
656         is_root = isempty(path) || path_equal(path, "/");
657         if (is_root)
658                 /* Make sure we don't try to display messages with an empty path. */
659                 path = "/";
660
661         /* We generally ignore errors caused by read-only mounted
662          * cgroup trees (assuming we are running in a container then),
663          * and missing cgroups, i.e. EROFS and ENOENT. */
664
665         if ((mask & CGROUP_MASK_CPU) && !is_root) {
666                 bool has_weight = cgroup_context_has_cpu_weight(c);
667                 bool has_shares = cgroup_context_has_cpu_shares(c);
668
669                 if (cg_all_unified() > 0) {
670                         uint64_t weight;
671
672                         if (has_weight)
673                                 weight = cgroup_context_cpu_weight(c, state);
674                         else if (has_shares) {
675                                 uint64_t shares = cgroup_context_cpu_shares(c, state);
676
677                                 weight = cgroup_cpu_shares_to_weight(shares);
678
679                                 log_cgroup_compat(u, "Applying [Startup]CpuShares %" PRIu64 " as [Startup]CpuWeight %" PRIu64 " on %s",
680                                                   shares, weight, path);
681                         } else
682                                 weight = CGROUP_WEIGHT_DEFAULT;
683
684                         cgroup_apply_unified_cpu_config(u, weight, c->cpu_quota_per_sec_usec);
685                 } else {
686                         uint64_t shares;
687
688                         if (has_shares)
689                                 shares = cgroup_context_cpu_shares(c, state);
690                         else if (has_weight) {
691                                 uint64_t weight = cgroup_context_cpu_weight(c, state);
692
693                                 shares = cgroup_cpu_weight_to_shares(weight);
694
695                                 log_cgroup_compat(u, "Applying [Startup]CpuWeight %" PRIu64 " as [Startup]CpuShares %" PRIu64 " on %s",
696                                                   weight, shares, path);
697                         } else
698                                 shares = CGROUP_CPU_SHARES_DEFAULT;
699
700                         cgroup_apply_legacy_cpu_config(u, shares, c->cpu_quota_per_sec_usec);
701                 }
702         }
703
704         if (mask & CGROUP_MASK_IO) {
705                 bool has_io = cgroup_context_has_io_config(c);
706                 bool has_blockio = cgroup_context_has_blockio_config(c);
707
708                 if (!is_root) {
709                         char buf[8+DECIMAL_STR_MAX(uint64_t)+1];
710                         uint64_t weight;
711
712                         if (has_io)
713                                 weight = cgroup_context_io_weight(c, state);
714                         else if (has_blockio) {
715                                 uint64_t blkio_weight = cgroup_context_blkio_weight(c, state);
716
717                                 weight = cgroup_weight_blkio_to_io(blkio_weight);
718
719                                 log_cgroup_compat(u, "Applying [Startup]BlockIOWeight %" PRIu64 " as [Startup]IOWeight %" PRIu64,
720                                                   blkio_weight, weight);
721                         } else
722                                 weight = CGROUP_WEIGHT_DEFAULT;
723
724                         xsprintf(buf, "default %" PRIu64 "\n", weight);
725                         r = cg_set_attribute("io", path, "io.weight", buf);
726                         if (r < 0)
727                                 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
728                                               "Failed to set io.weight: %m");
729
730                         if (has_io) {
731                                 CGroupIODeviceWeight *w;
732
733                                 /* FIXME: no way to reset this list */
734                                 LIST_FOREACH(device_weights, w, c->io_device_weights)
735                                         cgroup_apply_io_device_weight(u, w->path, w->weight);
736                         } else if (has_blockio) {
737                                 CGroupBlockIODeviceWeight *w;
738
739                                 /* FIXME: no way to reset this list */
740                                 LIST_FOREACH(device_weights, w, c->blockio_device_weights) {
741                                         weight = cgroup_weight_blkio_to_io(w->weight);
742
743                                         log_cgroup_compat(u, "Applying BlockIODeviceWeight %" PRIu64 " as IODeviceWeight %" PRIu64 " for %s",
744                                                           w->weight, weight, w->path);
745
746                                         cgroup_apply_io_device_weight(u, w->path, weight);
747                                 }
748                         }
749                 }
750
751                 /* Apply limits and free ones without config. */
752                 if (has_io) {
753                         CGroupIODeviceLimit *l, *next;
754
755                         LIST_FOREACH_SAFE(device_limits, l, next, c->io_device_limits) {
756                                 if (!cgroup_apply_io_device_limit(u, l->path, l->limits))
757                                         cgroup_context_free_io_device_limit(c, l);
758                         }
759                 } else if (has_blockio) {
760                         CGroupBlockIODeviceBandwidth *b, *next;
761
762                         LIST_FOREACH_SAFE(device_bandwidths, b, next, c->blockio_device_bandwidths) {
763                                 uint64_t limits[_CGROUP_IO_LIMIT_TYPE_MAX];
764                                 CGroupIOLimitType type;
765
766                                 for (type = 0; type < _CGROUP_IO_LIMIT_TYPE_MAX; type++)
767                                         limits[type] = cgroup_io_limit_defaults[type];
768
769                                 limits[CGROUP_IO_RBPS_MAX] = b->rbps;
770                                 limits[CGROUP_IO_WBPS_MAX] = b->wbps;
771
772                                 log_cgroup_compat(u, "Applying BlockIO{Read|Write}Bandwidth %" PRIu64 " %" PRIu64 " as IO{Read|Write}BandwidthMax for %s",
773                                                   b->rbps, b->wbps, b->path);
774
775                                 if (!cgroup_apply_io_device_limit(u, b->path, limits))
776                                         cgroup_context_free_blockio_device_bandwidth(c, b);
777                         }
778                 }
779         }
780
781         if (mask & CGROUP_MASK_BLKIO) {
782                 bool has_io = cgroup_context_has_io_config(c);
783                 bool has_blockio = cgroup_context_has_blockio_config(c);
784
785                 if (!is_root) {
786                         char buf[DECIMAL_STR_MAX(uint64_t)+1];
787                         uint64_t weight;
788
789                         if (has_blockio)
790                                 weight = cgroup_context_blkio_weight(c, state);
791                         else if (has_io) {
792                                 uint64_t io_weight = cgroup_context_io_weight(c, state);
793
794                                 weight = cgroup_weight_io_to_blkio(cgroup_context_io_weight(c, state));
795
796                                 log_cgroup_compat(u, "Applying [Startup]IOWeight %" PRIu64 " as [Startup]BlockIOWeight %" PRIu64,
797                                                   io_weight, weight);
798                         } else
799                                 weight = CGROUP_BLKIO_WEIGHT_DEFAULT;
800
801                         xsprintf(buf, "%" PRIu64 "\n", weight);
802                         r = cg_set_attribute("blkio", path, "blkio.weight", buf);
803                         if (r < 0)
804                                 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
805                                               "Failed to set blkio.weight: %m");
806
807                         if (has_blockio) {
808                                 CGroupBlockIODeviceWeight *w;
809
810                                 /* FIXME: no way to reset this list */
811                                 LIST_FOREACH(device_weights, w, c->blockio_device_weights)
812                                         cgroup_apply_blkio_device_weight(u, w->path, w->weight);
813                         } else if (has_io) {
814                                 CGroupIODeviceWeight *w;
815
816                                 /* FIXME: no way to reset this list */
817                                 LIST_FOREACH(device_weights, w, c->io_device_weights) {
818                                         weight = cgroup_weight_io_to_blkio(w->weight);
819
820                                         log_cgroup_compat(u, "Applying IODeviceWeight %" PRIu64 " as BlockIODeviceWeight %" PRIu64 " for %s",
821                                                           w->weight, weight, w->path);
822
823                                         cgroup_apply_blkio_device_weight(u, w->path, weight);
824                                 }
825                         }
826                 }
827
828                 /* Apply limits and free ones without config. */
829                 if (has_blockio) {
830                         CGroupBlockIODeviceBandwidth *b, *next;
831
832                         LIST_FOREACH_SAFE(device_bandwidths, b, next, c->blockio_device_bandwidths) {
833                                 if (!cgroup_apply_blkio_device_limit(u, b->path, b->rbps, b->wbps))
834                                         cgroup_context_free_blockio_device_bandwidth(c, b);
835                         }
836                 } else if (has_io) {
837                         CGroupIODeviceLimit *l, *next;
838
839                         LIST_FOREACH_SAFE(device_limits, l, next, c->io_device_limits) {
840                                 log_cgroup_compat(u, "Applying IO{Read|Write}Bandwidth %" PRIu64 " %" PRIu64 " as BlockIO{Read|Write}BandwidthMax for %s",
841                                                   l->limits[CGROUP_IO_RBPS_MAX], l->limits[CGROUP_IO_WBPS_MAX], l->path);
842
843                                 if (!cgroup_apply_blkio_device_limit(u, l->path, l->limits[CGROUP_IO_RBPS_MAX], l->limits[CGROUP_IO_WBPS_MAX]))
844                                         cgroup_context_free_io_device_limit(c, l);
845                         }
846                 }
847         }
848
849         if ((mask & CGROUP_MASK_MEMORY) && !is_root) {
850                 if (cg_all_unified() > 0) {
851                         uint64_t max = c->memory_max;
852
853                         if (cgroup_context_has_unified_memory_config(c))
854                                 max = c->memory_max;
855                         else {
856                                 max = c->memory_limit;
857
858                                 if (max != CGROUP_LIMIT_MAX)
859                                         log_cgroup_compat(u, "Applying MemoryLimit %" PRIu64 " as MemoryMax", max);
860                         }
861
862                         cgroup_apply_unified_memory_limit(u, "memory.low", c->memory_low);
863                         cgroup_apply_unified_memory_limit(u, "memory.high", c->memory_high);
864                         cgroup_apply_unified_memory_limit(u, "memory.max", max);
865                 } else {
866                         char buf[DECIMAL_STR_MAX(uint64_t) + 1];
867                         uint64_t val = c->memory_limit;
868
869                         if (val == CGROUP_LIMIT_MAX) {
870                                 val = c->memory_max;
871
872                                 if (val != CGROUP_LIMIT_MAX)
873                                         log_cgroup_compat(u, "Applying MemoryMax %" PRIi64 " as MemoryLimit", c->memory_max);
874                         }
875
876                         if (val == CGROUP_LIMIT_MAX)
877                                 strncpy(buf, "-1\n", sizeof(buf));
878                         else
879                                 xsprintf(buf, "%" PRIu64 "\n", val);
880
881                         r = cg_set_attribute("memory", path, "memory.limit_in_bytes", buf);
882                         if (r < 0)
883                                 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
884                                               "Failed to set memory.limit_in_bytes: %m");
885                 }
886         }
887
888         if ((mask & CGROUP_MASK_DEVICES) && !is_root) {
889                 CGroupDeviceAllow *a;
890
891                 /* Changing the devices list of a populated cgroup
892                  * might result in EINVAL, hence ignore EINVAL
893                  * here. */
894
895                 if (c->device_allow || c->device_policy != CGROUP_AUTO)
896                         r = cg_set_attribute("devices", path, "devices.deny", "a");
897                 else
898                         r = cg_set_attribute("devices", path, "devices.allow", "a");
899                 if (r < 0)
900                         log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EINVAL, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
901                                       "Failed to reset devices.list: %m");
902
903                 if (c->device_policy == CGROUP_CLOSED ||
904                     (c->device_policy == CGROUP_AUTO && c->device_allow)) {
905                         static const char auto_devices[] =
906                                 "/dev/null\0" "rwm\0"
907                                 "/dev/zero\0" "rwm\0"
908                                 "/dev/full\0" "rwm\0"
909                                 "/dev/random\0" "rwm\0"
910                                 "/dev/urandom\0" "rwm\0"
911                                 "/dev/tty\0" "rwm\0"
912                                 "/dev/pts/ptmx\0" "rw\0" /* /dev/pts/ptmx may not be duplicated, but accessed */
913                                 /* Allow /run/elogind/inaccessible/{chr,blk} devices for mapping InaccessiblePaths */
914                                 /* Allow /run/systemd/inaccessible/{chr,blk} devices for mapping InaccessiblePaths */
915                                 "/run/systemd/inaccessible/chr\0" "rwm\0"
916                                 "/run/systemd/inaccessible/blk\0" "rwm\0";
917
918                         const char *x, *y;
919
920                         NULSTR_FOREACH_PAIR(x, y, auto_devices)
921                                 whitelist_device(path, x, y);
922
923                         whitelist_major(path, "pts", 'c', "rw");
924                         whitelist_major(path, "kdbus", 'c', "rw");
925                         whitelist_major(path, "kdbus/*", 'c', "rw");
926                 }
927
928                 LIST_FOREACH(device_allow, a, c->device_allow) {
929                         char acc[4];
930                         unsigned k = 0;
931
932                         if (a->r)
933                                 acc[k++] = 'r';
934                         if (a->w)
935                                 acc[k++] = 'w';
936                         if (a->m)
937                                 acc[k++] = 'm';
938
939                         if (k == 0)
940                                 continue;
941
942                         acc[k++] = 0;
943
944                         if (startswith(a->path, "/dev/"))
945                                 whitelist_device(path, a->path, acc);
946                         else if (startswith(a->path, "block-"))
947                                 whitelist_major(path, a->path + 6, 'b', acc);
948                         else if (startswith(a->path, "char-"))
949                                 whitelist_major(path, a->path + 5, 'c', acc);
950                         else
951                                 log_unit_debug(u, "Ignoring device %s while writing cgroup attribute.", a->path);
952                 }
953         }
954
955         if ((mask & CGROUP_MASK_PIDS) && !is_root) {
956
957                 if (c->tasks_max != (uint64_t) -1) {
958                         char buf[DECIMAL_STR_MAX(uint64_t) + 2];
959
960                         sprintf(buf, "%" PRIu64 "\n", c->tasks_max);
961                         r = cg_set_attribute("pids", path, "pids.max", buf);
962                 } else
963                         r = cg_set_attribute("pids", path, "pids.max", "max");
964
965                 if (r < 0)
966                         log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
967                                       "Failed to set pids.max: %m");
968         }
969 }
970
971 CGroupMask cgroup_context_get_mask(CGroupContext *c) {
972         CGroupMask mask = 0;
973
974         /* Figure out which controllers we need */
975
976         if (c->cpu_accounting ||
977             cgroup_context_has_cpu_weight(c) ||
978             cgroup_context_has_cpu_shares(c) ||
979             c->cpu_quota_per_sec_usec != USEC_INFINITY)
980                 mask |= CGROUP_MASK_CPUACCT | CGROUP_MASK_CPU;
981
982         if (cgroup_context_has_io_config(c) || cgroup_context_has_blockio_config(c))
983                 mask |= CGROUP_MASK_IO | CGROUP_MASK_BLKIO;
984
985         if (c->memory_accounting ||
986             c->memory_limit != CGROUP_LIMIT_MAX ||
987             cgroup_context_has_unified_memory_config(c))
988                 mask |= CGROUP_MASK_MEMORY;
989
990         if (c->device_allow ||
991             c->device_policy != CGROUP_AUTO)
992                 mask |= CGROUP_MASK_DEVICES;
993
994         if (c->tasks_accounting ||
995             c->tasks_max != (uint64_t) -1)
996                 mask |= CGROUP_MASK_PIDS;
997
998         return mask;
999 }
1000
1001 CGroupMask unit_get_own_mask(Unit *u) {
1002         CGroupContext *c;
1003
1004         /* Returns the mask of controllers the unit needs for itself */
1005
1006         c = unit_get_cgroup_context(u);
1007         if (!c)
1008                 return 0;
1009
1010         /* If delegation is turned on, then turn on all cgroups,
1011          * unless we are on the legacy hierarchy and the process we
1012          * fork into it is known to drop privileges, and hence
1013          * shouldn't get access to the controllers.
1014          *
1015          * Note that on the unified hierarchy it is safe to delegate
1016          * controllers to unprivileged services. */
1017
1018         if (c->delegate) {
1019                 ExecContext *e;
1020
1021                 e = unit_get_exec_context(u);
1022                 if (!e ||
1023                     exec_context_maintains_privileges(e) ||
1024                     cg_all_unified() > 0)
1025                         return _CGROUP_MASK_ALL;
1026         }
1027
1028         return cgroup_context_get_mask(c);
1029 }
1030
1031 CGroupMask unit_get_members_mask(Unit *u) {
1032         assert(u);
1033
1034         /* Returns the mask of controllers all of the unit's children
1035          * require, merged */
1036
1037         if (u->cgroup_members_mask_valid)
1038                 return u->cgroup_members_mask;
1039
1040         u->cgroup_members_mask = 0;
1041
1042         if (u->type == UNIT_SLICE) {
1043                 Unit *member;
1044                 Iterator i;
1045
1046                 SET_FOREACH(member, u->dependencies[UNIT_BEFORE], i) {
1047
1048                         if (member == u)
1049                                 continue;
1050
1051                         if (UNIT_DEREF(member->slice) != u)
1052                                 continue;
1053
1054                         u->cgroup_members_mask |=
1055                                 unit_get_own_mask(member) |
1056                                 unit_get_members_mask(member);
1057                 }
1058         }
1059
1060         u->cgroup_members_mask_valid = true;
1061         return u->cgroup_members_mask;
1062 }
1063
1064 CGroupMask unit_get_siblings_mask(Unit *u) {
1065         assert(u);
1066
1067         /* Returns the mask of controllers all of the unit's siblings
1068          * require, i.e. the members mask of the unit's parent slice
1069          * if there is one. */
1070
1071         if (UNIT_ISSET(u->slice))
1072                 return unit_get_members_mask(UNIT_DEREF(u->slice));
1073
1074         return unit_get_own_mask(u) | unit_get_members_mask(u);
1075 }
1076
1077 CGroupMask unit_get_subtree_mask(Unit *u) {
1078
1079         /* Returns the mask of this subtree, meaning of the group
1080          * itself and its children. */
1081
1082         return unit_get_own_mask(u) | unit_get_members_mask(u);
1083 }
1084
1085 CGroupMask unit_get_target_mask(Unit *u) {
1086         CGroupMask mask;
1087
1088         /* This returns the cgroup mask of all controllers to enable
1089          * for a specific cgroup, i.e. everything it needs itself,
1090          * plus all that its children need, plus all that its siblings
1091          * need. This is primarily useful on the legacy cgroup
1092          * hierarchy, where we need to duplicate each cgroup in each
1093          * hierarchy that shall be enabled for it. */
1094
1095         mask = unit_get_own_mask(u) | unit_get_members_mask(u) | unit_get_siblings_mask(u);
1096         mask &= u->manager->cgroup_supported;
1097
1098         return mask;
1099 }
1100
1101 CGroupMask unit_get_enable_mask(Unit *u) {
1102         CGroupMask mask;
1103
1104         /* This returns the cgroup mask of all controllers to enable
1105          * for the children of a specific cgroup. This is primarily
1106          * useful for the unified cgroup hierarchy, where each cgroup
1107          * controls which controllers are enabled for its children. */
1108
1109         mask = unit_get_members_mask(u);
1110         mask &= u->manager->cgroup_supported;
1111
1112         return mask;
1113 }
1114
1115 /* Recurse from a unit up through its containing slices, propagating
1116  * mask bits upward. A unit is also member of itself. */
1117 void unit_update_cgroup_members_masks(Unit *u) {
1118         CGroupMask m;
1119         bool more;
1120
1121         assert(u);
1122
1123         /* Calculate subtree mask */
1124         m = unit_get_subtree_mask(u);
1125
1126         /* See if anything changed from the previous invocation. If
1127          * not, we're done. */
1128         if (u->cgroup_subtree_mask_valid && m == u->cgroup_subtree_mask)
1129                 return;
1130
1131         more =
1132                 u->cgroup_subtree_mask_valid &&
1133                 ((m & ~u->cgroup_subtree_mask) != 0) &&
1134                 ((~m & u->cgroup_subtree_mask) == 0);
1135
1136         u->cgroup_subtree_mask = m;
1137         u->cgroup_subtree_mask_valid = true;
1138
1139         if (UNIT_ISSET(u->slice)) {
1140                 Unit *s = UNIT_DEREF(u->slice);
1141
1142                 if (more)
1143                         /* There's more set now than before. We
1144                          * propagate the new mask to the parent's mask
1145                          * (not caring if it actually was valid or
1146                          * not). */
1147
1148                         s->cgroup_members_mask |= m;
1149
1150                 else
1151                         /* There's less set now than before (or we
1152                          * don't know), we need to recalculate
1153                          * everything, so let's invalidate the
1154                          * parent's members mask */
1155
1156                         s->cgroup_members_mask_valid = false;
1157
1158                 /* And now make sure that this change also hits our
1159                  * grandparents */
1160                 unit_update_cgroup_members_masks(s);
1161         }
1162 }
1163
1164 static const char *migrate_callback(CGroupMask mask, void *userdata) {
1165         Unit *u = userdata;
1166
1167         assert(mask != 0);
1168         assert(u);
1169
1170         while (u) {
1171                 if (u->cgroup_path &&
1172                     u->cgroup_realized &&
1173                     (u->cgroup_realized_mask & mask) == mask)
1174                         return u->cgroup_path;
1175
1176                 u = UNIT_DEREF(u->slice);
1177         }
1178
1179         return NULL;
1180 }
1181
1182 char *unit_default_cgroup_path(Unit *u) {
1183         _cleanup_free_ char *escaped = NULL, *slice = NULL;
1184         int r;
1185
1186         assert(u);
1187
1188         if (unit_has_name(u, SPECIAL_ROOT_SLICE))
1189                 return strdup(u->manager->cgroup_root);
1190
1191         if (UNIT_ISSET(u->slice) && !unit_has_name(UNIT_DEREF(u->slice), SPECIAL_ROOT_SLICE)) {
1192                 r = cg_slice_to_path(UNIT_DEREF(u->slice)->id, &slice);
1193                 if (r < 0)
1194                         return NULL;
1195         }
1196
1197         escaped = cg_escape(u->id);
1198         if (!escaped)
1199                 return NULL;
1200
1201         if (slice)
1202                 return strjoin(u->manager->cgroup_root, "/", slice, "/", escaped, NULL);
1203         else
1204                 return strjoin(u->manager->cgroup_root, "/", escaped, NULL);
1205 }
1206
1207 int unit_set_cgroup_path(Unit *u, const char *path) {
1208         _cleanup_free_ char *p = NULL;
1209         int r;
1210
1211         assert(u);
1212
1213         if (path) {
1214                 p = strdup(path);
1215                 if (!p)
1216                         return -ENOMEM;
1217         } else
1218                 p = NULL;
1219
1220         if (streq_ptr(u->cgroup_path, p))
1221                 return 0;
1222
1223         if (p) {
1224                 r = hashmap_put(u->manager->cgroup_unit, p, u);
1225                 if (r < 0)
1226                         return r;
1227         }
1228
1229         unit_release_cgroup(u);
1230
1231         u->cgroup_path = p;
1232         p = NULL;
1233
1234         return 1;
1235 }
1236
1237 int unit_watch_cgroup(Unit *u) {
1238         _cleanup_free_ char *events = NULL;
1239         int r;
1240
1241         assert(u);
1242
1243         if (!u->cgroup_path)
1244                 return 0;
1245
1246         if (u->cgroup_inotify_wd >= 0)
1247                 return 0;
1248
1249         /* Only applies to the unified hierarchy */
1250         r = cg_unified(SYSTEMD_CGROUP_CONTROLLER);
1251         if (r < 0)
1252                 return log_unit_error_errno(u, r, "Failed detect whether the unified hierarchy is used: %m");
1253         if (r == 0)
1254                 return 0;
1255
1256         /* Don't watch the root slice, it's pointless. */
1257         if (unit_has_name(u, SPECIAL_ROOT_SLICE))
1258                 return 0;
1259
1260         r = hashmap_ensure_allocated(&u->manager->cgroup_inotify_wd_unit, &trivial_hash_ops);
1261         if (r < 0)
1262                 return log_oom();
1263
1264         r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, "cgroup.events", &events);
1265         if (r < 0)
1266                 return log_oom();
1267
1268         u->cgroup_inotify_wd = inotify_add_watch(u->manager->cgroup_inotify_fd, events, IN_MODIFY);
1269         if (u->cgroup_inotify_wd < 0) {
1270
1271                 if (errno == ENOENT) /* If the directory is already
1272                                       * gone we don't need to track
1273                                       * it, so this is not an error */
1274                         return 0;
1275
1276                 return log_unit_error_errno(u, errno, "Failed to add inotify watch descriptor for control group %s: %m", u->cgroup_path);
1277         }
1278
1279         r = hashmap_put(u->manager->cgroup_inotify_wd_unit, INT_TO_PTR(u->cgroup_inotify_wd), u);
1280         if (r < 0)
1281                 return log_unit_error_errno(u, r, "Failed to add inotify watch descriptor to hash map: %m");
1282
1283         return 0;
1284 }
1285
1286 static int unit_create_cgroup(
1287                 Unit *u,
1288                 CGroupMask target_mask,
1289                 CGroupMask enable_mask) {
1290
1291         CGroupContext *c;
1292         int r;
1293
1294         assert(u);
1295
1296         c = unit_get_cgroup_context(u);
1297         if (!c)
1298                 return 0;
1299
1300         if (!u->cgroup_path) {
1301                 _cleanup_free_ char *path = NULL;
1302
1303                 path = unit_default_cgroup_path(u);
1304                 if (!path)
1305                         return log_oom();
1306
1307                 r = unit_set_cgroup_path(u, path);
1308                 if (r == -EEXIST)
1309                         return log_unit_error_errno(u, r, "Control group %s exists already.", path);
1310                 if (r < 0)
1311                         return log_unit_error_errno(u, r, "Failed to set unit's control group path to %s: %m", path);
1312         }
1313
1314         /* First, create our own group */
1315         r = cg_create_everywhere(u->manager->cgroup_supported, target_mask, u->cgroup_path);
1316         if (r < 0)
1317                 return log_unit_error_errno(u, r, "Failed to create cgroup %s: %m", u->cgroup_path);
1318
1319         /* Start watching it */
1320         (void) unit_watch_cgroup(u);
1321
1322         /* Enable all controllers we need */
1323         r = cg_enable_everywhere(u->manager->cgroup_supported, enable_mask, u->cgroup_path);
1324         if (r < 0)
1325                 log_unit_warning_errno(u, r, "Failed to enable controllers on cgroup %s, ignoring: %m", u->cgroup_path);
1326
1327         /* Keep track that this is now realized */
1328         u->cgroup_realized = true;
1329         u->cgroup_realized_mask = target_mask;
1330         u->cgroup_enabled_mask = enable_mask;
1331
1332         if (u->type != UNIT_SLICE && !c->delegate) {
1333
1334                 /* Then, possibly move things over, but not if
1335                  * subgroups may contain processes, which is the case
1336                  * for slice and delegation units. */
1337                 r = cg_migrate_everywhere(u->manager->cgroup_supported, u->cgroup_path, u->cgroup_path, migrate_callback, u);
1338                 if (r < 0)
1339                         log_unit_warning_errno(u, r, "Failed to migrate cgroup from to %s, ignoring: %m", u->cgroup_path);
1340         }
1341
1342         return 0;
1343 }
1344
1345 int unit_attach_pids_to_cgroup(Unit *u) {
1346         int r;
1347         assert(u);
1348
1349         r = unit_realize_cgroup(u);
1350         if (r < 0)
1351                 return r;
1352
1353         r = cg_attach_many_everywhere(u->manager->cgroup_supported, u->cgroup_path, u->pids, migrate_callback, u);
1354         if (r < 0)
1355                 return r;
1356
1357         return 0;
1358 }
1359
1360 static bool unit_has_mask_realized(Unit *u, CGroupMask target_mask, CGroupMask enable_mask) {
1361         assert(u);
1362
1363         return u->cgroup_realized && u->cgroup_realized_mask == target_mask && u->cgroup_enabled_mask == enable_mask;
1364 }
1365
1366 /* Check if necessary controllers and attributes for a unit are in place.
1367  *
1368  * If so, do nothing.
1369  * If not, create paths, move processes over, and set attributes.
1370  *
1371  * Returns 0 on success and < 0 on failure. */
1372 static int unit_realize_cgroup_now(Unit *u, ManagerState state) {
1373         CGroupMask target_mask, enable_mask;
1374         int r;
1375
1376         assert(u);
1377
1378         if (u->in_cgroup_queue) {
1379                 LIST_REMOVE(cgroup_queue, u->manager->cgroup_queue, u);
1380                 u->in_cgroup_queue = false;
1381         }
1382
1383         target_mask = unit_get_target_mask(u);
1384         enable_mask = unit_get_enable_mask(u);
1385
1386         if (unit_has_mask_realized(u, target_mask, enable_mask))
1387                 return 0;
1388
1389         /* First, realize parents */
1390         if (UNIT_ISSET(u->slice)) {
1391                 r = unit_realize_cgroup_now(UNIT_DEREF(u->slice), state);
1392                 if (r < 0)
1393                         return r;
1394         }
1395
1396         /* And then do the real work */
1397         r = unit_create_cgroup(u, target_mask, enable_mask);
1398         if (r < 0)
1399                 return r;
1400
1401         /* Finally, apply the necessary attributes. */
1402         cgroup_context_apply(u, target_mask, state);
1403
1404         return 0;
1405 }
1406
1407 static void unit_add_to_cgroup_queue(Unit *u) {
1408
1409         if (u->in_cgroup_queue)
1410                 return;
1411
1412         LIST_PREPEND(cgroup_queue, u->manager->cgroup_queue, u);
1413         u->in_cgroup_queue = true;
1414 }
1415
1416 unsigned manager_dispatch_cgroup_queue(Manager *m) {
1417         ManagerState state;
1418         unsigned n = 0;
1419         Unit *i;
1420         int r;
1421
1422         state = manager_state(m);
1423
1424         while ((i = m->cgroup_queue)) {
1425                 assert(i->in_cgroup_queue);
1426
1427                 r = unit_realize_cgroup_now(i, state);
1428                 if (r < 0)
1429                         log_warning_errno(r, "Failed to realize cgroups for queued unit %s, ignoring: %m", i->id);
1430
1431                 n++;
1432         }
1433
1434         return n;
1435 }
1436
1437 static void unit_queue_siblings(Unit *u) {
1438         Unit *slice;
1439
1440         /* This adds the siblings of the specified unit and the
1441          * siblings of all parent units to the cgroup queue. (But
1442          * neither the specified unit itself nor the parents.) */
1443
1444         while ((slice = UNIT_DEREF(u->slice))) {
1445                 Iterator i;
1446                 Unit *m;
1447
1448                 SET_FOREACH(m, slice->dependencies[UNIT_BEFORE], i) {
1449                         if (m == u)
1450                                 continue;
1451
1452                         /* Skip units that have a dependency on the slice
1453                          * but aren't actually in it. */
1454                         if (UNIT_DEREF(m->slice) != slice)
1455                                 continue;
1456
1457                         /* No point in doing cgroup application for units
1458                          * without active processes. */
1459                         if (UNIT_IS_INACTIVE_OR_FAILED(unit_active_state(m)))
1460                                 continue;
1461
1462                         /* If the unit doesn't need any new controllers
1463                          * and has current ones realized, it doesn't need
1464                          * any changes. */
1465                         if (unit_has_mask_realized(m, unit_get_target_mask(m), unit_get_enable_mask(m)))
1466                                 continue;
1467
1468                         unit_add_to_cgroup_queue(m);
1469                 }
1470
1471                 u = slice;
1472         }
1473 }
1474
1475 int unit_realize_cgroup(Unit *u) {
1476         assert(u);
1477
1478         if (!UNIT_HAS_CGROUP_CONTEXT(u))
1479                 return 0;
1480
1481         /* So, here's the deal: when realizing the cgroups for this
1482          * unit, we need to first create all parents, but there's more
1483          * actually: for the weight-based controllers we also need to
1484          * make sure that all our siblings (i.e. units that are in the
1485          * same slice as we are) have cgroups, too. Otherwise, things
1486          * would become very uneven as each of their processes would
1487          * get as much resources as all our group together. This call
1488          * will synchronously create the parent cgroups, but will
1489          * defer work on the siblings to the next event loop
1490          * iteration. */
1491
1492         /* Add all sibling slices to the cgroup queue. */
1493         unit_queue_siblings(u);
1494
1495         /* And realize this one now (and apply the values) */
1496         return unit_realize_cgroup_now(u, manager_state(u->manager));
1497 }
1498
1499 void unit_release_cgroup(Unit *u) {
1500         assert(u);
1501
1502         /* Forgets all cgroup details for this cgroup */
1503
1504         if (u->cgroup_path) {
1505                 (void) hashmap_remove(u->manager->cgroup_unit, u->cgroup_path);
1506                 u->cgroup_path = mfree(u->cgroup_path);
1507         }
1508
1509         if (u->cgroup_inotify_wd >= 0) {
1510                 if (inotify_rm_watch(u->manager->cgroup_inotify_fd, u->cgroup_inotify_wd) < 0)
1511                         log_unit_debug_errno(u, errno, "Failed to remove cgroup inotify watch %i for %s, ignoring", u->cgroup_inotify_wd, u->id);
1512
1513                 (void) hashmap_remove(u->manager->cgroup_inotify_wd_unit, INT_TO_PTR(u->cgroup_inotify_wd));
1514                 u->cgroup_inotify_wd = -1;
1515         }
1516 }
1517
1518 void unit_prune_cgroup(Unit *u) {
1519         int r;
1520         bool is_root_slice;
1521
1522         assert(u);
1523
1524         /* Removes the cgroup, if empty and possible, and stops watching it. */
1525
1526         if (!u->cgroup_path)
1527                 return;
1528
1529         is_root_slice = unit_has_name(u, SPECIAL_ROOT_SLICE);
1530
1531         r = cg_trim_everywhere(u->manager->cgroup_supported, u->cgroup_path, !is_root_slice);
1532         if (r < 0) {
1533                 log_unit_debug_errno(u, r, "Failed to destroy cgroup %s, ignoring: %m", u->cgroup_path);
1534                 return;
1535         }
1536
1537         if (is_root_slice)
1538                 return;
1539
1540         unit_release_cgroup(u);
1541
1542         u->cgroup_realized = false;
1543         u->cgroup_realized_mask = 0;
1544         u->cgroup_enabled_mask = 0;
1545 }
1546
1547 int unit_search_main_pid(Unit *u, pid_t *ret) {
1548         _cleanup_fclose_ FILE *f = NULL;
1549         pid_t pid = 0, npid, mypid;
1550         int r;
1551
1552         assert(u);
1553         assert(ret);
1554
1555         if (!u->cgroup_path)
1556                 return -ENXIO;
1557
1558         r = cg_enumerate_processes(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, &f);
1559         if (r < 0)
1560                 return r;
1561
1562         mypid = getpid();
1563         while (cg_read_pid(f, &npid) > 0)  {
1564                 pid_t ppid;
1565
1566                 if (npid == pid)
1567                         continue;
1568
1569                 /* Ignore processes that aren't our kids */
1570                 if (get_process_ppid(npid, &ppid) >= 0 && ppid != mypid)
1571                         continue;
1572
1573                 if (pid != 0)
1574                         /* Dang, there's more than one daemonized PID
1575                         in this group, so we don't know what process
1576                         is the main process. */
1577
1578                         return -ENODATA;
1579
1580                 pid = npid;
1581         }
1582
1583         *ret = pid;
1584         return 0;
1585 }
1586
1587 static int unit_watch_pids_in_path(Unit *u, const char *path) {
1588         _cleanup_closedir_ DIR *d = NULL;
1589         _cleanup_fclose_ FILE *f = NULL;
1590         int ret = 0, r;
1591
1592         assert(u);
1593         assert(path);
1594
1595         r = cg_enumerate_processes(SYSTEMD_CGROUP_CONTROLLER, path, &f);
1596         if (r < 0)
1597                 ret = r;
1598         else {
1599                 pid_t pid;
1600
1601                 while ((r = cg_read_pid(f, &pid)) > 0) {
1602                         r = unit_watch_pid(u, pid);
1603                         if (r < 0 && ret >= 0)
1604                                 ret = r;
1605                 }
1606
1607                 if (r < 0 && ret >= 0)
1608                         ret = r;
1609         }
1610
1611         r = cg_enumerate_subgroups(SYSTEMD_CGROUP_CONTROLLER, path, &d);
1612         if (r < 0) {
1613                 if (ret >= 0)
1614                         ret = r;
1615         } else {
1616                 char *fn;
1617
1618                 while ((r = cg_read_subgroup(d, &fn)) > 0) {
1619                         _cleanup_free_ char *p = NULL;
1620
1621                         p = strjoin(path, "/", fn, NULL);
1622                         free(fn);
1623
1624                         if (!p)
1625                                 return -ENOMEM;
1626
1627                         r = unit_watch_pids_in_path(u, p);
1628                         if (r < 0 && ret >= 0)
1629                                 ret = r;
1630                 }
1631
1632                 if (r < 0 && ret >= 0)
1633                         ret = r;
1634         }
1635
1636         return ret;
1637 }
1638
1639 int unit_watch_all_pids(Unit *u) {
1640         assert(u);
1641
1642         /* Adds all PIDs from our cgroup to the set of PIDs we
1643          * watch. This is a fallback logic for cases where we do not
1644          * get reliable cgroup empty notifications: we try to use
1645          * SIGCHLD as replacement. */
1646
1647         if (!u->cgroup_path)
1648                 return -ENOENT;
1649
1650         if (cg_unified(SYSTEMD_CGROUP_CONTROLLER) > 0) /* On unified we can use proper notifications */
1651                 return 0;
1652
1653         return unit_watch_pids_in_path(u, u->cgroup_path);
1654 }
1655
1656 int unit_notify_cgroup_empty(Unit *u) {
1657         int r;
1658
1659         assert(u);
1660
1661         if (!u->cgroup_path)
1662                 return 0;
1663
1664         r = cg_is_empty_recursive(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path);
1665         if (r <= 0)
1666                 return r;
1667
1668         unit_add_to_gc_queue(u);
1669
1670         if (UNIT_VTABLE(u)->notify_cgroup_empty)
1671                 UNIT_VTABLE(u)->notify_cgroup_empty(u);
1672
1673         return 0;
1674 }
1675
1676 static int on_cgroup_inotify_event(sd_event_source *s, int fd, uint32_t revents, void *userdata) {
1677         Manager *m = userdata;
1678
1679         assert(s);
1680         assert(fd >= 0);
1681         assert(m);
1682
1683         for (;;) {
1684                 union inotify_event_buffer buffer;
1685                 struct inotify_event *e;
1686                 ssize_t l;
1687
1688                 l = read(fd, &buffer, sizeof(buffer));
1689                 if (l < 0) {
1690                         if (errno == EINTR || errno == EAGAIN)
1691                                 return 0;
1692
1693                         return log_error_errno(errno, "Failed to read control group inotify events: %m");
1694                 }
1695
1696                 FOREACH_INOTIFY_EVENT(e, buffer, l) {
1697                         Unit *u;
1698
1699                         if (e->wd < 0)
1700                                 /* Queue overflow has no watch descriptor */
1701                                 continue;
1702
1703                         if (e->mask & IN_IGNORED)
1704                                 /* The watch was just removed */
1705                                 continue;
1706
1707                         u = hashmap_get(m->cgroup_inotify_wd_unit, INT_TO_PTR(e->wd));
1708                         if (!u) /* Not that inotify might deliver
1709                                  * events for a watch even after it
1710                                  * was removed, because it was queued
1711                                  * before the removal. Let's ignore
1712                                  * this here safely. */
1713                                 continue;
1714
1715                         (void) unit_notify_cgroup_empty(u);
1716                 }
1717         }
1718 }
1719 #endif // 0
1720
1721 int manager_setup_cgroup(Manager *m) {
1722         _cleanup_free_ char *path = NULL;
1723         CGroupController c;
1724         int r, all_unified, systemd_unified;
1725         char *e;
1726
1727         assert(m);
1728
1729         /* 1. Determine hierarchy */
1730         m->cgroup_root = mfree(m->cgroup_root);
1731         r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, 0, &m->cgroup_root);
1732         if (r < 0)
1733                 return log_error_errno(r, "Cannot determine cgroup we are running in: %m");
1734
1735 #if 0 /// elogind does not support systemd scopes and slices
1736         /* Chop off the init scope, if we are already located in it */
1737         e = endswith(m->cgroup_root, "/" SPECIAL_INIT_SCOPE);
1738
1739         /* LEGACY: Also chop off the system slice if we are in
1740          * it. This is to support live upgrades from older systemd
1741          * versions where PID 1 was moved there. Also see
1742          * cg_get_root_path(). */
1743         if (!e && MANAGER_IS_SYSTEM(m)) {
1744                 e = endswith(m->cgroup_root, "/" SPECIAL_SYSTEM_SLICE);
1745                 if (!e)
1746                         e = endswith(m->cgroup_root, "/system"); /* even more legacy */
1747         }
1748         if (e)
1749                 *e = 0;
1750 #endif // 0
1751
1752         /* And make sure to store away the root value without trailing
1753          * slash, even for the root dir, so that we can easily prepend
1754          * it everywhere. */
1755         while ((e = endswith(m->cgroup_root, "/")))
1756                 *e = 0;
1757         log_debug_elogind("Cgroup Controller \"%s\" -> root \"%s\"",
1758                           SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root);
1759
1760         /* 2. Show data */
1761         r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, NULL, &path);
1762         if (r < 0)
1763                 return log_error_errno(r, "Cannot find cgroup mount point: %m");
1764
1765         all_unified = cg_all_unified();
1766         systemd_unified = cg_unified(SYSTEMD_CGROUP_CONTROLLER);
1767
1768         if (all_unified < 0 || systemd_unified < 0)
1769                 return log_error_errno(all_unified < 0 ? all_unified : systemd_unified,
1770                                        "Couldn't determine if we are running in the unified hierarchy: %m");
1771
1772         if (all_unified > 0)
1773                 log_debug("Unified cgroup hierarchy is located at %s.", path);
1774         else if (systemd_unified > 0)
1775                 log_debug("Unified cgroup hierarchy is located at %s. Controllers are on legacy hierarchies.", path);
1776         else
1777                 log_debug("Using cgroup controller " SYSTEMD_CGROUP_CONTROLLER ". File system hierarchy is at %s.", path);
1778
1779         if (!m->test_run) {
1780                 const char *scope_path;
1781
1782                 /* 3. Install agent */
1783                 if (systemd_unified) {
1784
1785                         /* In the unified hierarchy we can get
1786                          * cgroup empty notifications via inotify. */
1787
1788 #if 0 /// elogind does not support the unified hierarchy, yet.
1789                         m->cgroup_inotify_event_source = sd_event_source_unref(m->cgroup_inotify_event_source);
1790                         safe_close(m->cgroup_inotify_fd);
1791
1792                         m->cgroup_inotify_fd = inotify_init1(IN_NONBLOCK|IN_CLOEXEC);
1793                         if (m->cgroup_inotify_fd < 0)
1794                                 return log_error_errno(errno, "Failed to create control group inotify object: %m");
1795
1796                         r = sd_event_add_io(m->event, &m->cgroup_inotify_event_source, m->cgroup_inotify_fd, EPOLLIN, on_cgroup_inotify_event, m);
1797                         if (r < 0)
1798                                 return log_error_errno(r, "Failed to watch control group inotify object: %m");
1799
1800                         /* Process cgroup empty notifications early, but after service notifications and SIGCHLD. Also
1801                          * see handling of cgroup agent notifications, for the classic cgroup hierarchy support. */
1802                         r = sd_event_source_set_priority(m->cgroup_inotify_event_source, SD_EVENT_PRIORITY_NORMAL-5);
1803                         if (r < 0)
1804                                 return log_error_errno(r, "Failed to set priority of inotify event source: %m");
1805
1806                         (void) sd_event_source_set_description(m->cgroup_inotify_event_source, "cgroup-inotify");
1807
1808 #else
1809                         return log_error_errno(EOPNOTSUPP, "Unified cgroup hierarchy not supported: %m");
1810 #endif // 0
1811                 } else if (MANAGER_IS_SYSTEM(m)) {
1812
1813                         /* On the legacy hierarchy we only get
1814                          * notifications via cgroup agents. (Which
1815                          * isn't really reliable, since it does not
1816                          * generate events when control groups with
1817                          * children run empty. */
1818
1819                         r = cg_install_release_agent(SYSTEMD_CGROUP_CONTROLLER, SYSTEMD_CGROUP_AGENT_PATH);
1820                         if (r < 0)
1821                                 log_warning_errno(r, "Failed to install release agent, ignoring: %m");
1822                         else if (r > 0)
1823                                 log_debug("Installed release agent.");
1824                         else if (r == 0)
1825                                 log_debug("Release agent already installed.");
1826                 }
1827
1828 #if 0 /// elogind is not meant to run in systemd init scope
1829                 /* 4. Make sure we are in the special "init.scope" unit in the root slice. */
1830                 scope_path = strjoina(m->cgroup_root, "/" SPECIAL_INIT_SCOPE);
1831                 r = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, scope_path, 0);
1832 #else
1833                 if (streq(SYSTEMD_CGROUP_CONTROLLER, "name=elogind"))
1834                         // we are our own cgroup controller
1835                         scope_path = strjoina("");
1836                 else if (streq(m->cgroup_root, "/elogind"))
1837                         // root already is our cgroup
1838                         scope_path = strjoina(m->cgroup_root);
1839                 else
1840                         // we have to create our own group
1841                         scope_path = strjoina(m->cgroup_root, "/elogind");
1842                 r = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, scope_path, 0);
1843 #endif // 0
1844                 if (r < 0)
1845                         return log_error_errno(r, "Failed to create %s control group: %m", scope_path);
1846                 log_debug_elogind("Created control group \"%s\"", scope_path);
1847
1848                 /* also, move all other userspace processes remaining
1849                  * in the root cgroup into that scope. */
1850                 r = cg_migrate(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, SYSTEMD_CGROUP_CONTROLLER, scope_path, 0);
1851                 if (r < 0)
1852                         log_warning_errno(r, "Couldn't move remaining userspace processes, ignoring: %m");
1853
1854                 /* 5. And pin it, so that it cannot be unmounted */
1855                 safe_close(m->pin_cgroupfs_fd);
1856                 m->pin_cgroupfs_fd = open(path, O_RDONLY|O_CLOEXEC|O_DIRECTORY|O_NOCTTY|O_NONBLOCK);
1857                 if (m->pin_cgroupfs_fd < 0)
1858                         return log_error_errno(errno, "Failed to open pin file: %m");
1859
1860                 /* 6.  Always enable hierarchical support if it exists... */
1861                 if (!all_unified)
1862                         (void) cg_set_attribute("memory", "/", "memory.use_hierarchy", "1");
1863         }
1864
1865         /* 7. Figure out which controllers are supported */
1866         r = cg_mask_supported(&m->cgroup_supported);
1867         if (r < 0)
1868                 return log_error_errno(r, "Failed to determine supported controllers: %m");
1869
1870         for (c = 0; c < _CGROUP_CONTROLLER_MAX; c++)
1871                 log_debug("Controller '%s' supported: %s", cgroup_controller_to_string(c), yes_no(m->cgroup_supported & CGROUP_CONTROLLER_TO_MASK(c)));
1872
1873         return 0;
1874 }
1875
1876 void manager_shutdown_cgroup(Manager *m, bool delete) {
1877         assert(m);
1878
1879         /* We can't really delete the group, since we are in it. But
1880          * let's trim it. */
1881         if (delete && m->cgroup_root)
1882                 (void) cg_trim(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, false);
1883
1884 #if 0 /// elogind does not support the unified hierarchy, yet.
1885         m->cgroup_inotify_wd_unit = hashmap_free(m->cgroup_inotify_wd_unit);
1886
1887         m->cgroup_inotify_event_source = sd_event_source_unref(m->cgroup_inotify_event_source);
1888         m->cgroup_inotify_fd = safe_close(m->cgroup_inotify_fd);
1889 #endif // 0
1890
1891         m->pin_cgroupfs_fd = safe_close(m->pin_cgroupfs_fd);
1892
1893         m->cgroup_root = mfree(m->cgroup_root);
1894 }
1895
1896 #if 0 /// UNNEEDED by elogind
1897 Unit* manager_get_unit_by_cgroup(Manager *m, const char *cgroup) {
1898         char *p;
1899         Unit *u;
1900
1901         assert(m);
1902         assert(cgroup);
1903
1904         u = hashmap_get(m->cgroup_unit, cgroup);
1905         if (u)
1906                 return u;
1907
1908         p = strdupa(cgroup);
1909         for (;;) {
1910                 char *e;
1911
1912                 e = strrchr(p, '/');
1913                 if (!e || e == p)
1914                         return hashmap_get(m->cgroup_unit, SPECIAL_ROOT_SLICE);
1915
1916                 *e = 0;
1917
1918                 u = hashmap_get(m->cgroup_unit, p);
1919                 if (u)
1920                         return u;
1921         }
1922 }
1923
1924 Unit *manager_get_unit_by_pid_cgroup(Manager *m, pid_t pid) {
1925         _cleanup_free_ char *cgroup = NULL;
1926         int r;
1927
1928         assert(m);
1929
1930         if (pid <= 0)
1931                 return NULL;
1932
1933         r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, pid, &cgroup);
1934         if (r < 0)
1935                 return NULL;
1936
1937         return manager_get_unit_by_cgroup(m, cgroup);
1938 }
1939
1940 Unit *manager_get_unit_by_pid(Manager *m, pid_t pid) {
1941         Unit *u;
1942
1943         assert(m);
1944
1945         if (pid <= 0)
1946                 return NULL;
1947
1948         if (pid == 1)
1949                 return hashmap_get(m->units, SPECIAL_INIT_SCOPE);
1950
1951         u = hashmap_get(m->watch_pids1, PID_TO_PTR(pid));
1952         if (u)
1953                 return u;
1954
1955         u = hashmap_get(m->watch_pids2, PID_TO_PTR(pid));
1956         if (u)
1957                 return u;
1958
1959         return manager_get_unit_by_pid_cgroup(m, pid);
1960 }
1961 #endif // 0
1962
1963 #if 0 /// elogind must substitute this with its own variant
1964 int manager_notify_cgroup_empty(Manager *m, const char *cgroup) {
1965         Unit *u;
1966
1967         assert(m);
1968         assert(cgroup);
1969
1970         log_debug("Got cgroup empty notification for: %s", cgroup);
1971
1972         u = manager_get_unit_by_cgroup(m, cgroup);
1973         if (!u)
1974                 return 0;
1975
1976         return unit_notify_cgroup_empty(u);
1977 }
1978 #else
1979 int manager_notify_cgroup_empty(Manager *m, const char *cgroup) {
1980         Session *s;
1981
1982         assert(m);
1983         assert(cgroup);
1984
1985         log_debug("Got cgroup empty notification for: %s", cgroup);
1986
1987         s = hashmap_get(m->sessions, cgroup);
1988
1989         if (s) {
1990                 session_finalize(s);
1991                 session_free(s);
1992         } else
1993                 log_warning("Session not found: %s", cgroup);
1994
1995         return 0;
1996 }
1997 #endif // 0
1998
1999 #if 0 /// UNNEEDED by elogind
2000 int unit_get_memory_current(Unit *u, uint64_t *ret) {
2001         _cleanup_free_ char *v = NULL;
2002         int r;
2003
2004         assert(u);
2005         assert(ret);
2006
2007         if (!u->cgroup_path)
2008                 return -ENODATA;
2009
2010         if ((u->cgroup_realized_mask & CGROUP_MASK_MEMORY) == 0)
2011                 return -ENODATA;
2012
2013         if (cg_all_unified() <= 0)
2014                 r = cg_get_attribute("memory", u->cgroup_path, "memory.usage_in_bytes", &v);
2015         else
2016                 r = cg_get_attribute("memory", u->cgroup_path, "memory.current", &v);
2017         if (r == -ENOENT)
2018                 return -ENODATA;
2019         if (r < 0)
2020                 return r;
2021
2022         return safe_atou64(v, ret);
2023 }
2024
2025 int unit_get_tasks_current(Unit *u, uint64_t *ret) {
2026         _cleanup_free_ char *v = NULL;
2027         int r;
2028
2029         assert(u);
2030         assert(ret);
2031
2032         if (!u->cgroup_path)
2033                 return -ENODATA;
2034
2035         if ((u->cgroup_realized_mask & CGROUP_MASK_PIDS) == 0)
2036                 return -ENODATA;
2037
2038         r = cg_get_attribute("pids", u->cgroup_path, "pids.current", &v);
2039         if (r == -ENOENT)
2040                 return -ENODATA;
2041         if (r < 0)
2042                 return r;
2043
2044         return safe_atou64(v, ret);
2045 }
2046
2047 static int unit_get_cpu_usage_raw(Unit *u, nsec_t *ret) {
2048         _cleanup_free_ char *v = NULL;
2049         uint64_t ns;
2050         int r;
2051
2052         assert(u);
2053         assert(ret);
2054
2055         if (!u->cgroup_path)
2056                 return -ENODATA;
2057
2058         if (cg_all_unified() > 0) {
2059                 const char *keys[] = { "usage_usec", NULL };
2060                 _cleanup_free_ char *val = NULL;
2061                 uint64_t us;
2062
2063                 if ((u->cgroup_realized_mask & CGROUP_MASK_CPU) == 0)
2064                         return -ENODATA;
2065
2066                 r = cg_get_keyed_attribute("cpu", u->cgroup_path, "cpu.stat", keys, &val);
2067                 if (r < 0)
2068                         return r;
2069
2070                 r = safe_atou64(val, &us);
2071                 if (r < 0)
2072                         return r;
2073
2074                 ns = us * NSEC_PER_USEC;
2075         } else {
2076                 if ((u->cgroup_realized_mask & CGROUP_MASK_CPUACCT) == 0)
2077                         return -ENODATA;
2078
2079                 r = cg_get_attribute("cpuacct", u->cgroup_path, "cpuacct.usage", &v);
2080                 if (r == -ENOENT)
2081                         return -ENODATA;
2082                 if (r < 0)
2083                         return r;
2084
2085                 r = safe_atou64(v, &ns);
2086                 if (r < 0)
2087                         return r;
2088         }
2089
2090         *ret = ns;
2091         return 0;
2092 }
2093
2094 int unit_get_cpu_usage(Unit *u, nsec_t *ret) {
2095         nsec_t ns;
2096         int r;
2097
2098         r = unit_get_cpu_usage_raw(u, &ns);
2099         if (r < 0)
2100                 return r;
2101
2102         if (ns > u->cpu_usage_base)
2103                 ns -= u->cpu_usage_base;
2104         else
2105                 ns = 0;
2106
2107         *ret = ns;
2108         return 0;
2109 }
2110
2111 int unit_reset_cpu_usage(Unit *u) {
2112         nsec_t ns;
2113         int r;
2114
2115         assert(u);
2116
2117         r = unit_get_cpu_usage_raw(u, &ns);
2118         if (r < 0) {
2119                 u->cpu_usage_base = 0;
2120                 return r;
2121         }
2122
2123         u->cpu_usage_base = ns;
2124         return 0;
2125 }
2126
2127 bool unit_cgroup_delegate(Unit *u) {
2128         CGroupContext *c;
2129
2130         assert(u);
2131
2132         c = unit_get_cgroup_context(u);
2133         if (!c)
2134                 return false;
2135
2136         return c->delegate;
2137 }
2138
2139 void unit_invalidate_cgroup(Unit *u, CGroupMask m) {
2140         assert(u);
2141
2142         if (!UNIT_HAS_CGROUP_CONTEXT(u))
2143                 return;
2144
2145         if (m == 0)
2146                 return;
2147
2148         /* always invalidate compat pairs together */
2149         if (m & (CGROUP_MASK_IO | CGROUP_MASK_BLKIO))
2150                 m |= CGROUP_MASK_IO | CGROUP_MASK_BLKIO;
2151
2152         if ((u->cgroup_realized_mask & m) == 0)
2153                 return;
2154
2155         u->cgroup_realized_mask &= ~m;
2156         unit_add_to_cgroup_queue(u);
2157 }
2158
2159 void manager_invalidate_startup_units(Manager *m) {
2160         Iterator i;
2161         Unit *u;
2162
2163         assert(m);
2164
2165         SET_FOREACH(u, m->startup_units, i)
2166                 unit_invalidate_cgroup(u, CGROUP_MASK_CPU|CGROUP_MASK_IO|CGROUP_MASK_BLKIO);
2167 }
2168
2169 static const char* const cgroup_device_policy_table[_CGROUP_DEVICE_POLICY_MAX] = {
2170         [CGROUP_AUTO] = "auto",
2171         [CGROUP_CLOSED] = "closed",
2172         [CGROUP_STRICT] = "strict",
2173 };
2174
2175 DEFINE_STRING_TABLE_LOOKUP(cgroup_device_policy, CGroupDevicePolicy);
2176 #endif // 0