1 /* SPDX-License-Identifier: LGPL-2.1+ */
3 This file is part of systemd.
5 Copyright 2010 Lennart Poettering
7 systemd is free software; you can redistribute it and/or modify it
8 under the terms of the GNU Lesser General Public License as published by
9 the Free Software Foundation; either version 2.1 of the License, or
10 (at your option) any later version.
12 systemd is distributed in the hope that it will be useful, but
13 WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 Lesser General Public License for more details.
17 You should have received a copy of the GNU Lesser General Public License
18 along with systemd; If not, see <http://www.gnu.org/licenses/>.
24 #include <sys/mount.h>
27 #include "alloc-util.h"
28 //#include "bus-util.h"
29 #include "cgroup-util.h"
30 //#include "dev-setup.h"
31 //#include "efivars.h"
37 //#include "missing.h"
39 #include "mount-setup.h"
40 #include "mount-util.h"
41 #include "path-util.h"
43 //#include "smack-util.h"
45 #include "user-util.h"
49 /// Additional includes needed by elogind
50 #include "string-util.h"
52 typedef enum MountMode {
55 MNT_IN_CONTAINER = 1 << 1,
56 MNT_CHECK_WRITABLE = 1 << 2,
59 typedef struct MountPoint {
65 bool (*condition_fn)(void);
69 /* The first three entries we might need before SELinux is up. The
70 * fourth (securityfs) is needed by IMA to load a custom policy. The
71 * other ones we can delay until SELinux and IMA are loaded. When
72 * SMACK is enabled we need smackfs, too, so it's a fifth one. */
74 #define N_EARLY_MOUNT 5
76 #define N_EARLY_MOUNT 4
79 static const MountPoint mount_table[] = {
80 #if 0 /// UNNEEDED by elogind
81 { "sysfs", "/sys", "sysfs", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV,
82 NULL, MNT_FATAL|MNT_IN_CONTAINER },
83 { "proc", "/proc", "proc", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV,
84 NULL, MNT_FATAL|MNT_IN_CONTAINER },
85 { "devtmpfs", "/dev", "devtmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME,
86 NULL, MNT_FATAL|MNT_IN_CONTAINER },
87 { "securityfs", "/sys/kernel/security", "securityfs", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV,
90 { "smackfs", "/sys/fs/smackfs", "smackfs", "smackfsdef=*", MS_NOSUID|MS_NOEXEC|MS_NODEV,
91 mac_smack_use, MNT_FATAL },
92 { "tmpfs", "/dev/shm", "tmpfs", "mode=1777,smackfsroot=*", MS_NOSUID|MS_NODEV|MS_STRICTATIME,
93 mac_smack_use, MNT_FATAL },
95 { "tmpfs", "/dev/shm", "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME,
96 NULL, MNT_FATAL|MNT_IN_CONTAINER },
97 { "devpts", "/dev/pts", "devpts", "mode=620,gid=" STRINGIFY(TTY_GID), MS_NOSUID|MS_NOEXEC,
98 NULL, MNT_IN_CONTAINER },
100 { "tmpfs", "/run", "tmpfs", "mode=755,smackfsroot=*", MS_NOSUID|MS_NODEV|MS_STRICTATIME,
101 mac_smack_use, MNT_FATAL },
103 { "tmpfs", "/run", "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME,
104 NULL, MNT_FATAL|MNT_IN_CONTAINER },
105 { "cgroup2", "/sys/fs/cgroup", "cgroup2", "nsdelegate", MS_NOSUID|MS_NOEXEC|MS_NODEV,
106 cg_is_unified_wanted, MNT_IN_CONTAINER|MNT_CHECK_WRITABLE },
107 { "cgroup2", "/sys/fs/cgroup", "cgroup2", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV,
108 cg_is_unified_wanted, MNT_IN_CONTAINER|MNT_CHECK_WRITABLE },
110 { "tmpfs", "/sys/fs/cgroup", "tmpfs", "mode=755", MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME,
111 cg_is_legacy_wanted, MNT_FATAL|MNT_IN_CONTAINER },
112 { "cgroup2", "/sys/fs/cgroup/unified", "cgroup2", "nsdelegate", MS_NOSUID|MS_NOEXEC|MS_NODEV,
113 cg_is_hybrid_wanted, MNT_IN_CONTAINER|MNT_CHECK_WRITABLE },
114 { "cgroup2", "/sys/fs/cgroup/unified", "cgroup2", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV,
115 cg_is_hybrid_wanted, MNT_IN_CONTAINER|MNT_CHECK_WRITABLE },
116 #if 0 /// UNNEEDED by elogind
117 { "cgroup", "/sys/fs/cgroup/systemd", "cgroup", "none,name=systemd,xattr", MS_NOSUID|MS_NOEXEC|MS_NODEV,
118 cg_is_legacy_wanted, MNT_IN_CONTAINER },
119 { "cgroup", "/sys/fs/cgroup/systemd", "cgroup", "none,name=systemd", MS_NOSUID|MS_NOEXEC|MS_NODEV,
120 cg_is_legacy_wanted, MNT_FATAL|MNT_IN_CONTAINER },
121 { "pstore", "/sys/fs/pstore", "pstore", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV,
124 { "efivarfs", "/sys/firmware/efi/efivars", "efivarfs", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV,
125 is_efi_boot, MNT_NONE },
127 { "bpf", "/sys/fs/bpf", "bpf", "mode=700", MS_NOSUID|MS_NOEXEC|MS_NODEV,
130 { "cgroup", "/sys/fs/cgroup/elogind", "cgroup", "none,name=elogind,release_agent="SYSTEMD_CGROUP_AGENT_PATH",xattr", MS_NOSUID|MS_NOEXEC|MS_NODEV,
131 cg_is_legacy_wanted, MNT_IN_CONTAINER },
132 { "cgroup", "/sys/fs/cgroup/elogind", "cgroup", "none,name=elogind,release_agent="SYSTEMD_CGROUP_AGENT_PATH, MS_NOSUID|MS_NOEXEC|MS_NODEV,
133 cg_is_legacy_wanted, MNT_FATAL|MNT_IN_CONTAINER },
137 #if 0 /// UNNEEDED by elogind
138 /* These are API file systems that might be mounted by other software,
139 * we just list them here so that we know that we should ignore them */
141 static const char ignore_paths[] =
142 /* SELinux file systems */
144 /* Container bind mounts */
149 bool mount_point_is_api(const char *path) {
152 /* Checks if this mount point is considered "API", and hence
153 * should be ignored */
155 for (i = 0; i < ELEMENTSOF(mount_table); i ++)
156 if (path_equal(path, mount_table[i].where))
159 return path_startswith(path, "/sys/fs/cgroup/");
162 bool mount_point_ignore(const char *path) {
165 NULSTR_FOREACH(i, ignore_paths)
166 if (path_equal(path, i))
173 static int mount_one(const MountPoint *p, bool relabel) {
178 priority = (p->mode & MNT_FATAL) ? LOG_ERR : LOG_DEBUG;
180 if (p->condition_fn && !p->condition_fn())
183 /* Relabel first, just in case */
185 (void) label_fix(p->where, true, true);
187 r = path_is_mount_point(p->where, NULL, AT_SYMLINK_FOLLOW);
188 if (r < 0 && r != -ENOENT) {
189 log_full_errno(priority, r, "Failed to determine whether %s is a mount point: %m", p->where);
190 return (p->mode & MNT_FATAL) ? r : 0;
195 /* Skip securityfs in a container */
196 if (!(p->mode & MNT_IN_CONTAINER) && detect_container() > 0)
199 /* The access mode here doesn't really matter too much, since
200 * the mounted file system will take precedence anyway. */
202 (void) mkdir_p_label(p->where, 0755);
204 (void) mkdir_p(p->where, 0755);
206 log_debug("Mounting %s to %s of type %s with options %s.",
217 log_full_errno(priority, errno, "Failed to mount %s at %s: %m", p->type, p->where);
218 return (p->mode & MNT_FATAL) ? -errno : 0;
221 /* Relabel again, since we now mounted something fresh here */
223 (void) label_fix(p->where, false, false);
225 if (p->mode & MNT_CHECK_WRITABLE) {
226 if (access(p->where, W_OK) < 0) {
229 (void) umount(p->where);
230 (void) rmdir(p->where);
232 log_full_errno(priority, r, "Mount point %s not writable after mounting: %m", p->where);
233 return (p->mode & MNT_FATAL) ? r : 0;
240 static int mount_points_setup(unsigned n, bool loaded_policy) {
244 for (i = 0; i < n; i ++) {
247 j = mount_one(mount_table + i, loaded_policy);
248 if (j != 0 && r >= 0)
255 #if 0 /// UNNEEDED by elogind
256 int mount_setup_early(void) {
257 assert_cc(N_EARLY_MOUNT <= ELEMENTSOF(mount_table));
259 /* Do a minimal mount of /proc and friends to enable the most
260 * basic stuff, such as SELinux */
261 return mount_points_setup(N_EARLY_MOUNT, false);
264 int mount_cgroup_controllers(char ***join_controllers) {
265 _cleanup_set_free_free_ Set *controllers = NULL;
268 if (!cg_is_legacy_wanted())
271 /* Mount all available cgroup controllers that are built into the kernel. */
273 if (!join_controllers)
275 * mount "cpu" + "cpuacct" together, and "net_cls" + "net_prio".
277 * We'd like to add "cpuset" to the mix, but "cpuset" doesn't really
278 * work for groups with no initialized attributes.
280 join_controllers = (char**[]) {
281 STRV_MAKE("cpu", "cpuacct"),
282 STRV_MAKE("net_cls", "net_prio"),
286 r = cg_kernel_controllers(&controllers);
288 return log_error_errno(r, "Failed to enumerate cgroup controllers: %m");
291 _cleanup_free_ char *options = NULL, *controller = NULL, *where = NULL;
295 .flags = MS_NOSUID|MS_NOEXEC|MS_NODEV,
296 .mode = MNT_IN_CONTAINER,
300 controller = set_steal_first(controllers);
304 for (k = join_controllers; *k; k++)
305 if (strv_find(*k, controller))
311 for (i = *k, j = *k; *i; i++) {
313 if (!streq(*i, controller)) {
314 _cleanup_free_ char *t;
316 t = set_remove(controllers, *i);
328 options = strv_join(*k, ",");
332 options = controller;
336 where = strappend("/sys/fs/cgroup/", options);
343 r = mount_one(&p, true);
347 if (r > 0 && k && *k) {
350 for (i = *k; *i; i++) {
351 _cleanup_free_ char *t = NULL;
353 t = strappend("/sys/fs/cgroup/", *i);
357 r = symlink(options, t);
359 #ifdef SMACK_RUN_LABEL
360 _cleanup_free_ char *src;
361 src = strappend("/sys/fs/cgroup/", options);
364 r = mac_smack_copy(t, src);
365 if (r < 0 && r != -EOPNOTSUPP)
366 return log_error_errno(r, "Failed to copy smack label from %s to %s: %m", src, t);
368 } else if (errno != EEXIST)
369 return log_error_errno(errno, "Failed to create symlink %s: %m", t);
374 /* Now that we mounted everything, let's make the tmpfs the
375 * cgroup file systems are mounted into read-only. */
376 (void) mount("tmpfs", "/sys/fs/cgroup", "tmpfs", MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME|MS_RDONLY, "mode=755");
381 #if HAVE_SELINUX || ENABLE_SMACK
384 const struct stat *sb,
386 struct FTW *ftwbuf) {
388 /* No need to label /dev twice in a row... */
389 if (_unlikely_(ftwbuf->level == 0))
392 label_fix(fpath, false, false);
394 /* /run/initramfs is static data and big, no need to
395 * dynamically relabel its contents at boot... */
396 if (_unlikely_(ftwbuf->level == 1 &&
398 streq(fpath, "/run/initramfs")))
399 return FTW_SKIP_SUBTREE;
406 int mount_setup(bool loaded_policy) {
409 r = mount_points_setup(ELEMENTSOF(mount_table), loaded_policy);
413 #if 0 /// elogind does not control /, /dev, /run and /run/systemd/* are setup elsewhere.
414 #if HAVE_SELINUX || ENABLE_SMACK
415 /* Nodes in devtmpfs and /run need to be manually updated for
416 * the appropriate labels, after mounting. The other virtual
417 * API file systems like /sys and /proc do not need that, they
418 * use the same label for all their files. */
420 usec_t before_relabel, after_relabel;
421 char timespan[FORMAT_TIMESPAN_MAX];
423 before_relabel = now(CLOCK_MONOTONIC);
425 nftw("/dev", nftw_cb, 64, FTW_MOUNT|FTW_PHYS|FTW_ACTIONRETVAL);
426 nftw("/dev/shm", nftw_cb, 64, FTW_MOUNT|FTW_PHYS|FTW_ACTIONRETVAL);
427 nftw("/run", nftw_cb, 64, FTW_MOUNT|FTW_PHYS|FTW_ACTIONRETVAL);
429 /* Temporarily remount the root cgroup filesystem to give it a proper label. */
430 r = cg_all_unified();
432 (void) mount(NULL, "/sys/fs/cgroup", NULL, MS_REMOUNT, NULL);
433 label_fix("/sys/fs/cgroup", false, false);
434 nftw("/sys/fs/cgroup", nftw_cb, 64, FTW_MOUNT|FTW_PHYS|FTW_ACTIONRETVAL);
435 (void) mount(NULL, "/sys/fs/cgroup", NULL, MS_REMOUNT|MS_RDONLY, NULL);
437 return log_error_errno(r, "Failed to determine whether we are in all unified mode: %m");
439 after_relabel = now(CLOCK_MONOTONIC);
441 log_info("Relabelled /dev, /run and /sys/fs/cgroup in %s.",
442 format_timespan(timespan, sizeof(timespan), after_relabel - before_relabel, 0));
446 /* Create a few default symlinks, which are normally created
447 * by udevd, but some scripts might need them before we start
449 dev_setup(NULL, UID_INVALID, GID_INVALID);
451 /* Mark the root directory as shared in regards to mount propagation. The kernel defaults to "private", but we
452 * think it makes more sense to have a default of "shared" so that nspawn and the container tools work out of
453 * the box. If specific setups need other settings they can reset the propagation mode to private if
454 * needed. Note that we set this only when we are invoked directly by the kernel. If we are invoked by a
455 * container manager we assume the container manager knows what it is doing (for example, because it set up
456 * some directories with different propagation modes). */
457 if (detect_container() <= 0)
458 if (mount(NULL, "/", NULL, MS_REC|MS_SHARED, NULL) < 0)
459 log_warning_errno(errno, "Failed to set up the root directory for shared mount propagation: %m");
462 /* Create a few directories we always want around, Note that sd_booted() checks for /run/systemd/system, so
463 * this mkdir really needs to stay for good, otherwise software that copied sd-daemon.c into their sources will
464 * misdetect systemd. */
465 (void) mkdir_label("/run/systemd", 0755);
466 #if 0 /// Yeah, but elogind is not used with systemd, so this directory would be toxic.
467 (void) mkdir_label("/run/systemd/system", 0755);
470 /* Set up inaccessible items */
471 (void) mkdir_label("/run/systemd/inaccessible", 0000);
472 (void) mknod("/run/systemd/inaccessible/reg", S_IFREG | 0000, 0);
473 (void) mkdir_label("/run/systemd/inaccessible/dir", 0000);
474 (void) mknod("/run/systemd/inaccessible/chr", S_IFCHR | 0000, makedev(0, 0));
475 (void) mknod("/run/systemd/inaccessible/blk", S_IFBLK | 0000, makedev(0, 0));
476 (void) mkfifo("/run/systemd/inaccessible/fifo", 0000);
477 (void) mknod("/run/systemd/inaccessible/sock", S_IFSOCK | 0000, 0);