1 /* SPDX-License-Identifier: LGPL-2.1+ */
3 This file is part of systemd.
5 Copyright 2010 Lennart Poettering
7 systemd is free software; you can redistribute it and/or modify it
8 under the terms of the GNU Lesser General Public License as published by
9 the Free Software Foundation; either version 2.1 of the License, or
10 (at your option) any later version.
12 systemd is distributed in the hope that it will be useful, but
13 WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 Lesser General Public License for more details.
17 You should have received a copy of the GNU Lesser General Public License
18 along with systemd; If not, see <http://www.gnu.org/licenses/>.
24 #include <sys/mount.h>
25 //#include <sys/statvfs.h>
28 #include "alloc-util.h"
29 //#include "bus-util.h"
30 #include "cgroup-util.h"
31 //#include "dev-setup.h"
32 //#include "efivars.h"
38 //#include "missing.h"
40 #include "mount-setup.h"
41 #include "mount-util.h"
42 #include "path-util.h"
44 //#include "smack-util.h"
46 #include "user-util.h"
50 /// Additional includes needed by elogind
51 #include "string-util.h"
53 typedef enum MountMode {
56 MNT_IN_CONTAINER = 1 << 1,
57 MNT_CHECK_WRITABLE = 1 << 2,
60 typedef struct MountPoint {
66 bool (*condition_fn)(void);
70 /* The first three entries we might need before SELinux is up. The
71 * fourth (securityfs) is needed by IMA to load a custom policy. The
72 * other ones we can delay until SELinux and IMA are loaded. When
73 * SMACK is enabled we need smackfs, too, so it's a fifth one. */
75 #define N_EARLY_MOUNT 5
77 #define N_EARLY_MOUNT 4
80 static const MountPoint mount_table[] = {
81 #if 0 /// UNNEEDED by elogind
82 { "sysfs", "/sys", "sysfs", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV,
83 NULL, MNT_FATAL|MNT_IN_CONTAINER },
84 { "proc", "/proc", "proc", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV,
85 NULL, MNT_FATAL|MNT_IN_CONTAINER },
86 { "devtmpfs", "/dev", "devtmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME,
87 NULL, MNT_FATAL|MNT_IN_CONTAINER },
88 { "securityfs", "/sys/kernel/security", "securityfs", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV,
91 { "smackfs", "/sys/fs/smackfs", "smackfs", "smackfsdef=*", MS_NOSUID|MS_NOEXEC|MS_NODEV,
92 mac_smack_use, MNT_FATAL },
93 { "tmpfs", "/dev/shm", "tmpfs", "mode=1777,smackfsroot=*", MS_NOSUID|MS_NODEV|MS_STRICTATIME,
94 mac_smack_use, MNT_FATAL },
96 { "tmpfs", "/dev/shm", "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME,
97 NULL, MNT_FATAL|MNT_IN_CONTAINER },
98 { "devpts", "/dev/pts", "devpts", "mode=620,gid=" STRINGIFY(TTY_GID), MS_NOSUID|MS_NOEXEC,
99 NULL, MNT_IN_CONTAINER },
101 { "tmpfs", "/run", "tmpfs", "mode=755,smackfsroot=*", MS_NOSUID|MS_NODEV|MS_STRICTATIME,
102 mac_smack_use, MNT_FATAL },
104 { "tmpfs", "/run", "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME,
105 NULL, MNT_FATAL|MNT_IN_CONTAINER },
107 { "cgroup2", "/sys/fs/cgroup", "cgroup2", "nsdelegate", MS_NOSUID|MS_NOEXEC|MS_NODEV,
108 cg_is_unified_wanted, MNT_IN_CONTAINER|MNT_CHECK_WRITABLE },
109 { "cgroup2", "/sys/fs/cgroup", "cgroup2", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV,
110 cg_is_unified_wanted, MNT_IN_CONTAINER|MNT_CHECK_WRITABLE },
111 { "tmpfs", "/sys/fs/cgroup", "tmpfs", "mode=755", MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME,
112 cg_is_legacy_wanted, MNT_FATAL|MNT_IN_CONTAINER },
113 { "cgroup2", "/sys/fs/cgroup/unified", "cgroup2", "nsdelegate", MS_NOSUID|MS_NOEXEC|MS_NODEV,
114 cg_is_hybrid_wanted, MNT_IN_CONTAINER|MNT_CHECK_WRITABLE },
115 { "cgroup2", "/sys/fs/cgroup/unified", "cgroup2", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV,
116 cg_is_hybrid_wanted, MNT_IN_CONTAINER|MNT_CHECK_WRITABLE },
117 #if 0 /// UNNEEDED by elogind
118 { "cgroup", "/sys/fs/cgroup/systemd", "cgroup", "none,name=systemd,xattr", MS_NOSUID|MS_NOEXEC|MS_NODEV,
119 cg_is_legacy_wanted, MNT_IN_CONTAINER },
120 { "cgroup", "/sys/fs/cgroup/systemd", "cgroup", "none,name=systemd", MS_NOSUID|MS_NOEXEC|MS_NODEV,
121 cg_is_legacy_wanted, MNT_FATAL|MNT_IN_CONTAINER },
122 { "pstore", "/sys/fs/pstore", "pstore", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV,
125 { "efivarfs", "/sys/firmware/efi/efivars", "efivarfs", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV,
126 is_efi_boot, MNT_NONE },
128 { "bpf", "/sys/fs/bpf", "bpf", "mode=700", MS_NOSUID|MS_NOEXEC|MS_NODEV,
131 { "cgroup", "/sys/fs/cgroup/elogind", "cgroup", "none,name=elogind,release_agent="SYSTEMD_CGROUP_AGENT_PATH",xattr", MS_NOSUID|MS_NOEXEC|MS_NODEV,
132 cg_is_legacy_wanted, MNT_IN_CONTAINER },
133 { "cgroup", "/sys/fs/cgroup/elogind", "cgroup", "none,name=elogind,release_agent="SYSTEMD_CGROUP_AGENT_PATH, MS_NOSUID|MS_NOEXEC|MS_NODEV,
134 cg_is_legacy_wanted, MNT_FATAL|MNT_IN_CONTAINER },
138 #if 0 /// UNNEEDED by elogind
139 /* These are API file systems that might be mounted by other software,
140 * we just list them here so that we know that we should ignore them */
142 static const char ignore_paths[] =
143 /* SELinux file systems */
145 /* Container bind mounts */
150 bool mount_point_is_api(const char *path) {
153 /* Checks if this mount point is considered "API", and hence
154 * should be ignored */
156 for (i = 0; i < ELEMENTSOF(mount_table); i ++)
157 if (path_equal(path, mount_table[i].where))
160 return path_startswith(path, "/sys/fs/cgroup/");
163 bool mount_point_ignore(const char *path) {
166 NULSTR_FOREACH(i, ignore_paths)
167 if (path_equal(path, i))
174 static int mount_one(const MountPoint *p, bool relabel) {
179 priority = (p->mode & MNT_FATAL) ? LOG_ERR : LOG_DEBUG;
181 if (p->condition_fn && !p->condition_fn())
184 /* Relabel first, just in case */
186 (void) label_fix(p->where, true, true);
188 r = path_is_mount_point(p->where, NULL, AT_SYMLINK_FOLLOW);
189 if (r < 0 && r != -ENOENT) {
190 log_full_errno(priority, r, "Failed to determine whether %s is a mount point: %m", p->where);
191 return (p->mode & MNT_FATAL) ? r : 0;
196 /* Skip securityfs in a container */
197 if (!(p->mode & MNT_IN_CONTAINER) && detect_container() > 0)
200 /* The access mode here doesn't really matter too much, since
201 * the mounted file system will take precedence anyway. */
203 (void) mkdir_p_label(p->where, 0755);
205 (void) mkdir_p(p->where, 0755);
207 log_debug("Mounting %s to %s of type %s with options %s.",
218 log_full_errno(priority, errno, "Failed to mount %s at %s: %m", p->type, p->where);
219 return (p->mode & MNT_FATAL) ? -errno : 0;
222 /* Relabel again, since we now mounted something fresh here */
224 (void) label_fix(p->where, false, false);
226 if (p->mode & MNT_CHECK_WRITABLE) {
227 if (access(p->where, W_OK) < 0) {
230 (void) umount(p->where);
231 (void) rmdir(p->where);
233 log_full_errno(priority, r, "Mount point %s not writable after mounting: %m", p->where);
234 return (p->mode & MNT_FATAL) ? r : 0;
241 static int mount_points_setup(unsigned n, bool loaded_policy) {
245 for (i = 0; i < n; i ++) {
248 j = mount_one(mount_table + i, loaded_policy);
249 if (j != 0 && r >= 0)
256 #if 0 /// UNNEEDED by elogind
257 int mount_setup_early(void) {
258 assert_cc(N_EARLY_MOUNT <= ELEMENTSOF(mount_table));
260 /* Do a minimal mount of /proc and friends to enable the most
261 * basic stuff, such as SELinux */
262 return mount_points_setup(N_EARLY_MOUNT, false);
265 int mount_cgroup_controllers(char ***join_controllers) {
266 _cleanup_set_free_free_ Set *controllers = NULL;
267 bool has_argument = !!join_controllers;
270 if (!cg_is_legacy_wanted())
273 /* Mount all available cgroup controllers that are built into the kernel. */
277 * mount "cpu" + "cpuacct" together, and "net_cls" + "net_prio".
279 * We'd like to add "cpuset" to the mix, but "cpuset" doesn't really
280 * work for groups with no initialized attributes.
282 join_controllers = (char**[]) {
283 STRV_MAKE("cpu", "cpuacct"),
284 STRV_MAKE("net_cls", "net_prio"),
288 r = cg_kernel_controllers(&controllers);
290 return log_error_errno(r, "Failed to enumerate cgroup controllers: %m");
293 _cleanup_free_ char *options = NULL, *controller = NULL, *where = NULL;
297 .flags = MS_NOSUID|MS_NOEXEC|MS_NODEV,
298 .mode = MNT_IN_CONTAINER,
302 controller = set_steal_first(controllers);
306 for (k = join_controllers; *k; k++)
307 if (strv_find(*k, controller))
313 for (i = *k, j = *k; *i; i++) {
315 if (!streq(*i, controller)) {
316 _cleanup_free_ char *t;
318 t = set_remove(controllers, *i);
331 options = strv_join(*k, ",");
335 options = controller;
339 where = strappend("/sys/fs/cgroup/", options);
346 r = mount_one(&p, true);
350 if (r > 0 && k && *k) {
353 for (i = *k; *i; i++) {
354 _cleanup_free_ char *t = NULL;
356 t = strappend("/sys/fs/cgroup/", *i);
360 r = symlink(options, t);
362 #ifdef SMACK_RUN_LABEL
363 _cleanup_free_ char *src;
364 src = strappend("/sys/fs/cgroup/", options);
367 r = mac_smack_copy(t, src);
368 if (r < 0 && r != -EOPNOTSUPP)
369 return log_error_errno(r, "Failed to copy smack label from %s to %s: %m", src, t);
371 } else if (errno != EEXIST)
372 return log_error_errno(errno, "Failed to create symlink %s: %m", t);
377 /* Now that we mounted everything, let's make the tmpfs the
378 * cgroup file systems are mounted into read-only. */
379 (void) mount("tmpfs", "/sys/fs/cgroup", "tmpfs", MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME|MS_RDONLY, "mode=755");
384 #if HAVE_SELINUX || ENABLE_SMACK
387 const struct stat *sb,
389 struct FTW *ftwbuf) {
391 /* No need to label /dev twice in a row... */
392 if (_unlikely_(ftwbuf->level == 0))
395 label_fix(fpath, false, false);
397 /* /run/initramfs is static data and big, no need to
398 * dynamically relabel its contents at boot... */
399 if (_unlikely_(ftwbuf->level == 1 &&
401 streq(fpath, "/run/initramfs")))
402 return FTW_SKIP_SUBTREE;
407 static int relabel_cgroup_filesystems(void) {
411 r = cg_all_unified();
413 /* Temporarily remount the root cgroup filesystem to give it a proper label. Do this
414 only when the filesystem has been already populated by a previous instance of systemd
415 running from initrd. Otherwise don't remount anything and leave the filesystem read-write
416 for the cgroup filesystems to be mounted inside. */
417 r = statfs("/sys/fs/cgroup", &st);
419 return log_error_errno(errno, "Failed to determine mount flags for /sys/fs/cgroup: %m");
422 if (st.f_flags & ST_RDONLY)
423 (void) mount(NULL, "/sys/fs/cgroup", NULL, MS_REMOUNT, NULL);
425 label_fix("/sys/fs/cgroup", false, false);
426 nftw("/sys/fs/cgroup", nftw_cb, 64, FTW_MOUNT|FTW_PHYS|FTW_ACTIONRETVAL);
428 if (st.f_flags & ST_RDONLY)
429 (void) mount(NULL, "/sys/fs/cgroup", NULL, MS_REMOUNT|MS_RDONLY, NULL);
431 return log_error_errno(r, "Failed to determine whether we are in all unified mode: %m");
438 int mount_setup(bool loaded_policy) {
441 r = mount_points_setup(ELEMENTSOF(mount_table), loaded_policy);
445 #if 0 /// elogind does not control /, /dev, /run and /run/systemd/* are setup elsewhere.
446 #if HAVE_SELINUX || ENABLE_SMACK
447 /* Nodes in devtmpfs and /run need to be manually updated for
448 * the appropriate labels, after mounting. The other virtual
449 * API file systems like /sys and /proc do not need that, they
450 * use the same label for all their files. */
452 usec_t before_relabel, after_relabel;
453 char timespan[FORMAT_TIMESPAN_MAX];
455 before_relabel = now(CLOCK_MONOTONIC);
457 nftw("/dev", nftw_cb, 64, FTW_MOUNT|FTW_PHYS|FTW_ACTIONRETVAL);
458 nftw("/dev/shm", nftw_cb, 64, FTW_MOUNT|FTW_PHYS|FTW_ACTIONRETVAL);
459 nftw("/run", nftw_cb, 64, FTW_MOUNT|FTW_PHYS|FTW_ACTIONRETVAL);
461 r = relabel_cgroup_filesystems();
465 after_relabel = now(CLOCK_MONOTONIC);
467 log_info("Relabelled /dev, /run and /sys/fs/cgroup in %s.",
468 format_timespan(timespan, sizeof(timespan), after_relabel - before_relabel, 0));
472 /* Create a few default symlinks, which are normally created
473 * by udevd, but some scripts might need them before we start
475 dev_setup(NULL, UID_INVALID, GID_INVALID);
477 /* Mark the root directory as shared in regards to mount propagation. The kernel defaults to "private", but we
478 * think it makes more sense to have a default of "shared" so that nspawn and the container tools work out of
479 * the box. If specific setups need other settings they can reset the propagation mode to private if
480 * needed. Note that we set this only when we are invoked directly by the kernel. If we are invoked by a
481 * container manager we assume the container manager knows what it is doing (for example, because it set up
482 * some directories with different propagation modes). */
483 if (detect_container() <= 0)
484 if (mount(NULL, "/", NULL, MS_REC|MS_SHARED, NULL) < 0)
485 log_warning_errno(errno, "Failed to set up the root directory for shared mount propagation: %m");
488 /* Create a few directories we always want around, Note that sd_booted() checks for /run/systemd/system, so
489 * this mkdir really needs to stay for good, otherwise software that copied sd-daemon.c into their sources will
490 * misdetect systemd. */
491 (void) mkdir_label("/run/systemd", 0755);
492 #if 0 /// Yeah, but elogind is not used with systemd, so this directory would be toxic.
493 (void) mkdir_label("/run/systemd/system", 0755);
496 /* Set up inaccessible items */
497 (void) mkdir_label("/run/systemd/inaccessible", 0000);
498 (void) mknod("/run/systemd/inaccessible/reg", S_IFREG | 0000, 0);
499 (void) mkdir_label("/run/systemd/inaccessible/dir", 0000);
500 (void) mknod("/run/systemd/inaccessible/chr", S_IFCHR | 0000, makedev(0, 0));
501 (void) mknod("/run/systemd/inaccessible/blk", S_IFBLK | 0000, makedev(0, 0));
502 (void) mkfifo("/run/systemd/inaccessible/fifo", 0000);
503 (void) mknod("/run/systemd/inaccessible/sock", S_IFSOCK | 0000, 0);