1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
4 This file is part of systemd.
6 Copyright 2010 Lennart Poettering
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
23 #include <sys/mount.h>
28 #include <sys/types.h>
30 #include <sys/syscall.h>
37 #include "path-util.h"
38 #include "namespace.h"
41 #include "loopback-setup.h"
43 #include "dev-setup.h"
47 typedef enum MountMode {
48 /* This is ordered by priority! */
58 typedef struct BindMount {
65 static int append_mounts(BindMount **p, char **strv, MountMode mode) {
70 STRV_FOREACH(i, strv) {
75 if ((mode == INACCESSIBLE || mode == READONLY || mode == READWRITE) && (*i)[0] == '-') {
80 if (!path_is_absolute(*i))
91 static int mount_path_compare(const void *a, const void *b) {
92 const BindMount *p = a, *q = b;
94 if (path_equal(p->path, q->path)) {
96 /* If the paths are equal, check the mode */
97 if (p->mode < q->mode)
100 if (p->mode > q->mode)
106 /* If the paths are not equal, then order prefixes first */
107 if (path_startswith(p->path, q->path))
110 if (path_startswith(q->path, p->path))
116 static void drop_duplicates(BindMount *m, unsigned *n) {
117 BindMount *f, *t, *previous;
122 for (f = m, t = m, previous = NULL; f < m+*n; f++) {
124 /* The first one wins */
125 if (previous && path_equal(f->path, previous->path))
138 static int mount_dev(BindMount *m) {
139 static const char devnodes[] =
147 char temporary_mount[] = "/tmp/namespace-dev-XXXXXX";
148 const char *d, *dev = NULL, *devpts = NULL, *devshm = NULL, *devkdbus = NULL, *devhugepages = NULL, *devmqueue = NULL, *devlog = NULL, *devptmx = NULL;
149 _cleanup_umask_ mode_t u;
156 if (!mkdtemp(temporary_mount))
159 dev = strappenda(temporary_mount, "/dev");
161 if (mount("tmpfs", dev, "tmpfs", MS_NOSUID|MS_STRICTATIME, "mode=755") < 0) {
166 devpts = strappenda(temporary_mount, "/dev/pts");
168 if (mount("/dev/pts", devpts, NULL, MS_BIND, NULL) < 0) {
173 devptmx = strappenda(temporary_mount, "/dev/ptmx");
174 symlink("pts/ptmx", devptmx);
176 devshm = strappenda(temporary_mount, "/dev/shm");
177 mkdir(devshm, 01777);
178 r = mount("/dev/shm", devshm, NULL, MS_BIND, NULL);
184 devmqueue = strappenda(temporary_mount, "/dev/mqueue");
185 mkdir(devmqueue, 0755);
186 mount("/dev/mqueue", devmqueue, NULL, MS_BIND, NULL);
188 devkdbus = strappenda(temporary_mount, "/dev/kdbus");
189 mkdir(devkdbus, 0755);
190 mount("/dev/kdbus", devkdbus, NULL, MS_BIND, NULL);
192 devhugepages = strappenda(temporary_mount, "/dev/hugepages");
193 mkdir(devhugepages, 0755);
194 mount("/dev/hugepages", devhugepages, NULL, MS_BIND, NULL);
196 devlog = strappenda(temporary_mount, "/dev/log");
197 symlink("/run/systemd/journal/dev-log", devlog);
199 NULSTR_FOREACH(d, devnodes) {
200 _cleanup_free_ char *dn = NULL;
213 if (!S_ISBLK(st.st_mode) &&
214 !S_ISCHR(st.st_mode)) {
222 dn = strappend(temporary_mount, d);
228 mac_selinux_create_file_prepare(d, st.st_mode);
229 r = mknod(dn, st.st_mode, st.st_rdev);
230 mac_selinux_create_file_clear();
238 dev_setup(temporary_mount);
240 if (mount(dev, "/dev/", NULL, MS_MOVE, NULL) < 0) {
246 rmdir(temporary_mount);
261 umount(devhugepages);
268 rmdir(temporary_mount);
273 static int mount_kdbus(BindMount *m) {
275 char temporary_mount[] = "/tmp/kdbus-dev-XXXXXX";
276 _cleanup_free_ char *basepath = NULL;
277 _cleanup_umask_ mode_t u;
278 char *busnode = NULL, *root;
286 if (!mkdtemp(temporary_mount)) {
287 log_error("Failed create temp dir: %m");
291 root = strappenda(temporary_mount, "/kdbus");
293 if (mount("tmpfs", root, "tmpfs", MS_NOSUID|MS_STRICTATIME, "mode=777") < 0) {
298 /* create a new /dev/null dev node copy so we have some fodder to
299 * bind-mount the custom endpoint over. */
300 if (stat("/dev/null", &st) < 0) {
301 log_error("Failed to stat /dev/null: %m");
306 busnode = strappenda(root, "/bus");
307 if (mknod(busnode, (st.st_mode & ~07777) | 0600, st.st_rdev) < 0) {
308 log_error("mknod() for %s failed: %m", busnode);
313 r = mount(m->path, busnode, "bind", MS_BIND, NULL);
315 log_error("bind mount of %s failed: %m", m->path);
320 basepath = dirname_malloc(m->path);
326 if (mount(root, basepath, NULL, MS_MOVE, NULL) < 0) {
327 log_error("bind mount of %s failed: %m", basepath);
332 rmdir(temporary_mount);
343 rmdir(temporary_mount);
348 static int apply_mount(
351 const char *var_tmp_dir) {
362 /* First, get rid of everything that is below if there
363 * is anything... Then, overmount it with an
364 * inaccessible directory. */
365 umount_recursive(m->path, 0);
367 what = "/run/systemd/inaccessible";
372 /* Nothing to mount here, we just later toggle the
373 * MS_RDONLY bit for the mount point */
380 case PRIVATE_VAR_TMP:
387 case PRIVATE_BUS_ENDPOINT:
388 return mount_kdbus(m);
391 assert_not_reached("Unknown mode");
396 r = mount(what, m->path, NULL, MS_BIND|MS_REC, NULL);
398 log_debug("Successfully mounted %s to %s", what, m->path);
399 else if (m->ignore && errno == ENOENT)
405 static int make_read_only(BindMount *m) {
410 if (IN_SET(m->mode, INACCESSIBLE, READONLY))
411 r = bind_remount_recursive(m->path, true);
412 else if (IN_SET(m->mode, READWRITE, PRIVATE_TMP, PRIVATE_VAR_TMP, PRIVATE_DEV))
413 r = bind_remount_recursive(m->path, false);
417 if (m->ignore && r == -ENOENT)
424 char** read_write_dirs,
425 char** read_only_dirs,
426 char** inaccessible_dirs,
428 const char* var_tmp_dir,
429 const char* bus_endpoint_path,
431 ProtectHome protect_home,
432 ProtectSystem protect_system,
433 unsigned mount_flags) {
435 BindMount *m, *mounts = NULL;
439 if (mount_flags == 0)
440 mount_flags = MS_SHARED;
442 if (unshare(CLONE_NEWNS) < 0)
445 n = !!tmp_dir + !!var_tmp_dir + !!bus_endpoint_path +
446 strv_length(read_write_dirs) +
447 strv_length(read_only_dirs) +
448 strv_length(inaccessible_dirs) +
450 (protect_home != PROTECT_HOME_NO ? 3 : 0) +
451 (protect_system != PROTECT_SYSTEM_NO ? 2 : 0) +
452 (protect_system == PROTECT_SYSTEM_FULL ? 1 : 0);
455 m = mounts = (BindMount *) alloca0(n * sizeof(BindMount));
456 r = append_mounts(&m, read_write_dirs, READWRITE);
460 r = append_mounts(&m, read_only_dirs, READONLY);
464 r = append_mounts(&m, inaccessible_dirs, INACCESSIBLE);
470 m->mode = PRIVATE_TMP;
475 m->path = "/var/tmp";
476 m->mode = PRIVATE_VAR_TMP;
482 m->mode = PRIVATE_DEV;
486 if (bus_endpoint_path) {
487 m->path = bus_endpoint_path;
488 m->mode = PRIVATE_BUS_ENDPOINT;
492 if (protect_home != PROTECT_HOME_NO) {
493 r = append_mounts(&m, STRV_MAKE("-/home", "-/run/user", "-/root"), protect_home == PROTECT_HOME_READ_ONLY ? READONLY : INACCESSIBLE);
498 if (protect_system != PROTECT_SYSTEM_NO) {
499 r = append_mounts(&m, protect_system == PROTECT_SYSTEM_FULL ? STRV_MAKE("/usr", "-/boot", "/etc") : STRV_MAKE("/usr", "-/boot"), READONLY);
504 assert(mounts + n == m);
506 qsort(mounts, n, sizeof(BindMount), mount_path_compare);
507 drop_duplicates(mounts, &n);
511 /* Remount / as SLAVE so that nothing now mounted in the namespace
512 shows up in the parent */
513 if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0)
516 for (m = mounts; m < mounts + n; ++m) {
517 r = apply_mount(m, tmp_dir, var_tmp_dir);
522 for (m = mounts; m < mounts + n; ++m) {
523 r = make_read_only(m);
529 /* Remount / as the desired mode. Not that this will not
530 * reestablish propagation from our side to the host, since
531 * what's disconnected is disconnected. */
532 if (mount(NULL, "/", NULL, mount_flags | MS_REC, NULL) < 0) {
541 for (m = mounts; m < mounts + n; ++m)
543 umount2(m->path, MNT_DETACH);
549 static int setup_one_tmp_dir(const char *id, const char *prefix, char **path) {
550 _cleanup_free_ char *x = NULL;
551 char bid[SD_ID128_STRING_MAX];
559 /* We include the boot id in the directory so that after a
560 * reboot we can easily identify obsolete directories. */
562 r = sd_id128_get_boot(&boot_id);
566 x = strjoin(prefix, "/systemd-private-", sd_id128_to_string(boot_id, bid), "-", id, "-XXXXXX", NULL);
574 RUN_WITH_UMASK(0000) {
577 y = strappenda(x, "/tmp");
579 if (mkdir(y, 0777 | S_ISVTX) < 0)
589 int setup_tmp_dirs(const char *id, char **tmp_dir, char **var_tmp_dir) {
597 r = setup_one_tmp_dir(id, "/tmp", &a);
601 r = setup_one_tmp_dir(id, "/var/tmp", &b);
605 t = strappenda(a, "/tmp");
619 int setup_netns(int netns_storage_socket[2]) {
620 _cleanup_close_ int netns = -1;
622 struct cmsghdr cmsghdr;
623 uint8_t buf[CMSG_SPACE(sizeof(int))];
626 .msg_control = &control,
627 .msg_controllen = sizeof(control),
629 struct cmsghdr *cmsg;
632 assert(netns_storage_socket);
633 assert(netns_storage_socket[0] >= 0);
634 assert(netns_storage_socket[1] >= 0);
636 /* We use the passed socketpair as a storage buffer for our
637 * namespace reference fd. Whatever process runs this first
638 * shall create a new namespace, all others should just join
639 * it. To serialize that we use a file lock on the socket
642 * It's a bit crazy, but hey, works great! */
644 if (lockf(netns_storage_socket[0], F_LOCK, 0) < 0)
647 if (recvmsg(netns_storage_socket[0], &mh, MSG_DONTWAIT|MSG_CMSG_CLOEXEC) < 0) {
648 if (errno != EAGAIN) {
653 /* Nothing stored yet, so let's create a new namespace */
655 if (unshare(CLONE_NEWNET) < 0) {
662 netns = open("/proc/self/ns/net", O_RDONLY|O_CLOEXEC|O_NOCTTY);
670 /* Yay, found something, so let's join the namespace */
672 for (cmsg = CMSG_FIRSTHDR(&mh); cmsg; cmsg = CMSG_NXTHDR(&mh, cmsg)) {
673 if (cmsg->cmsg_level == SOL_SOCKET && cmsg->cmsg_type == SCM_RIGHTS) {
674 assert(cmsg->cmsg_len == CMSG_LEN(sizeof(int)));
675 netns = *(int*) CMSG_DATA(cmsg);
679 if (setns(netns, CLONE_NEWNET) < 0) {
687 cmsg = CMSG_FIRSTHDR(&mh);
688 cmsg->cmsg_level = SOL_SOCKET;
689 cmsg->cmsg_type = SCM_RIGHTS;
690 cmsg->cmsg_len = CMSG_LEN(sizeof(int));
691 memcpy(CMSG_DATA(cmsg), &netns, sizeof(int));
692 mh.msg_controllen = cmsg->cmsg_len;
694 if (sendmsg(netns_storage_socket[1], &mh, MSG_DONTWAIT|MSG_NOSIGNAL) < 0) {
700 lockf(netns_storage_socket[0], F_ULOCK, 0);
705 static const char *const protect_home_table[_PROTECT_HOME_MAX] = {
706 [PROTECT_HOME_NO] = "no",
707 [PROTECT_HOME_YES] = "yes",
708 [PROTECT_HOME_READ_ONLY] = "read-only",
711 DEFINE_STRING_TABLE_LOOKUP(protect_home, ProtectHome);
713 static const char *const protect_system_table[_PROTECT_SYSTEM_MAX] = {
714 [PROTECT_SYSTEM_NO] = "no",
715 [PROTECT_SYSTEM_YES] = "yes",
716 [PROTECT_SYSTEM_FULL] = "full",
719 DEFINE_STRING_TABLE_LOOKUP(protect_system, ProtectSystem);