1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
4 This file is part of systemd.
6 Copyright 2010 Lennart Poettering
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
23 #include <sys/mount.h>
33 #include "path-util.h"
35 #include "loopback-setup.h"
36 #include "dev-setup.h"
37 #include "selinux-util.h"
38 #include "namespace.h"
40 typedef enum MountMode {
41 /* This is ordered by priority! */
51 typedef struct BindMount {
58 static int append_mounts(BindMount **p, char **strv, MountMode mode) {
63 STRV_FOREACH(i, strv) {
68 if ((mode == INACCESSIBLE || mode == READONLY || mode == READWRITE) && (*i)[0] == '-') {
73 if (!path_is_absolute(*i))
84 static int mount_path_compare(const void *a, const void *b) {
85 const BindMount *p = a, *q = b;
87 if (path_equal(p->path, q->path)) {
89 /* If the paths are equal, check the mode */
90 if (p->mode < q->mode)
93 if (p->mode > q->mode)
99 /* If the paths are not equal, then order prefixes first */
100 if (path_startswith(p->path, q->path))
103 if (path_startswith(q->path, p->path))
109 static void drop_duplicates(BindMount *m, unsigned *n) {
110 BindMount *f, *t, *previous;
115 for (f = m, t = m, previous = NULL; f < m+*n; f++) {
117 /* The first one wins */
118 if (previous && path_equal(f->path, previous->path))
131 static int mount_dev(BindMount *m) {
132 static const char devnodes[] =
140 char temporary_mount[] = "/tmp/namespace-dev-XXXXXX";
141 const char *d, *dev = NULL, *devpts = NULL, *devshm = NULL, *devhugepages = NULL, *devmqueue = NULL, *devlog = NULL, *devptmx = NULL;
142 _cleanup_umask_ mode_t u;
149 if (!mkdtemp(temporary_mount))
152 dev = strjoina(temporary_mount, "/dev");
153 (void)mkdir(dev, 0755);
154 if (mount("tmpfs", dev, "tmpfs", MS_NOSUID|MS_STRICTATIME, "mode=755") < 0) {
159 devpts = strjoina(temporary_mount, "/dev/pts");
160 (void)mkdir(devpts, 0755);
161 if (mount("/dev/pts", devpts, NULL, MS_BIND, NULL) < 0) {
166 devptmx = strjoina(temporary_mount, "/dev/ptmx");
167 symlink("pts/ptmx", devptmx);
169 devshm = strjoina(temporary_mount, "/dev/shm");
170 (void)mkdir(devshm, 01777);
171 r = mount("/dev/shm", devshm, NULL, MS_BIND, NULL);
177 devmqueue = strjoina(temporary_mount, "/dev/mqueue");
178 (void)mkdir(devmqueue, 0755);
179 mount("/dev/mqueue", devmqueue, NULL, MS_BIND, NULL);
181 devhugepages = strjoina(temporary_mount, "/dev/hugepages");
182 (void)mkdir(devhugepages, 0755);
183 mount("/dev/hugepages", devhugepages, NULL, MS_BIND, NULL);
185 devlog = strjoina(temporary_mount, "/dev/log");
186 symlink("/run/systemd/journal/dev-log", devlog);
188 NULSTR_FOREACH(d, devnodes) {
189 _cleanup_free_ char *dn = NULL;
202 if (!S_ISBLK(st.st_mode) &&
203 !S_ISCHR(st.st_mode)) {
211 dn = strappend(temporary_mount, d);
217 mac_selinux_create_file_prepare(d, st.st_mode);
218 r = mknod(dn, st.st_mode, st.st_rdev);
219 mac_selinux_create_file_clear();
227 dev_setup(temporary_mount);
229 if (mount(dev, "/dev/", NULL, MS_MOVE, NULL) < 0) {
235 rmdir(temporary_mount);
247 umount(devhugepages);
254 rmdir(temporary_mount);
259 static int mount_kdbus(BindMount *m) {
261 char temporary_mount[] = "/tmp/kdbus-dev-XXXXXX";
262 _cleanup_free_ char *basepath = NULL;
263 _cleanup_umask_ mode_t u;
264 char *busnode = NULL, *root;
272 if (!mkdtemp(temporary_mount))
273 return log_error_errno(errno, "Failed create temp dir: %m");
275 root = strjoina(temporary_mount, "/kdbus");
276 (void)mkdir(root, 0755);
277 if (mount("tmpfs", root, "tmpfs", MS_NOSUID|MS_STRICTATIME, "mode=777") < 0) {
282 /* create a new /dev/null dev node copy so we have some fodder to
283 * bind-mount the custom endpoint over. */
284 if (stat("/dev/null", &st) < 0) {
285 log_error_errno(errno, "Failed to stat /dev/null: %m");
290 busnode = strjoina(root, "/bus");
291 if (mknod(busnode, (st.st_mode & ~07777) | 0600, st.st_rdev) < 0) {
292 log_error_errno(errno, "mknod() for %s failed: %m", busnode);
297 r = mount(m->path, busnode, "bind", MS_BIND, NULL);
299 log_error_errno(errno, "bind mount of %s failed: %m", m->path);
304 basepath = dirname_malloc(m->path);
310 if (mount(root, basepath, NULL, MS_MOVE, NULL) < 0) {
311 log_error_errno(errno, "bind mount of %s failed: %m", basepath);
316 rmdir(temporary_mount);
327 rmdir(temporary_mount);
332 static int apply_mount(
335 const char *var_tmp_dir) {
346 /* First, get rid of everything that is below if there
347 * is anything... Then, overmount it with an
348 * inaccessible directory. */
349 umount_recursive(m->path, 0);
351 what = "/run/systemd/inaccessible";
356 /* Nothing to mount here, we just later toggle the
357 * MS_RDONLY bit for the mount point */
364 case PRIVATE_VAR_TMP:
371 case PRIVATE_BUS_ENDPOINT:
372 return mount_kdbus(m);
375 assert_not_reached("Unknown mode");
380 r = mount(what, m->path, NULL, MS_BIND|MS_REC, NULL);
382 log_debug("Successfully mounted %s to %s", what, m->path);
383 else if (m->ignore && errno == ENOENT)
389 static int make_read_only(BindMount *m) {
394 if (IN_SET(m->mode, INACCESSIBLE, READONLY))
395 r = bind_remount_recursive(m->path, true);
396 else if (IN_SET(m->mode, READWRITE, PRIVATE_TMP, PRIVATE_VAR_TMP, PRIVATE_DEV))
397 r = bind_remount_recursive(m->path, false);
401 if (m->ignore && r == -ENOENT)
408 char** read_write_dirs,
409 char** read_only_dirs,
410 char** inaccessible_dirs,
412 const char* var_tmp_dir,
413 const char* bus_endpoint_path,
415 ProtectHome protect_home,
416 ProtectSystem protect_system,
417 unsigned long mount_flags) {
419 BindMount *m, *mounts = NULL;
423 if (mount_flags == 0)
424 mount_flags = MS_SHARED;
426 if (unshare(CLONE_NEWNS) < 0)
429 n = !!tmp_dir + !!var_tmp_dir + !!bus_endpoint_path +
430 strv_length(read_write_dirs) +
431 strv_length(read_only_dirs) +
432 strv_length(inaccessible_dirs) +
434 (protect_home != PROTECT_HOME_NO ? 3 : 0) +
435 (protect_system != PROTECT_SYSTEM_NO ? 2 : 0) +
436 (protect_system == PROTECT_SYSTEM_FULL ? 1 : 0);
439 m = mounts = (BindMount *) alloca0(n * sizeof(BindMount));
440 r = append_mounts(&m, read_write_dirs, READWRITE);
444 r = append_mounts(&m, read_only_dirs, READONLY);
448 r = append_mounts(&m, inaccessible_dirs, INACCESSIBLE);
454 m->mode = PRIVATE_TMP;
459 m->path = "/var/tmp";
460 m->mode = PRIVATE_VAR_TMP;
466 m->mode = PRIVATE_DEV;
470 if (bus_endpoint_path) {
471 m->path = bus_endpoint_path;
472 m->mode = PRIVATE_BUS_ENDPOINT;
476 if (protect_home != PROTECT_HOME_NO) {
477 r = append_mounts(&m, STRV_MAKE("-/home", "-/run/user", "-/root"), protect_home == PROTECT_HOME_READ_ONLY ? READONLY : INACCESSIBLE);
482 if (protect_system != PROTECT_SYSTEM_NO) {
483 r = append_mounts(&m, protect_system == PROTECT_SYSTEM_FULL ? STRV_MAKE("/usr", "-/boot", "/etc") : STRV_MAKE("/usr", "-/boot"), READONLY);
488 assert(mounts + n == m);
490 qsort(mounts, n, sizeof(BindMount), mount_path_compare);
491 drop_duplicates(mounts, &n);
495 /* Remount / as SLAVE so that nothing now mounted in the namespace
496 shows up in the parent */
497 if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0)
500 for (m = mounts; m < mounts + n; ++m) {
501 r = apply_mount(m, tmp_dir, var_tmp_dir);
506 for (m = mounts; m < mounts + n; ++m) {
507 r = make_read_only(m);
513 /* Remount / as the desired mode. Not that this will not
514 * reestablish propagation from our side to the host, since
515 * what's disconnected is disconnected. */
516 if (mount(NULL, "/", NULL, mount_flags | MS_REC, NULL) < 0) {
525 for (m = mounts; m < mounts + n; ++m)
527 umount2(m->path, MNT_DETACH);
533 static int setup_one_tmp_dir(const char *id, const char *prefix, char **path) {
534 _cleanup_free_ char *x = NULL;
535 char bid[SD_ID128_STRING_MAX];
543 /* We include the boot id in the directory so that after a
544 * reboot we can easily identify obsolete directories. */
546 r = sd_id128_get_boot(&boot_id);
550 x = strjoin(prefix, "/systemd-private-", sd_id128_to_string(boot_id, bid), "-", id, "-XXXXXX", NULL);
558 RUN_WITH_UMASK(0000) {
561 y = strjoina(x, "/tmp");
563 if (mkdir(y, 0777 | S_ISVTX) < 0)
573 int setup_tmp_dirs(const char *id, char **tmp_dir, char **var_tmp_dir) {
581 r = setup_one_tmp_dir(id, "/tmp", &a);
585 r = setup_one_tmp_dir(id, "/var/tmp", &b);
589 t = strjoina(a, "/tmp");
603 int setup_netns(int netns_storage_socket[2]) {
604 _cleanup_close_ int netns = -1;
606 struct cmsghdr cmsghdr;
607 uint8_t buf[CMSG_SPACE(sizeof(int))];
610 .msg_control = &control,
611 .msg_controllen = sizeof(control),
613 struct cmsghdr *cmsg;
616 assert(netns_storage_socket);
617 assert(netns_storage_socket[0] >= 0);
618 assert(netns_storage_socket[1] >= 0);
620 /* We use the passed socketpair as a storage buffer for our
621 * namespace reference fd. Whatever process runs this first
622 * shall create a new namespace, all others should just join
623 * it. To serialize that we use a file lock on the socket
626 * It's a bit crazy, but hey, works great! */
628 if (lockf(netns_storage_socket[0], F_LOCK, 0) < 0)
631 if (recvmsg(netns_storage_socket[0], &mh, MSG_DONTWAIT|MSG_CMSG_CLOEXEC) < 0) {
632 if (errno != EAGAIN) {
637 /* Nothing stored yet, so let's create a new namespace */
639 if (unshare(CLONE_NEWNET) < 0) {
646 netns = open("/proc/self/ns/net", O_RDONLY|O_CLOEXEC|O_NOCTTY);
654 /* Yay, found something, so let's join the namespace */
656 for (cmsg = CMSG_FIRSTHDR(&mh); cmsg; cmsg = CMSG_NXTHDR(&mh, cmsg)) {
657 if (cmsg->cmsg_level == SOL_SOCKET && cmsg->cmsg_type == SCM_RIGHTS) {
658 assert(cmsg->cmsg_len == CMSG_LEN(sizeof(int)));
659 netns = *(int*) CMSG_DATA(cmsg);
663 if (setns(netns, CLONE_NEWNET) < 0) {
671 cmsg = CMSG_FIRSTHDR(&mh);
672 cmsg->cmsg_level = SOL_SOCKET;
673 cmsg->cmsg_type = SCM_RIGHTS;
674 cmsg->cmsg_len = CMSG_LEN(sizeof(int));
675 memcpy(CMSG_DATA(cmsg), &netns, sizeof(int));
676 mh.msg_controllen = cmsg->cmsg_len;
678 if (sendmsg(netns_storage_socket[1], &mh, MSG_DONTWAIT|MSG_NOSIGNAL) < 0) {
684 lockf(netns_storage_socket[0], F_ULOCK, 0);
689 static const char *const protect_home_table[_PROTECT_HOME_MAX] = {
690 [PROTECT_HOME_NO] = "no",
691 [PROTECT_HOME_YES] = "yes",
692 [PROTECT_HOME_READ_ONLY] = "read-only",
695 DEFINE_STRING_TABLE_LOOKUP(protect_home, ProtectHome);
697 static const char *const protect_system_table[_PROTECT_SYSTEM_MAX] = {
698 [PROTECT_SYSTEM_NO] = "no",
699 [PROTECT_SYSTEM_YES] = "yes",
700 [PROTECT_SYSTEM_FULL] = "full",
703 DEFINE_STRING_TABLE_LOOKUP(protect_system, ProtectSystem);