1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
4 This file is part of systemd.
6 Copyright 2010 Lennart Poettering
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
23 #include <sys/mount.h>
33 #include "path-util.h"
35 #include "loopback-setup.h"
36 #include "dev-setup.h"
37 #include "selinux-util.h"
38 #include "namespace.h"
40 typedef enum MountMode {
41 /* This is ordered by priority! */
51 typedef struct BindMount {
58 static int append_mounts(BindMount **p, char **strv, MountMode mode) {
63 STRV_FOREACH(i, strv) {
68 if ((mode == INACCESSIBLE || mode == READONLY || mode == READWRITE) && (*i)[0] == '-') {
73 if (!path_is_absolute(*i))
84 static int mount_path_compare(const void *a, const void *b) {
85 const BindMount *p = a, *q = b;
87 if (path_equal(p->path, q->path)) {
89 /* If the paths are equal, check the mode */
90 if (p->mode < q->mode)
93 if (p->mode > q->mode)
99 /* If the paths are not equal, then order prefixes first */
100 if (path_startswith(p->path, q->path))
103 if (path_startswith(q->path, p->path))
109 static void drop_duplicates(BindMount *m, unsigned *n) {
110 BindMount *f, *t, *previous;
115 for (f = m, t = m, previous = NULL; f < m+*n; f++) {
117 /* The first one wins */
118 if (previous && path_equal(f->path, previous->path))
131 static int mount_dev(BindMount *m) {
132 static const char devnodes[] =
140 char temporary_mount[] = "/tmp/namespace-dev-XXXXXX";
141 const char *d, *dev = NULL, *devpts = NULL, *devshm = NULL, *devhugepages = NULL, *devmqueue = NULL, *devlog = NULL, *devptmx = NULL;
142 _cleanup_umask_ mode_t u;
149 if (!mkdtemp(temporary_mount))
152 dev = strjoina(temporary_mount, "/dev");
153 (void) mkdir(dev, 0755);
154 if (mount("tmpfs", dev, "tmpfs", MS_NOSUID|MS_STRICTATIME, "mode=755") < 0) {
159 devpts = strjoina(temporary_mount, "/dev/pts");
160 (void) mkdir(devpts, 0755);
161 if (mount("/dev/pts", devpts, NULL, MS_BIND, NULL) < 0) {
166 devptmx = strjoina(temporary_mount, "/dev/ptmx");
167 if (symlink("pts/ptmx", devptmx) < 0) {
172 devshm = strjoina(temporary_mount, "/dev/shm");
173 (void) mkdir(devshm, 01777);
174 r = mount("/dev/shm", devshm, NULL, MS_BIND, NULL);
180 devmqueue = strjoina(temporary_mount, "/dev/mqueue");
181 (void) mkdir(devmqueue, 0755);
182 (void) mount("/dev/mqueue", devmqueue, NULL, MS_BIND, NULL);
184 devhugepages = strjoina(temporary_mount, "/dev/hugepages");
185 (void) mkdir(devhugepages, 0755);
186 (void) mount("/dev/hugepages", devhugepages, NULL, MS_BIND, NULL);
188 devlog = strjoina(temporary_mount, "/dev/log");
189 (void) symlink("/run/systemd/journal/dev-log", devlog);
191 NULSTR_FOREACH(d, devnodes) {
192 _cleanup_free_ char *dn = NULL;
205 if (!S_ISBLK(st.st_mode) &&
206 !S_ISCHR(st.st_mode)) {
214 dn = strappend(temporary_mount, d);
220 mac_selinux_create_file_prepare(d, st.st_mode);
221 r = mknod(dn, st.st_mode, st.st_rdev);
222 mac_selinux_create_file_clear();
230 dev_setup(temporary_mount);
232 if (mount(dev, "/dev/", NULL, MS_MOVE, NULL) < 0) {
238 rmdir(temporary_mount);
250 umount(devhugepages);
257 rmdir(temporary_mount);
262 static int mount_kdbus(BindMount *m) {
264 char temporary_mount[] = "/tmp/kdbus-dev-XXXXXX";
265 _cleanup_free_ char *basepath = NULL;
266 _cleanup_umask_ mode_t u;
267 char *busnode = NULL, *root;
275 if (!mkdtemp(temporary_mount))
276 return log_error_errno(errno, "Failed create temp dir: %m");
278 root = strjoina(temporary_mount, "/kdbus");
279 (void) mkdir(root, 0755);
280 if (mount("tmpfs", root, "tmpfs", MS_NOSUID|MS_STRICTATIME, "mode=777") < 0) {
285 /* create a new /dev/null dev node copy so we have some fodder to
286 * bind-mount the custom endpoint over. */
287 if (stat("/dev/null", &st) < 0) {
288 log_error_errno(errno, "Failed to stat /dev/null: %m");
293 busnode = strjoina(root, "/bus");
294 if (mknod(busnode, (st.st_mode & ~07777) | 0600, st.st_rdev) < 0) {
295 log_error_errno(errno, "mknod() for %s failed: %m", busnode);
300 r = mount(m->path, busnode, "bind", MS_BIND, NULL);
302 log_error_errno(errno, "bind mount of %s failed: %m", m->path);
307 basepath = dirname_malloc(m->path);
313 if (mount(root, basepath, NULL, MS_MOVE, NULL) < 0) {
314 log_error_errno(errno, "bind mount of %s failed: %m", basepath);
319 rmdir(temporary_mount);
330 rmdir(temporary_mount);
335 static int apply_mount(
338 const char *var_tmp_dir) {
349 /* First, get rid of everything that is below if there
350 * is anything... Then, overmount it with an
351 * inaccessible directory. */
352 umount_recursive(m->path, 0);
354 what = "/run/systemd/inaccessible";
359 /* Nothing to mount here, we just later toggle the
360 * MS_RDONLY bit for the mount point */
367 case PRIVATE_VAR_TMP:
374 case PRIVATE_BUS_ENDPOINT:
375 return mount_kdbus(m);
378 assert_not_reached("Unknown mode");
383 r = mount(what, m->path, NULL, MS_BIND|MS_REC, NULL);
385 log_debug("Successfully mounted %s to %s", what, m->path);
386 else if (m->ignore && errno == ENOENT)
392 static int make_read_only(BindMount *m) {
397 if (IN_SET(m->mode, INACCESSIBLE, READONLY))
398 r = bind_remount_recursive(m->path, true);
399 else if (IN_SET(m->mode, READWRITE, PRIVATE_TMP, PRIVATE_VAR_TMP, PRIVATE_DEV))
400 r = bind_remount_recursive(m->path, false);
404 if (m->ignore && r == -ENOENT)
411 char** read_write_dirs,
412 char** read_only_dirs,
413 char** inaccessible_dirs,
415 const char* var_tmp_dir,
416 const char* bus_endpoint_path,
418 ProtectHome protect_home,
419 ProtectSystem protect_system,
420 unsigned long mount_flags) {
422 BindMount *m, *mounts = NULL;
426 if (mount_flags == 0)
427 mount_flags = MS_SHARED;
429 if (unshare(CLONE_NEWNS) < 0)
432 n = !!tmp_dir + !!var_tmp_dir + !!bus_endpoint_path +
433 strv_length(read_write_dirs) +
434 strv_length(read_only_dirs) +
435 strv_length(inaccessible_dirs) +
437 (protect_home != PROTECT_HOME_NO ? 3 : 0) +
438 (protect_system != PROTECT_SYSTEM_NO ? 2 : 0) +
439 (protect_system == PROTECT_SYSTEM_FULL ? 1 : 0);
442 m = mounts = (BindMount *) alloca0(n * sizeof(BindMount));
443 r = append_mounts(&m, read_write_dirs, READWRITE);
447 r = append_mounts(&m, read_only_dirs, READONLY);
451 r = append_mounts(&m, inaccessible_dirs, INACCESSIBLE);
457 m->mode = PRIVATE_TMP;
462 m->path = "/var/tmp";
463 m->mode = PRIVATE_VAR_TMP;
469 m->mode = PRIVATE_DEV;
473 if (bus_endpoint_path) {
474 m->path = bus_endpoint_path;
475 m->mode = PRIVATE_BUS_ENDPOINT;
479 if (protect_home != PROTECT_HOME_NO) {
480 r = append_mounts(&m, STRV_MAKE("-/home", "-/run/user", "-/root"), protect_home == PROTECT_HOME_READ_ONLY ? READONLY : INACCESSIBLE);
485 if (protect_system != PROTECT_SYSTEM_NO) {
486 r = append_mounts(&m, protect_system == PROTECT_SYSTEM_FULL ? STRV_MAKE("/usr", "-/boot", "/etc") : STRV_MAKE("/usr", "-/boot"), READONLY);
491 assert(mounts + n == m);
493 qsort(mounts, n, sizeof(BindMount), mount_path_compare);
494 drop_duplicates(mounts, &n);
498 /* Remount / as SLAVE so that nothing now mounted in the namespace
499 shows up in the parent */
500 if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0)
503 for (m = mounts; m < mounts + n; ++m) {
504 r = apply_mount(m, tmp_dir, var_tmp_dir);
509 for (m = mounts; m < mounts + n; ++m) {
510 r = make_read_only(m);
516 /* Remount / as the desired mode. Not that this will not
517 * reestablish propagation from our side to the host, since
518 * what's disconnected is disconnected. */
519 if (mount(NULL, "/", NULL, mount_flags | MS_REC, NULL) < 0) {
528 for (m = mounts; m < mounts + n; ++m)
530 (void) umount2(m->path, MNT_DETACH);
536 static int setup_one_tmp_dir(const char *id, const char *prefix, char **path) {
537 _cleanup_free_ char *x = NULL;
538 char bid[SD_ID128_STRING_MAX];
546 /* We include the boot id in the directory so that after a
547 * reboot we can easily identify obsolete directories. */
549 r = sd_id128_get_boot(&boot_id);
553 x = strjoin(prefix, "/systemd-private-", sd_id128_to_string(boot_id, bid), "-", id, "-XXXXXX", NULL);
561 RUN_WITH_UMASK(0000) {
564 y = strjoina(x, "/tmp");
566 if (mkdir(y, 0777 | S_ISVTX) < 0)
576 int setup_tmp_dirs(const char *id, char **tmp_dir, char **var_tmp_dir) {
584 r = setup_one_tmp_dir(id, "/tmp", &a);
588 r = setup_one_tmp_dir(id, "/var/tmp", &b);
592 t = strjoina(a, "/tmp");
606 int setup_netns(int netns_storage_socket[2]) {
607 _cleanup_close_ int netns = -1;
609 struct cmsghdr cmsghdr;
610 uint8_t buf[CMSG_SPACE(sizeof(int))];
613 .msg_control = &control,
614 .msg_controllen = sizeof(control),
616 struct cmsghdr *cmsg;
619 assert(netns_storage_socket);
620 assert(netns_storage_socket[0] >= 0);
621 assert(netns_storage_socket[1] >= 0);
623 /* We use the passed socketpair as a storage buffer for our
624 * namespace reference fd. Whatever process runs this first
625 * shall create a new namespace, all others should just join
626 * it. To serialize that we use a file lock on the socket
629 * It's a bit crazy, but hey, works great! */
631 if (lockf(netns_storage_socket[0], F_LOCK, 0) < 0)
634 if (recvmsg(netns_storage_socket[0], &mh, MSG_DONTWAIT|MSG_CMSG_CLOEXEC) < 0) {
635 if (errno != EAGAIN) {
640 /* Nothing stored yet, so let's create a new namespace */
642 if (unshare(CLONE_NEWNET) < 0) {
649 netns = open("/proc/self/ns/net", O_RDONLY|O_CLOEXEC|O_NOCTTY);
657 /* Yay, found something, so let's join the namespace */
659 for (cmsg = CMSG_FIRSTHDR(&mh); cmsg; cmsg = CMSG_NXTHDR(&mh, cmsg)) {
660 if (cmsg->cmsg_level == SOL_SOCKET && cmsg->cmsg_type == SCM_RIGHTS) {
661 assert(cmsg->cmsg_len == CMSG_LEN(sizeof(int)));
662 netns = *(int*) CMSG_DATA(cmsg);
666 if (setns(netns, CLONE_NEWNET) < 0) {
674 cmsg = CMSG_FIRSTHDR(&mh);
675 cmsg->cmsg_level = SOL_SOCKET;
676 cmsg->cmsg_type = SCM_RIGHTS;
677 cmsg->cmsg_len = CMSG_LEN(sizeof(int));
678 memcpy(CMSG_DATA(cmsg), &netns, sizeof(int));
679 mh.msg_controllen = cmsg->cmsg_len;
681 if (sendmsg(netns_storage_socket[1], &mh, MSG_DONTWAIT|MSG_NOSIGNAL) < 0) {
687 lockf(netns_storage_socket[0], F_ULOCK, 0);
692 static const char *const protect_home_table[_PROTECT_HOME_MAX] = {
693 [PROTECT_HOME_NO] = "no",
694 [PROTECT_HOME_YES] = "yes",
695 [PROTECT_HOME_READ_ONLY] = "read-only",
698 DEFINE_STRING_TABLE_LOOKUP(protect_home, ProtectHome);
700 static const char *const protect_system_table[_PROTECT_SYSTEM_MAX] = {
701 [PROTECT_SYSTEM_NO] = "no",
702 [PROTECT_SYSTEM_YES] = "yes",
703 [PROTECT_SYSTEM_FULL] = "full",
706 DEFINE_STRING_TABLE_LOOKUP(protect_system, ProtectSystem);