1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
4 This file is part of systemd.
6 Copyright 2010 Lennart Poettering
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
23 #include <sys/mount.h>
28 #include <sys/types.h>
30 #include <sys/syscall.h>
37 #include "path-util.h"
38 #include "namespace.h"
41 #include "loopback-setup.h"
43 #include "dev-setup.h"
47 typedef enum MountMode {
48 /* This is ordered by priority! */
58 typedef struct BindMount {
65 static int append_mounts(BindMount **p, char **strv, MountMode mode) {
70 STRV_FOREACH(i, strv) {
75 if ((mode == INACCESSIBLE || mode == READONLY || mode == READWRITE) && (*i)[0] == '-') {
80 if (!path_is_absolute(*i))
91 static int mount_path_compare(const void *a, const void *b) {
92 const BindMount *p = a, *q = b;
94 if (path_equal(p->path, q->path)) {
96 /* If the paths are equal, check the mode */
97 if (p->mode < q->mode)
100 if (p->mode > q->mode)
106 /* If the paths are not equal, then order prefixes first */
107 if (path_startswith(p->path, q->path))
110 if (path_startswith(q->path, p->path))
116 static void drop_duplicates(BindMount *m, unsigned *n) {
117 BindMount *f, *t, *previous;
122 for (f = m, t = m, previous = NULL; f < m+*n; f++) {
124 /* The first one wins */
125 if (previous && path_equal(f->path, previous->path))
138 static int mount_dev(BindMount *m) {
139 static const char devnodes[] =
147 char temporary_mount[] = "/tmp/namespace-dev-XXXXXX";
148 const char *d, *dev = NULL, *devpts = NULL, *devshm = NULL, *devkdbus = NULL, *devhugepages = NULL, *devmqueue = NULL, *devlog = NULL, *devptmx = NULL;
149 _cleanup_umask_ mode_t u;
156 if (!mkdtemp(temporary_mount))
159 dev = strappenda(temporary_mount, "/dev");
161 if (mount("tmpfs", dev, "tmpfs", MS_NOSUID|MS_STRICTATIME, "mode=755") < 0) {
166 devpts = strappenda(temporary_mount, "/dev/pts");
168 if (mount("/dev/pts", devpts, NULL, MS_BIND, NULL) < 0) {
173 devptmx = strappenda(temporary_mount, "/dev/ptmx");
174 symlink("pts/ptmx", devptmx);
176 devshm = strappenda(temporary_mount, "/dev/shm");
177 mkdir(devshm, 01777);
178 r = mount("/dev/shm", devshm, NULL, MS_BIND, NULL);
184 devmqueue = strappenda(temporary_mount, "/dev/mqueue");
185 mkdir(devmqueue, 0755);
186 mount("/dev/mqueue", devmqueue, NULL, MS_BIND, NULL);
188 devkdbus = strappenda(temporary_mount, "/dev/kdbus");
189 mkdir(devkdbus, 0755);
190 mount("/dev/kdbus", devkdbus, NULL, MS_BIND, NULL);
192 devhugepages = strappenda(temporary_mount, "/dev/hugepages");
193 mkdir(devhugepages, 0755);
194 mount("/dev/hugepages", devhugepages, NULL, MS_BIND, NULL);
196 devlog = strappenda(temporary_mount, "/dev/log");
197 symlink("/run/systemd/journal/dev-log", devlog);
199 NULSTR_FOREACH(d, devnodes) {
200 _cleanup_free_ char *dn = NULL;
213 if (!S_ISBLK(st.st_mode) &&
214 !S_ISCHR(st.st_mode)) {
222 dn = strappend(temporary_mount, d);
228 label_context_set(d, st.st_mode);
229 r = mknod(dn, st.st_mode, st.st_rdev);
230 label_context_clear();
238 dev_setup(temporary_mount);
240 if (mount(dev, "/dev/", NULL, MS_MOVE, NULL) < 0) {
246 rmdir(temporary_mount);
261 umount(devhugepages);
271 rmdir(temporary_mount);
276 static int mount_kdbus(BindMount *m) {
278 char temporary_mount[] = "/tmp/kdbus-dev-XXXXXX";
279 _cleanup_free_ char *basepath = NULL;
280 _cleanup_umask_ mode_t u;
281 char *busnode = NULL, *root;
289 if (!mkdtemp(temporary_mount)) {
290 log_error("Failed create temp dir: %m");
294 root = strappenda(temporary_mount, "/kdbus");
296 if (mount("tmpfs", root, "tmpfs", MS_NOSUID|MS_STRICTATIME, "mode=777") < 0) {
301 /* create a new /dev/null dev node copy so we have some fodder to
302 * bind-mount the custom endpoint over. */
303 if (stat("/dev/null", &st) < 0) {
304 log_error("Failed to stat /dev/null: %m");
309 busnode = strappenda(root, "/bus");
310 if (mknod(busnode, (st.st_mode & ~07777) | 0600, st.st_rdev) < 0) {
311 log_error("mknod() for %s failed: %m", busnode);
316 r = mount(m->path, busnode, "bind", MS_BIND, NULL);
318 log_error("bind mount of %s failed: %m", m->path);
323 basepath = dirname_malloc(m->path);
329 if (mount(root, basepath, NULL, MS_MOVE, NULL) < 0) {
330 log_error("bind mount of %s failed: %m", basepath);
335 rmdir(temporary_mount);
346 rmdir(temporary_mount);
351 static int apply_mount(
354 const char *var_tmp_dir) {
365 /* First, get rid of everything that is below if there
366 * is anything... Then, overmount it with an
367 * inaccessible directory. */
368 umount_recursive(m->path, 0);
370 what = "/run/systemd/inaccessible";
375 /* Nothing to mount here, we just later toggle the
376 * MS_RDONLY bit for the mount point */
383 case PRIVATE_VAR_TMP:
390 case PRIVATE_BUS_ENDPOINT:
391 return mount_kdbus(m);
394 assert_not_reached("Unknown mode");
399 r = mount(what, m->path, NULL, MS_BIND|MS_REC, NULL);
401 log_debug("Successfully mounted %s to %s", what, m->path);
402 else if (m->ignore && errno == ENOENT)
408 static int make_read_only(BindMount *m) {
413 if (IN_SET(m->mode, INACCESSIBLE, READONLY))
414 r = bind_remount_recursive(m->path, true);
415 else if (IN_SET(m->mode, READWRITE, PRIVATE_TMP, PRIVATE_VAR_TMP, PRIVATE_DEV))
416 r = bind_remount_recursive(m->path, false);
420 if (m->ignore && r == -ENOENT)
427 char** read_write_dirs,
428 char** read_only_dirs,
429 char** inaccessible_dirs,
432 char* bus_endpoint_path,
434 ProtectHome protect_home,
435 ProtectSystem protect_system,
436 unsigned mount_flags) {
438 BindMount *m, *mounts = NULL;
442 if (mount_flags == 0)
443 mount_flags = MS_SHARED;
445 if (unshare(CLONE_NEWNS) < 0)
448 n = !!tmp_dir + !!var_tmp_dir + !!bus_endpoint_path +
449 strv_length(read_write_dirs) +
450 strv_length(read_only_dirs) +
451 strv_length(inaccessible_dirs) +
453 (protect_home != PROTECT_HOME_NO ? 3 : 0) +
454 (protect_system != PROTECT_SYSTEM_NO ? 2 : 0) +
455 (protect_system == PROTECT_SYSTEM_FULL ? 1 : 0);
458 m = mounts = (BindMount *) alloca0(n * sizeof(BindMount));
459 r = append_mounts(&m, read_write_dirs, READWRITE);
463 r = append_mounts(&m, read_only_dirs, READONLY);
467 r = append_mounts(&m, inaccessible_dirs, INACCESSIBLE);
473 m->mode = PRIVATE_TMP;
478 m->path = "/var/tmp";
479 m->mode = PRIVATE_VAR_TMP;
485 m->mode = PRIVATE_DEV;
489 if (bus_endpoint_path) {
490 m->path = bus_endpoint_path;
491 m->mode = PRIVATE_BUS_ENDPOINT;
495 if (protect_home != PROTECT_HOME_NO) {
496 r = append_mounts(&m, STRV_MAKE("-/home", "-/run/user", "-/root"), protect_home == PROTECT_HOME_READ_ONLY ? READONLY : INACCESSIBLE);
501 if (protect_system != PROTECT_SYSTEM_NO) {
502 r = append_mounts(&m, protect_system == PROTECT_SYSTEM_FULL ? STRV_MAKE("/usr", "-/boot", "/etc") : STRV_MAKE("/usr", "-/boot"), READONLY);
507 assert(mounts + n == m);
509 qsort(mounts, n, sizeof(BindMount), mount_path_compare);
510 drop_duplicates(mounts, &n);
514 /* Remount / as SLAVE so that nothing now mounted in the namespace
515 shows up in the parent */
516 if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0)
519 for (m = mounts; m < mounts + n; ++m) {
520 r = apply_mount(m, tmp_dir, var_tmp_dir);
525 for (m = mounts; m < mounts + n; ++m) {
526 r = make_read_only(m);
532 /* Remount / as the desired mode. Not that this will not
533 * reestablish propagation from our side to the host, since
534 * what's disconnected is disconnected. */
535 if (mount(NULL, "/", NULL, mount_flags | MS_REC, NULL) < 0) {
544 for (m = mounts; m < mounts + n; ++m)
546 umount2(m->path, MNT_DETACH);
552 static int setup_one_tmp_dir(const char *id, const char *prefix, char **path) {
553 _cleanup_free_ char *x = NULL;
554 char bid[SD_ID128_STRING_MAX];
562 /* We include the boot id in the directory so that after a
563 * reboot we can easily identify obsolete directories. */
565 r = sd_id128_get_boot(&boot_id);
569 x = strjoin(prefix, "/systemd-private-", sd_id128_to_string(boot_id, bid), "-", id, "-XXXXXX", NULL);
577 RUN_WITH_UMASK(0000) {
580 y = strappenda(x, "/tmp");
582 if (mkdir(y, 0777 | S_ISVTX) < 0)
592 int setup_tmp_dirs(const char *id, char **tmp_dir, char **var_tmp_dir) {
600 r = setup_one_tmp_dir(id, "/tmp", &a);
604 r = setup_one_tmp_dir(id, "/var/tmp", &b);
608 t = strappenda(a, "/tmp");
622 int setup_netns(int netns_storage_socket[2]) {
623 _cleanup_close_ int netns = -1;
625 struct cmsghdr cmsghdr;
626 uint8_t buf[CMSG_SPACE(sizeof(int))];
629 .msg_control = &control,
630 .msg_controllen = sizeof(control),
632 struct cmsghdr *cmsg;
635 assert(netns_storage_socket);
636 assert(netns_storage_socket[0] >= 0);
637 assert(netns_storage_socket[1] >= 0);
639 /* We use the passed socketpair as a storage buffer for our
640 * namespace reference fd. Whatever process runs this first
641 * shall create a new namespace, all others should just join
642 * it. To serialize that we use a file lock on the socket
645 * It's a bit crazy, but hey, works great! */
647 if (lockf(netns_storage_socket[0], F_LOCK, 0) < 0)
650 if (recvmsg(netns_storage_socket[0], &mh, MSG_DONTWAIT|MSG_CMSG_CLOEXEC) < 0) {
651 if (errno != EAGAIN) {
656 /* Nothing stored yet, so let's create a new namespace */
658 if (unshare(CLONE_NEWNET) < 0) {
665 netns = open("/proc/self/ns/net", O_RDONLY|O_CLOEXEC|O_NOCTTY);
673 /* Yay, found something, so let's join the namespace */
675 for (cmsg = CMSG_FIRSTHDR(&mh); cmsg; cmsg = CMSG_NXTHDR(&mh, cmsg)) {
676 if (cmsg->cmsg_level == SOL_SOCKET && cmsg->cmsg_type == SCM_RIGHTS) {
677 assert(cmsg->cmsg_len == CMSG_LEN(sizeof(int)));
678 netns = *(int*) CMSG_DATA(cmsg);
682 if (setns(netns, CLONE_NEWNET) < 0) {
690 cmsg = CMSG_FIRSTHDR(&mh);
691 cmsg->cmsg_level = SOL_SOCKET;
692 cmsg->cmsg_type = SCM_RIGHTS;
693 cmsg->cmsg_len = CMSG_LEN(sizeof(int));
694 memcpy(CMSG_DATA(cmsg), &netns, sizeof(int));
695 mh.msg_controllen = cmsg->cmsg_len;
697 if (sendmsg(netns_storage_socket[1], &mh, MSG_DONTWAIT|MSG_NOSIGNAL) < 0) {
703 lockf(netns_storage_socket[0], F_ULOCK, 0);
708 static const char *const protect_home_table[_PROTECT_HOME_MAX] = {
709 [PROTECT_HOME_NO] = "no",
710 [PROTECT_HOME_YES] = "yes",
711 [PROTECT_HOME_READ_ONLY] = "read-only",
714 DEFINE_STRING_TABLE_LOOKUP(protect_home, ProtectHome);
716 static const char *const protect_system_table[_PROTECT_SYSTEM_MAX] = {
717 [PROTECT_SYSTEM_NO] = "no",
718 [PROTECT_SYSTEM_YES] = "yes",
719 [PROTECT_SYSTEM_FULL] = "full",
722 DEFINE_STRING_TABLE_LOOKUP(protect_system, ProtectSystem);