1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
4 This file is part of systemd.
6 Copyright 2010 Lennart Poettering
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
23 #include <sys/mount.h>
28 #include <sys/types.h>
30 #include <sys/syscall.h>
37 #include "path-util.h"
38 #include "namespace.h"
41 #include "loopback-setup.h"
43 #include "dev-setup.h"
47 typedef enum MountMode {
48 /* This is ordered by priority! */
58 typedef struct BindMount {
65 static int append_mounts(BindMount **p, char **strv, MountMode mode) {
70 STRV_FOREACH(i, strv) {
75 if ((mode == INACCESSIBLE || mode == READONLY || mode == READWRITE) && (*i)[0] == '-') {
80 if (!path_is_absolute(*i))
91 static int mount_path_compare(const void *a, const void *b) {
92 const BindMount *p = a, *q = b;
94 if (path_equal(p->path, q->path)) {
96 /* If the paths are equal, check the mode */
97 if (p->mode < q->mode)
100 if (p->mode > q->mode)
106 /* If the paths are not equal, then order prefixes first */
107 if (path_startswith(p->path, q->path))
110 if (path_startswith(q->path, p->path))
116 static void drop_duplicates(BindMount *m, unsigned *n) {
117 BindMount *f, *t, *previous;
122 for (f = m, t = m, previous = NULL; f < m+*n; f++) {
124 /* The first one wins */
125 if (previous && path_equal(f->path, previous->path))
138 static int mount_dev(BindMount *m) {
139 static const char devnodes[] =
147 char temporary_mount[] = "/tmp/namespace-dev-XXXXXX";
148 const char *d, *dev = NULL, *devpts = NULL, *devshm = NULL, *devhugepages = NULL, *devmqueue = NULL, *devlog = NULL, *devptmx = NULL;
149 _cleanup_umask_ mode_t u;
156 if (!mkdtemp(temporary_mount))
159 dev = strappenda(temporary_mount, "/dev");
160 (void)mkdir(dev, 0755);
161 if (mount("tmpfs", dev, "tmpfs", MS_NOSUID|MS_STRICTATIME, "mode=755") < 0) {
166 devpts = strappenda(temporary_mount, "/dev/pts");
167 (void)mkdir(devpts, 0755);
168 if (mount("/dev/pts", devpts, NULL, MS_BIND, NULL) < 0) {
173 devptmx = strappenda(temporary_mount, "/dev/ptmx");
174 symlink("pts/ptmx", devptmx);
176 devshm = strappenda(temporary_mount, "/dev/shm");
177 (void)mkdir(devshm, 01777);
178 r = mount("/dev/shm", devshm, NULL, MS_BIND, NULL);
184 devmqueue = strappenda(temporary_mount, "/dev/mqueue");
185 (void)mkdir(devmqueue, 0755);
186 mount("/dev/mqueue", devmqueue, NULL, MS_BIND, NULL);
188 devhugepages = strappenda(temporary_mount, "/dev/hugepages");
189 (void)mkdir(devhugepages, 0755);
190 mount("/dev/hugepages", devhugepages, NULL, MS_BIND, NULL);
192 devlog = strappenda(temporary_mount, "/dev/log");
193 symlink("/run/systemd/journal/dev-log", devlog);
195 NULSTR_FOREACH(d, devnodes) {
196 _cleanup_free_ char *dn = NULL;
209 if (!S_ISBLK(st.st_mode) &&
210 !S_ISCHR(st.st_mode)) {
218 dn = strappend(temporary_mount, d);
224 mac_selinux_create_file_prepare(d, st.st_mode);
225 r = mknod(dn, st.st_mode, st.st_rdev);
226 mac_selinux_create_file_clear();
234 dev_setup(temporary_mount);
236 if (mount(dev, "/dev/", NULL, MS_MOVE, NULL) < 0) {
242 rmdir(temporary_mount);
254 umount(devhugepages);
261 rmdir(temporary_mount);
266 static int mount_kdbus(BindMount *m) {
268 char temporary_mount[] = "/tmp/kdbus-dev-XXXXXX";
269 _cleanup_free_ char *basepath = NULL;
270 _cleanup_umask_ mode_t u;
271 char *busnode = NULL, *root;
279 if (!mkdtemp(temporary_mount))
280 return log_error_errno(errno, "Failed create temp dir: %m");
282 root = strappenda(temporary_mount, "/kdbus");
283 (void)mkdir(root, 0755);
284 if (mount("tmpfs", root, "tmpfs", MS_NOSUID|MS_STRICTATIME, "mode=777") < 0) {
289 /* create a new /dev/null dev node copy so we have some fodder to
290 * bind-mount the custom endpoint over. */
291 if (stat("/dev/null", &st) < 0) {
292 log_error_errno(errno, "Failed to stat /dev/null: %m");
297 busnode = strappenda(root, "/bus");
298 if (mknod(busnode, (st.st_mode & ~07777) | 0600, st.st_rdev) < 0) {
299 log_error_errno(errno, "mknod() for %s failed: %m", busnode);
304 r = mount(m->path, busnode, "bind", MS_BIND, NULL);
306 log_error_errno(errno, "bind mount of %s failed: %m", m->path);
311 basepath = dirname_malloc(m->path);
317 if (mount(root, basepath, NULL, MS_MOVE, NULL) < 0) {
318 log_error_errno(errno, "bind mount of %s failed: %m", basepath);
323 rmdir(temporary_mount);
334 rmdir(temporary_mount);
339 static int apply_mount(
342 const char *var_tmp_dir) {
353 /* First, get rid of everything that is below if there
354 * is anything... Then, overmount it with an
355 * inaccessible directory. */
356 umount_recursive(m->path, 0);
358 what = "/run/systemd/inaccessible";
363 /* Nothing to mount here, we just later toggle the
364 * MS_RDONLY bit for the mount point */
371 case PRIVATE_VAR_TMP:
378 case PRIVATE_BUS_ENDPOINT:
379 return mount_kdbus(m);
382 assert_not_reached("Unknown mode");
387 r = mount(what, m->path, NULL, MS_BIND|MS_REC, NULL);
389 log_debug("Successfully mounted %s to %s", what, m->path);
390 else if (m->ignore && errno == ENOENT)
396 static int make_read_only(BindMount *m) {
401 if (IN_SET(m->mode, INACCESSIBLE, READONLY))
402 r = bind_remount_recursive(m->path, true);
403 else if (IN_SET(m->mode, READWRITE, PRIVATE_TMP, PRIVATE_VAR_TMP, PRIVATE_DEV))
404 r = bind_remount_recursive(m->path, false);
408 if (m->ignore && r == -ENOENT)
415 char** read_write_dirs,
416 char** read_only_dirs,
417 char** inaccessible_dirs,
419 const char* var_tmp_dir,
420 const char* bus_endpoint_path,
422 ProtectHome protect_home,
423 ProtectSystem protect_system,
424 unsigned mount_flags) {
426 BindMount *m, *mounts = NULL;
430 if (mount_flags == 0)
431 mount_flags = MS_SHARED;
433 if (unshare(CLONE_NEWNS) < 0)
436 n = !!tmp_dir + !!var_tmp_dir + !!bus_endpoint_path +
437 strv_length(read_write_dirs) +
438 strv_length(read_only_dirs) +
439 strv_length(inaccessible_dirs) +
441 (protect_home != PROTECT_HOME_NO ? 3 : 0) +
442 (protect_system != PROTECT_SYSTEM_NO ? 2 : 0) +
443 (protect_system == PROTECT_SYSTEM_FULL ? 1 : 0);
446 m = mounts = (BindMount *) alloca0(n * sizeof(BindMount));
447 r = append_mounts(&m, read_write_dirs, READWRITE);
451 r = append_mounts(&m, read_only_dirs, READONLY);
455 r = append_mounts(&m, inaccessible_dirs, INACCESSIBLE);
461 m->mode = PRIVATE_TMP;
466 m->path = "/var/tmp";
467 m->mode = PRIVATE_VAR_TMP;
473 m->mode = PRIVATE_DEV;
477 if (bus_endpoint_path) {
478 m->path = bus_endpoint_path;
479 m->mode = PRIVATE_BUS_ENDPOINT;
483 if (protect_home != PROTECT_HOME_NO) {
484 r = append_mounts(&m, STRV_MAKE("-/home", "-/run/user", "-/root"), protect_home == PROTECT_HOME_READ_ONLY ? READONLY : INACCESSIBLE);
489 if (protect_system != PROTECT_SYSTEM_NO) {
490 r = append_mounts(&m, protect_system == PROTECT_SYSTEM_FULL ? STRV_MAKE("/usr", "-/boot", "/etc") : STRV_MAKE("/usr", "-/boot"), READONLY);
495 assert(mounts + n == m);
497 qsort(mounts, n, sizeof(BindMount), mount_path_compare);
498 drop_duplicates(mounts, &n);
502 /* Remount / as SLAVE so that nothing now mounted in the namespace
503 shows up in the parent */
504 if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0)
507 for (m = mounts; m < mounts + n; ++m) {
508 r = apply_mount(m, tmp_dir, var_tmp_dir);
513 for (m = mounts; m < mounts + n; ++m) {
514 r = make_read_only(m);
520 /* Remount / as the desired mode. Not that this will not
521 * reestablish propagation from our side to the host, since
522 * what's disconnected is disconnected. */
523 if (mount(NULL, "/", NULL, mount_flags | MS_REC, NULL) < 0) {
532 for (m = mounts; m < mounts + n; ++m)
534 umount2(m->path, MNT_DETACH);
540 static int setup_one_tmp_dir(const char *id, const char *prefix, char **path) {
541 _cleanup_free_ char *x = NULL;
542 char bid[SD_ID128_STRING_MAX];
550 /* We include the boot id in the directory so that after a
551 * reboot we can easily identify obsolete directories. */
553 r = sd_id128_get_boot(&boot_id);
557 x = strjoin(prefix, "/systemd-private-", sd_id128_to_string(boot_id, bid), "-", id, "-XXXXXX", NULL);
565 RUN_WITH_UMASK(0000) {
568 y = strappenda(x, "/tmp");
570 if (mkdir(y, 0777 | S_ISVTX) < 0)
580 int setup_tmp_dirs(const char *id, char **tmp_dir, char **var_tmp_dir) {
588 r = setup_one_tmp_dir(id, "/tmp", &a);
592 r = setup_one_tmp_dir(id, "/var/tmp", &b);
596 t = strappenda(a, "/tmp");
610 int setup_netns(int netns_storage_socket[2]) {
611 _cleanup_close_ int netns = -1;
613 struct cmsghdr cmsghdr;
614 uint8_t buf[CMSG_SPACE(sizeof(int))];
617 .msg_control = &control,
618 .msg_controllen = sizeof(control),
620 struct cmsghdr *cmsg;
623 assert(netns_storage_socket);
624 assert(netns_storage_socket[0] >= 0);
625 assert(netns_storage_socket[1] >= 0);
627 /* We use the passed socketpair as a storage buffer for our
628 * namespace reference fd. Whatever process runs this first
629 * shall create a new namespace, all others should just join
630 * it. To serialize that we use a file lock on the socket
633 * It's a bit crazy, but hey, works great! */
635 if (lockf(netns_storage_socket[0], F_LOCK, 0) < 0)
638 if (recvmsg(netns_storage_socket[0], &mh, MSG_DONTWAIT|MSG_CMSG_CLOEXEC) < 0) {
639 if (errno != EAGAIN) {
644 /* Nothing stored yet, so let's create a new namespace */
646 if (unshare(CLONE_NEWNET) < 0) {
653 netns = open("/proc/self/ns/net", O_RDONLY|O_CLOEXEC|O_NOCTTY);
661 /* Yay, found something, so let's join the namespace */
663 for (cmsg = CMSG_FIRSTHDR(&mh); cmsg; cmsg = CMSG_NXTHDR(&mh, cmsg)) {
664 if (cmsg->cmsg_level == SOL_SOCKET && cmsg->cmsg_type == SCM_RIGHTS) {
665 assert(cmsg->cmsg_len == CMSG_LEN(sizeof(int)));
666 netns = *(int*) CMSG_DATA(cmsg);
670 if (setns(netns, CLONE_NEWNET) < 0) {
678 cmsg = CMSG_FIRSTHDR(&mh);
679 cmsg->cmsg_level = SOL_SOCKET;
680 cmsg->cmsg_type = SCM_RIGHTS;
681 cmsg->cmsg_len = CMSG_LEN(sizeof(int));
682 memcpy(CMSG_DATA(cmsg), &netns, sizeof(int));
683 mh.msg_controllen = cmsg->cmsg_len;
685 if (sendmsg(netns_storage_socket[1], &mh, MSG_DONTWAIT|MSG_NOSIGNAL) < 0) {
691 lockf(netns_storage_socket[0], F_ULOCK, 0);
696 static const char *const protect_home_table[_PROTECT_HOME_MAX] = {
697 [PROTECT_HOME_NO] = "no",
698 [PROTECT_HOME_YES] = "yes",
699 [PROTECT_HOME_READ_ONLY] = "read-only",
702 DEFINE_STRING_TABLE_LOOKUP(protect_home, ProtectHome);
704 static const char *const protect_system_table[_PROTECT_SYSTEM_MAX] = {
705 [PROTECT_SYSTEM_NO] = "no",
706 [PROTECT_SYSTEM_YES] = "yes",
707 [PROTECT_SYSTEM_FULL] = "full",
710 DEFINE_STRING_TABLE_LOOKUP(protect_system, ProtectSystem);