1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
4 This file is part of systemd.
6 Copyright 2010 Lennart Poettering
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
23 #include <sys/mount.h>
28 #include <sys/types.h>
30 #include <sys/syscall.h>
37 #include "path-util.h"
38 #include "namespace.h"
41 #include "loopback-setup.h"
43 #include "dev-setup.h"
47 typedef enum MountMode {
48 /* This is ordered by priority! */
58 typedef struct BindMount {
65 static int append_mounts(BindMount **p, char **strv, MountMode mode) {
70 STRV_FOREACH(i, strv) {
75 if ((mode == INACCESSIBLE || mode == READONLY || mode == READWRITE) && (*i)[0] == '-') {
80 if (!path_is_absolute(*i))
91 static int mount_path_compare(const void *a, const void *b) {
92 const BindMount *p = a, *q = b;
94 if (path_equal(p->path, q->path)) {
96 /* If the paths are equal, check the mode */
97 if (p->mode < q->mode)
100 if (p->mode > q->mode)
106 /* If the paths are not equal, then order prefixes first */
107 if (path_startswith(p->path, q->path))
110 if (path_startswith(q->path, p->path))
116 static void drop_duplicates(BindMount *m, unsigned *n) {
117 BindMount *f, *t, *previous;
122 for (f = m, t = m, previous = NULL; f < m+*n; f++) {
124 /* The first one wins */
125 if (previous && path_equal(f->path, previous->path))
138 static int mount_dev(BindMount *m) {
139 static const char devnodes[] =
147 char temporary_mount[] = "/tmp/namespace-dev-XXXXXX";
148 const char *d, *dev = NULL, *devpts = NULL, *devshm = NULL, *devhugepages = NULL, *devmqueue = NULL, *devlog = NULL, *devptmx = NULL;
149 _cleanup_umask_ mode_t u;
156 if (!mkdtemp(temporary_mount))
159 dev = strappenda(temporary_mount, "/dev");
161 if (mount("tmpfs", dev, "tmpfs", MS_NOSUID|MS_STRICTATIME, "mode=755") < 0) {
166 devpts = strappenda(temporary_mount, "/dev/pts");
168 if (mount("/dev/pts", devpts, NULL, MS_BIND, NULL) < 0) {
173 devptmx = strappenda(temporary_mount, "/dev/ptmx");
174 symlink("pts/ptmx", devptmx);
176 devshm = strappenda(temporary_mount, "/dev/shm");
177 mkdir(devshm, 01777);
178 r = mount("/dev/shm", devshm, NULL, MS_BIND, NULL);
184 devmqueue = strappenda(temporary_mount, "/dev/mqueue");
185 mkdir(devmqueue, 0755);
186 mount("/dev/mqueue", devmqueue, NULL, MS_BIND, NULL);
188 devhugepages = strappenda(temporary_mount, "/dev/hugepages");
189 mkdir(devhugepages, 0755);
190 mount("/dev/hugepages", devhugepages, NULL, MS_BIND, NULL);
192 devlog = strappenda(temporary_mount, "/dev/log");
193 symlink("/run/systemd/journal/dev-log", devlog);
195 NULSTR_FOREACH(d, devnodes) {
196 _cleanup_free_ char *dn = NULL;
209 if (!S_ISBLK(st.st_mode) &&
210 !S_ISCHR(st.st_mode)) {
218 dn = strappend(temporary_mount, d);
224 mac_selinux_create_file_prepare(d, st.st_mode);
225 r = mknod(dn, st.st_mode, st.st_rdev);
226 mac_selinux_create_file_clear();
234 dev_setup(temporary_mount);
236 if (mount(dev, "/dev/", NULL, MS_MOVE, NULL) < 0) {
242 rmdir(temporary_mount);
254 umount(devhugepages);
261 rmdir(temporary_mount);
266 static int mount_kdbus(BindMount *m) {
268 char temporary_mount[] = "/tmp/kdbus-dev-XXXXXX";
269 _cleanup_free_ char *basepath = NULL;
270 _cleanup_umask_ mode_t u;
271 char *busnode = NULL, *root;
279 if (!mkdtemp(temporary_mount)) {
280 log_error("Failed create temp dir: %m");
284 root = strappenda(temporary_mount, "/kdbus");
286 if (mount("tmpfs", root, "tmpfs", MS_NOSUID|MS_STRICTATIME, "mode=777") < 0) {
291 /* create a new /dev/null dev node copy so we have some fodder to
292 * bind-mount the custom endpoint over. */
293 if (stat("/dev/null", &st) < 0) {
294 log_error("Failed to stat /dev/null: %m");
299 busnode = strappenda(root, "/bus");
300 if (mknod(busnode, (st.st_mode & ~07777) | 0600, st.st_rdev) < 0) {
301 log_error("mknod() for %s failed: %m", busnode);
306 r = mount(m->path, busnode, "bind", MS_BIND, NULL);
308 log_error("bind mount of %s failed: %m", m->path);
313 basepath = dirname_malloc(m->path);
319 if (mount(root, basepath, NULL, MS_MOVE, NULL) < 0) {
320 log_error("bind mount of %s failed: %m", basepath);
325 rmdir(temporary_mount);
336 rmdir(temporary_mount);
341 static int apply_mount(
344 const char *var_tmp_dir) {
355 /* First, get rid of everything that is below if there
356 * is anything... Then, overmount it with an
357 * inaccessible directory. */
358 umount_recursive(m->path, 0);
360 what = "/run/systemd/inaccessible";
365 /* Nothing to mount here, we just later toggle the
366 * MS_RDONLY bit for the mount point */
373 case PRIVATE_VAR_TMP:
380 case PRIVATE_BUS_ENDPOINT:
381 return mount_kdbus(m);
384 assert_not_reached("Unknown mode");
389 r = mount(what, m->path, NULL, MS_BIND|MS_REC, NULL);
391 log_debug("Successfully mounted %s to %s", what, m->path);
392 else if (m->ignore && errno == ENOENT)
398 static int make_read_only(BindMount *m) {
403 if (IN_SET(m->mode, INACCESSIBLE, READONLY))
404 r = bind_remount_recursive(m->path, true);
405 else if (IN_SET(m->mode, READWRITE, PRIVATE_TMP, PRIVATE_VAR_TMP, PRIVATE_DEV))
406 r = bind_remount_recursive(m->path, false);
410 if (m->ignore && r == -ENOENT)
417 char** read_write_dirs,
418 char** read_only_dirs,
419 char** inaccessible_dirs,
421 const char* var_tmp_dir,
422 const char* bus_endpoint_path,
424 ProtectHome protect_home,
425 ProtectSystem protect_system,
426 unsigned mount_flags) {
428 BindMount *m, *mounts = NULL;
432 if (mount_flags == 0)
433 mount_flags = MS_SHARED;
435 if (unshare(CLONE_NEWNS) < 0)
438 n = !!tmp_dir + !!var_tmp_dir + !!bus_endpoint_path +
439 strv_length(read_write_dirs) +
440 strv_length(read_only_dirs) +
441 strv_length(inaccessible_dirs) +
443 (protect_home != PROTECT_HOME_NO ? 3 : 0) +
444 (protect_system != PROTECT_SYSTEM_NO ? 2 : 0) +
445 (protect_system == PROTECT_SYSTEM_FULL ? 1 : 0);
448 m = mounts = (BindMount *) alloca0(n * sizeof(BindMount));
449 r = append_mounts(&m, read_write_dirs, READWRITE);
453 r = append_mounts(&m, read_only_dirs, READONLY);
457 r = append_mounts(&m, inaccessible_dirs, INACCESSIBLE);
463 m->mode = PRIVATE_TMP;
468 m->path = "/var/tmp";
469 m->mode = PRIVATE_VAR_TMP;
475 m->mode = PRIVATE_DEV;
479 if (bus_endpoint_path) {
480 m->path = bus_endpoint_path;
481 m->mode = PRIVATE_BUS_ENDPOINT;
485 if (protect_home != PROTECT_HOME_NO) {
486 r = append_mounts(&m, STRV_MAKE("-/home", "-/run/user", "-/root"), protect_home == PROTECT_HOME_READ_ONLY ? READONLY : INACCESSIBLE);
491 if (protect_system != PROTECT_SYSTEM_NO) {
492 r = append_mounts(&m, protect_system == PROTECT_SYSTEM_FULL ? STRV_MAKE("/usr", "-/boot", "/etc") : STRV_MAKE("/usr", "-/boot"), READONLY);
497 assert(mounts + n == m);
499 qsort(mounts, n, sizeof(BindMount), mount_path_compare);
500 drop_duplicates(mounts, &n);
504 /* Remount / as SLAVE so that nothing now mounted in the namespace
505 shows up in the parent */
506 if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0)
509 for (m = mounts; m < mounts + n; ++m) {
510 r = apply_mount(m, tmp_dir, var_tmp_dir);
515 for (m = mounts; m < mounts + n; ++m) {
516 r = make_read_only(m);
522 /* Remount / as the desired mode. Not that this will not
523 * reestablish propagation from our side to the host, since
524 * what's disconnected is disconnected. */
525 if (mount(NULL, "/", NULL, mount_flags | MS_REC, NULL) < 0) {
534 for (m = mounts; m < mounts + n; ++m)
536 umount2(m->path, MNT_DETACH);
542 static int setup_one_tmp_dir(const char *id, const char *prefix, char **path) {
543 _cleanup_free_ char *x = NULL;
544 char bid[SD_ID128_STRING_MAX];
552 /* We include the boot id in the directory so that after a
553 * reboot we can easily identify obsolete directories. */
555 r = sd_id128_get_boot(&boot_id);
559 x = strjoin(prefix, "/systemd-private-", sd_id128_to_string(boot_id, bid), "-", id, "-XXXXXX", NULL);
567 RUN_WITH_UMASK(0000) {
570 y = strappenda(x, "/tmp");
572 if (mkdir(y, 0777 | S_ISVTX) < 0)
582 int setup_tmp_dirs(const char *id, char **tmp_dir, char **var_tmp_dir) {
590 r = setup_one_tmp_dir(id, "/tmp", &a);
594 r = setup_one_tmp_dir(id, "/var/tmp", &b);
598 t = strappenda(a, "/tmp");
612 int setup_netns(int netns_storage_socket[2]) {
613 _cleanup_close_ int netns = -1;
615 struct cmsghdr cmsghdr;
616 uint8_t buf[CMSG_SPACE(sizeof(int))];
619 .msg_control = &control,
620 .msg_controllen = sizeof(control),
622 struct cmsghdr *cmsg;
625 assert(netns_storage_socket);
626 assert(netns_storage_socket[0] >= 0);
627 assert(netns_storage_socket[1] >= 0);
629 /* We use the passed socketpair as a storage buffer for our
630 * namespace reference fd. Whatever process runs this first
631 * shall create a new namespace, all others should just join
632 * it. To serialize that we use a file lock on the socket
635 * It's a bit crazy, but hey, works great! */
637 if (lockf(netns_storage_socket[0], F_LOCK, 0) < 0)
640 if (recvmsg(netns_storage_socket[0], &mh, MSG_DONTWAIT|MSG_CMSG_CLOEXEC) < 0) {
641 if (errno != EAGAIN) {
646 /* Nothing stored yet, so let's create a new namespace */
648 if (unshare(CLONE_NEWNET) < 0) {
655 netns = open("/proc/self/ns/net", O_RDONLY|O_CLOEXEC|O_NOCTTY);
663 /* Yay, found something, so let's join the namespace */
665 for (cmsg = CMSG_FIRSTHDR(&mh); cmsg; cmsg = CMSG_NXTHDR(&mh, cmsg)) {
666 if (cmsg->cmsg_level == SOL_SOCKET && cmsg->cmsg_type == SCM_RIGHTS) {
667 assert(cmsg->cmsg_len == CMSG_LEN(sizeof(int)));
668 netns = *(int*) CMSG_DATA(cmsg);
672 if (setns(netns, CLONE_NEWNET) < 0) {
680 cmsg = CMSG_FIRSTHDR(&mh);
681 cmsg->cmsg_level = SOL_SOCKET;
682 cmsg->cmsg_type = SCM_RIGHTS;
683 cmsg->cmsg_len = CMSG_LEN(sizeof(int));
684 memcpy(CMSG_DATA(cmsg), &netns, sizeof(int));
685 mh.msg_controllen = cmsg->cmsg_len;
687 if (sendmsg(netns_storage_socket[1], &mh, MSG_DONTWAIT|MSG_NOSIGNAL) < 0) {
693 lockf(netns_storage_socket[0], F_ULOCK, 0);
698 static const char *const protect_home_table[_PROTECT_HOME_MAX] = {
699 [PROTECT_HOME_NO] = "no",
700 [PROTECT_HOME_YES] = "yes",
701 [PROTECT_HOME_READ_ONLY] = "read-only",
704 DEFINE_STRING_TABLE_LOOKUP(protect_home, ProtectHome);
706 static const char *const protect_system_table[_PROTECT_SYSTEM_MAX] = {
707 [PROTECT_SYSTEM_NO] = "no",
708 [PROTECT_SYSTEM_YES] = "yes",
709 [PROTECT_SYSTEM_FULL] = "full",
712 DEFINE_STRING_TABLE_LOOKUP(protect_system, ProtectSystem);