1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
4 This file is part of systemd.
6 Copyright 2010 Lennart Poettering
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
23 #include <sys/mount.h>
33 #include "path-util.h"
35 #include "loopback-setup.h"
36 #include "dev-setup.h"
37 #include "selinux-util.h"
38 #include "namespace.h"
40 typedef enum MountMode {
41 /* This is ordered by priority! */
51 typedef struct BindMount {
58 static int append_mounts(BindMount **p, char **strv, MountMode mode) {
63 STRV_FOREACH(i, strv) {
68 if ((mode == INACCESSIBLE || mode == READONLY || mode == READWRITE) && (*i)[0] == '-') {
73 if (!path_is_absolute(*i))
84 static int mount_path_compare(const void *a, const void *b) {
85 const BindMount *p = a, *q = b;
88 d = path_compare(p->path, q->path);
91 /* If the paths are equal, check the mode */
92 if (p->mode < q->mode)
95 if (p->mode > q->mode)
101 /* If the paths are not equal, then order prefixes first */
105 static void drop_duplicates(BindMount *m, unsigned *n) {
106 BindMount *f, *t, *previous;
111 for (f = m, t = m, previous = NULL; f < m+*n; f++) {
113 /* The first one wins */
114 if (previous && path_equal(f->path, previous->path))
127 static int mount_dev(BindMount *m) {
128 static const char devnodes[] =
136 char temporary_mount[] = "/tmp/namespace-dev-XXXXXX";
137 const char *d, *dev = NULL, *devpts = NULL, *devshm = NULL, *devhugepages = NULL, *devmqueue = NULL, *devlog = NULL, *devptmx = NULL;
138 _cleanup_umask_ mode_t u;
145 if (!mkdtemp(temporary_mount))
148 dev = strjoina(temporary_mount, "/dev");
149 (void) mkdir(dev, 0755);
150 if (mount("tmpfs", dev, "tmpfs", MS_NOSUID|MS_STRICTATIME, "mode=755") < 0) {
155 devpts = strjoina(temporary_mount, "/dev/pts");
156 (void) mkdir(devpts, 0755);
157 if (mount("/dev/pts", devpts, NULL, MS_BIND, NULL) < 0) {
162 devptmx = strjoina(temporary_mount, "/dev/ptmx");
163 if (symlink("pts/ptmx", devptmx) < 0) {
168 devshm = strjoina(temporary_mount, "/dev/shm");
169 (void) mkdir(devshm, 01777);
170 r = mount("/dev/shm", devshm, NULL, MS_BIND, NULL);
176 devmqueue = strjoina(temporary_mount, "/dev/mqueue");
177 (void) mkdir(devmqueue, 0755);
178 (void) mount("/dev/mqueue", devmqueue, NULL, MS_BIND, NULL);
180 devhugepages = strjoina(temporary_mount, "/dev/hugepages");
181 (void) mkdir(devhugepages, 0755);
182 (void) mount("/dev/hugepages", devhugepages, NULL, MS_BIND, NULL);
184 devlog = strjoina(temporary_mount, "/dev/log");
185 (void) symlink("/run/systemd/journal/dev-log", devlog);
187 NULSTR_FOREACH(d, devnodes) {
188 _cleanup_free_ char *dn = NULL;
201 if (!S_ISBLK(st.st_mode) &&
202 !S_ISCHR(st.st_mode)) {
210 dn = strappend(temporary_mount, d);
216 mac_selinux_create_file_prepare(d, st.st_mode);
217 r = mknod(dn, st.st_mode, st.st_rdev);
218 mac_selinux_create_file_clear();
226 dev_setup(temporary_mount);
228 if (mount(dev, "/dev/", NULL, MS_MOVE, NULL) < 0) {
234 rmdir(temporary_mount);
246 umount(devhugepages);
253 rmdir(temporary_mount);
258 static int mount_kdbus(BindMount *m) {
260 char temporary_mount[] = "/tmp/kdbus-dev-XXXXXX";
261 _cleanup_free_ char *basepath = NULL;
262 _cleanup_umask_ mode_t u;
263 char *busnode = NULL, *root;
271 if (!mkdtemp(temporary_mount))
272 return log_error_errno(errno, "Failed create temp dir: %m");
274 root = strjoina(temporary_mount, "/kdbus");
275 (void) mkdir(root, 0755);
276 if (mount("tmpfs", root, "tmpfs", MS_NOSUID|MS_STRICTATIME, "mode=777") < 0) {
281 /* create a new /dev/null dev node copy so we have some fodder to
282 * bind-mount the custom endpoint over. */
283 if (stat("/dev/null", &st) < 0) {
284 log_error_errno(errno, "Failed to stat /dev/null: %m");
289 busnode = strjoina(root, "/bus");
290 if (mknod(busnode, (st.st_mode & ~07777) | 0600, st.st_rdev) < 0) {
291 log_error_errno(errno, "mknod() for %s failed: %m", busnode);
296 r = mount(m->path, busnode, NULL, MS_BIND, NULL);
298 log_error_errno(errno, "bind mount of %s failed: %m", m->path);
303 basepath = dirname_malloc(m->path);
309 if (mount(root, basepath, NULL, MS_MOVE, NULL) < 0) {
310 log_error_errno(errno, "bind mount of %s failed: %m", basepath);
315 rmdir(temporary_mount);
326 rmdir(temporary_mount);
331 static int apply_mount(
334 const char *var_tmp_dir) {
345 /* First, get rid of everything that is below if there
346 * is anything... Then, overmount it with an
347 * inaccessible directory. */
348 umount_recursive(m->path, 0);
350 what = "/run/systemd/inaccessible";
355 /* Nothing to mount here, we just later toggle the
356 * MS_RDONLY bit for the mount point */
363 case PRIVATE_VAR_TMP:
370 case PRIVATE_BUS_ENDPOINT:
371 return mount_kdbus(m);
374 assert_not_reached("Unknown mode");
379 r = mount(what, m->path, NULL, MS_BIND|MS_REC, NULL);
381 log_debug("Successfully mounted %s to %s", what, m->path);
382 else if (m->ignore && errno == ENOENT)
388 static int make_read_only(BindMount *m) {
393 if (IN_SET(m->mode, INACCESSIBLE, READONLY))
394 r = bind_remount_recursive(m->path, true);
395 else if (IN_SET(m->mode, READWRITE, PRIVATE_TMP, PRIVATE_VAR_TMP, PRIVATE_DEV))
396 r = bind_remount_recursive(m->path, false);
400 if (m->ignore && r == -ENOENT)
407 char** read_write_dirs,
408 char** read_only_dirs,
409 char** inaccessible_dirs,
411 const char* var_tmp_dir,
412 const char* bus_endpoint_path,
414 ProtectHome protect_home,
415 ProtectSystem protect_system,
416 unsigned long mount_flags) {
418 BindMount *m, *mounts = NULL;
422 if (mount_flags == 0)
423 mount_flags = MS_SHARED;
425 if (unshare(CLONE_NEWNS) < 0)
428 n = !!tmp_dir + !!var_tmp_dir + !!bus_endpoint_path +
429 strv_length(read_write_dirs) +
430 strv_length(read_only_dirs) +
431 strv_length(inaccessible_dirs) +
433 (protect_home != PROTECT_HOME_NO ? 3 : 0) +
434 (protect_system != PROTECT_SYSTEM_NO ? 2 : 0) +
435 (protect_system == PROTECT_SYSTEM_FULL ? 1 : 0);
438 m = mounts = (BindMount *) alloca0(n * sizeof(BindMount));
439 r = append_mounts(&m, read_write_dirs, READWRITE);
443 r = append_mounts(&m, read_only_dirs, READONLY);
447 r = append_mounts(&m, inaccessible_dirs, INACCESSIBLE);
453 m->mode = PRIVATE_TMP;
458 m->path = "/var/tmp";
459 m->mode = PRIVATE_VAR_TMP;
465 m->mode = PRIVATE_DEV;
469 if (bus_endpoint_path) {
470 m->path = bus_endpoint_path;
471 m->mode = PRIVATE_BUS_ENDPOINT;
475 if (protect_home != PROTECT_HOME_NO) {
476 r = append_mounts(&m, STRV_MAKE("-/home", "-/run/user", "-/root"), protect_home == PROTECT_HOME_READ_ONLY ? READONLY : INACCESSIBLE);
481 if (protect_system != PROTECT_SYSTEM_NO) {
482 r = append_mounts(&m, protect_system == PROTECT_SYSTEM_FULL ? STRV_MAKE("/usr", "-/boot", "/etc") : STRV_MAKE("/usr", "-/boot"), READONLY);
487 assert(mounts + n == m);
489 qsort(mounts, n, sizeof(BindMount), mount_path_compare);
490 drop_duplicates(mounts, &n);
494 /* Remount / as SLAVE so that nothing now mounted in the namespace
495 shows up in the parent */
496 if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0)
499 for (m = mounts; m < mounts + n; ++m) {
500 r = apply_mount(m, tmp_dir, var_tmp_dir);
505 for (m = mounts; m < mounts + n; ++m) {
506 r = make_read_only(m);
512 /* Remount / as the desired mode. Not that this will not
513 * reestablish propagation from our side to the host, since
514 * what's disconnected is disconnected. */
515 if (mount(NULL, "/", NULL, mount_flags | MS_REC, NULL) < 0) {
524 for (m = mounts; m < mounts + n; ++m)
526 (void) umount2(m->path, MNT_DETACH);
532 static int setup_one_tmp_dir(const char *id, const char *prefix, char **path) {
533 _cleanup_free_ char *x = NULL;
534 char bid[SD_ID128_STRING_MAX];
542 /* We include the boot id in the directory so that after a
543 * reboot we can easily identify obsolete directories. */
545 r = sd_id128_get_boot(&boot_id);
549 x = strjoin(prefix, "/systemd-private-", sd_id128_to_string(boot_id, bid), "-", id, "-XXXXXX", NULL);
557 RUN_WITH_UMASK(0000) {
560 y = strjoina(x, "/tmp");
562 if (mkdir(y, 0777 | S_ISVTX) < 0)
572 int setup_tmp_dirs(const char *id, char **tmp_dir, char **var_tmp_dir) {
580 r = setup_one_tmp_dir(id, "/tmp", &a);
584 r = setup_one_tmp_dir(id, "/var/tmp", &b);
588 t = strjoina(a, "/tmp");
602 int setup_netns(int netns_storage_socket[2]) {
603 _cleanup_close_ int netns = -1;
605 struct cmsghdr cmsghdr;
606 uint8_t buf[CMSG_SPACE(sizeof(int))];
609 .msg_control = &control,
610 .msg_controllen = sizeof(control),
612 struct cmsghdr *cmsg;
615 assert(netns_storage_socket);
616 assert(netns_storage_socket[0] >= 0);
617 assert(netns_storage_socket[1] >= 0);
619 /* We use the passed socketpair as a storage buffer for our
620 * namespace reference fd. Whatever process runs this first
621 * shall create a new namespace, all others should just join
622 * it. To serialize that we use a file lock on the socket
625 * It's a bit crazy, but hey, works great! */
627 if (lockf(netns_storage_socket[0], F_LOCK, 0) < 0)
630 if (recvmsg(netns_storage_socket[0], &mh, MSG_DONTWAIT|MSG_CMSG_CLOEXEC) < 0) {
631 if (errno != EAGAIN) {
636 /* Nothing stored yet, so let's create a new namespace */
638 if (unshare(CLONE_NEWNET) < 0) {
645 netns = open("/proc/self/ns/net", O_RDONLY|O_CLOEXEC|O_NOCTTY);
653 /* Yay, found something, so let's join the namespace */
655 for (cmsg = CMSG_FIRSTHDR(&mh); cmsg; cmsg = CMSG_NXTHDR(&mh, cmsg)) {
656 if (cmsg->cmsg_level == SOL_SOCKET && cmsg->cmsg_type == SCM_RIGHTS) {
657 assert(cmsg->cmsg_len == CMSG_LEN(sizeof(int)));
658 netns = *(int*) CMSG_DATA(cmsg);
662 if (setns(netns, CLONE_NEWNET) < 0) {
670 cmsg = CMSG_FIRSTHDR(&mh);
671 cmsg->cmsg_level = SOL_SOCKET;
672 cmsg->cmsg_type = SCM_RIGHTS;
673 cmsg->cmsg_len = CMSG_LEN(sizeof(int));
674 memcpy(CMSG_DATA(cmsg), &netns, sizeof(int));
675 mh.msg_controllen = cmsg->cmsg_len;
677 if (sendmsg(netns_storage_socket[1], &mh, MSG_DONTWAIT|MSG_NOSIGNAL) < 0) {
683 lockf(netns_storage_socket[0], F_ULOCK, 0);
688 static const char *const protect_home_table[_PROTECT_HOME_MAX] = {
689 [PROTECT_HOME_NO] = "no",
690 [PROTECT_HOME_YES] = "yes",
691 [PROTECT_HOME_READ_ONLY] = "read-only",
694 DEFINE_STRING_TABLE_LOOKUP(protect_home, ProtectHome);
696 static const char *const protect_system_table[_PROTECT_SYSTEM_MAX] = {
697 [PROTECT_SYSTEM_NO] = "no",
698 [PROTECT_SYSTEM_YES] = "yes",
699 [PROTECT_SYSTEM_FULL] = "full",
702 DEFINE_STRING_TABLE_LOOKUP(protect_system, ProtectSystem);