1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
4 This file is part of systemd.
6 Copyright 2010 Lennart Poettering
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
23 #include <sys/mount.h>
28 #include <sys/types.h>
30 #include <sys/syscall.h>
37 #include "path-util.h"
40 #include "loopback-setup.h"
42 #include "dev-setup.h"
45 #include "selinux-util.h"
46 #include "namespace.h"
48 typedef enum MountMode {
49 /* This is ordered by priority! */
59 typedef struct BindMount {
66 static int append_mounts(BindMount **p, char **strv, MountMode mode) {
71 STRV_FOREACH(i, strv) {
76 if ((mode == INACCESSIBLE || mode == READONLY || mode == READWRITE) && (*i)[0] == '-') {
81 if (!path_is_absolute(*i))
92 static int mount_path_compare(const void *a, const void *b) {
93 const BindMount *p = a, *q = b;
95 if (path_equal(p->path, q->path)) {
97 /* If the paths are equal, check the mode */
98 if (p->mode < q->mode)
101 if (p->mode > q->mode)
107 /* If the paths are not equal, then order prefixes first */
108 if (path_startswith(p->path, q->path))
111 if (path_startswith(q->path, p->path))
117 static void drop_duplicates(BindMount *m, unsigned *n) {
118 BindMount *f, *t, *previous;
123 for (f = m, t = m, previous = NULL; f < m+*n; f++) {
125 /* The first one wins */
126 if (previous && path_equal(f->path, previous->path))
139 static int mount_dev(BindMount *m) {
140 static const char devnodes[] =
148 char temporary_mount[] = "/tmp/namespace-dev-XXXXXX";
149 const char *d, *dev = NULL, *devpts = NULL, *devshm = NULL, *devhugepages = NULL, *devmqueue = NULL, *devlog = NULL, *devptmx = NULL;
150 _cleanup_umask_ mode_t u;
157 if (!mkdtemp(temporary_mount))
160 dev = strjoina(temporary_mount, "/dev");
161 (void)mkdir(dev, 0755);
162 if (mount("tmpfs", dev, "tmpfs", MS_NOSUID|MS_STRICTATIME, "mode=755") < 0) {
167 devpts = strjoina(temporary_mount, "/dev/pts");
168 (void)mkdir(devpts, 0755);
169 if (mount("/dev/pts", devpts, NULL, MS_BIND, NULL) < 0) {
174 devptmx = strjoina(temporary_mount, "/dev/ptmx");
175 symlink("pts/ptmx", devptmx);
177 devshm = strjoina(temporary_mount, "/dev/shm");
178 (void)mkdir(devshm, 01777);
179 r = mount("/dev/shm", devshm, NULL, MS_BIND, NULL);
185 devmqueue = strjoina(temporary_mount, "/dev/mqueue");
186 (void)mkdir(devmqueue, 0755);
187 mount("/dev/mqueue", devmqueue, NULL, MS_BIND, NULL);
189 devhugepages = strjoina(temporary_mount, "/dev/hugepages");
190 (void)mkdir(devhugepages, 0755);
191 mount("/dev/hugepages", devhugepages, NULL, MS_BIND, NULL);
193 devlog = strjoina(temporary_mount, "/dev/log");
194 symlink("/run/systemd/journal/dev-log", devlog);
196 NULSTR_FOREACH(d, devnodes) {
197 _cleanup_free_ char *dn = NULL;
210 if (!S_ISBLK(st.st_mode) &&
211 !S_ISCHR(st.st_mode)) {
219 dn = strappend(temporary_mount, d);
225 mac_selinux_create_file_prepare(d, st.st_mode);
226 r = mknod(dn, st.st_mode, st.st_rdev);
227 mac_selinux_create_file_clear();
235 dev_setup(temporary_mount);
237 if (mount(dev, "/dev/", NULL, MS_MOVE, NULL) < 0) {
243 rmdir(temporary_mount);
255 umount(devhugepages);
262 rmdir(temporary_mount);
267 static int mount_kdbus(BindMount *m) {
269 char temporary_mount[] = "/tmp/kdbus-dev-XXXXXX";
270 _cleanup_free_ char *basepath = NULL;
271 _cleanup_umask_ mode_t u;
272 char *busnode = NULL, *root;
280 if (!mkdtemp(temporary_mount))
281 return log_error_errno(errno, "Failed create temp dir: %m");
283 root = strjoina(temporary_mount, "/kdbus");
284 (void)mkdir(root, 0755);
285 if (mount("tmpfs", root, "tmpfs", MS_NOSUID|MS_STRICTATIME, "mode=777") < 0) {
290 /* create a new /dev/null dev node copy so we have some fodder to
291 * bind-mount the custom endpoint over. */
292 if (stat("/dev/null", &st) < 0) {
293 log_error_errno(errno, "Failed to stat /dev/null: %m");
298 busnode = strjoina(root, "/bus");
299 if (mknod(busnode, (st.st_mode & ~07777) | 0600, st.st_rdev) < 0) {
300 log_error_errno(errno, "mknod() for %s failed: %m", busnode);
305 r = mount(m->path, busnode, "bind", MS_BIND, NULL);
307 log_error_errno(errno, "bind mount of %s failed: %m", m->path);
312 basepath = dirname_malloc(m->path);
318 if (mount(root, basepath, NULL, MS_MOVE, NULL) < 0) {
319 log_error_errno(errno, "bind mount of %s failed: %m", basepath);
324 rmdir(temporary_mount);
335 rmdir(temporary_mount);
340 static int apply_mount(
343 const char *var_tmp_dir) {
354 /* First, get rid of everything that is below if there
355 * is anything... Then, overmount it with an
356 * inaccessible directory. */
357 umount_recursive(m->path, 0);
359 what = "/run/systemd/inaccessible";
364 /* Nothing to mount here, we just later toggle the
365 * MS_RDONLY bit for the mount point */
372 case PRIVATE_VAR_TMP:
379 case PRIVATE_BUS_ENDPOINT:
380 return mount_kdbus(m);
383 assert_not_reached("Unknown mode");
388 r = mount(what, m->path, NULL, MS_BIND|MS_REC, NULL);
390 log_debug("Successfully mounted %s to %s", what, m->path);
391 else if (m->ignore && errno == ENOENT)
397 static int make_read_only(BindMount *m) {
402 if (IN_SET(m->mode, INACCESSIBLE, READONLY))
403 r = bind_remount_recursive(m->path, true);
404 else if (IN_SET(m->mode, READWRITE, PRIVATE_TMP, PRIVATE_VAR_TMP, PRIVATE_DEV))
405 r = bind_remount_recursive(m->path, false);
409 if (m->ignore && r == -ENOENT)
416 char** read_write_dirs,
417 char** read_only_dirs,
418 char** inaccessible_dirs,
420 const char* var_tmp_dir,
421 const char* bus_endpoint_path,
423 ProtectHome protect_home,
424 ProtectSystem protect_system,
425 unsigned long mount_flags) {
427 BindMount *m, *mounts = NULL;
431 if (mount_flags == 0)
432 mount_flags = MS_SHARED;
434 if (unshare(CLONE_NEWNS) < 0)
437 n = !!tmp_dir + !!var_tmp_dir + !!bus_endpoint_path +
438 strv_length(read_write_dirs) +
439 strv_length(read_only_dirs) +
440 strv_length(inaccessible_dirs) +
442 (protect_home != PROTECT_HOME_NO ? 3 : 0) +
443 (protect_system != PROTECT_SYSTEM_NO ? 2 : 0) +
444 (protect_system == PROTECT_SYSTEM_FULL ? 1 : 0);
447 m = mounts = (BindMount *) alloca0(n * sizeof(BindMount));
448 r = append_mounts(&m, read_write_dirs, READWRITE);
452 r = append_mounts(&m, read_only_dirs, READONLY);
456 r = append_mounts(&m, inaccessible_dirs, INACCESSIBLE);
462 m->mode = PRIVATE_TMP;
467 m->path = "/var/tmp";
468 m->mode = PRIVATE_VAR_TMP;
474 m->mode = PRIVATE_DEV;
478 if (bus_endpoint_path) {
479 m->path = bus_endpoint_path;
480 m->mode = PRIVATE_BUS_ENDPOINT;
484 if (protect_home != PROTECT_HOME_NO) {
485 r = append_mounts(&m, STRV_MAKE("-/home", "-/run/user", "-/root"), protect_home == PROTECT_HOME_READ_ONLY ? READONLY : INACCESSIBLE);
490 if (protect_system != PROTECT_SYSTEM_NO) {
491 r = append_mounts(&m, protect_system == PROTECT_SYSTEM_FULL ? STRV_MAKE("/usr", "-/boot", "/etc") : STRV_MAKE("/usr", "-/boot"), READONLY);
496 assert(mounts + n == m);
498 qsort(mounts, n, sizeof(BindMount), mount_path_compare);
499 drop_duplicates(mounts, &n);
503 /* Remount / as SLAVE so that nothing now mounted in the namespace
504 shows up in the parent */
505 if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0)
508 for (m = mounts; m < mounts + n; ++m) {
509 r = apply_mount(m, tmp_dir, var_tmp_dir);
514 for (m = mounts; m < mounts + n; ++m) {
515 r = make_read_only(m);
521 /* Remount / as the desired mode. Not that this will not
522 * reestablish propagation from our side to the host, since
523 * what's disconnected is disconnected. */
524 if (mount(NULL, "/", NULL, mount_flags | MS_REC, NULL) < 0) {
533 for (m = mounts; m < mounts + n; ++m)
535 umount2(m->path, MNT_DETACH);
541 static int setup_one_tmp_dir(const char *id, const char *prefix, char **path) {
542 _cleanup_free_ char *x = NULL;
543 char bid[SD_ID128_STRING_MAX];
551 /* We include the boot id in the directory so that after a
552 * reboot we can easily identify obsolete directories. */
554 r = sd_id128_get_boot(&boot_id);
558 x = strjoin(prefix, "/systemd-private-", sd_id128_to_string(boot_id, bid), "-", id, "-XXXXXX", NULL);
566 RUN_WITH_UMASK(0000) {
569 y = strjoina(x, "/tmp");
571 if (mkdir(y, 0777 | S_ISVTX) < 0)
581 int setup_tmp_dirs(const char *id, char **tmp_dir, char **var_tmp_dir) {
589 r = setup_one_tmp_dir(id, "/tmp", &a);
593 r = setup_one_tmp_dir(id, "/var/tmp", &b);
597 t = strjoina(a, "/tmp");
611 int setup_netns(int netns_storage_socket[2]) {
612 _cleanup_close_ int netns = -1;
614 struct cmsghdr cmsghdr;
615 uint8_t buf[CMSG_SPACE(sizeof(int))];
618 .msg_control = &control,
619 .msg_controllen = sizeof(control),
621 struct cmsghdr *cmsg;
624 assert(netns_storage_socket);
625 assert(netns_storage_socket[0] >= 0);
626 assert(netns_storage_socket[1] >= 0);
628 /* We use the passed socketpair as a storage buffer for our
629 * namespace reference fd. Whatever process runs this first
630 * shall create a new namespace, all others should just join
631 * it. To serialize that we use a file lock on the socket
634 * It's a bit crazy, but hey, works great! */
636 if (lockf(netns_storage_socket[0], F_LOCK, 0) < 0)
639 if (recvmsg(netns_storage_socket[0], &mh, MSG_DONTWAIT|MSG_CMSG_CLOEXEC) < 0) {
640 if (errno != EAGAIN) {
645 /* Nothing stored yet, so let's create a new namespace */
647 if (unshare(CLONE_NEWNET) < 0) {
654 netns = open("/proc/self/ns/net", O_RDONLY|O_CLOEXEC|O_NOCTTY);
662 /* Yay, found something, so let's join the namespace */
664 for (cmsg = CMSG_FIRSTHDR(&mh); cmsg; cmsg = CMSG_NXTHDR(&mh, cmsg)) {
665 if (cmsg->cmsg_level == SOL_SOCKET && cmsg->cmsg_type == SCM_RIGHTS) {
666 assert(cmsg->cmsg_len == CMSG_LEN(sizeof(int)));
667 netns = *(int*) CMSG_DATA(cmsg);
671 if (setns(netns, CLONE_NEWNET) < 0) {
679 cmsg = CMSG_FIRSTHDR(&mh);
680 cmsg->cmsg_level = SOL_SOCKET;
681 cmsg->cmsg_type = SCM_RIGHTS;
682 cmsg->cmsg_len = CMSG_LEN(sizeof(int));
683 memcpy(CMSG_DATA(cmsg), &netns, sizeof(int));
684 mh.msg_controllen = cmsg->cmsg_len;
686 if (sendmsg(netns_storage_socket[1], &mh, MSG_DONTWAIT|MSG_NOSIGNAL) < 0) {
692 lockf(netns_storage_socket[0], F_ULOCK, 0);
697 static const char *const protect_home_table[_PROTECT_HOME_MAX] = {
698 [PROTECT_HOME_NO] = "no",
699 [PROTECT_HOME_YES] = "yes",
700 [PROTECT_HOME_READ_ONLY] = "read-only",
703 DEFINE_STRING_TABLE_LOOKUP(protect_home, ProtectHome);
705 static const char *const protect_system_table[_PROTECT_SYSTEM_MAX] = {
706 [PROTECT_SYSTEM_NO] = "no",
707 [PROTECT_SYSTEM_YES] = "yes",
708 [PROTECT_SYSTEM_FULL] = "full",
711 DEFINE_STRING_TABLE_LOOKUP(protect_system, ProtectSystem);