1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
4 This file is part of systemd.
6 Copyright 2010 Lennart Poettering
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
23 #include <sys/mount.h>
28 #include <sys/types.h>
30 #include <sys/syscall.h>
37 #include "path-util.h"
38 #include "namespace.h"
41 #include "loopback-setup.h"
43 #include "dev-setup.h"
47 typedef enum MountMode {
48 /* This is ordered by priority! */
57 typedef struct BindMount {
64 static int append_mounts(BindMount **p, char **strv, MountMode mode) {
69 STRV_FOREACH(i, strv) {
74 if ((mode == INACCESSIBLE || mode == READONLY || mode == READWRITE) && (*i)[0] == '-') {
79 if (!path_is_absolute(*i))
90 static int mount_path_compare(const void *a, const void *b) {
91 const BindMount *p = a, *q = b;
93 if (path_equal(p->path, q->path)) {
95 /* If the paths are equal, check the mode */
96 if (p->mode < q->mode)
99 if (p->mode > q->mode)
105 /* If the paths are not equal, then order prefixes first */
106 if (path_startswith(p->path, q->path))
109 if (path_startswith(q->path, p->path))
115 static void drop_duplicates(BindMount *m, unsigned *n) {
116 BindMount *f, *t, *previous;
121 for (f = m, t = m, previous = NULL; f < m+*n; f++) {
123 /* The first one wins */
124 if (previous && path_equal(f->path, previous->path))
138 static int mount_dev(BindMount *m) {
139 static const char devnodes[] =
147 char temporary_mount[] = "/tmp/namespace-dev-XXXXXX";
148 const char *d, *dev = NULL, *devpts = NULL, *devshm = NULL, *devkdbus = NULL, *devhugepages = NULL, *devmqueue = NULL, *devlog = NULL, *devptmx = NULL;
149 _cleanup_umask_ mode_t u;
156 if (!mkdtemp(temporary_mount))
159 dev = strappenda(temporary_mount, "/dev");
161 if (mount("tmpfs", dev, "tmpfs", MS_NOSUID|MS_STRICTATIME, "mode=755") < 0) {
166 devpts = strappenda(temporary_mount, "/dev/pts");
168 if (mount("/dev/pts", devpts, NULL, MS_BIND, NULL) < 0) {
173 devptmx = strappenda(temporary_mount, "/dev/ptmx");
174 symlink("pts/ptmx", devptmx);
176 devshm = strappenda(temporary_mount, "/dev/shm");
177 mkdir(devshm, 01777);
178 r = mount("/dev/shm", devshm, NULL, MS_BIND, NULL);
184 devmqueue = strappenda(temporary_mount, "/dev/mqueue");
185 mkdir(devmqueue, 0755);
186 mount("/dev/mqueue", devmqueue, NULL, MS_BIND, NULL);
188 devkdbus = strappenda(temporary_mount, "/dev/kdbus");
189 mkdir(devkdbus, 0755);
190 mount("/dev/kdbus", devkdbus, NULL, MS_BIND, NULL);
192 devhugepages = strappenda(temporary_mount, "/dev/hugepages");
193 mkdir(devhugepages, 0755);
194 mount("/dev/hugepages", devhugepages, NULL, MS_BIND, NULL);
196 devlog = strappenda(temporary_mount, "/dev/log");
197 symlink("/run/systemd/journal/dev-log", devlog);
199 NULSTR_FOREACH(d, devnodes) {
200 _cleanup_free_ char *dn = NULL;
213 if (!S_ISBLK(st.st_mode) &&
214 !S_ISCHR(st.st_mode)) {
222 dn = strappend(temporary_mount, d);
228 label_context_set(d, st.st_mode);
229 r = mknod(dn, st.st_mode, st.st_rdev);
230 label_context_clear();
238 dev_setup(temporary_mount);
240 if (mount(dev, "/dev/", NULL, MS_MOVE, NULL) < 0) {
246 rmdir(temporary_mount);
261 umount(devhugepages);
271 rmdir(temporary_mount);
276 static int apply_mount(
279 const char *var_tmp_dir) {
290 /* First, get rid of everything that is below if there
291 * is anything... Then, overmount it with an
292 * inaccessible directory. */
293 umount_recursive(m->path, 0);
295 what = "/run/systemd/inaccessible";
300 /* Nothing to mount here, we just later toggle the
301 * MS_RDONLY bit for the mount point */
308 case PRIVATE_VAR_TMP:
316 assert_not_reached("Unknown mode");
321 r = mount(what, m->path, NULL, MS_BIND|MS_REC, NULL);
323 log_debug("Successfully mounted %s to %s", what, m->path);
324 else if (m->ignore && errno == ENOENT)
330 static int make_read_only(BindMount *m) {
335 if (IN_SET(m->mode, INACCESSIBLE, READONLY))
336 r = bind_remount_recursive(m->path, true);
337 else if (IN_SET(m->mode, READWRITE, PRIVATE_TMP, PRIVATE_VAR_TMP, PRIVATE_DEV))
338 r = bind_remount_recursive(m->path, false);
342 if (m->ignore && r == -ENOENT)
349 char** read_write_dirs,
350 char** read_only_dirs,
351 char** inaccessible_dirs,
355 ProtectHome protect_home,
356 ProtectSystem protect_system,
357 unsigned mount_flags) {
359 BindMount *m, *mounts = NULL;
363 if (mount_flags == 0)
364 mount_flags = MS_SHARED;
366 if (unshare(CLONE_NEWNS) < 0)
369 n = !!tmp_dir + !!var_tmp_dir +
370 strv_length(read_write_dirs) +
371 strv_length(read_only_dirs) +
372 strv_length(inaccessible_dirs) +
374 (protect_home != PROTECT_HOME_NO ? 3 : 0) +
375 (protect_system != PROTECT_SYSTEM_NO ? 2 : 0) +
376 (protect_system == PROTECT_SYSTEM_FULL ? 1 : 0);
379 m = mounts = (BindMount *) alloca0(n * sizeof(BindMount));
380 r = append_mounts(&m, read_write_dirs, READWRITE);
384 r = append_mounts(&m, read_only_dirs, READONLY);
388 r = append_mounts(&m, inaccessible_dirs, INACCESSIBLE);
394 m->mode = PRIVATE_TMP;
399 m->path = "/var/tmp";
400 m->mode = PRIVATE_VAR_TMP;
406 m->mode = PRIVATE_DEV;
410 if (protect_home != PROTECT_HOME_NO) {
411 r = append_mounts(&m, STRV_MAKE("-/home", "-/run/user", "-/root"), protect_home == PROTECT_HOME_READ_ONLY ? READONLY : INACCESSIBLE);
416 if (protect_system != PROTECT_SYSTEM_NO) {
417 r = append_mounts(&m, protect_system == PROTECT_SYSTEM_FULL ? STRV_MAKE("/usr", "-/boot", "/etc") : STRV_MAKE("/usr", "-/boot"), READONLY);
422 assert(mounts + n == m);
424 qsort(mounts, n, sizeof(BindMount), mount_path_compare);
425 drop_duplicates(mounts, &n);
429 /* Remount / as SLAVE so that nothing now mounted in the namespace
430 shows up in the parent */
431 if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0)
434 for (m = mounts; m < mounts + n; ++m) {
435 r = apply_mount(m, tmp_dir, var_tmp_dir);
440 for (m = mounts; m < mounts + n; ++m) {
441 r = make_read_only(m);
447 /* Remount / as the desired mode. Not that this will not
448 * reestablish propagation from our side to the host, since
449 * what's disconnected is disconnected. */
450 if (mount(NULL, "/", NULL, mount_flags | MS_REC, NULL) < 0) {
459 for (m = mounts; m < mounts + n; ++m)
461 umount2(m->path, MNT_DETACH);
467 static int setup_one_tmp_dir(const char *id, const char *prefix, char **path) {
468 _cleanup_free_ char *x = NULL;
469 char bid[SD_ID128_STRING_MAX];
477 /* We include the boot id in the directory so that after a
478 * reboot we can easily identify obsolete directories. */
480 r = sd_id128_get_boot(&boot_id);
484 x = strjoin(prefix, "/systemd-private-", sd_id128_to_string(boot_id, bid), "-", id, "-XXXXXX", NULL);
492 RUN_WITH_UMASK(0000) {
495 y = strappenda(x, "/tmp");
497 if (mkdir(y, 0777 | S_ISVTX) < 0)
507 int setup_tmp_dirs(const char *id, char **tmp_dir, char **var_tmp_dir) {
515 r = setup_one_tmp_dir(id, "/tmp", &a);
519 r = setup_one_tmp_dir(id, "/var/tmp", &b);
523 t = strappenda(a, "/tmp");
537 int setup_netns(int netns_storage_socket[2]) {
538 _cleanup_close_ int netns = -1;
540 struct cmsghdr cmsghdr;
541 uint8_t buf[CMSG_SPACE(sizeof(int))];
544 .msg_control = &control,
545 .msg_controllen = sizeof(control),
547 struct cmsghdr *cmsg;
550 assert(netns_storage_socket);
551 assert(netns_storage_socket[0] >= 0);
552 assert(netns_storage_socket[1] >= 0);
554 /* We use the passed socketpair as a storage buffer for our
555 * namespace reference fd. Whatever process runs this first
556 * shall create a new namespace, all others should just join
557 * it. To serialize that we use a file lock on the socket
560 * It's a bit crazy, but hey, works great! */
562 if (lockf(netns_storage_socket[0], F_LOCK, 0) < 0)
565 if (recvmsg(netns_storage_socket[0], &mh, MSG_DONTWAIT|MSG_CMSG_CLOEXEC) < 0) {
566 if (errno != EAGAIN) {
571 /* Nothing stored yet, so let's create a new namespace */
573 if (unshare(CLONE_NEWNET) < 0) {
580 netns = open("/proc/self/ns/net", O_RDONLY|O_CLOEXEC|O_NOCTTY);
588 /* Yay, found something, so let's join the namespace */
590 for (cmsg = CMSG_FIRSTHDR(&mh); cmsg; cmsg = CMSG_NXTHDR(&mh, cmsg)) {
591 if (cmsg->cmsg_level == SOL_SOCKET && cmsg->cmsg_type == SCM_RIGHTS) {
592 assert(cmsg->cmsg_len == CMSG_LEN(sizeof(int)));
593 netns = *(int*) CMSG_DATA(cmsg);
597 if (setns(netns, CLONE_NEWNET) < 0) {
605 cmsg = CMSG_FIRSTHDR(&mh);
606 cmsg->cmsg_level = SOL_SOCKET;
607 cmsg->cmsg_type = SCM_RIGHTS;
608 cmsg->cmsg_len = CMSG_LEN(sizeof(int));
609 memcpy(CMSG_DATA(cmsg), &netns, sizeof(int));
610 mh.msg_controllen = cmsg->cmsg_len;
612 if (sendmsg(netns_storage_socket[1], &mh, MSG_DONTWAIT|MSG_NOSIGNAL) < 0) {
618 lockf(netns_storage_socket[0], F_ULOCK, 0);
623 static const char *const protect_home_table[_PROTECT_HOME_MAX] = {
624 [PROTECT_HOME_NO] = "no",
625 [PROTECT_HOME_YES] = "yes",
626 [PROTECT_HOME_READ_ONLY] = "read-only",
629 DEFINE_STRING_TABLE_LOOKUP(protect_home, ProtectHome);
631 static const char *const protect_system_table[_PROTECT_SYSTEM_MAX] = {
632 [PROTECT_SYSTEM_NO] = "no",
633 [PROTECT_SYSTEM_YES] = "yes",
634 [PROTECT_SYSTEM_FULL] = "full",
637 DEFINE_STRING_TABLE_LOOKUP(protect_system, ProtectSystem);