1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
4 This file is part of systemd.
6 Copyright 2010 Lennart Poettering
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
23 #include <sys/mount.h>
28 #include <sys/types.h>
30 #include <sys/syscall.h>
37 #include "path-util.h"
38 #include "namespace.h"
41 #include "loopback-setup.h"
43 #include "dev-setup.h"
46 typedef enum MountMode {
47 /* This is ordered by priority! */
56 typedef struct BindMount {
63 static int append_mounts(BindMount **p, char **strv, MountMode mode) {
68 STRV_FOREACH(i, strv) {
72 if ((mode == INACCESSIBLE || mode == READONLY || mode == READWRITE) && (*i)[0] == '-') {
77 if (!path_is_absolute(*i))
88 static int mount_path_compare(const void *a, const void *b) {
89 const BindMount *p = a, *q = b;
91 if (path_equal(p->path, q->path)) {
93 /* If the paths are equal, check the mode */
94 if (p->mode < q->mode)
97 if (p->mode > q->mode)
103 /* If the paths are not equal, then order prefixes first */
104 if (path_startswith(p->path, q->path))
107 if (path_startswith(q->path, p->path))
113 static void drop_duplicates(BindMount *m, unsigned *n) {
114 BindMount *f, *t, *previous;
119 for (f = m, t = m, previous = NULL; f < m+*n; f++) {
121 /* The first one wins */
122 if (previous && path_equal(f->path, previous->path))
136 static int mount_dev(BindMount *m) {
137 static const char devnodes[] =
145 char temporary_mount[] = "/tmp/namespace-dev-XXXXXX";
146 const char *d, *dev = NULL, *devpts = NULL, *devshm = NULL, *devkdbus = NULL, *devhugepages = NULL, *devmqueue = NULL, *devlog = NULL, *devptmx = NULL;
147 _cleanup_umask_ mode_t u;
154 if (!mkdtemp(temporary_mount))
157 dev = strappenda(temporary_mount, "/dev");
159 if (mount("tmpfs", dev, "tmpfs", MS_NOSUID|MS_STRICTATIME, "mode=755") < 0) {
164 devpts = strappenda(temporary_mount, "/dev/pts");
166 if (mount("/dev/pts", devpts, NULL, MS_BIND, NULL) < 0) {
171 devptmx = strappenda(temporary_mount, "/dev/ptmx");
172 symlink("pts/ptmx", devptmx);
174 devshm = strappenda(temporary_mount, "/dev/shm");
175 mkdir(devshm, 01777);
176 r = mount("/dev/shm", devshm, NULL, MS_BIND, NULL);
182 devmqueue = strappenda(temporary_mount, "/dev/mqueue");
183 mkdir(devmqueue, 0755);
184 mount("/dev/mqueue", devmqueue, NULL, MS_BIND, NULL);
186 devkdbus = strappenda(temporary_mount, "/dev/kdbus");
187 mkdir(devkdbus, 0755);
188 mount("/dev/kdbus", devkdbus, NULL, MS_BIND, NULL);
190 devhugepages = strappenda(temporary_mount, "/dev/hugepages");
191 mkdir(devhugepages, 0755);
192 mount("/dev/hugepages", devhugepages, NULL, MS_BIND, NULL);
194 devlog = strappenda(temporary_mount, "/dev/log");
195 symlink("/run/systemd/journal/dev-log", devlog);
197 NULSTR_FOREACH(d, devnodes) {
198 _cleanup_free_ char *dn = NULL;
211 if (!S_ISBLK(st.st_mode) &&
212 !S_ISCHR(st.st_mode)) {
220 dn = strappend(temporary_mount, d);
226 r = mknod(dn, st.st_mode, st.st_rdev);
233 dev_setup(temporary_mount);
235 if (mount(dev, "/dev/", NULL, MS_MOVE, NULL) < 0) {
241 rmdir(temporary_mount);
256 umount(devhugepages);
266 rmdir(temporary_mount);
271 static int apply_mount(
274 const char *var_tmp_dir) {
285 /* First, get rid of everything that is below if there
286 * is anything... Then, overmount it with an
287 * inaccessible directory. */
288 umount_recursive(m->path, 0);
290 what = "/run/systemd/inaccessible";
295 /* Nothing to mount here, we just later toggle the
296 * MS_RDONLY bit for the mount point */
303 case PRIVATE_VAR_TMP:
311 assert_not_reached("Unknown mode");
316 r = mount(what, m->path, NULL, MS_BIND|MS_REC, NULL);
318 log_debug("Successfully mounted %s to %s", what, m->path);
319 else if (m->ignore && errno == ENOENT)
325 static int make_read_only(BindMount *m) {
330 if (IN_SET(m->mode, INACCESSIBLE, READONLY))
331 r = bind_remount_recursive(m->path, true);
332 else if (m->mode == READWRITE)
333 r = bind_remount_recursive(m->path, false);
337 if (m->ignore && r == -ENOENT)
344 char** read_write_dirs,
345 char** read_only_dirs,
346 char** inaccessible_dirs,
350 ProtectHome protect_home,
351 ProtectSystem protect_system,
352 unsigned mount_flags) {
354 BindMount *m, *mounts = NULL;
358 if (mount_flags == 0)
359 mount_flags = MS_SHARED;
361 if (unshare(CLONE_NEWNS) < 0)
364 n = !!tmp_dir + !!var_tmp_dir +
365 strv_length(read_write_dirs) +
366 strv_length(read_only_dirs) +
367 strv_length(inaccessible_dirs) +
369 (protect_home != PROTECT_HOME_NO ? 3 : 0) +
370 (protect_system != PROTECT_SYSTEM_NO ? 2 : 0) +
371 (protect_system == PROTECT_SYSTEM_FULL ? 1 : 0);
374 m = mounts = (BindMount *) alloca(n * sizeof(BindMount));
375 r = append_mounts(&m, read_write_dirs, READWRITE);
379 r = append_mounts(&m, read_only_dirs, READONLY);
383 r = append_mounts(&m, inaccessible_dirs, INACCESSIBLE);
389 m->mode = PRIVATE_TMP;
394 m->path = "/var/tmp";
395 m->mode = PRIVATE_VAR_TMP;
401 m->mode = PRIVATE_DEV;
405 if (protect_home != PROTECT_HOME_NO) {
406 r = append_mounts(&m, STRV_MAKE("-/home", "-/run/user", "-/root"), protect_home == PROTECT_HOME_READ_ONLY ? READONLY : INACCESSIBLE);
411 if (protect_system != PROTECT_SYSTEM_NO) {
412 r = append_mounts(&m, protect_system == PROTECT_SYSTEM_FULL ? STRV_MAKE("/usr", "-/boot", "/etc") : STRV_MAKE("/usr", "-/boot"), READONLY);
417 assert(mounts + n == m);
419 qsort(mounts, n, sizeof(BindMount), mount_path_compare);
420 drop_duplicates(mounts, &n);
424 /* Remount / as SLAVE so that nothing now mounted in the namespace
425 shows up in the parent */
426 if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0)
429 for (m = mounts; m < mounts + n; ++m) {
430 r = apply_mount(m, tmp_dir, var_tmp_dir);
435 for (m = mounts; m < mounts + n; ++m) {
436 r = make_read_only(m);
442 /* Remount / as the desired mode. Not that this will not
443 * reestablish propagation from our side to the host, since
444 * what's disconnected is disconnected. */
445 if (mount(NULL, "/", NULL, mount_flags | MS_REC, NULL) < 0) {
454 for (m = mounts; m < mounts + n; ++m)
456 umount2(m->path, MNT_DETACH);
462 static int setup_one_tmp_dir(const char *id, const char *prefix, char **path) {
463 _cleanup_free_ char *x = NULL;
464 char bid[SD_ID128_STRING_MAX];
472 /* We include the boot id in the directory so that after a
473 * reboot we can easily identify obsolete directories. */
475 r = sd_id128_get_boot(&boot_id);
479 x = strjoin(prefix, "/systemd-private-", sd_id128_to_string(boot_id, bid), "-", id, "-XXXXXX", NULL);
487 RUN_WITH_UMASK(0000) {
490 y = strappenda(x, "/tmp");
492 if (mkdir(y, 0777 | S_ISVTX) < 0)
502 int setup_tmp_dirs(const char *id, char **tmp_dir, char **var_tmp_dir) {
510 r = setup_one_tmp_dir(id, "/tmp", &a);
514 r = setup_one_tmp_dir(id, "/var/tmp", &b);
518 t = strappenda(a, "/tmp");
532 int setup_netns(int netns_storage_socket[2]) {
533 _cleanup_close_ int netns = -1;
535 struct cmsghdr cmsghdr;
536 uint8_t buf[CMSG_SPACE(sizeof(int))];
539 .msg_control = &control,
540 .msg_controllen = sizeof(control),
542 struct cmsghdr *cmsg;
545 assert(netns_storage_socket);
546 assert(netns_storage_socket[0] >= 0);
547 assert(netns_storage_socket[1] >= 0);
549 /* We use the passed socketpair as a storage buffer for our
550 * namespace reference fd. Whatever process runs this first
551 * shall create a new namespace, all others should just join
552 * it. To serialize that we use a file lock on the socket
555 * It's a bit crazy, but hey, works great! */
557 if (lockf(netns_storage_socket[0], F_LOCK, 0) < 0)
560 if (recvmsg(netns_storage_socket[0], &mh, MSG_DONTWAIT|MSG_CMSG_CLOEXEC) < 0) {
561 if (errno != EAGAIN) {
566 /* Nothing stored yet, so let's create a new namespace */
568 if (unshare(CLONE_NEWNET) < 0) {
575 netns = open("/proc/self/ns/net", O_RDONLY|O_CLOEXEC|O_NOCTTY);
583 /* Yay, found something, so let's join the namespace */
585 for (cmsg = CMSG_FIRSTHDR(&mh); cmsg; cmsg = CMSG_NXTHDR(&mh, cmsg)) {
586 if (cmsg->cmsg_level == SOL_SOCKET && cmsg->cmsg_type == SCM_RIGHTS) {
587 assert(cmsg->cmsg_len == CMSG_LEN(sizeof(int)));
588 netns = *(int*) CMSG_DATA(cmsg);
592 if (setns(netns, CLONE_NEWNET) < 0) {
600 cmsg = CMSG_FIRSTHDR(&mh);
601 cmsg->cmsg_level = SOL_SOCKET;
602 cmsg->cmsg_type = SCM_RIGHTS;
603 cmsg->cmsg_len = CMSG_LEN(sizeof(int));
604 memcpy(CMSG_DATA(cmsg), &netns, sizeof(int));
605 mh.msg_controllen = cmsg->cmsg_len;
607 if (sendmsg(netns_storage_socket[1], &mh, MSG_DONTWAIT|MSG_NOSIGNAL) < 0) {
613 lockf(netns_storage_socket[0], F_ULOCK, 0);
618 static const char *const protect_home_table[_PROTECT_HOME_MAX] = {
619 [PROTECT_HOME_NO] = "no",
620 [PROTECT_HOME_YES] = "yes",
621 [PROTECT_HOME_READ_ONLY] = "read-only",
624 DEFINE_STRING_TABLE_LOOKUP(protect_home, ProtectHome);
626 static const char *const protect_system_table[_PROTECT_SYSTEM_MAX] = {
627 [PROTECT_SYSTEM_NO] = "no",
628 [PROTECT_SYSTEM_YES] = "yes",
629 [PROTECT_SYSTEM_FULL] = "full",
632 DEFINE_STRING_TABLE_LOOKUP(protect_system, ProtectSystem);