1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
4 This file is part of systemd.
6 Copyright 2010 Lennart Poettering
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
23 #include <sys/mount.h>
28 #include <sys/types.h>
30 #include <sys/syscall.h>
37 #include "path-util.h"
38 #include "namespace.h"
41 #include "loopback-setup.h"
43 #include "dev-setup.h"
46 typedef enum MountMode {
47 /* This is ordered by priority! */
56 typedef struct BindMount {
63 static int append_mounts(BindMount **p, char **strv, MountMode mode) {
68 STRV_FOREACH(i, strv) {
72 if ((mode == INACCESSIBLE || mode == READONLY || mode == READWRITE) && (*i)[0] == '-') {
77 if (!path_is_absolute(*i))
88 static int mount_path_compare(const void *a, const void *b) {
89 const BindMount *p = a, *q = b;
91 if (path_equal(p->path, q->path)) {
93 /* If the paths are equal, check the mode */
94 if (p->mode < q->mode)
97 if (p->mode > q->mode)
103 /* If the paths are not equal, then order prefixes first */
104 if (path_startswith(p->path, q->path))
107 if (path_startswith(q->path, p->path))
113 static void drop_duplicates(BindMount *m, unsigned *n) {
114 BindMount *f, *t, *previous;
119 for (f = m, t = m, previous = NULL; f < m+*n; f++) {
121 /* The first one wins */
122 if (previous && path_equal(f->path, previous->path))
136 static int mount_dev(BindMount *m) {
137 static const char devnodes[] =
145 char temporary_mount[] = "/tmp/namespace-dev-XXXXXX";
146 const char *d, *dev = NULL, *devpts = NULL, *devshm = NULL, *devkdbus = NULL, *devhugepages = NULL, *devmqueue = NULL, *devlog = NULL;
147 _cleanup_umask_ mode_t u;
154 if (!mkdtemp(temporary_mount))
157 dev = strappenda(temporary_mount, "/dev");
159 if (mount("tmpfs", dev, "tmpfs", MS_NOSUID|MS_STRICTATIME, "mode=755") < 0) {
164 devpts = strappenda(temporary_mount, "/dev/pts");
166 if (mount("/dev/pts", devpts, NULL, MS_BIND, NULL) < 0) {
171 devshm = strappenda(temporary_mount, "/dev/shm");
172 mkdir(devshm, 01777);
173 r = mount("/dev/shm", devshm, NULL, MS_BIND, NULL);
179 devmqueue = strappenda(temporary_mount, "/dev/mqueue");
180 mkdir(devmqueue, 0755);
181 mount("/dev/mqueue", devmqueue, NULL, MS_BIND, NULL);
183 devkdbus = strappenda(temporary_mount, "/dev/kdbus");
184 mkdir(devkdbus, 0755);
185 mount("/dev/kdbus", devkdbus, NULL, MS_BIND, NULL);
187 devhugepages = strappenda(temporary_mount, "/dev/hugepages");
188 mkdir(devhugepages, 0755);
189 mount("/dev/hugepages", devhugepages, NULL, MS_BIND, NULL);
191 devlog = strappenda(temporary_mount, "/dev/log");
192 symlink("/run/systemd/journal/dev-log", devlog);
194 NULSTR_FOREACH(d, devnodes) {
195 _cleanup_free_ char *dn = NULL;
208 if (!S_ISBLK(st.st_mode) &&
209 !S_ISCHR(st.st_mode)) {
217 dn = strappend(temporary_mount, d);
223 r = mknod(dn, st.st_mode, st.st_rdev);
230 dev_setup(temporary_mount);
232 if (mount(dev, "/dev/", NULL, MS_MOVE, NULL) < 0) {
238 rmdir(temporary_mount);
253 umount(devhugepages);
263 rmdir(temporary_mount);
268 static int apply_mount(
271 const char *var_tmp_dir) {
284 what = "/run/systemd/inaccessible";
296 case PRIVATE_VAR_TMP:
301 assert_not_reached("Unknown mode");
306 r = mount(what, m->path, NULL, MS_BIND|MS_REC, NULL);
308 log_debug("Successfully mounted %s to %s", what, m->path);
309 else if (m->ignore && errno == ENOENT)
315 static int make_read_only(BindMount *m) {
320 if (m->mode != INACCESSIBLE && m->mode != READONLY)
323 r = mount(NULL, m->path, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY|MS_REC, NULL);
324 if (r < 0 && !(m->ignore && errno == ENOENT))
331 char** read_write_dirs,
332 char** read_only_dirs,
333 char** inaccessible_dirs,
337 ProtectedHome protected_home,
338 bool read_only_system,
339 unsigned mount_flags) {
341 BindMount *m, *mounts = NULL;
345 if (mount_flags == 0)
346 mount_flags = MS_SHARED;
348 if (unshare(CLONE_NEWNS) < 0)
351 n = !!tmp_dir + !!var_tmp_dir +
352 strv_length(read_write_dirs) +
353 strv_length(read_only_dirs) +
354 strv_length(inaccessible_dirs) +
356 (protected_home != PROTECTED_HOME_NO ? 2 : 0) +
357 (read_only_system ? 2 : 0);
360 m = mounts = (BindMount *) alloca(n * sizeof(BindMount));
361 r = append_mounts(&m, read_write_dirs, READWRITE);
365 r = append_mounts(&m, read_only_dirs, READONLY);
369 r = append_mounts(&m, inaccessible_dirs, INACCESSIBLE);
375 m->mode = PRIVATE_TMP;
380 m->path = "/var/tmp";
381 m->mode = PRIVATE_VAR_TMP;
387 m->mode = PRIVATE_DEV;
391 if (protected_home != PROTECTED_HOME_NO) {
392 r = append_mounts(&m, STRV_MAKE("-/home", "-/run/user"), protected_home == PROTECTED_HOME_READ_ONLY ? READONLY : INACCESSIBLE);
397 if (read_only_system) {
398 r = append_mounts(&m, STRV_MAKE("/usr", "-/boot"), READONLY);
403 assert(mounts + n == m);
405 qsort(mounts, n, sizeof(BindMount), mount_path_compare);
406 drop_duplicates(mounts, &n);
410 /* Remount / as SLAVE so that nothing now mounted in the namespace
411 shows up in the parent */
412 if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0)
415 for (m = mounts; m < mounts + n; ++m) {
416 r = apply_mount(m, tmp_dir, var_tmp_dir);
421 for (m = mounts; m < mounts + n; ++m) {
422 r = make_read_only(m);
428 /* Remount / as the desired mode. Not that this will not
429 * reestablish propagation from our side to the host, since
430 * what's disconnected is disconnected. */
431 if (mount(NULL, "/", NULL, mount_flags | MS_REC, NULL) < 0) {
440 for (m = mounts; m < mounts + n; ++m)
442 umount2(m->path, MNT_DETACH);
448 static int setup_one_tmp_dir(const char *id, const char *prefix, char **path) {
449 _cleanup_free_ char *x = NULL;
450 char bid[SD_ID128_STRING_MAX];
458 /* We include the boot id in the directory so that after a
459 * reboot we can easily identify obsolete directories. */
461 r = sd_id128_get_boot(&boot_id);
465 x = strjoin(prefix, "/systemd-private-", sd_id128_to_string(boot_id, bid), "-", id, "-XXXXXX", NULL);
473 RUN_WITH_UMASK(0000) {
476 y = strappenda(x, "/tmp");
478 if (mkdir(y, 0777 | S_ISVTX) < 0)
488 int setup_tmp_dirs(const char *id, char **tmp_dir, char **var_tmp_dir) {
496 r = setup_one_tmp_dir(id, "/tmp", &a);
500 r = setup_one_tmp_dir(id, "/var/tmp", &b);
504 t = strappenda(a, "/tmp");
518 int setup_netns(int netns_storage_socket[2]) {
519 _cleanup_close_ int netns = -1;
521 struct cmsghdr cmsghdr;
522 uint8_t buf[CMSG_SPACE(sizeof(int))];
525 .msg_control = &control,
526 .msg_controllen = sizeof(control),
528 struct cmsghdr *cmsg;
531 assert(netns_storage_socket);
532 assert(netns_storage_socket[0] >= 0);
533 assert(netns_storage_socket[1] >= 0);
535 /* We use the passed socketpair as a storage buffer for our
536 * namespace reference fd. Whatever process runs this first
537 * shall create a new namespace, all others should just join
538 * it. To serialize that we use a file lock on the socket
541 * It's a bit crazy, but hey, works great! */
543 if (lockf(netns_storage_socket[0], F_LOCK, 0) < 0)
546 if (recvmsg(netns_storage_socket[0], &mh, MSG_DONTWAIT|MSG_CMSG_CLOEXEC) < 0) {
547 if (errno != EAGAIN) {
552 /* Nothing stored yet, so let's create a new namespace */
554 if (unshare(CLONE_NEWNET) < 0) {
561 netns = open("/proc/self/ns/net", O_RDONLY|O_CLOEXEC|O_NOCTTY);
569 /* Yay, found something, so let's join the namespace */
571 for (cmsg = CMSG_FIRSTHDR(&mh); cmsg; cmsg = CMSG_NXTHDR(&mh, cmsg)) {
572 if (cmsg->cmsg_level == SOL_SOCKET && cmsg->cmsg_type == SCM_RIGHTS) {
573 assert(cmsg->cmsg_len == CMSG_LEN(sizeof(int)));
574 netns = *(int*) CMSG_DATA(cmsg);
578 if (setns(netns, CLONE_NEWNET) < 0) {
586 cmsg = CMSG_FIRSTHDR(&mh);
587 cmsg->cmsg_level = SOL_SOCKET;
588 cmsg->cmsg_type = SCM_RIGHTS;
589 cmsg->cmsg_len = CMSG_LEN(sizeof(int));
590 memcpy(CMSG_DATA(cmsg), &netns, sizeof(int));
591 mh.msg_controllen = cmsg->cmsg_len;
593 if (sendmsg(netns_storage_socket[1], &mh, MSG_DONTWAIT|MSG_NOSIGNAL) < 0) {
599 lockf(netns_storage_socket[0], F_ULOCK, 0);
604 static const char *const protected_home_table[_PROTECTED_HOME_MAX] = {
605 [PROTECTED_HOME_NO] = "no",
606 [PROTECTED_HOME_YES] = "yes",
607 [PROTECTED_HOME_READ_ONLY] = "read-only",
610 DEFINE_STRING_TABLE_LOOKUP(protected_home, ProtectedHome);