1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
4 This file is part of systemd.
6 Copyright 2010 Lennart Poettering
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
23 #include <sys/mount.h>
28 #include <sys/types.h>
30 #include <sys/syscall.h>
37 #include "path-util.h"
38 #include "namespace.h"
41 #include "loopback-setup.h"
43 #include "dev-setup.h"
46 typedef enum MountMode {
47 /* This is ordered by priority! */
56 typedef struct BindMount {
63 static int append_mounts(BindMount **p, char **strv, MountMode mode) {
68 STRV_FOREACH(i, strv) {
72 if ((mode == INACCESSIBLE || mode == READONLY || mode == READWRITE) && (*i)[0] == '-') {
77 if (!path_is_absolute(*i))
88 static int mount_path_compare(const void *a, const void *b) {
89 const BindMount *p = a, *q = b;
91 if (path_equal(p->path, q->path)) {
93 /* If the paths are equal, check the mode */
94 if (p->mode < q->mode)
97 if (p->mode > q->mode)
103 /* If the paths are not equal, then order prefixes first */
104 if (path_startswith(p->path, q->path))
107 if (path_startswith(q->path, p->path))
113 static void drop_duplicates(BindMount *m, unsigned *n) {
114 BindMount *f, *t, *previous;
119 for (f = m, t = m, previous = NULL; f < m+*n; f++) {
121 /* The first one wins */
122 if (previous && path_equal(f->path, previous->path))
136 static int mount_dev(BindMount *m) {
137 static const char devnodes[] =
145 char temporary_mount[] = "/tmp/namespace-dev-XXXXXX";
146 const char *d, *dev = NULL, *devpts = NULL, *devshm = NULL, *devkdbus = NULL, *devhugepages = NULL, *devmqueue = NULL;
147 _cleanup_umask_ mode_t u;
154 if (!mkdtemp(temporary_mount))
157 dev = strappenda(temporary_mount, "/dev");
159 if (mount("tmpfs", dev, "tmpfs", MS_NOSUID|MS_STRICTATIME, "mode=755") < 0) {
164 devpts = strappenda(temporary_mount, "/dev/pts");
166 if (mount("/dev/pts", devpts, NULL, MS_BIND, NULL) < 0) {
171 devshm = strappenda(temporary_mount, "/dev/shm");
172 mkdir(devshm, 01777);
173 r = mount("/dev/shm", devshm, NULL, MS_BIND, NULL);
179 devmqueue = strappenda(temporary_mount, "/dev/mqueue");
180 mkdir(devmqueue, 0755);
181 mount("/dev/mqueue", devmqueue, NULL, MS_BIND, NULL);
183 devkdbus = strappenda(temporary_mount, "/dev/kdbus");
184 mkdir(devkdbus, 0755);
185 mount("/dev/kdbus", devkdbus, NULL, MS_BIND, NULL);
187 devhugepages = strappenda(temporary_mount, "/dev/hugepages");
188 mkdir(devhugepages, 0755);
189 mount("/dev/hugepages", devhugepages, NULL, MS_BIND, NULL);
191 NULSTR_FOREACH(d, devnodes) {
192 _cleanup_free_ char *dn = NULL;
205 if (!S_ISBLK(st.st_mode) &&
206 !S_ISCHR(st.st_mode)) {
214 dn = strappend(temporary_mount, d);
220 r = mknod(dn, st.st_mode, st.st_rdev);
227 dev_setup(temporary_mount);
229 if (mount(dev, "/dev/", NULL, MS_MOVE, NULL) < 0) {
235 rmdir(temporary_mount);
250 umount(devhugepages);
260 rmdir(temporary_mount);
265 static int apply_mount(
268 const char *var_tmp_dir) {
281 what = "/run/systemd/inaccessible";
293 case PRIVATE_VAR_TMP:
298 assert_not_reached("Unknown mode");
303 r = mount(what, m->path, NULL, MS_BIND|MS_REC, NULL);
305 log_debug("Successfully mounted %s to %s", what, m->path);
306 else if (m->ignore && errno == ENOENT)
312 static int make_read_only(BindMount *m) {
317 if (m->mode != INACCESSIBLE && m->mode != READONLY)
320 r = mount(NULL, m->path, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY|MS_REC, NULL);
321 if (r < 0 && !(m->ignore && errno == ENOENT))
328 char** read_write_dirs,
329 char** read_only_dirs,
330 char** inaccessible_dirs,
334 ProtectedHome protected_home,
335 bool read_only_system,
336 unsigned mount_flags) {
338 BindMount *m, *mounts = NULL;
342 if (mount_flags == 0)
343 mount_flags = MS_SHARED;
345 if (unshare(CLONE_NEWNS) < 0)
348 n = !!tmp_dir + !!var_tmp_dir +
349 strv_length(read_write_dirs) +
350 strv_length(read_only_dirs) +
351 strv_length(inaccessible_dirs) +
353 (protected_home != PROTECTED_HOME_NO ? 2 : 0) +
354 (read_only_system ? 2 : 0);
357 m = mounts = (BindMount *) alloca(n * sizeof(BindMount));
358 r = append_mounts(&m, read_write_dirs, READWRITE);
362 r = append_mounts(&m, read_only_dirs, READONLY);
366 r = append_mounts(&m, inaccessible_dirs, INACCESSIBLE);
372 m->mode = PRIVATE_TMP;
377 m->path = "/var/tmp";
378 m->mode = PRIVATE_VAR_TMP;
384 m->mode = PRIVATE_DEV;
388 if (protected_home != PROTECTED_HOME_NO) {
389 r = append_mounts(&m, STRV_MAKE("-/home", "-/run/user"), protected_home == PROTECTED_HOME_READ_ONLY ? READONLY : INACCESSIBLE);
394 if (read_only_system) {
395 r = append_mounts(&m, STRV_MAKE("/usr", "-/boot"), READONLY);
400 assert(mounts + n == m);
402 qsort(mounts, n, sizeof(BindMount), mount_path_compare);
403 drop_duplicates(mounts, &n);
407 /* Remount / as SLAVE so that nothing now mounted in the namespace
408 shows up in the parent */
409 if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0)
412 for (m = mounts; m < mounts + n; ++m) {
413 r = apply_mount(m, tmp_dir, var_tmp_dir);
418 for (m = mounts; m < mounts + n; ++m) {
419 r = make_read_only(m);
425 /* Remount / as the desired mode. Not that this will not
426 * reestablish propagation from our side to the host, since
427 * what's disconnected is disconnected. */
428 if (mount(NULL, "/", NULL, mount_flags | MS_REC, NULL) < 0) {
437 for (m = mounts; m < mounts + n; ++m)
439 umount2(m->path, MNT_DETACH);
445 static int setup_one_tmp_dir(const char *id, const char *prefix, char **path) {
446 _cleanup_free_ char *x = NULL;
447 char bid[SD_ID128_STRING_MAX];
455 /* We include the boot id in the directory so that after a
456 * reboot we can easily identify obsolete directories. */
458 r = sd_id128_get_boot(&boot_id);
462 x = strjoin(prefix, "/systemd-private-", sd_id128_to_string(boot_id, bid), "-", id, "-XXXXXX", NULL);
470 RUN_WITH_UMASK(0000) {
473 y = strappenda(x, "/tmp");
475 if (mkdir(y, 0777 | S_ISVTX) < 0)
485 int setup_tmp_dirs(const char *id, char **tmp_dir, char **var_tmp_dir) {
493 r = setup_one_tmp_dir(id, "/tmp", &a);
497 r = setup_one_tmp_dir(id, "/var/tmp", &b);
501 t = strappenda(a, "/tmp");
515 int setup_netns(int netns_storage_socket[2]) {
516 _cleanup_close_ int netns = -1;
518 struct cmsghdr cmsghdr;
519 uint8_t buf[CMSG_SPACE(sizeof(int))];
522 .msg_control = &control,
523 .msg_controllen = sizeof(control),
525 struct cmsghdr *cmsg;
528 assert(netns_storage_socket);
529 assert(netns_storage_socket[0] >= 0);
530 assert(netns_storage_socket[1] >= 0);
532 /* We use the passed socketpair as a storage buffer for our
533 * namespace reference fd. Whatever process runs this first
534 * shall create a new namespace, all others should just join
535 * it. To serialize that we use a file lock on the socket
538 * It's a bit crazy, but hey, works great! */
540 if (lockf(netns_storage_socket[0], F_LOCK, 0) < 0)
543 if (recvmsg(netns_storage_socket[0], &mh, MSG_DONTWAIT|MSG_CMSG_CLOEXEC) < 0) {
544 if (errno != EAGAIN) {
549 /* Nothing stored yet, so let's create a new namespace */
551 if (unshare(CLONE_NEWNET) < 0) {
558 netns = open("/proc/self/ns/net", O_RDONLY|O_CLOEXEC|O_NOCTTY);
566 /* Yay, found something, so let's join the namespace */
568 for (cmsg = CMSG_FIRSTHDR(&mh); cmsg; cmsg = CMSG_NXTHDR(&mh, cmsg)) {
569 if (cmsg->cmsg_level == SOL_SOCKET && cmsg->cmsg_type == SCM_RIGHTS) {
570 assert(cmsg->cmsg_len == CMSG_LEN(sizeof(int)));
571 netns = *(int*) CMSG_DATA(cmsg);
575 if (setns(netns, CLONE_NEWNET) < 0) {
583 cmsg = CMSG_FIRSTHDR(&mh);
584 cmsg->cmsg_level = SOL_SOCKET;
585 cmsg->cmsg_type = SCM_RIGHTS;
586 cmsg->cmsg_len = CMSG_LEN(sizeof(int));
587 memcpy(CMSG_DATA(cmsg), &netns, sizeof(int));
588 mh.msg_controllen = cmsg->cmsg_len;
590 if (sendmsg(netns_storage_socket[1], &mh, MSG_DONTWAIT|MSG_NOSIGNAL) < 0) {
596 lockf(netns_storage_socket[0], F_ULOCK, 0);
601 static const char *const protected_home_table[_PROTECTED_HOME_MAX] = {
602 [PROTECTED_HOME_NO] = "no",
603 [PROTECTED_HOME_YES] = "yes",
604 [PROTECTED_HOME_READ_ONLY] = "read-only",
607 DEFINE_STRING_TABLE_LOOKUP(protected_home, ProtectedHome);