1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
4 This file is part of systemd.
6 Copyright 2010 Lennart Poettering
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
25 #include <linux/limits.h>
30 #include <sys/select.h>
32 #include <sys/types.h>
35 #include <linux/fanotify.h>
36 #include <sys/signalfd.h>
40 #include <linux/fiemap.h>
41 #include <sys/ioctl.h>
44 #include <sys/inotify.h>
47 #ifdef HAVE_LINUX_BTRFS_H
48 #include <linux/btrfs.h>
51 #ifdef HAVE_FANOTIFY_INIT
52 #include <sys/fanotify.h>
55 #include <systemd/sd-daemon.h>
61 #include "readahead-common.h"
66 * - detect ssd on btrfs/lvm...
67 * - read ahead directories
70 * - handle files where nothing is in mincore
71 * - does ioprio_set work with fadvise()?
74 static ReadaheadShared *shared = NULL;
75 static usec_t starttime;
77 /* Avoid collisions with the NULL pointer */
78 #define SECTOR_TO_PTR(s) ULONG_TO_PTR((s)+1)
79 #define PTR_TO_SECTOR(p) (PTR_TO_ULONG(p)-1)
81 static int btrfs_defrag(int fd) {
82 struct btrfs_ioctl_vol_args data = { .fd = fd };
84 return ioctl(fd, BTRFS_IOC_DEFRAG, &data);
87 static int pack_file(FILE *pack, const char *fn, bool on_btrfs) {
89 void *start = MAP_FAILED;
95 int r = 0, fd = -1, k;
100 fd = open(fn, O_RDONLY|O_CLOEXEC|O_NOATIME|O_NOCTTY|O_NOFOLLOW);
106 if (errno == EPERM || errno == EACCES)
109 log_warning("open(%s) failed: %m", fn);
114 k = file_verify(fd, fn, arg_file_size_max, &st);
123 l = PAGE_ALIGN(st.st_size);
124 start = mmap(NULL, l, PROT_READ, MAP_SHARED, fd, 0);
125 if (start == MAP_FAILED) {
126 log_warning("mmap(%s) failed: %m", fn);
131 pages = l / page_size();
132 vec = alloca0(pages);
133 if (mincore(start, l, vec) < 0) {
134 log_warning("mincore(%s) failed: %m", fn);
142 /* Store the inode, so that we notice when the file is deleted */
143 inode = (uint64_t) st.st_ino;
144 fwrite(&inode, sizeof(inode), 1, pack);
147 for (c = 0; c < pages; c++) {
148 bool new_mapped = !!(vec[c] & 1);
150 if (!mapped && new_mapped)
152 else if (mapped && !new_mapped) {
153 fwrite(&b, sizeof(b), 1, pack);
154 fwrite(&c, sizeof(c), 1, pack);
156 log_debug("%s: page %u to %u", fn, b, c);
162 /* We don't write any range data if we should read the entire file */
163 if (mapped && b > 0) {
164 fwrite(&b, sizeof(b), 1, pack);
165 fwrite(&c, sizeof(c), 1, pack);
167 log_debug("%s: page %u to %u", fn, b, c);
172 fwrite(&b, sizeof(b), 1, pack);
173 fwrite(&b, sizeof(b), 1, pack);
176 if (start != MAP_FAILED)
180 close_nointr_nofail(fd);
185 static unsigned long fd_first_block(int fd) {
187 struct fiemap fiemap;
188 struct fiemap_extent extent;
190 .fiemap.fm_length = ~0ULL,
191 .fiemap.fm_extent_count = 1,
194 if (ioctl(fd, FS_IOC_FIEMAP, &data) < 0)
197 if (data.fiemap.fm_mapped_extents <= 0)
200 if (data.fiemap.fm_extents[0].fe_flags & FIEMAP_EXTENT_UNKNOWN)
203 return (unsigned long) data.fiemap.fm_extents[0].fe_physical;
212 static int qsort_compare(const void *a, const void *b) {
213 const struct item *i, *j;
218 /* sort by bin first */
224 /* then sort by sector */
225 if (i->block < j->block)
227 if (i->block > j->block)
230 return strcmp(i->path, j->path);
233 static int collect(const char *root) {
235 FD_FANOTIFY, /* Get the actual fs events */
237 FD_INOTIFY, /* We get notifications to quit early via this fd */
240 struct pollfd pollfd[_FD_MAX] = {};
241 int fanotify_fd = -1, signal_fd = -1, inotify_fd = -1, r = 0;
243 Hashmap *files = NULL;
248 char *pack_fn_new = NULL, *pack_fn = NULL;
249 bool on_ssd, on_btrfs;
252 uint64_t previous_block_readahead;
253 bool previous_block_readahead_set = false;
257 if (asprintf(&pack_fn, "%s/.readahead", root) < 0) {
262 starttime = now(CLOCK_MONOTONIC);
264 /* If there's no pack file yet we lower the kernel readahead
265 * so that mincore() is accurate. If there is a pack file
266 * already we assume it is accurate enough so that kernel
267 * readahead is never triggered. */
268 previous_block_readahead_set =
269 access(pack_fn, F_OK) < 0 &&
270 block_get_readahead(root, &previous_block_readahead) >= 0 &&
271 block_set_readahead(root, 8*1024) >= 0;
273 if (ioprio_set(IOPRIO_WHO_PROCESS, getpid(), IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0)) < 0)
274 log_warning("Failed to set IDLE IO priority class: %m");
276 assert_se(sigemptyset(&mask) == 0);
277 sigset_add_many(&mask, SIGINT, SIGTERM, -1);
278 assert_se(sigprocmask(SIG_SETMASK, &mask, NULL) == 0);
280 if ((signal_fd = signalfd(-1, &mask, SFD_NONBLOCK|SFD_CLOEXEC)) < 0) {
281 log_error("signalfd(): %m");
286 files = hashmap_new(string_hash_func, string_compare_func);
288 log_error("Failed to allocate set.");
293 fanotify_fd = fanotify_init(FAN_CLOEXEC|FAN_NONBLOCK, O_RDONLY|O_LARGEFILE|O_CLOEXEC|O_NOATIME);
294 if (fanotify_fd < 0) {
295 log_error("Failed to create fanotify object: %m");
300 if (fanotify_mark(fanotify_fd, FAN_MARK_ADD|FAN_MARK_MOUNT, FAN_OPEN, AT_FDCWD, root) < 0) {
301 log_error("Failed to mark %s: %m", root);
306 inotify_fd = open_inotify();
307 if (inotify_fd < 0) {
312 not_after = now(CLOCK_MONOTONIC) + arg_timeout;
316 pollfd[FD_FANOTIFY].fd = fanotify_fd;
317 pollfd[FD_FANOTIFY].events = POLLIN;
318 pollfd[FD_SIGNAL].fd = signal_fd;
319 pollfd[FD_SIGNAL].events = POLLIN;
320 pollfd[FD_INOTIFY].fd = inotify_fd;
321 pollfd[FD_INOTIFY].events = POLLIN;
325 "STATUS=Collecting readahead data");
327 log_debug("Collecting...");
329 if (access("/run/systemd/readahead/cancel", F_OK) >= 0) {
330 log_debug("Collection canceled");
335 if (access("/run/systemd/readahead/done", F_OK) >= 0) {
336 log_debug("Got termination request");
342 struct fanotify_event_metadata metadata;
346 struct fanotify_event_metadata *m;
350 if (hashmap_size(files) > arg_files_max) {
351 log_debug("Reached maximum number of read ahead files, ending collection.");
355 t = now(CLOCK_MONOTONIC);
356 if (t >= not_after) {
357 log_debug("Reached maximum collection time, ending collection.");
361 if ((h = poll(pollfd, _FD_MAX, (int) ((not_after - t) / USEC_PER_MSEC))) < 0) {
366 log_error("poll(): %m");
372 log_debug("Reached maximum collection time, ending collection.");
376 if (pollfd[FD_SIGNAL].revents) {
377 log_debug("Got signal.");
381 if (pollfd[FD_INOTIFY].revents) {
382 uint8_t inotify_buffer[sizeof(struct inotify_event) + FILENAME_MAX];
383 struct inotify_event *e;
385 if ((n = read(inotify_fd, &inotify_buffer, sizeof(inotify_buffer))) < 0) {
386 if (errno == EINTR || errno == EAGAIN)
389 log_error("Failed to read inotify event: %m");
394 e = (struct inotify_event*) inotify_buffer;
398 if ((e->mask & IN_CREATE) && streq(e->name, "cancel")) {
399 log_debug("Collection canceled");
404 if ((e->mask & IN_CREATE) && streq(e->name, "done")) {
405 log_debug("Got termination request");
409 step = sizeof(struct inotify_event) + e->len;
410 assert(step <= (size_t) n);
412 e = (struct inotify_event*) ((uint8_t*) e + step);
417 n = read(fanotify_fd, &data, sizeof(data));
420 if (errno == EINTR || errno == EAGAIN)
423 /* fanotify sometimes returns EACCES on read()
424 * where it shouldn't. For now let's just
425 * ignore it here (which is safe), but
426 * eventually this should be
427 * dropped when the kernel is fixed.
429 * https://bugzilla.redhat.com/show_bug.cgi?id=707577 */
433 log_error("Failed to read event: %m");
438 for (m = &data.metadata; FAN_EVENT_OK(m, n); m = FAN_EVENT_NEXT(m, n)) {
439 char fn[sizeof("/proc/self/fd/") + DECIMAL_STR_MAX(int)];
445 if (m->pid == my_pid)
448 __sync_synchronize();
449 if (m->pid == shared->replay)
452 snprintf(fn, sizeof(fn), "/proc/self/fd/%i", m->fd);
453 k = readlink_malloc(fn, &p);
455 if (startswith(p, "/tmp") ||
456 endswith(p, " (deleted)") ||
457 hashmap_get(files, p))
458 /* Not interesting, or
466 entry = new0(struct item, 1);
472 ul = fd_first_block(m->fd);
474 entrytime = now(CLOCK_MONOTONIC);
477 entry->path = strdup(p);
483 entry->bin = (entrytime - starttime) / 2000000;
485 k = hashmap_put(files, p, entry);
487 log_warning("hashmap_put() failed: %s", strerror(-k));
493 log_warning("readlink(%s) failed: %s", fn, strerror(-k));
497 close_nointr_nofail(m->fd);
502 if (fanotify_fd >= 0) {
503 close_nointr_nofail(fanotify_fd);
507 log_debug("Writing Pack File...");
509 on_ssd = fs_on_ssd(root) > 0;
510 log_debug("On SSD: %s", yes_no(on_ssd));
512 on_btrfs = statfs(root, &sfs) >= 0 && F_TYPE_EQUAL(sfs.f_type, BTRFS_SUPER_MAGIC);
513 log_debug("On btrfs: %s", yes_no(on_btrfs));
515 if (asprintf(&pack_fn_new, "%s/.readahead.new", root) < 0) {
520 pack = fopen(pack_fn_new, "we");
522 log_error("Failed to open pack file: %m");
527 fputs(CANONICAL_HOST READAHEAD_PACK_FILE_VERSION, pack);
528 putc(on_ssd ? 'S' : 'R', pack);
530 if (on_ssd || on_btrfs) {
532 /* On SSD or on btrfs, just write things out in the
533 * order the files were accessed. */
535 HASHMAP_FOREACH_KEY(q, p, files, i)
536 pack_file(pack, p, on_btrfs);
540 /* On rotating media, order things by the block
543 log_debug("Ordering...");
545 n = hashmap_size(files);
547 _cleanup_free_ struct item *ordered;
551 ordered = new(struct item, n);
558 HASHMAP_FOREACH_KEY(q, p, files, i) {
559 memcpy(j, q, sizeof(struct item));
563 assert(ordered + n == j);
565 qsort(ordered, n, sizeof(struct item), qsort_compare);
567 for (k = 0; k < n; k++)
568 pack_file(pack, ordered[k].path, on_btrfs);
570 log_warning("No pack files");
573 log_debug("Finalizing...");
578 log_error("Failed to write pack file.");
583 if (rename(pack_fn_new, pack_fn) < 0) {
584 log_error("Failed to rename readahead file: %m");
595 if (fanotify_fd >= 0)
596 close_nointr_nofail(fanotify_fd);
599 close_nointr_nofail(signal_fd);
602 close_nointr_nofail(inotify_fd);
611 while ((p = hashmap_steal_first_key(files)))
616 if (previous_block_readahead_set) {
619 /* Restore the original kernel readahead setting if we
620 * changed it, and nobody has overwritten it since
622 if (block_get_readahead(root, &bytes) >= 0 && bytes == 8*1024)
623 block_set_readahead(root, previous_block_readahead);
629 int main_collect(const char *root) {
634 /* Skip this step on read-only media. Note that we check the
635 * underlying block device here, not he read-only flag of the
636 * file system on top, since that one is most likely mounted
637 * read-only anyway at boot, even if the underlying block
638 * device is theoretically writable. */
639 if (fs_on_read_only(root) > 0) {
640 log_info("Disabling readahead collector due to read-only media.");
645 log_info("Disabling readahead collector due to low memory.");
649 shared = shared_get();
653 shared->collect = getpid();
654 __sync_synchronize();
656 if (collect(root) < 0)