1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
4 This file is part of systemd.
6 Copyright 2010 Lennart Poettering
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
25 #include <linux/limits.h>
30 #include <sys/select.h>
32 #include <sys/types.h>
35 #include <linux/fanotify.h>
36 #include <sys/signalfd.h>
40 #include <linux/fiemap.h>
41 #include <sys/ioctl.h>
44 #include <sys/inotify.h>
47 #ifdef HAVE_LINUX_BTRFS_H
48 #include <linux/btrfs.h>
51 #ifdef HAVE_FANOTIFY_INIT
52 #include <sys/fanotify.h>
55 #include <systemd/sd-daemon.h>
61 #include "readahead-common.h"
66 * - detect ssd on btrfs/lvm...
67 * - read ahead directories
70 * - handle files where nothing is in mincore
71 * - does ioprio_set work with fadvise()?
74 static ReadaheadShared *shared = NULL;
75 static usec_t starttime;
77 /* Avoid collisions with the NULL pointer */
78 #define SECTOR_TO_PTR(s) ULONG_TO_PTR((s)+1)
79 #define PTR_TO_SECTOR(p) (PTR_TO_ULONG(p)-1)
81 static int btrfs_defrag(int fd) {
82 struct btrfs_ioctl_vol_args data = { .fd = fd };
84 return ioctl(fd, BTRFS_IOC_DEFRAG, &data);
87 static int pack_file(FILE *pack, const char *fn, bool on_btrfs) {
89 void *start = MAP_FAILED;
95 int r = 0, fd = -1, k;
100 fd = open(fn, O_RDONLY|O_CLOEXEC|O_NOATIME|O_NOCTTY|O_NOFOLLOW);
106 if (errno == EPERM || errno == EACCES)
109 log_warning("open(%s) failed: %m", fn);
114 k = file_verify(fd, fn, arg_file_size_max, &st);
123 l = PAGE_ALIGN(st.st_size);
124 start = mmap(NULL, l, PROT_READ, MAP_SHARED, fd, 0);
125 if (start == MAP_FAILED) {
126 log_warning("mmap(%s) failed: %m", fn);
131 pages = l / page_size();
132 vec = alloca0(pages);
133 if (mincore(start, l, vec) < 0) {
134 log_warning("mincore(%s) failed: %m", fn);
142 /* Store the inode, so that we notice when the file is deleted */
143 inode = (uint64_t) st.st_ino;
144 fwrite(&inode, sizeof(inode), 1, pack);
147 for (c = 0; c < pages; c++) {
148 bool new_mapped = !!(vec[c] & 1);
150 if (!mapped && new_mapped)
152 else if (mapped && !new_mapped) {
153 fwrite(&b, sizeof(b), 1, pack);
154 fwrite(&c, sizeof(c), 1, pack);
156 log_debug("%s: page %u to %u", fn, b, c);
162 /* We don't write any range data if we should read the entire file */
163 if (mapped && b > 0) {
164 fwrite(&b, sizeof(b), 1, pack);
165 fwrite(&c, sizeof(c), 1, pack);
167 log_debug("%s: page %u to %u", fn, b, c);
172 fwrite(&b, sizeof(b), 1, pack);
173 fwrite(&b, sizeof(b), 1, pack);
176 if (start != MAP_FAILED)
184 static unsigned long fd_first_block(int fd) {
186 struct fiemap fiemap;
187 struct fiemap_extent extent;
189 .fiemap.fm_length = ~0ULL,
190 .fiemap.fm_extent_count = 1,
193 if (ioctl(fd, FS_IOC_FIEMAP, &data) < 0)
196 if (data.fiemap.fm_mapped_extents <= 0)
199 if (data.fiemap.fm_extents[0].fe_flags & FIEMAP_EXTENT_UNKNOWN)
202 return (unsigned long) data.fiemap.fm_extents[0].fe_physical;
211 static int qsort_compare(const void *a, const void *b) {
212 const struct item *i, *j;
217 /* sort by bin first */
223 /* then sort by sector */
224 if (i->block < j->block)
226 if (i->block > j->block)
229 return strcmp(i->path, j->path);
232 static int collect(const char *root) {
234 FD_FANOTIFY, /* Get the actual fs events */
236 FD_INOTIFY, /* We get notifications to quit early via this fd */
239 struct pollfd pollfd[_FD_MAX] = {};
240 int fanotify_fd = -1, signal_fd = -1, inotify_fd = -1, r = 0;
242 Hashmap *files = NULL;
247 char *pack_fn_new = NULL, *pack_fn = NULL;
248 bool on_ssd, on_btrfs;
251 uint64_t previous_block_readahead;
252 bool previous_block_readahead_set = false;
256 if (asprintf(&pack_fn, "%s/.readahead", root) < 0) {
261 starttime = now(CLOCK_MONOTONIC);
263 /* If there's no pack file yet we lower the kernel readahead
264 * so that mincore() is accurate. If there is a pack file
265 * already we assume it is accurate enough so that kernel
266 * readahead is never triggered. */
267 previous_block_readahead_set =
268 access(pack_fn, F_OK) < 0 &&
269 block_get_readahead(root, &previous_block_readahead) >= 0 &&
270 block_set_readahead(root, 8*1024) >= 0;
272 if (ioprio_set(IOPRIO_WHO_PROCESS, getpid(), IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0)) < 0)
273 log_warning("Failed to set IDLE IO priority class: %m");
275 assert_se(sigemptyset(&mask) == 0);
276 sigset_add_many(&mask, SIGINT, SIGTERM, -1);
277 assert_se(sigprocmask(SIG_SETMASK, &mask, NULL) == 0);
279 if ((signal_fd = signalfd(-1, &mask, SFD_NONBLOCK|SFD_CLOEXEC)) < 0) {
280 log_error("signalfd(): %m");
285 files = hashmap_new(string_hash_func, string_compare_func);
287 log_error("Failed to allocate set.");
292 fanotify_fd = fanotify_init(FAN_CLOEXEC|FAN_NONBLOCK, O_RDONLY|O_LARGEFILE|O_CLOEXEC|O_NOATIME);
293 if (fanotify_fd < 0) {
294 log_error("Failed to create fanotify object: %m");
299 if (fanotify_mark(fanotify_fd, FAN_MARK_ADD|FAN_MARK_MOUNT, FAN_OPEN, AT_FDCWD, root) < 0) {
300 log_error("Failed to mark %s: %m", root);
305 inotify_fd = open_inotify();
306 if (inotify_fd < 0) {
311 not_after = now(CLOCK_MONOTONIC) + arg_timeout;
315 pollfd[FD_FANOTIFY].fd = fanotify_fd;
316 pollfd[FD_FANOTIFY].events = POLLIN;
317 pollfd[FD_SIGNAL].fd = signal_fd;
318 pollfd[FD_SIGNAL].events = POLLIN;
319 pollfd[FD_INOTIFY].fd = inotify_fd;
320 pollfd[FD_INOTIFY].events = POLLIN;
324 "STATUS=Collecting readahead data");
326 log_debug("Collecting...");
328 if (access("/run/systemd/readahead/cancel", F_OK) >= 0) {
329 log_debug("Collection canceled");
334 if (access("/run/systemd/readahead/done", F_OK) >= 0) {
335 log_debug("Got termination request");
341 struct fanotify_event_metadata metadata;
345 struct fanotify_event_metadata *m;
349 if (hashmap_size(files) > arg_files_max) {
350 log_debug("Reached maximum number of read ahead files, ending collection.");
354 t = now(CLOCK_MONOTONIC);
355 if (t >= not_after) {
356 log_debug("Reached maximum collection time, ending collection.");
360 if ((h = poll(pollfd, _FD_MAX, (int) ((not_after - t) / USEC_PER_MSEC))) < 0) {
365 log_error("poll(): %m");
371 log_debug("Reached maximum collection time, ending collection.");
375 if (pollfd[FD_SIGNAL].revents) {
376 log_debug("Got signal.");
380 if (pollfd[FD_INOTIFY].revents) {
381 uint8_t inotify_buffer[sizeof(struct inotify_event) + FILENAME_MAX];
382 struct inotify_event *e;
384 if ((n = read(inotify_fd, &inotify_buffer, sizeof(inotify_buffer))) < 0) {
385 if (errno == EINTR || errno == EAGAIN)
388 log_error("Failed to read inotify event: %m");
393 e = (struct inotify_event*) inotify_buffer;
397 if ((e->mask & IN_CREATE) && streq(e->name, "cancel")) {
398 log_debug("Collection canceled");
403 if ((e->mask & IN_CREATE) && streq(e->name, "done")) {
404 log_debug("Got termination request");
408 step = sizeof(struct inotify_event) + e->len;
409 assert(step <= (size_t) n);
411 e = (struct inotify_event*) ((uint8_t*) e + step);
416 n = read(fanotify_fd, &data, sizeof(data));
419 if (errno == EINTR || errno == EAGAIN)
422 /* fanotify sometimes returns EACCES on read()
423 * where it shouldn't. For now let's just
424 * ignore it here (which is safe), but
425 * eventually this should be
426 * dropped when the kernel is fixed.
428 * https://bugzilla.redhat.com/show_bug.cgi?id=707577 */
432 log_error("Failed to read event: %m");
437 for (m = &data.metadata; FAN_EVENT_OK(m, n); m = FAN_EVENT_NEXT(m, n)) {
438 char fn[sizeof("/proc/self/fd/") + DECIMAL_STR_MAX(int)];
444 if (m->pid == my_pid)
447 __sync_synchronize();
448 if (m->pid == shared->replay)
451 snprintf(fn, sizeof(fn), "/proc/self/fd/%i", m->fd);
452 k = readlink_malloc(fn, &p);
454 if (startswith(p, "/tmp") ||
455 endswith(p, " (deleted)") ||
456 hashmap_get(files, p))
457 /* Not interesting, or
465 entry = new0(struct item, 1);
471 ul = fd_first_block(m->fd);
473 entrytime = now(CLOCK_MONOTONIC);
476 entry->path = strdup(p);
482 entry->bin = (entrytime - starttime) / 2000000;
484 k = hashmap_put(files, p, entry);
486 log_warning("hashmap_put() failed: %s", strerror(-k));
492 log_warning("readlink(%s) failed: %s", fn, strerror(-k));
500 fanotify_fd = safe_close(fanotify_fd);
502 log_debug("Writing Pack File...");
504 on_ssd = fs_on_ssd(root) > 0;
505 log_debug("On SSD: %s", yes_no(on_ssd));
507 on_btrfs = statfs(root, &sfs) >= 0 && F_TYPE_EQUAL(sfs.f_type, BTRFS_SUPER_MAGIC);
508 log_debug("On btrfs: %s", yes_no(on_btrfs));
510 if (asprintf(&pack_fn_new, "%s/.readahead.new", root) < 0) {
515 pack = fopen(pack_fn_new, "we");
517 log_error("Failed to open pack file: %m");
522 fputs(CANONICAL_HOST READAHEAD_PACK_FILE_VERSION, pack);
523 putc(on_ssd ? 'S' : 'R', pack);
525 if (on_ssd || on_btrfs) {
527 /* On SSD or on btrfs, just write things out in the
528 * order the files were accessed. */
530 HASHMAP_FOREACH_KEY(q, p, files, i)
531 pack_file(pack, p, on_btrfs);
535 /* On rotating media, order things by the block
538 log_debug("Ordering...");
540 n = hashmap_size(files);
542 _cleanup_free_ struct item *ordered;
546 ordered = new(struct item, n);
553 HASHMAP_FOREACH_KEY(q, p, files, i) {
554 memcpy(j, q, sizeof(struct item));
558 assert(ordered + n == j);
560 qsort(ordered, n, sizeof(struct item), qsort_compare);
562 for (k = 0; k < n; k++)
563 pack_file(pack, ordered[k].path, on_btrfs);
565 log_warning("No pack files");
568 log_debug("Finalizing...");
573 log_error("Failed to write pack file.");
578 if (rename(pack_fn_new, pack_fn) < 0) {
579 log_error("Failed to rename readahead file: %m");
590 safe_close(fanotify_fd);
591 safe_close(signal_fd);
592 safe_close(inotify_fd);
601 while ((p = hashmap_steal_first_key(files)))
606 if (previous_block_readahead_set) {
609 /* Restore the original kernel readahead setting if we
610 * changed it, and nobody has overwritten it since
612 if (block_get_readahead(root, &bytes) >= 0 && bytes == 8*1024)
613 block_set_readahead(root, previous_block_readahead);
619 int main_collect(const char *root) {
624 /* Skip this step on read-only media. Note that we check the
625 * underlying block device here, not he read-only flag of the
626 * file system on top, since that one is most likely mounted
627 * read-only anyway at boot, even if the underlying block
628 * device is theoretically writable. */
629 if (fs_on_read_only(root) > 0) {
630 log_info("Disabling readahead collector due to read-only media.");
635 log_info("Disabling readahead collector due to low memory.");
639 shared = shared_get();
643 shared->collect = getpid();
644 __sync_synchronize();
646 if (collect(root) < 0)