1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
4 This file is part of systemd.
6 Copyright 2010 Lennart Poettering
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
25 #include <linux/limits.h>
30 #include <sys/select.h>
32 #include <sys/types.h>
35 #include <linux/fanotify.h>
36 #include <sys/signalfd.h>
40 #include <linux/fiemap.h>
41 #include <linux/btrfs.h>
42 #include <sys/ioctl.h>
45 #include <sys/inotify.h>
48 #ifdef HAVE_FANOTIFY_INIT
49 #include <sys/fanotify.h>
52 #include <systemd/sd-daemon.h>
58 #include "readahead-common.h"
63 * - detect ssd on btrfs/lvm...
64 * - read ahead directories
67 * - handle files where nothing is in mincore
68 * - does ioprio_set work with fadvise()?
71 static ReadaheadShared *shared = NULL;
72 static usec_t starttime;
74 /* Avoid collisions with the NULL pointer */
75 #define SECTOR_TO_PTR(s) ULONG_TO_PTR((s)+1)
76 #define PTR_TO_SECTOR(p) (PTR_TO_ULONG(p)-1)
78 static int btrfs_defrag(int fd) {
79 struct btrfs_ioctl_vol_args data = { .fd = fd };
81 return ioctl(fd, BTRFS_IOC_DEFRAG, &data);
84 static int pack_file(FILE *pack, const char *fn, bool on_btrfs) {
86 void *start = MAP_FAILED;
92 int r = 0, fd = -1, k;
97 fd = open(fn, O_RDONLY|O_CLOEXEC|O_NOATIME|O_NOCTTY|O_NOFOLLOW);
103 if (errno == EPERM || errno == EACCES)
106 log_warning("open(%s) failed: %m", fn);
111 k = file_verify(fd, fn, arg_file_size_max, &st);
120 l = PAGE_ALIGN(st.st_size);
121 start = mmap(NULL, l, PROT_READ, MAP_SHARED, fd, 0);
122 if (start == MAP_FAILED) {
123 log_warning("mmap(%s) failed: %m", fn);
128 pages = l / page_size();
130 memset(vec, 0, pages);
131 if (mincore(start, l, vec) < 0) {
132 log_warning("mincore(%s) failed: %m", fn);
140 /* Store the inode, so that we notice when the file is deleted */
141 inode = (uint64_t) st.st_ino;
142 fwrite(&inode, sizeof(inode), 1, pack);
145 for (c = 0; c < pages; c++) {
146 bool new_mapped = !!(vec[c] & 1);
148 if (!mapped && new_mapped)
150 else if (mapped && !new_mapped) {
151 fwrite(&b, sizeof(b), 1, pack);
152 fwrite(&c, sizeof(c), 1, pack);
154 log_debug("%s: page %u to %u", fn, b, c);
160 /* We don't write any range data if we should read the entire file */
161 if (mapped && b > 0) {
162 fwrite(&b, sizeof(b), 1, pack);
163 fwrite(&c, sizeof(c), 1, pack);
165 log_debug("%s: page %u to %u", fn, b, c);
170 fwrite(&b, sizeof(b), 1, pack);
171 fwrite(&b, sizeof(b), 1, pack);
174 if (start != MAP_FAILED)
178 close_nointr_nofail(fd);
183 static unsigned long fd_first_block(int fd) {
185 struct fiemap fiemap;
186 struct fiemap_extent extent;
188 .fiemap.fm_length = ~0ULL,
189 .fiemap.fm_extent_count = 1,
192 if (ioctl(fd, FS_IOC_FIEMAP, &data) < 0)
195 if (data.fiemap.fm_mapped_extents <= 0)
198 if (data.fiemap.fm_extents[0].fe_flags & FIEMAP_EXTENT_UNKNOWN)
201 return (unsigned long) data.fiemap.fm_extents[0].fe_physical;
210 static int qsort_compare(const void *a, const void *b) {
211 const struct item *i, *j;
216 /* sort by bin first */
222 /* then sort by sector */
223 if (i->block < j->block)
225 if (i->block > j->block)
228 return strcmp(i->path, j->path);
231 static int collect(const char *root) {
233 FD_FANOTIFY, /* Get the actual fs events */
235 FD_INOTIFY, /* We get notifications to quit early via this fd */
238 struct pollfd pollfd[_FD_MAX] = {};
239 int fanotify_fd = -1, signal_fd = -1, inotify_fd = -1, r = 0;
241 Hashmap *files = NULL;
246 char *pack_fn_new = NULL, *pack_fn = NULL;
247 bool on_ssd, on_btrfs;
250 uint64_t previous_block_readahead;
251 bool previous_block_readahead_set = false;
255 if (asprintf(&pack_fn, "%s/.readahead", root) < 0) {
260 starttime = now(CLOCK_MONOTONIC);
262 /* If there's no pack file yet we lower the kernel readahead
263 * so that mincore() is accurate. If there is a pack file
264 * already we assume it is accurate enough so that kernel
265 * readahead is never triggered. */
266 previous_block_readahead_set =
267 access(pack_fn, F_OK) < 0 &&
268 block_get_readahead(root, &previous_block_readahead) >= 0 &&
269 block_set_readahead(root, 8*1024) >= 0;
271 if (ioprio_set(IOPRIO_WHO_PROCESS, getpid(), IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0)) < 0)
272 log_warning("Failed to set IDLE IO priority class: %m");
274 assert_se(sigemptyset(&mask) == 0);
275 sigset_add_many(&mask, SIGINT, SIGTERM, -1);
276 assert_se(sigprocmask(SIG_SETMASK, &mask, NULL) == 0);
278 if ((signal_fd = signalfd(-1, &mask, SFD_NONBLOCK|SFD_CLOEXEC)) < 0) {
279 log_error("signalfd(): %m");
284 files = hashmap_new(string_hash_func, string_compare_func);
286 log_error("Failed to allocate set.");
291 fanotify_fd = fanotify_init(FAN_CLOEXEC|FAN_NONBLOCK, O_RDONLY|O_LARGEFILE|O_CLOEXEC|O_NOATIME);
292 if (fanotify_fd < 0) {
293 log_error("Failed to create fanotify object: %m");
298 if (fanotify_mark(fanotify_fd, FAN_MARK_ADD|FAN_MARK_MOUNT, FAN_OPEN, AT_FDCWD, root) < 0) {
299 log_error("Failed to mark %s: %m", root);
304 inotify_fd = open_inotify();
305 if (inotify_fd < 0) {
310 not_after = now(CLOCK_MONOTONIC) + arg_timeout;
314 pollfd[FD_FANOTIFY].fd = fanotify_fd;
315 pollfd[FD_FANOTIFY].events = POLLIN;
316 pollfd[FD_SIGNAL].fd = signal_fd;
317 pollfd[FD_SIGNAL].events = POLLIN;
318 pollfd[FD_INOTIFY].fd = inotify_fd;
319 pollfd[FD_INOTIFY].events = POLLIN;
323 "STATUS=Collecting readahead data");
325 log_debug("Collecting...");
327 if (access("/run/systemd/readahead/cancel", F_OK) >= 0) {
328 log_debug("Collection canceled");
333 if (access("/run/systemd/readahead/done", F_OK) >= 0) {
334 log_debug("Got termination request");
340 struct fanotify_event_metadata metadata;
344 struct fanotify_event_metadata *m;
348 if (hashmap_size(files) > arg_files_max) {
349 log_debug("Reached maximum number of read ahead files, ending collection.");
353 t = now(CLOCK_MONOTONIC);
354 if (t >= not_after) {
355 log_debug("Reached maximum collection time, ending collection.");
359 if ((h = poll(pollfd, _FD_MAX, (int) ((not_after - t) / USEC_PER_MSEC))) < 0) {
364 log_error("poll(): %m");
370 log_debug("Reached maximum collection time, ending collection.");
374 if (pollfd[FD_SIGNAL].revents) {
375 log_debug("Got signal.");
379 if (pollfd[FD_INOTIFY].revents) {
380 uint8_t inotify_buffer[sizeof(struct inotify_event) + FILENAME_MAX];
381 struct inotify_event *e;
383 if ((n = read(inotify_fd, &inotify_buffer, sizeof(inotify_buffer))) < 0) {
384 if (errno == EINTR || errno == EAGAIN)
387 log_error("Failed to read inotify event: %m");
392 e = (struct inotify_event*) inotify_buffer;
396 if ((e->mask & IN_CREATE) && streq(e->name, "cancel")) {
397 log_debug("Collection canceled");
402 if ((e->mask & IN_CREATE) && streq(e->name, "done")) {
403 log_debug("Got termination request");
407 step = sizeof(struct inotify_event) + e->len;
408 assert(step <= (size_t) n);
410 e = (struct inotify_event*) ((uint8_t*) e + step);
415 if ((n = read(fanotify_fd, &data, sizeof(data))) < 0) {
417 if (errno == EINTR || errno == EAGAIN)
420 /* fanotify sometimes returns EACCES on read()
421 * where it shouldn't. For now let's just
422 * ignore it here (which is safe), but
423 * eventually this should be
424 * dropped when the kernel is fixed.
426 * https://bugzilla.redhat.com/show_bug.cgi?id=707577 */
430 log_error("Failed to read event: %m");
435 for (m = &data.metadata; FAN_EVENT_OK(m, n); m = FAN_EVENT_NEXT(m, n)) {
442 if (m->pid == my_pid)
445 __sync_synchronize();
446 if (m->pid == shared->replay)
449 snprintf(fn, sizeof(fn), "/proc/self/fd/%i", m->fd);
452 if ((k = readlink_malloc(fn, &p)) >= 0) {
453 if (startswith(p, "/tmp") ||
454 endswith(p, " (deleted)") ||
455 hashmap_get(files, p))
456 /* Not interesting, or
464 entry = new0(struct item, 1);
470 ul = fd_first_block(m->fd);
472 entrytime = now(CLOCK_MONOTONIC);
475 entry->path = strdup(p);
481 entry->bin = (entrytime - starttime) / 2000000;
483 k = hashmap_put(files, p, entry);
485 log_warning("hashmap_put() failed: %s", strerror(-k));
491 log_warning("readlink(%s) failed: %s", fn, strerror(-k));
495 close_nointr_nofail(m->fd);
500 if (fanotify_fd >= 0) {
501 close_nointr_nofail(fanotify_fd);
505 log_debug("Writing Pack File...");
507 on_ssd = fs_on_ssd(root) > 0;
508 log_debug("On SSD: %s", yes_no(on_ssd));
510 on_btrfs = statfs(root, &sfs) >= 0 && F_TYPE_CMP(sfs.f_type, BTRFS_SUPER_MAGIC);
511 log_debug("On btrfs: %s", yes_no(on_btrfs));
513 if (asprintf(&pack_fn_new, "%s/.readahead.new", root) < 0) {
518 pack = fopen(pack_fn_new, "we");
520 log_error("Failed to open pack file: %m");
525 fputs(CANONICAL_HOST READAHEAD_PACK_FILE_VERSION, pack);
526 putc(on_ssd ? 'S' : 'R', pack);
528 if (on_ssd || on_btrfs) {
530 /* On SSD or on btrfs, just write things out in the
531 * order the files were accessed. */
533 HASHMAP_FOREACH_KEY(q, p, files, i)
534 pack_file(pack, p, on_btrfs);
536 struct item *ordered, *j;
539 /* On rotating media, order things by the block
542 log_debug("Ordering...");
544 n = hashmap_size(files);
545 if (!(ordered = new(struct item, n))) {
551 HASHMAP_FOREACH_KEY(q, p, files, i) {
552 memcpy(j, q, sizeof(struct item));
556 assert(ordered + n == j);
558 qsort(ordered, n, sizeof(struct item), qsort_compare);
560 for (k = 0; k < n; k++)
561 pack_file(pack, ordered[k].path, on_btrfs);
566 log_debug("Finalizing...");
571 log_error("Failed to write pack file.");
576 if (rename(pack_fn_new, pack_fn) < 0) {
577 log_error("Failed to rename readahead file: %m");
588 if (fanotify_fd >= 0)
589 close_nointr_nofail(fanotify_fd);
592 close_nointr_nofail(signal_fd);
595 close_nointr_nofail(inotify_fd);
604 while ((p = hashmap_steal_first_key(files)))
609 if (previous_block_readahead_set) {
612 /* Restore the original kernel readahead setting if we
613 * changed it, and nobody has overwritten it since
615 if (block_get_readahead(root, &bytes) >= 0 && bytes == 8*1024)
616 block_set_readahead(root, previous_block_readahead);
622 int main_collect(const char *root) {
627 /* Skip this step on read-only media. Note that we check the
628 * underlying block device here, not he read-only flag of the
629 * file system on top, since that one is most likely mounted
630 * read-only anyway at boot, even if the underlying block
631 * device is theoretically writable. */
632 if (fs_on_read_only(root) > 0) {
633 log_info("Disabling readahead collector due to read-only media.");
638 log_info("Disabling readahead collector due to low memory.");
642 shared = shared_get();
646 shared->collect = getpid();
647 __sync_synchronize();
649 if (collect(root) < 0)