1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
4 This file is part of systemd.
6 Copyright 2010 Lennart Poettering
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
25 #include <linux/limits.h>
30 #include <sys/select.h>
32 #include <sys/types.h>
35 #include <linux/fanotify.h>
36 #include <sys/signalfd.h>
40 #include <linux/fiemap.h>
41 #include <sys/ioctl.h>
44 #include <sys/inotify.h>
47 #ifdef HAVE_LINUX_BTRFS_H
48 #include <linux/btrfs.h>
51 #ifdef HAVE_FANOTIFY_INIT
52 #include <sys/fanotify.h>
55 #include <systemd/sd-daemon.h>
61 #include "readahead-common.h"
66 * - detect ssd on btrfs/lvm...
67 * - read ahead directories
70 * - handle files where nothing is in mincore
71 * - does ioprio_set work with fadvise()?
74 static ReadaheadShared *shared = NULL;
75 static usec_t starttime;
77 /* Avoid collisions with the NULL pointer */
78 #define SECTOR_TO_PTR(s) ULONG_TO_PTR((s)+1)
79 #define PTR_TO_SECTOR(p) (PTR_TO_ULONG(p)-1)
81 static int btrfs_defrag(int fd) {
82 struct btrfs_ioctl_vol_args data = { .fd = fd };
84 return ioctl(fd, BTRFS_IOC_DEFRAG, &data);
87 static int pack_file(FILE *pack, const char *fn, bool on_btrfs) {
89 void *start = MAP_FAILED;
95 int r = 0, fd = -1, k;
100 fd = open(fn, O_RDONLY|O_CLOEXEC|O_NOATIME|O_NOCTTY|O_NOFOLLOW);
106 if (errno == EPERM || errno == EACCES)
109 log_warning("open(%s) failed: %m", fn);
114 k = file_verify(fd, fn, arg_file_size_max, &st);
123 l = PAGE_ALIGN(st.st_size);
124 start = mmap(NULL, l, PROT_READ, MAP_SHARED, fd, 0);
125 if (start == MAP_FAILED) {
126 log_warning("mmap(%s) failed: %m", fn);
131 pages = l / page_size();
133 memset(vec, 0, pages);
134 if (mincore(start, l, vec) < 0) {
135 log_warning("mincore(%s) failed: %m", fn);
143 /* Store the inode, so that we notice when the file is deleted */
144 inode = (uint64_t) st.st_ino;
145 fwrite(&inode, sizeof(inode), 1, pack);
148 for (c = 0; c < pages; c++) {
149 bool new_mapped = !!(vec[c] & 1);
151 if (!mapped && new_mapped)
153 else if (mapped && !new_mapped) {
154 fwrite(&b, sizeof(b), 1, pack);
155 fwrite(&c, sizeof(c), 1, pack);
157 log_debug("%s: page %u to %u", fn, b, c);
163 /* We don't write any range data if we should read the entire file */
164 if (mapped && b > 0) {
165 fwrite(&b, sizeof(b), 1, pack);
166 fwrite(&c, sizeof(c), 1, pack);
168 log_debug("%s: page %u to %u", fn, b, c);
173 fwrite(&b, sizeof(b), 1, pack);
174 fwrite(&b, sizeof(b), 1, pack);
177 if (start != MAP_FAILED)
181 close_nointr_nofail(fd);
186 static unsigned long fd_first_block(int fd) {
188 struct fiemap fiemap;
189 struct fiemap_extent extent;
191 .fiemap.fm_length = ~0ULL,
192 .fiemap.fm_extent_count = 1,
195 if (ioctl(fd, FS_IOC_FIEMAP, &data) < 0)
198 if (data.fiemap.fm_mapped_extents <= 0)
201 if (data.fiemap.fm_extents[0].fe_flags & FIEMAP_EXTENT_UNKNOWN)
204 return (unsigned long) data.fiemap.fm_extents[0].fe_physical;
213 static int qsort_compare(const void *a, const void *b) {
214 const struct item *i, *j;
219 /* sort by bin first */
225 /* then sort by sector */
226 if (i->block < j->block)
228 if (i->block > j->block)
231 return strcmp(i->path, j->path);
234 static int collect(const char *root) {
236 FD_FANOTIFY, /* Get the actual fs events */
238 FD_INOTIFY, /* We get notifications to quit early via this fd */
241 struct pollfd pollfd[_FD_MAX] = {};
242 int fanotify_fd = -1, signal_fd = -1, inotify_fd = -1, r = 0;
244 Hashmap *files = NULL;
249 char *pack_fn_new = NULL, *pack_fn = NULL;
250 bool on_ssd, on_btrfs;
253 uint64_t previous_block_readahead;
254 bool previous_block_readahead_set = false;
258 if (asprintf(&pack_fn, "%s/.readahead", root) < 0) {
263 starttime = now(CLOCK_MONOTONIC);
265 /* If there's no pack file yet we lower the kernel readahead
266 * so that mincore() is accurate. If there is a pack file
267 * already we assume it is accurate enough so that kernel
268 * readahead is never triggered. */
269 previous_block_readahead_set =
270 access(pack_fn, F_OK) < 0 &&
271 block_get_readahead(root, &previous_block_readahead) >= 0 &&
272 block_set_readahead(root, 8*1024) >= 0;
274 if (ioprio_set(IOPRIO_WHO_PROCESS, getpid(), IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0)) < 0)
275 log_warning("Failed to set IDLE IO priority class: %m");
277 assert_se(sigemptyset(&mask) == 0);
278 sigset_add_many(&mask, SIGINT, SIGTERM, -1);
279 assert_se(sigprocmask(SIG_SETMASK, &mask, NULL) == 0);
281 if ((signal_fd = signalfd(-1, &mask, SFD_NONBLOCK|SFD_CLOEXEC)) < 0) {
282 log_error("signalfd(): %m");
287 files = hashmap_new(string_hash_func, string_compare_func);
289 log_error("Failed to allocate set.");
294 fanotify_fd = fanotify_init(FAN_CLOEXEC|FAN_NONBLOCK, O_RDONLY|O_LARGEFILE|O_CLOEXEC|O_NOATIME);
295 if (fanotify_fd < 0) {
296 log_error("Failed to create fanotify object: %m");
301 if (fanotify_mark(fanotify_fd, FAN_MARK_ADD|FAN_MARK_MOUNT, FAN_OPEN, AT_FDCWD, root) < 0) {
302 log_error("Failed to mark %s: %m", root);
307 inotify_fd = open_inotify();
308 if (inotify_fd < 0) {
313 not_after = now(CLOCK_MONOTONIC) + arg_timeout;
317 pollfd[FD_FANOTIFY].fd = fanotify_fd;
318 pollfd[FD_FANOTIFY].events = POLLIN;
319 pollfd[FD_SIGNAL].fd = signal_fd;
320 pollfd[FD_SIGNAL].events = POLLIN;
321 pollfd[FD_INOTIFY].fd = inotify_fd;
322 pollfd[FD_INOTIFY].events = POLLIN;
326 "STATUS=Collecting readahead data");
328 log_debug("Collecting...");
330 if (access("/run/systemd/readahead/cancel", F_OK) >= 0) {
331 log_debug("Collection canceled");
336 if (access("/run/systemd/readahead/done", F_OK) >= 0) {
337 log_debug("Got termination request");
343 struct fanotify_event_metadata metadata;
347 struct fanotify_event_metadata *m;
351 if (hashmap_size(files) > arg_files_max) {
352 log_debug("Reached maximum number of read ahead files, ending collection.");
356 t = now(CLOCK_MONOTONIC);
357 if (t >= not_after) {
358 log_debug("Reached maximum collection time, ending collection.");
362 if ((h = poll(pollfd, _FD_MAX, (int) ((not_after - t) / USEC_PER_MSEC))) < 0) {
367 log_error("poll(): %m");
373 log_debug("Reached maximum collection time, ending collection.");
377 if (pollfd[FD_SIGNAL].revents) {
378 log_debug("Got signal.");
382 if (pollfd[FD_INOTIFY].revents) {
383 uint8_t inotify_buffer[sizeof(struct inotify_event) + FILENAME_MAX];
384 struct inotify_event *e;
386 if ((n = read(inotify_fd, &inotify_buffer, sizeof(inotify_buffer))) < 0) {
387 if (errno == EINTR || errno == EAGAIN)
390 log_error("Failed to read inotify event: %m");
395 e = (struct inotify_event*) inotify_buffer;
399 if ((e->mask & IN_CREATE) && streq(e->name, "cancel")) {
400 log_debug("Collection canceled");
405 if ((e->mask & IN_CREATE) && streq(e->name, "done")) {
406 log_debug("Got termination request");
410 step = sizeof(struct inotify_event) + e->len;
411 assert(step <= (size_t) n);
413 e = (struct inotify_event*) ((uint8_t*) e + step);
418 n = read(fanotify_fd, &data, sizeof(data));
421 if (errno == EINTR || errno == EAGAIN)
424 /* fanotify sometimes returns EACCES on read()
425 * where it shouldn't. For now let's just
426 * ignore it here (which is safe), but
427 * eventually this should be
428 * dropped when the kernel is fixed.
430 * https://bugzilla.redhat.com/show_bug.cgi?id=707577 */
434 log_error("Failed to read event: %m");
439 for (m = &data.metadata; FAN_EVENT_OK(m, n); m = FAN_EVENT_NEXT(m, n)) {
440 char fn[sizeof("/proc/self/fd/") + DECIMAL_STR_MAX(int)];
446 if (m->pid == my_pid)
449 __sync_synchronize();
450 if (m->pid == shared->replay)
453 snprintf(fn, sizeof(fn), "/proc/self/fd/%i", m->fd);
454 k = readlink_malloc(fn, &p);
456 if (startswith(p, "/tmp") ||
457 endswith(p, " (deleted)") ||
458 hashmap_get(files, p))
459 /* Not interesting, or
467 entry = new0(struct item, 1);
473 ul = fd_first_block(m->fd);
475 entrytime = now(CLOCK_MONOTONIC);
478 entry->path = strdup(p);
484 entry->bin = (entrytime - starttime) / 2000000;
486 k = hashmap_put(files, p, entry);
488 log_warning("hashmap_put() failed: %s", strerror(-k));
494 log_warning("readlink(%s) failed: %s", fn, strerror(-k));
498 close_nointr_nofail(m->fd);
503 if (fanotify_fd >= 0) {
504 close_nointr_nofail(fanotify_fd);
508 log_debug("Writing Pack File...");
510 on_ssd = fs_on_ssd(root) > 0;
511 log_debug("On SSD: %s", yes_no(on_ssd));
513 on_btrfs = statfs(root, &sfs) >= 0 && F_TYPE_EQUAL(sfs.f_type, BTRFS_SUPER_MAGIC);
514 log_debug("On btrfs: %s", yes_no(on_btrfs));
516 if (asprintf(&pack_fn_new, "%s/.readahead.new", root) < 0) {
521 pack = fopen(pack_fn_new, "we");
523 log_error("Failed to open pack file: %m");
528 fputs(CANONICAL_HOST READAHEAD_PACK_FILE_VERSION, pack);
529 putc(on_ssd ? 'S' : 'R', pack);
531 if (on_ssd || on_btrfs) {
533 /* On SSD or on btrfs, just write things out in the
534 * order the files were accessed. */
536 HASHMAP_FOREACH_KEY(q, p, files, i)
537 pack_file(pack, p, on_btrfs);
541 /* On rotating media, order things by the block
544 log_debug("Ordering...");
546 n = hashmap_size(files);
548 _cleanup_free_ struct item *ordered;
552 ordered = new(struct item, n);
559 HASHMAP_FOREACH_KEY(q, p, files, i) {
560 memcpy(j, q, sizeof(struct item));
564 assert(ordered + n == j);
566 qsort(ordered, n, sizeof(struct item), qsort_compare);
568 for (k = 0; k < n; k++)
569 pack_file(pack, ordered[k].path, on_btrfs);
571 log_warning("No pack files");
574 log_debug("Finalizing...");
579 log_error("Failed to write pack file.");
584 if (rename(pack_fn_new, pack_fn) < 0) {
585 log_error("Failed to rename readahead file: %m");
596 if (fanotify_fd >= 0)
597 close_nointr_nofail(fanotify_fd);
600 close_nointr_nofail(signal_fd);
603 close_nointr_nofail(inotify_fd);
612 while ((p = hashmap_steal_first_key(files)))
617 if (previous_block_readahead_set) {
620 /* Restore the original kernel readahead setting if we
621 * changed it, and nobody has overwritten it since
623 if (block_get_readahead(root, &bytes) >= 0 && bytes == 8*1024)
624 block_set_readahead(root, previous_block_readahead);
630 int main_collect(const char *root) {
635 /* Skip this step on read-only media. Note that we check the
636 * underlying block device here, not he read-only flag of the
637 * file system on top, since that one is most likely mounted
638 * read-only anyway at boot, even if the underlying block
639 * device is theoretically writable. */
640 if (fs_on_read_only(root) > 0) {
641 log_info("Disabling readahead collector due to read-only media.");
646 log_info("Disabling readahead collector due to low memory.");
650 shared = shared_get();
654 shared->collect = getpid();
655 __sync_synchronize();
657 if (collect(root) < 0)