1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
4 This file is part of systemd.
6 Copyright 2010 Lennart Poettering
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
25 #include <linux/limits.h>
30 #include <sys/select.h>
32 #include <sys/types.h>
35 #include <linux/fanotify.h>
36 #include <sys/signalfd.h>
40 #include <linux/fiemap.h>
41 #include <sys/ioctl.h>
44 #include <sys/inotify.h>
47 #ifdef HAVE_FANOTIFY_INIT
48 #include <sys/fanotify.h>
51 #include <systemd/sd-daemon.h>
57 #include "readahead-common.h"
62 * - detect ssd on btrfs/lvm...
63 * - read ahead directories
66 * - handle files where nothing is in mincore
67 * - does ioprio_set work with fadvise()?
70 static ReadaheadShared *shared = NULL;
71 static usec_t starttime;
73 /* Avoid collisions with the NULL pointer */
74 #define SECTOR_TO_PTR(s) ULONG_TO_PTR((s)+1)
75 #define PTR_TO_SECTOR(p) (PTR_TO_ULONG(p)-1)
77 static int btrfs_defrag(int fd) {
78 struct btrfs_ioctl_vol_args data;
83 return ioctl(fd, BTRFS_IOC_DEFRAG, &data);
86 static int pack_file(FILE *pack, const char *fn, bool on_btrfs) {
88 void *start = MAP_FAILED;
94 int r = 0, fd = -1, k;
99 fd = open(fn, O_RDONLY|O_CLOEXEC|O_NOATIME|O_NOCTTY|O_NOFOLLOW);
105 if (errno == EPERM || errno == EACCES)
108 log_warning("open(%s) failed: %m", fn);
113 k = file_verify(fd, fn, arg_file_size_max, &st);
122 l = PAGE_ALIGN(st.st_size);
123 start = mmap(NULL, l, PROT_READ, MAP_SHARED, fd, 0);
124 if (start == MAP_FAILED) {
125 log_warning("mmap(%s) failed: %m", fn);
130 pages = l / page_size();
132 memset(vec, 0, pages);
133 if (mincore(start, l, vec) < 0) {
134 log_warning("mincore(%s) failed: %m", fn);
142 /* Store the inode, so that we notice when the file is deleted */
143 inode = (uint64_t) st.st_ino;
144 fwrite(&inode, sizeof(inode), 1, pack);
147 for (c = 0; c < pages; c++) {
148 bool new_mapped = !!(vec[c] & 1);
150 if (!mapped && new_mapped)
152 else if (mapped && !new_mapped) {
153 fwrite(&b, sizeof(b), 1, pack);
154 fwrite(&c, sizeof(c), 1, pack);
156 log_debug("%s: page %u to %u", fn, b, c);
162 /* We don't write any range data if we should read the entire file */
163 if (mapped && b > 0) {
164 fwrite(&b, sizeof(b), 1, pack);
165 fwrite(&c, sizeof(c), 1, pack);
167 log_debug("%s: page %u to %u", fn, b, c);
172 fwrite(&b, sizeof(b), 1, pack);
173 fwrite(&b, sizeof(b), 1, pack);
176 if (start != MAP_FAILED)
180 close_nointr_nofail(fd);
185 static unsigned long fd_first_block(int fd) {
187 struct fiemap fiemap;
188 struct fiemap_extent extent;
192 data.fiemap.fm_length = ~0ULL;
193 data.fiemap.fm_extent_count = 1;
195 if (ioctl(fd, FS_IOC_FIEMAP, &data) < 0)
198 if (data.fiemap.fm_mapped_extents <= 0)
201 if (data.fiemap.fm_extents[0].fe_flags & FIEMAP_EXTENT_UNKNOWN)
204 return (unsigned long) data.fiemap.fm_extents[0].fe_physical;
213 static int qsort_compare(const void *a, const void *b) {
214 const struct item *i, *j;
219 /* sort by bin first */
225 /* then sort by sector */
226 if (i->block < j->block)
228 if (i->block > j->block)
231 return strcmp(i->path, j->path);
234 static int collect(const char *root) {
236 FD_FANOTIFY, /* Get the actual fs events */
238 FD_INOTIFY, /* We get notifications to quit early via this fd */
241 struct pollfd pollfd[_FD_MAX];
242 int fanotify_fd = -1, signal_fd = -1, inotify_fd = -1, r = 0;
244 Hashmap *files = NULL;
249 char *pack_fn_new = NULL, *pack_fn = NULL;
250 bool on_ssd, on_btrfs;
253 uint64_t previous_block_readahead;
254 bool previous_block_readahead_set = false;
258 if (asprintf(&pack_fn, "%s/.readahead", root) < 0) {
263 starttime = now(CLOCK_MONOTONIC);
265 /* If there's no pack file yet we lower the kernel readahead
266 * so that mincore() is accurate. If there is a pack file
267 * already we assume it is accurate enough so that kernel
268 * readahead is never triggered. */
269 previous_block_readahead_set =
270 access(pack_fn, F_OK) < 0 &&
271 block_get_readahead(root, &previous_block_readahead) >= 0 &&
272 block_set_readahead(root, 8*1024) >= 0;
274 if (ioprio_set(IOPRIO_WHO_PROCESS, getpid(), IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0)) < 0)
275 log_warning("Failed to set IDLE IO priority class: %m");
277 assert_se(sigemptyset(&mask) == 0);
278 sigset_add_many(&mask, SIGINT, SIGTERM, -1);
279 assert_se(sigprocmask(SIG_SETMASK, &mask, NULL) == 0);
281 if ((signal_fd = signalfd(-1, &mask, SFD_NONBLOCK|SFD_CLOEXEC)) < 0) {
282 log_error("signalfd(): %m");
287 if (!(files = hashmap_new(string_hash_func, string_compare_func))) {
288 log_error("Failed to allocate set.");
293 if ((fanotify_fd = fanotify_init(FAN_CLOEXEC|FAN_NONBLOCK, O_RDONLY|O_LARGEFILE|O_CLOEXEC|O_NOATIME)) < 0) {
294 log_error("Failed to create fanotify object: %m");
299 if (fanotify_mark(fanotify_fd, FAN_MARK_ADD|FAN_MARK_MOUNT, FAN_OPEN, AT_FDCWD, root) < 0) {
300 log_error("Failed to mark %s: %m", root);
305 if ((inotify_fd = open_inotify()) < 0) {
310 not_after = now(CLOCK_MONOTONIC) + arg_timeout;
315 pollfd[FD_FANOTIFY].fd = fanotify_fd;
316 pollfd[FD_FANOTIFY].events = POLLIN;
317 pollfd[FD_SIGNAL].fd = signal_fd;
318 pollfd[FD_SIGNAL].events = POLLIN;
319 pollfd[FD_INOTIFY].fd = inotify_fd;
320 pollfd[FD_INOTIFY].events = POLLIN;
324 "STATUS=Collecting readahead data");
326 log_debug("Collecting...");
328 if (access("/run/systemd/readahead/cancel", F_OK) >= 0) {
329 log_debug("Collection canceled");
334 if (access("/run/systemd/readahead/done", F_OK) >= 0) {
335 log_debug("Got termination request");
341 struct fanotify_event_metadata metadata;
345 struct fanotify_event_metadata *m;
349 if (hashmap_size(files) > arg_files_max) {
350 log_debug("Reached maximum number of read ahead files, ending collection.");
354 t = now(CLOCK_MONOTONIC);
355 if (t >= not_after) {
356 log_debug("Reached maximum collection time, ending collection.");
360 if ((h = poll(pollfd, _FD_MAX, (int) ((not_after - t) / USEC_PER_MSEC))) < 0) {
365 log_error("poll(): %m");
371 log_debug("Reached maximum collection time, ending collection.");
375 if (pollfd[FD_SIGNAL].revents) {
376 log_debug("Got signal.");
380 if (pollfd[FD_INOTIFY].revents) {
381 uint8_t inotify_buffer[sizeof(struct inotify_event) + FILENAME_MAX];
382 struct inotify_event *e;
384 if ((n = read(inotify_fd, &inotify_buffer, sizeof(inotify_buffer))) < 0) {
385 if (errno == EINTR || errno == EAGAIN)
388 log_error("Failed to read inotify event: %m");
393 e = (struct inotify_event*) inotify_buffer;
397 if ((e->mask & IN_CREATE) && streq(e->name, "cancel")) {
398 log_debug("Collection canceled");
403 if ((e->mask & IN_CREATE) && streq(e->name, "done")) {
404 log_debug("Got termination request");
408 step = sizeof(struct inotify_event) + e->len;
409 assert(step <= (size_t) n);
411 e = (struct inotify_event*) ((uint8_t*) e + step);
416 if ((n = read(fanotify_fd, &data, sizeof(data))) < 0) {
418 if (errno == EINTR || errno == EAGAIN)
421 /* fanotify sometimes returns EACCES on read()
422 * where it shouldn't. For now let's just
423 * ignore it here (which is safe), but
424 * eventually this should be
425 * dropped when the kernel is fixed.
427 * https://bugzilla.redhat.com/show_bug.cgi?id=707577 */
431 log_error("Failed to read event: %m");
436 for (m = &data.metadata; FAN_EVENT_OK(m, n); m = FAN_EVENT_NEXT(m, n)) {
443 if (m->pid == my_pid)
446 __sync_synchronize();
447 if (m->pid == shared->replay)
450 snprintf(fn, sizeof(fn), "/proc/self/fd/%i", m->fd);
453 if ((k = readlink_malloc(fn, &p)) >= 0) {
454 if (startswith(p, "/tmp") ||
455 endswith(p, " (deleted)") ||
456 hashmap_get(files, p))
457 /* Not interesting, or
465 entry = new0(struct item, 1);
471 ul = fd_first_block(m->fd);
473 entrytime = now(CLOCK_MONOTONIC);
476 entry->path = strdup(p);
482 entry->bin = (entrytime - starttime) / 2000000;
484 if ((k = hashmap_put(files, p, entry)) < 0) {
485 log_warning("set_put() failed: %s", strerror(-k));
491 log_warning("readlink(%s) failed: %s", fn, strerror(-k));
495 close_nointr_nofail(m->fd);
500 if (fanotify_fd >= 0) {
501 close_nointr_nofail(fanotify_fd);
505 log_debug("Writing Pack File...");
507 on_ssd = fs_on_ssd(root) > 0;
508 log_debug("On SSD: %s", yes_no(on_ssd));
510 on_btrfs = statfs(root, &sfs) >= 0 && (long) sfs.f_type == (long) BTRFS_SUPER_MAGIC;
511 log_debug("On btrfs: %s", yes_no(on_btrfs));
513 if (asprintf(&pack_fn_new, "%s/.readahead.new", root) < 0) {
518 pack = fopen(pack_fn_new, "we");
520 log_error("Failed to open pack file: %m");
525 fputs(CANONICAL_HOST READAHEAD_PACK_FILE_VERSION, pack);
526 putc(on_ssd ? 'S' : 'R', pack);
528 if (on_ssd || on_btrfs) {
530 /* On SSD or on btrfs, just write things out in the
531 * order the files were accessed. */
533 HASHMAP_FOREACH_KEY(q, p, files, i)
534 pack_file(pack, p, on_btrfs);
536 struct item *ordered, *j;
539 /* On rotating media, order things by the block
542 log_debug("Ordering...");
544 n = hashmap_size(files);
545 if (!(ordered = new(struct item, n))) {
551 HASHMAP_FOREACH_KEY(q, p, files, i) {
552 memcpy(j, q, sizeof(struct item));
556 assert(ordered + n == j);
558 qsort(ordered, n, sizeof(struct item), qsort_compare);
560 for (k = 0; k < n; k++)
561 pack_file(pack, ordered[k].path, on_btrfs);
566 log_debug("Finalizing...");
571 log_error("Failed to write pack file.");
576 if (rename(pack_fn_new, pack_fn) < 0) {
577 log_error("Failed to rename readahead file: %m");
588 if (fanotify_fd >= 0)
589 close_nointr_nofail(fanotify_fd);
592 close_nointr_nofail(signal_fd);
595 close_nointr_nofail(inotify_fd);
604 while ((p = hashmap_steal_first_key(files)))
609 if (previous_block_readahead_set) {
612 /* Restore the original kernel readahead setting if we
613 * changed it, and nobody has overwritten it since
615 if (block_get_readahead(root, &bytes) >= 0 && bytes == 8*1024)
616 block_set_readahead(root, previous_block_readahead);
622 int main_collect(const char *root) {
627 /* Skip this step on read-only media. Note that we check the
628 * underlying block device here, not he read-only flag of the
629 * file system on top, since that one is most likely mounted
630 * read-only anyway at boot, even if the underlying block
631 * device is theoretically writable. */
632 if (fs_on_read_only(root) > 0) {
633 log_info("Disabling readahead collector due to read-only media.");
638 log_info("Disabling readahead collector due to low memory.");
642 shared = shared_get();
646 shared->collect = getpid();
647 __sync_synchronize();
649 if (collect(root) < 0)