1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
4 This file is part of systemd.
6 Copyright 2010 Lennart Poettering
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
25 #include <linux/limits.h>
30 #include <sys/select.h>
32 #include <sys/types.h>
35 #include <linux/fanotify.h>
36 #include <sys/signalfd.h>
40 #include <linux/fiemap.h>
41 #include <sys/ioctl.h>
44 #include <sys/inotify.h>
47 #ifdef HAVE_FANOTIFY_INIT
48 #include <sys/fanotify.h>
51 #include <systemd/sd-daemon.h>
57 #include "readahead-common.h"
62 * - detect ssd on btrfs/lvm...
63 * - read ahead directories
66 * - handle files where nothing is in mincore
67 * - does ioprio_set work with fadvise()?
70 static ReadaheadShared *shared = NULL;
71 static usec_t starttime;
73 /* Avoid collisions with the NULL pointer */
74 #define SECTOR_TO_PTR(s) ULONG_TO_PTR((s)+1)
75 #define PTR_TO_SECTOR(p) (PTR_TO_ULONG(p)-1)
77 static int btrfs_defrag(int fd) {
78 struct btrfs_ioctl_vol_args data = { .fd = fd };
80 return ioctl(fd, BTRFS_IOC_DEFRAG, &data);
83 static int pack_file(FILE *pack, const char *fn, bool on_btrfs) {
85 void *start = MAP_FAILED;
91 int r = 0, fd = -1, k;
96 fd = open(fn, O_RDONLY|O_CLOEXEC|O_NOATIME|O_NOCTTY|O_NOFOLLOW);
102 if (errno == EPERM || errno == EACCES)
105 log_warning("open(%s) failed: %m", fn);
110 k = file_verify(fd, fn, arg_file_size_max, &st);
119 l = PAGE_ALIGN(st.st_size);
120 start = mmap(NULL, l, PROT_READ, MAP_SHARED, fd, 0);
121 if (start == MAP_FAILED) {
122 log_warning("mmap(%s) failed: %m", fn);
127 pages = l / page_size();
129 memset(vec, 0, pages);
130 if (mincore(start, l, vec) < 0) {
131 log_warning("mincore(%s) failed: %m", fn);
139 /* Store the inode, so that we notice when the file is deleted */
140 inode = (uint64_t) st.st_ino;
141 fwrite(&inode, sizeof(inode), 1, pack);
144 for (c = 0; c < pages; c++) {
145 bool new_mapped = !!(vec[c] & 1);
147 if (!mapped && new_mapped)
149 else if (mapped && !new_mapped) {
150 fwrite(&b, sizeof(b), 1, pack);
151 fwrite(&c, sizeof(c), 1, pack);
153 log_debug("%s: page %u to %u", fn, b, c);
159 /* We don't write any range data if we should read the entire file */
160 if (mapped && b > 0) {
161 fwrite(&b, sizeof(b), 1, pack);
162 fwrite(&c, sizeof(c), 1, pack);
164 log_debug("%s: page %u to %u", fn, b, c);
169 fwrite(&b, sizeof(b), 1, pack);
170 fwrite(&b, sizeof(b), 1, pack);
173 if (start != MAP_FAILED)
177 close_nointr_nofail(fd);
182 static unsigned long fd_first_block(int fd) {
184 struct fiemap fiemap;
185 struct fiemap_extent extent;
187 .fiemap.fm_length = ~0ULL,
188 .fiemap.fm_extent_count = 1,
191 if (ioctl(fd, FS_IOC_FIEMAP, &data) < 0)
194 if (data.fiemap.fm_mapped_extents <= 0)
197 if (data.fiemap.fm_extents[0].fe_flags & FIEMAP_EXTENT_UNKNOWN)
200 return (unsigned long) data.fiemap.fm_extents[0].fe_physical;
209 static int qsort_compare(const void *a, const void *b) {
210 const struct item *i, *j;
215 /* sort by bin first */
221 /* then sort by sector */
222 if (i->block < j->block)
224 if (i->block > j->block)
227 return strcmp(i->path, j->path);
230 static int collect(const char *root) {
232 FD_FANOTIFY, /* Get the actual fs events */
234 FD_INOTIFY, /* We get notifications to quit early via this fd */
237 struct pollfd pollfd[_FD_MAX] = {};
238 int fanotify_fd = -1, signal_fd = -1, inotify_fd = -1, r = 0;
240 Hashmap *files = NULL;
245 char *pack_fn_new = NULL, *pack_fn = NULL;
246 bool on_ssd, on_btrfs;
249 uint64_t previous_block_readahead;
250 bool previous_block_readahead_set = false;
254 if (asprintf(&pack_fn, "%s/.readahead", root) < 0) {
259 starttime = now(CLOCK_MONOTONIC);
261 /* If there's no pack file yet we lower the kernel readahead
262 * so that mincore() is accurate. If there is a pack file
263 * already we assume it is accurate enough so that kernel
264 * readahead is never triggered. */
265 previous_block_readahead_set =
266 access(pack_fn, F_OK) < 0 &&
267 block_get_readahead(root, &previous_block_readahead) >= 0 &&
268 block_set_readahead(root, 8*1024) >= 0;
270 if (ioprio_set(IOPRIO_WHO_PROCESS, getpid(), IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0)) < 0)
271 log_warning("Failed to set IDLE IO priority class: %m");
273 assert_se(sigemptyset(&mask) == 0);
274 sigset_add_many(&mask, SIGINT, SIGTERM, -1);
275 assert_se(sigprocmask(SIG_SETMASK, &mask, NULL) == 0);
277 if ((signal_fd = signalfd(-1, &mask, SFD_NONBLOCK|SFD_CLOEXEC)) < 0) {
278 log_error("signalfd(): %m");
283 files = hashmap_new(string_hash_func, string_compare_func);
285 log_error("Failed to allocate set.");
290 fanotify_fd = fanotify_init(FAN_CLOEXEC|FAN_NONBLOCK, O_RDONLY|O_LARGEFILE|O_CLOEXEC|O_NOATIME);
291 if (fanotify_fd < 0) {
292 log_error("Failed to create fanotify object: %m");
297 if (fanotify_mark(fanotify_fd, FAN_MARK_ADD|FAN_MARK_MOUNT, FAN_OPEN, AT_FDCWD, root) < 0) {
298 log_error("Failed to mark %s: %m", root);
303 inotify_fd = open_inotify();
304 if (inotify_fd < 0) {
309 not_after = now(CLOCK_MONOTONIC) + arg_timeout;
313 pollfd[FD_FANOTIFY].fd = fanotify_fd;
314 pollfd[FD_FANOTIFY].events = POLLIN;
315 pollfd[FD_SIGNAL].fd = signal_fd;
316 pollfd[FD_SIGNAL].events = POLLIN;
317 pollfd[FD_INOTIFY].fd = inotify_fd;
318 pollfd[FD_INOTIFY].events = POLLIN;
322 "STATUS=Collecting readahead data");
324 log_debug("Collecting...");
326 if (access("/run/systemd/readahead/cancel", F_OK) >= 0) {
327 log_debug("Collection canceled");
332 if (access("/run/systemd/readahead/done", F_OK) >= 0) {
333 log_debug("Got termination request");
339 struct fanotify_event_metadata metadata;
343 struct fanotify_event_metadata *m;
347 if (hashmap_size(files) > arg_files_max) {
348 log_debug("Reached maximum number of read ahead files, ending collection.");
352 t = now(CLOCK_MONOTONIC);
353 if (t >= not_after) {
354 log_debug("Reached maximum collection time, ending collection.");
358 if ((h = poll(pollfd, _FD_MAX, (int) ((not_after - t) / USEC_PER_MSEC))) < 0) {
363 log_error("poll(): %m");
369 log_debug("Reached maximum collection time, ending collection.");
373 if (pollfd[FD_SIGNAL].revents) {
374 log_debug("Got signal.");
378 if (pollfd[FD_INOTIFY].revents) {
379 uint8_t inotify_buffer[sizeof(struct inotify_event) + FILENAME_MAX];
380 struct inotify_event *e;
382 if ((n = read(inotify_fd, &inotify_buffer, sizeof(inotify_buffer))) < 0) {
383 if (errno == EINTR || errno == EAGAIN)
386 log_error("Failed to read inotify event: %m");
391 e = (struct inotify_event*) inotify_buffer;
395 if ((e->mask & IN_CREATE) && streq(e->name, "cancel")) {
396 log_debug("Collection canceled");
401 if ((e->mask & IN_CREATE) && streq(e->name, "done")) {
402 log_debug("Got termination request");
406 step = sizeof(struct inotify_event) + e->len;
407 assert(step <= (size_t) n);
409 e = (struct inotify_event*) ((uint8_t*) e + step);
414 if ((n = read(fanotify_fd, &data, sizeof(data))) < 0) {
416 if (errno == EINTR || errno == EAGAIN)
419 /* fanotify sometimes returns EACCES on read()
420 * where it shouldn't. For now let's just
421 * ignore it here (which is safe), but
422 * eventually this should be
423 * dropped when the kernel is fixed.
425 * https://bugzilla.redhat.com/show_bug.cgi?id=707577 */
429 log_error("Failed to read event: %m");
434 for (m = &data.metadata; FAN_EVENT_OK(m, n); m = FAN_EVENT_NEXT(m, n)) {
441 if (m->pid == my_pid)
444 __sync_synchronize();
445 if (m->pid == shared->replay)
448 snprintf(fn, sizeof(fn), "/proc/self/fd/%i", m->fd);
451 if ((k = readlink_malloc(fn, &p)) >= 0) {
452 if (startswith(p, "/tmp") ||
453 endswith(p, " (deleted)") ||
454 hashmap_get(files, p))
455 /* Not interesting, or
463 entry = new0(struct item, 1);
469 ul = fd_first_block(m->fd);
471 entrytime = now(CLOCK_MONOTONIC);
474 entry->path = strdup(p);
480 entry->bin = (entrytime - starttime) / 2000000;
482 if ((k = hashmap_put(files, p, entry)) < 0) {
483 log_warning("set_put() failed: %s", strerror(-k));
489 log_warning("readlink(%s) failed: %s", fn, strerror(-k));
493 close_nointr_nofail(m->fd);
498 if (fanotify_fd >= 0) {
499 close_nointr_nofail(fanotify_fd);
503 log_debug("Writing Pack File...");
505 on_ssd = fs_on_ssd(root) > 0;
506 log_debug("On SSD: %s", yes_no(on_ssd));
508 on_btrfs = statfs(root, &sfs) >= 0 && (unsigned) sfs.f_type == BTRFS_SUPER_MAGIC;
509 log_debug("On btrfs: %s", yes_no(on_btrfs));
511 if (asprintf(&pack_fn_new, "%s/.readahead.new", root) < 0) {
516 pack = fopen(pack_fn_new, "we");
518 log_error("Failed to open pack file: %m");
523 fputs(CANONICAL_HOST READAHEAD_PACK_FILE_VERSION, pack);
524 putc(on_ssd ? 'S' : 'R', pack);
526 if (on_ssd || on_btrfs) {
528 /* On SSD or on btrfs, just write things out in the
529 * order the files were accessed. */
531 HASHMAP_FOREACH_KEY(q, p, files, i)
532 pack_file(pack, p, on_btrfs);
534 struct item *ordered, *j;
537 /* On rotating media, order things by the block
540 log_debug("Ordering...");
542 n = hashmap_size(files);
543 if (!(ordered = new(struct item, n))) {
549 HASHMAP_FOREACH_KEY(q, p, files, i) {
550 memcpy(j, q, sizeof(struct item));
554 assert(ordered + n == j);
556 qsort(ordered, n, sizeof(struct item), qsort_compare);
558 for (k = 0; k < n; k++)
559 pack_file(pack, ordered[k].path, on_btrfs);
564 log_debug("Finalizing...");
569 log_error("Failed to write pack file.");
574 if (rename(pack_fn_new, pack_fn) < 0) {
575 log_error("Failed to rename readahead file: %m");
586 if (fanotify_fd >= 0)
587 close_nointr_nofail(fanotify_fd);
590 close_nointr_nofail(signal_fd);
593 close_nointr_nofail(inotify_fd);
602 while ((p = hashmap_steal_first_key(files)))
607 if (previous_block_readahead_set) {
610 /* Restore the original kernel readahead setting if we
611 * changed it, and nobody has overwritten it since
613 if (block_get_readahead(root, &bytes) >= 0 && bytes == 8*1024)
614 block_set_readahead(root, previous_block_readahead);
620 int main_collect(const char *root) {
625 /* Skip this step on read-only media. Note that we check the
626 * underlying block device here, not he read-only flag of the
627 * file system on top, since that one is most likely mounted
628 * read-only anyway at boot, even if the underlying block
629 * device is theoretically writable. */
630 if (fs_on_read_only(root) > 0) {
631 log_info("Disabling readahead collector due to read-only media.");
636 log_info("Disabling readahead collector due to low memory.");
640 shared = shared_get();
644 shared->collect = getpid();
645 __sync_synchronize();
647 if (collect(root) < 0)