1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
4 This file is part of systemd.
6 Copyright 2010 Lennart Poettering
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
25 #include <linux/limits.h>
30 #include <sys/select.h>
32 #include <sys/types.h>
35 #include <linux/fanotify.h>
36 #include <sys/signalfd.h>
40 #include <linux/fiemap.h>
41 #include <sys/ioctl.h>
44 #include <sys/inotify.h>
47 #ifdef HAVE_FANOTIFY_INIT
48 #include <sys/fanotify.h>
51 #include <systemd/sd-daemon.h>
57 #include "readahead-common.h"
62 * - detect ssd on btrfs/lvm...
63 * - read ahead directories
66 * - handle files where nothing is in mincore
67 * - does ioprio_set work with fadvise()?
70 static ReadaheadShared *shared = NULL;
71 static usec_t starttime;
73 /* Avoid collisions with the NULL pointer */
74 #define SECTOR_TO_PTR(s) ULONG_TO_PTR((s)+1)
75 #define PTR_TO_SECTOR(p) (PTR_TO_ULONG(p)-1)
77 static int btrfs_defrag(int fd) {
78 struct btrfs_ioctl_vol_args data;
83 return ioctl(fd, BTRFS_IOC_DEFRAG, &data);
86 static int pack_file(FILE *pack, const char *fn, bool on_btrfs) {
88 void *start = MAP_FAILED;
94 int r = 0, fd = -1, k;
99 fd = open(fn, O_RDONLY|O_CLOEXEC|O_NOATIME|O_NOCTTY|O_NOFOLLOW);
105 if (errno == EPERM || errno == EACCES)
108 log_warning("open(%s) failed: %m", fn);
113 k = file_verify(fd, fn, arg_file_size_max, &st);
122 l = PAGE_ALIGN(st.st_size);
123 start = mmap(NULL, l, PROT_READ, MAP_SHARED, fd, 0);
124 if (start == MAP_FAILED) {
125 log_warning("mmap(%s) failed: %m", fn);
130 pages = l / page_size();
132 memset(vec, 0, pages);
133 if (mincore(start, l, vec) < 0) {
134 log_warning("mincore(%s) failed: %m", fn);
142 /* Store the inode, so that we notice when the file is deleted */
143 inode = (uint64_t) st.st_ino;
144 fwrite(&inode, sizeof(inode), 1, pack);
147 for (c = 0; c < pages; c++) {
148 bool new_mapped = !!(vec[c] & 1);
150 if (!mapped && new_mapped)
152 else if (mapped && !new_mapped) {
153 fwrite(&b, sizeof(b), 1, pack);
154 fwrite(&c, sizeof(c), 1, pack);
156 log_debug("%s: page %u to %u", fn, b, c);
162 /* We don't write any range data if we should read the entire file */
163 if (mapped && b > 0) {
164 fwrite(&b, sizeof(b), 1, pack);
165 fwrite(&c, sizeof(c), 1, pack);
167 log_debug("%s: page %u to %u", fn, b, c);
172 fwrite(&b, sizeof(b), 1, pack);
173 fwrite(&b, sizeof(b), 1, pack);
176 if (start != MAP_FAILED)
180 close_nointr_nofail(fd);
185 static unsigned long fd_first_block(int fd) {
187 struct fiemap fiemap;
188 struct fiemap_extent extent;
192 data.fiemap.fm_length = ~0ULL;
193 data.fiemap.fm_extent_count = 1;
195 if (ioctl(fd, FS_IOC_FIEMAP, &data) < 0)
198 if (data.fiemap.fm_mapped_extents <= 0)
201 if (data.fiemap.fm_extents[0].fe_flags & FIEMAP_EXTENT_UNKNOWN)
204 return (unsigned long) data.fiemap.fm_extents[0].fe_physical;
213 static int qsort_compare(const void *a, const void *b) {
214 const struct item *i, *j;
219 /* sort by bin first */
225 /* then sort by sector */
226 if (i->block < j->block)
228 if (i->block > j->block)
231 return strcmp(i->path, j->path);
234 static int collect(const char *root) {
236 FD_FANOTIFY, /* Get the actual fs events */
238 FD_INOTIFY, /* We get notifications to quit early via this fd */
241 struct pollfd pollfd[_FD_MAX];
242 int fanotify_fd = -1, signal_fd = -1, inotify_fd = -1, r = 0;
244 Hashmap *files = NULL;
249 char *pack_fn_new = NULL, *pack_fn = NULL;
250 bool on_ssd, on_btrfs;
253 uint64_t previous_block_readahead;
254 bool previous_block_readahead_set = false;
258 if (asprintf(&pack_fn, "%s/.readahead", root) < 0) {
263 starttime = now(CLOCK_MONOTONIC);
265 /* If there's no pack file yet we lower the kernel readahead
266 * so that mincore() is accurate. If there is a pack file
267 * already we assume it is accurate enough so that kernel
268 * readahead is never triggered. */
269 previous_block_readahead_set =
270 access(pack_fn, F_OK) < 0 &&
271 block_get_readahead(root, &previous_block_readahead) >= 0 &&
272 block_set_readahead(root, 8*1024) >= 0;
274 if (ioprio_set(IOPRIO_WHO_PROCESS, getpid(), IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0)) < 0)
275 log_warning("Failed to set IDLE IO priority class: %m");
277 assert_se(sigemptyset(&mask) == 0);
278 sigset_add_many(&mask, SIGINT, SIGTERM, -1);
279 assert_se(sigprocmask(SIG_SETMASK, &mask, NULL) == 0);
281 if ((signal_fd = signalfd(-1, &mask, SFD_NONBLOCK|SFD_CLOEXEC)) < 0) {
282 log_error("signalfd(): %m");
287 files = hashmap_new(string_hash_func, string_compare_func);
289 log_error("Failed to allocate set.");
294 fanotify_fd = fanotify_init(FAN_CLOEXEC|FAN_NONBLOCK, O_RDONLY|O_LARGEFILE|O_CLOEXEC|O_NOATIME);
295 if (fanotify_fd < 0) {
296 log_error("Failed to create fanotify object: %m");
301 if (fanotify_mark(fanotify_fd, FAN_MARK_ADD|FAN_MARK_MOUNT, FAN_OPEN, AT_FDCWD, root) < 0) {
302 log_error("Failed to mark %s: %m", root);
307 inotify_fd = open_inotify();
308 if (inotify_fd < 0) {
313 not_after = now(CLOCK_MONOTONIC) + arg_timeout;
318 pollfd[FD_FANOTIFY].fd = fanotify_fd;
319 pollfd[FD_FANOTIFY].events = POLLIN;
320 pollfd[FD_SIGNAL].fd = signal_fd;
321 pollfd[FD_SIGNAL].events = POLLIN;
322 pollfd[FD_INOTIFY].fd = inotify_fd;
323 pollfd[FD_INOTIFY].events = POLLIN;
327 "STATUS=Collecting readahead data");
329 log_debug("Collecting...");
331 if (access("/run/systemd/readahead/cancel", F_OK) >= 0) {
332 log_debug("Collection canceled");
337 if (access("/run/systemd/readahead/done", F_OK) >= 0) {
338 log_debug("Got termination request");
344 struct fanotify_event_metadata metadata;
348 struct fanotify_event_metadata *m;
352 if (hashmap_size(files) > arg_files_max) {
353 log_debug("Reached maximum number of read ahead files, ending collection.");
357 t = now(CLOCK_MONOTONIC);
358 if (t >= not_after) {
359 log_debug("Reached maximum collection time, ending collection.");
363 if ((h = poll(pollfd, _FD_MAX, (int) ((not_after - t) / USEC_PER_MSEC))) < 0) {
368 log_error("poll(): %m");
374 log_debug("Reached maximum collection time, ending collection.");
378 if (pollfd[FD_SIGNAL].revents) {
379 log_debug("Got signal.");
383 if (pollfd[FD_INOTIFY].revents) {
384 uint8_t inotify_buffer[sizeof(struct inotify_event) + FILENAME_MAX];
385 struct inotify_event *e;
387 if ((n = read(inotify_fd, &inotify_buffer, sizeof(inotify_buffer))) < 0) {
388 if (errno == EINTR || errno == EAGAIN)
391 log_error("Failed to read inotify event: %m");
396 e = (struct inotify_event*) inotify_buffer;
400 if ((e->mask & IN_CREATE) && streq(e->name, "cancel")) {
401 log_debug("Collection canceled");
406 if ((e->mask & IN_CREATE) && streq(e->name, "done")) {
407 log_debug("Got termination request");
411 step = sizeof(struct inotify_event) + e->len;
412 assert(step <= (size_t) n);
414 e = (struct inotify_event*) ((uint8_t*) e + step);
419 if ((n = read(fanotify_fd, &data, sizeof(data))) < 0) {
421 if (errno == EINTR || errno == EAGAIN)
424 /* fanotify sometimes returns EACCES on read()
425 * where it shouldn't. For now let's just
426 * ignore it here (which is safe), but
427 * eventually this should be
428 * dropped when the kernel is fixed.
430 * https://bugzilla.redhat.com/show_bug.cgi?id=707577 */
434 log_error("Failed to read event: %m");
439 for (m = &data.metadata; FAN_EVENT_OK(m, n); m = FAN_EVENT_NEXT(m, n)) {
446 if (m->pid == my_pid)
449 __sync_synchronize();
450 if (m->pid == shared->replay)
453 snprintf(fn, sizeof(fn), "/proc/self/fd/%i", m->fd);
456 if ((k = readlink_malloc(fn, &p)) >= 0) {
457 if (startswith(p, "/tmp") ||
458 endswith(p, " (deleted)") ||
459 hashmap_get(files, p))
460 /* Not interesting, or
468 entry = new0(struct item, 1);
474 ul = fd_first_block(m->fd);
476 entrytime = now(CLOCK_MONOTONIC);
479 entry->path = strdup(p);
485 entry->bin = (entrytime - starttime) / 2000000;
487 if ((k = hashmap_put(files, p, entry)) < 0) {
488 log_warning("set_put() failed: %s", strerror(-k));
494 log_warning("readlink(%s) failed: %s", fn, strerror(-k));
498 close_nointr_nofail(m->fd);
503 if (fanotify_fd >= 0) {
504 close_nointr_nofail(fanotify_fd);
508 log_debug("Writing Pack File...");
510 on_ssd = fs_on_ssd(root) > 0;
511 log_debug("On SSD: %s", yes_no(on_ssd));
513 on_btrfs = statfs(root, &sfs) >= 0 && (long) sfs.f_type == (long) BTRFS_SUPER_MAGIC;
514 log_debug("On btrfs: %s", yes_no(on_btrfs));
516 if (asprintf(&pack_fn_new, "%s/.readahead.new", root) < 0) {
521 pack = fopen(pack_fn_new, "we");
523 log_error("Failed to open pack file: %m");
528 fputs(CANONICAL_HOST READAHEAD_PACK_FILE_VERSION, pack);
529 putc(on_ssd ? 'S' : 'R', pack);
531 if (on_ssd || on_btrfs) {
533 /* On SSD or on btrfs, just write things out in the
534 * order the files were accessed. */
536 HASHMAP_FOREACH_KEY(q, p, files, i)
537 pack_file(pack, p, on_btrfs);
539 struct item *ordered, *j;
542 /* On rotating media, order things by the block
545 log_debug("Ordering...");
547 n = hashmap_size(files);
548 if (!(ordered = new(struct item, n))) {
554 HASHMAP_FOREACH_KEY(q, p, files, i) {
555 memcpy(j, q, sizeof(struct item));
559 assert(ordered + n == j);
561 qsort(ordered, n, sizeof(struct item), qsort_compare);
563 for (k = 0; k < n; k++)
564 pack_file(pack, ordered[k].path, on_btrfs);
569 log_debug("Finalizing...");
574 log_error("Failed to write pack file.");
579 if (rename(pack_fn_new, pack_fn) < 0) {
580 log_error("Failed to rename readahead file: %m");
591 if (fanotify_fd >= 0)
592 close_nointr_nofail(fanotify_fd);
595 close_nointr_nofail(signal_fd);
598 close_nointr_nofail(inotify_fd);
607 while ((p = hashmap_steal_first_key(files)))
612 if (previous_block_readahead_set) {
615 /* Restore the original kernel readahead setting if we
616 * changed it, and nobody has overwritten it since
618 if (block_get_readahead(root, &bytes) >= 0 && bytes == 8*1024)
619 block_set_readahead(root, previous_block_readahead);
625 int main_collect(const char *root) {
630 /* Skip this step on read-only media. Note that we check the
631 * underlying block device here, not he read-only flag of the
632 * file system on top, since that one is most likely mounted
633 * read-only anyway at boot, even if the underlying block
634 * device is theoretically writable. */
635 if (fs_on_read_only(root) > 0) {
636 log_info("Disabling readahead collector due to read-only media.");
641 log_info("Disabling readahead collector due to low memory.");
645 shared = shared_get();
649 shared->collect = getpid();
650 __sync_synchronize();
652 if (collect(root) < 0)