1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
4 This file is part of systemd.
6 Copyright 2010 Lennart Poettering
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
25 #include <linux/limits.h>
30 #include <sys/select.h>
32 #include <sys/types.h>
35 #include <linux/fanotify.h>
36 #include <sys/signalfd.h>
40 #include <linux/fiemap.h>
41 #include <sys/ioctl.h>
44 #include <sys/inotify.h>
47 #ifdef HAVE_FANOTIFY_INIT
48 #include <sys/fanotify.h>
51 #include <systemd/sd-daemon.h>
57 #include "readahead-common.h"
62 * - detect ssd on btrfs/lvm...
63 * - read ahead directories
66 * - handle files where nothing is in mincore
67 * - does ioprio_set work with fadvise()?
70 static ReadaheadShared *shared = NULL;
71 static usec_t starttime;
73 /* Avoid collisions with the NULL pointer */
74 #define SECTOR_TO_PTR(s) ULONG_TO_PTR((s)+1)
75 #define PTR_TO_SECTOR(p) (PTR_TO_ULONG(p)-1)
77 static int btrfs_defrag(int fd) {
78 struct btrfs_ioctl_vol_args data = { .fd = fd };
80 return ioctl(fd, BTRFS_IOC_DEFRAG, &data);
83 static int pack_file(FILE *pack, const char *fn, bool on_btrfs) {
85 void *start = MAP_FAILED;
91 int r = 0, fd = -1, k;
96 fd = open(fn, O_RDONLY|O_CLOEXEC|O_NOATIME|O_NOCTTY|O_NOFOLLOW);
102 if (errno == EPERM || errno == EACCES)
105 log_warning("open(%s) failed: %m", fn);
110 k = file_verify(fd, fn, arg_file_size_max, &st);
119 l = PAGE_ALIGN(st.st_size);
120 start = mmap(NULL, l, PROT_READ, MAP_SHARED, fd, 0);
121 if (start == MAP_FAILED) {
122 log_warning("mmap(%s) failed: %m", fn);
127 pages = l / page_size();
129 memset(vec, 0, pages);
130 if (mincore(start, l, vec) < 0) {
131 log_warning("mincore(%s) failed: %m", fn);
139 /* Store the inode, so that we notice when the file is deleted */
140 inode = (uint64_t) st.st_ino;
141 fwrite(&inode, sizeof(inode), 1, pack);
144 for (c = 0; c < pages; c++) {
145 bool new_mapped = !!(vec[c] & 1);
147 if (!mapped && new_mapped)
149 else if (mapped && !new_mapped) {
150 fwrite(&b, sizeof(b), 1, pack);
151 fwrite(&c, sizeof(c), 1, pack);
153 log_debug("%s: page %u to %u", fn, b, c);
159 /* We don't write any range data if we should read the entire file */
160 if (mapped && b > 0) {
161 fwrite(&b, sizeof(b), 1, pack);
162 fwrite(&c, sizeof(c), 1, pack);
164 log_debug("%s: page %u to %u", fn, b, c);
169 fwrite(&b, sizeof(b), 1, pack);
170 fwrite(&b, sizeof(b), 1, pack);
173 if (start != MAP_FAILED)
177 close_nointr_nofail(fd);
182 static unsigned long fd_first_block(int fd) {
184 struct fiemap fiemap;
185 struct fiemap_extent extent;
187 .fiemap.fm_length = ~0ULL,
188 .fiemap.fm_extent_count = 1,
191 if (ioctl(fd, FS_IOC_FIEMAP, &data) < 0)
194 if (data.fiemap.fm_mapped_extents <= 0)
197 if (data.fiemap.fm_extents[0].fe_flags & FIEMAP_EXTENT_UNKNOWN)
200 return (unsigned long) data.fiemap.fm_extents[0].fe_physical;
209 static int qsort_compare(const void *a, const void *b) {
210 const struct item *i, *j;
215 /* sort by bin first */
221 /* then sort by sector */
222 if (i->block < j->block)
224 if (i->block > j->block)
227 return strcmp(i->path, j->path);
230 static int collect(const char *root) {
232 FD_FANOTIFY, /* Get the actual fs events */
234 FD_INOTIFY, /* We get notifications to quit early via this fd */
237 struct pollfd pollfd[_FD_MAX] = {};
238 int fanotify_fd = -1, signal_fd = -1, inotify_fd = -1, r = 0;
240 Hashmap *files = NULL;
245 char *pack_fn_new = NULL, *pack_fn = NULL;
246 bool on_ssd, on_btrfs;
249 uint64_t previous_block_readahead;
250 bool previous_block_readahead_set = false;
254 if (asprintf(&pack_fn, "%s/.readahead", root) < 0) {
259 starttime = now(CLOCK_MONOTONIC);
261 /* If there's no pack file yet we lower the kernel readahead
262 * so that mincore() is accurate. If there is a pack file
263 * already we assume it is accurate enough so that kernel
264 * readahead is never triggered. */
265 previous_block_readahead_set =
266 access(pack_fn, F_OK) < 0 &&
267 block_get_readahead(root, &previous_block_readahead) >= 0 &&
268 block_set_readahead(root, 8*1024) >= 0;
270 if (ioprio_set(IOPRIO_WHO_PROCESS, getpid(), IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0)) < 0)
271 log_warning("Failed to set IDLE IO priority class: %m");
273 assert_se(sigemptyset(&mask) == 0);
274 sigset_add_many(&mask, SIGINT, SIGTERM, -1);
275 assert_se(sigprocmask(SIG_SETMASK, &mask, NULL) == 0);
277 if ((signal_fd = signalfd(-1, &mask, SFD_NONBLOCK|SFD_CLOEXEC)) < 0) {
278 log_error("signalfd(): %m");
283 files = hashmap_new(string_hash_func, string_compare_func);
285 log_error("Failed to allocate set.");
290 fanotify_fd = fanotify_init(FAN_CLOEXEC|FAN_NONBLOCK, O_RDONLY|O_LARGEFILE|O_CLOEXEC|O_NOATIME);
291 if (fanotify_fd < 0) {
292 log_error("Failed to create fanotify object: %m");
297 if (fanotify_mark(fanotify_fd, FAN_MARK_ADD|FAN_MARK_MOUNT, FAN_OPEN, AT_FDCWD, root) < 0) {
298 log_error("Failed to mark %s: %m", root);
303 inotify_fd = open_inotify();
304 if (inotify_fd < 0) {
309 not_after = now(CLOCK_MONOTONIC) + arg_timeout;
313 pollfd[FD_FANOTIFY].fd = fanotify_fd;
314 pollfd[FD_FANOTIFY].events = POLLIN;
315 pollfd[FD_SIGNAL].fd = signal_fd;
316 pollfd[FD_SIGNAL].events = POLLIN;
317 pollfd[FD_INOTIFY].fd = inotify_fd;
318 pollfd[FD_INOTIFY].events = POLLIN;
322 "STATUS=Collecting readahead data");
324 log_debug("Collecting...");
326 if (access("/run/systemd/readahead/cancel", F_OK) >= 0) {
327 log_debug("Collection canceled");
332 if (access("/run/systemd/readahead/done", F_OK) >= 0) {
333 log_debug("Got termination request");
339 struct fanotify_event_metadata metadata;
343 struct fanotify_event_metadata *m;
347 if (hashmap_size(files) > arg_files_max) {
348 log_debug("Reached maximum number of read ahead files, ending collection.");
352 t = now(CLOCK_MONOTONIC);
353 if (t >= not_after) {
354 log_debug("Reached maximum collection time, ending collection.");
358 if ((h = poll(pollfd, _FD_MAX, (int) ((not_after - t) / USEC_PER_MSEC))) < 0) {
363 log_error("poll(): %m");
369 log_debug("Reached maximum collection time, ending collection.");
373 if (pollfd[FD_SIGNAL].revents) {
374 log_debug("Got signal.");
378 if (pollfd[FD_INOTIFY].revents) {
379 uint8_t inotify_buffer[sizeof(struct inotify_event) + FILENAME_MAX];
380 struct inotify_event *e;
382 if ((n = read(inotify_fd, &inotify_buffer, sizeof(inotify_buffer))) < 0) {
383 if (errno == EINTR || errno == EAGAIN)
386 log_error("Failed to read inotify event: %m");
391 e = (struct inotify_event*) inotify_buffer;
395 if ((e->mask & IN_CREATE) && streq(e->name, "cancel")) {
396 log_debug("Collection canceled");
401 if ((e->mask & IN_CREATE) && streq(e->name, "done")) {
402 log_debug("Got termination request");
406 step = sizeof(struct inotify_event) + e->len;
407 assert(step <= (size_t) n);
409 e = (struct inotify_event*) ((uint8_t*) e + step);
414 if ((n = read(fanotify_fd, &data, sizeof(data))) < 0) {
416 if (errno == EINTR || errno == EAGAIN)
419 /* fanotify sometimes returns EACCES on read()
420 * where it shouldn't. For now let's just
421 * ignore it here (which is safe), but
422 * eventually this should be
423 * dropped when the kernel is fixed.
425 * https://bugzilla.redhat.com/show_bug.cgi?id=707577 */
429 log_error("Failed to read event: %m");
434 for (m = &data.metadata; FAN_EVENT_OK(m, n); m = FAN_EVENT_NEXT(m, n)) {
441 if (m->pid == my_pid)
444 __sync_synchronize();
445 if (m->pid == shared->replay)
448 snprintf(fn, sizeof(fn), "/proc/self/fd/%i", m->fd);
451 if ((k = readlink_malloc(fn, &p)) >= 0) {
452 if (startswith(p, "/tmp") ||
453 endswith(p, " (deleted)") ||
454 hashmap_get(files, p))
455 /* Not interesting, or
463 entry = new0(struct item, 1);
469 ul = fd_first_block(m->fd);
471 entrytime = now(CLOCK_MONOTONIC);
474 entry->path = strdup(p);
480 entry->bin = (entrytime - starttime) / 2000000;
482 k = hashmap_put(files, p, entry);
484 log_warning("hashmap_put() failed: %s", strerror(-k));
490 log_warning("readlink(%s) failed: %s", fn, strerror(-k));
494 close_nointr_nofail(m->fd);
499 if (fanotify_fd >= 0) {
500 close_nointr_nofail(fanotify_fd);
504 log_debug("Writing Pack File...");
506 on_ssd = fs_on_ssd(root) > 0;
507 log_debug("On SSD: %s", yes_no(on_ssd));
509 on_btrfs = statfs(root, &sfs) >= 0 && F_TYPE_CMP(sfs.f_type, BTRFS_SUPER_MAGIC);
510 log_debug("On btrfs: %s", yes_no(on_btrfs));
512 if (asprintf(&pack_fn_new, "%s/.readahead.new", root) < 0) {
517 pack = fopen(pack_fn_new, "we");
519 log_error("Failed to open pack file: %m");
524 fputs(CANONICAL_HOST READAHEAD_PACK_FILE_VERSION, pack);
525 putc(on_ssd ? 'S' : 'R', pack);
527 if (on_ssd || on_btrfs) {
529 /* On SSD or on btrfs, just write things out in the
530 * order the files were accessed. */
532 HASHMAP_FOREACH_KEY(q, p, files, i)
533 pack_file(pack, p, on_btrfs);
535 struct item *ordered, *j;
538 /* On rotating media, order things by the block
541 log_debug("Ordering...");
543 n = hashmap_size(files);
544 if (!(ordered = new(struct item, n))) {
550 HASHMAP_FOREACH_KEY(q, p, files, i) {
551 memcpy(j, q, sizeof(struct item));
555 assert(ordered + n == j);
557 qsort(ordered, n, sizeof(struct item), qsort_compare);
559 for (k = 0; k < n; k++)
560 pack_file(pack, ordered[k].path, on_btrfs);
565 log_debug("Finalizing...");
570 log_error("Failed to write pack file.");
575 if (rename(pack_fn_new, pack_fn) < 0) {
576 log_error("Failed to rename readahead file: %m");
587 if (fanotify_fd >= 0)
588 close_nointr_nofail(fanotify_fd);
591 close_nointr_nofail(signal_fd);
594 close_nointr_nofail(inotify_fd);
603 while ((p = hashmap_steal_first_key(files)))
608 if (previous_block_readahead_set) {
611 /* Restore the original kernel readahead setting if we
612 * changed it, and nobody has overwritten it since
614 if (block_get_readahead(root, &bytes) >= 0 && bytes == 8*1024)
615 block_set_readahead(root, previous_block_readahead);
621 int main_collect(const char *root) {
626 /* Skip this step on read-only media. Note that we check the
627 * underlying block device here, not he read-only flag of the
628 * file system on top, since that one is most likely mounted
629 * read-only anyway at boot, even if the underlying block
630 * device is theoretically writable. */
631 if (fs_on_read_only(root) > 0) {
632 log_info("Disabling readahead collector due to read-only media.");
637 log_info("Disabling readahead collector due to low memory.");
641 shared = shared_get();
645 shared->collect = getpid();
646 __sync_synchronize();
648 if (collect(root) < 0)