1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
4 This file is part of systemd.
6 Copyright 2010 Lennart Poettering
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
25 #include <linux/limits.h>
30 #include <sys/select.h>
32 #include <sys/types.h>
35 #include <linux/fanotify.h>
36 #include <sys/signalfd.h>
40 #include <linux/fiemap.h>
41 #include <sys/ioctl.h>
44 #include <sys/inotify.h>
46 #ifdef HAVE_FANOTIFY_INIT
47 #include <sys/fanotify.h>
50 #include <systemd/sd-daemon.h>
56 #include "readahead-common.h"
61 * - detect ssd on btrfs/lvm...
62 * - read ahead directories
65 * - handle files where nothing is in mincore
66 * - does ioprio_set work with fadvise()?
69 static ReadaheadShared *shared = NULL;
71 /* Avoid collisions with the NULL pointer */
72 #define SECTOR_TO_PTR(s) ULONG_TO_PTR((s)+1)
73 #define PTR_TO_SECTOR(p) (PTR_TO_ULONG(p)-1)
75 static int btrfs_defrag(int fd) {
76 struct btrfs_ioctl_vol_args data;
81 return ioctl(fd, BTRFS_IOC_DEFRAG, &data);
84 static int pack_file(FILE *pack, const char *fn, bool on_btrfs) {
86 void *start = MAP_FAILED;
92 int r = 0, fd = -1, k;
97 fd = open(fn, O_RDONLY|O_CLOEXEC|O_NOATIME|O_NOCTTY|O_NOFOLLOW);
103 if (errno == EPERM || errno == EACCES)
106 log_warning("open(%s) failed: %m", fn);
111 k = file_verify(fd, fn, arg_file_size_max, &st);
120 l = PAGE_ALIGN(st.st_size);
121 start = mmap(NULL, l, PROT_READ, MAP_SHARED, fd, 0);
122 if (start == MAP_FAILED) {
123 log_warning("mmap(%s) failed: %m", fn);
128 pages = l / page_size();
130 memset(vec, 0, pages);
131 if (mincore(start, l, vec) < 0) {
132 log_warning("mincore(%s) failed: %m", fn);
140 /* Store the inode, so that we notice when the file is deleted */
141 inode = (uint64_t) st.st_ino;
142 fwrite(&inode, sizeof(inode), 1, pack);
145 for (c = 0; c < pages; c++) {
146 bool new_mapped = !!(vec[c] & 1);
148 if (!mapped && new_mapped)
150 else if (mapped && !new_mapped) {
151 fwrite(&b, sizeof(b), 1, pack);
152 fwrite(&c, sizeof(c), 1, pack);
154 log_debug("%s: page %u to %u", fn, b, c);
160 /* We don't write any range data if we should read the entire file */
161 if (mapped && b > 0) {
162 fwrite(&b, sizeof(b), 1, pack);
163 fwrite(&c, sizeof(c), 1, pack);
165 log_debug("%s: page %u to %u", fn, b, c);
170 fwrite(&b, sizeof(b), 1, pack);
171 fwrite(&b, sizeof(b), 1, pack);
174 if (start != MAP_FAILED)
178 close_nointr_nofail(fd);
183 static unsigned long fd_first_block(int fd) {
185 struct fiemap fiemap;
186 struct fiemap_extent extent;
190 data.fiemap.fm_length = ~0ULL;
191 data.fiemap.fm_extent_count = 1;
193 if (ioctl(fd, FS_IOC_FIEMAP, &data) < 0)
196 if (data.fiemap.fm_mapped_extents <= 0)
199 if (data.fiemap.fm_extents[0].fe_flags & FIEMAP_EXTENT_UNKNOWN)
202 return (unsigned long) data.fiemap.fm_extents[0].fe_physical;
210 static int qsort_compare(const void *a, const void *b) {
211 const struct item *i, *j;
216 if (i->block < j->block)
218 if (i->block > j->block)
221 return strcmp(i->path, j->path);
224 static int collect(const char *root) {
226 FD_FANOTIFY, /* Get the actual fs events */
228 FD_INOTIFY, /* We get notifications to quit early via this fd */
231 struct pollfd pollfd[_FD_MAX];
232 int fanotify_fd = -1, signal_fd = -1, inotify_fd = -1, r = 0;
234 Hashmap *files = NULL;
239 char *pack_fn_new = NULL, *pack_fn = NULL;
240 bool on_ssd, on_btrfs;
243 uint64_t previous_block_readahead;
244 bool previous_block_readahead_set = false;
248 if (asprintf(&pack_fn, "%s/.readahead", root) < 0) {
253 /* If there's no pack file yet we lower the kernel readahead
254 * so that mincore() is accurate. If there is a pack file
255 * already we assume it is accurate enough so that kernel
256 * readahead is never triggered. */
257 previous_block_readahead_set =
258 access(pack_fn, F_OK) < 0 &&
259 block_get_readahead(root, &previous_block_readahead) >= 0 &&
260 block_set_readahead(root, 8*1024) >= 0;
262 if (ioprio_set(IOPRIO_WHO_PROCESS, getpid(), IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0)) < 0)
263 log_warning("Failed to set IDLE IO priority class: %m");
265 assert_se(sigemptyset(&mask) == 0);
266 sigset_add_many(&mask, SIGINT, SIGTERM, -1);
267 assert_se(sigprocmask(SIG_SETMASK, &mask, NULL) == 0);
269 if ((signal_fd = signalfd(-1, &mask, SFD_NONBLOCK|SFD_CLOEXEC)) < 0) {
270 log_error("signalfd(): %m");
275 if (!(files = hashmap_new(string_hash_func, string_compare_func))) {
276 log_error("Failed to allocate set.");
281 if ((fanotify_fd = fanotify_init(FAN_CLOEXEC|FAN_NONBLOCK, O_RDONLY|O_LARGEFILE|O_CLOEXEC|O_NOATIME)) < 0) {
282 log_error("Failed to create fanotify object: %m");
287 if (fanotify_mark(fanotify_fd, FAN_MARK_ADD|FAN_MARK_MOUNT, FAN_OPEN, AT_FDCWD, root) < 0) {
288 log_error("Failed to mark %s: %m", root);
293 if ((inotify_fd = open_inotify()) < 0) {
298 not_after = now(CLOCK_MONOTONIC) + arg_timeout;
303 pollfd[FD_FANOTIFY].fd = fanotify_fd;
304 pollfd[FD_FANOTIFY].events = POLLIN;
305 pollfd[FD_SIGNAL].fd = signal_fd;
306 pollfd[FD_SIGNAL].events = POLLIN;
307 pollfd[FD_INOTIFY].fd = inotify_fd;
308 pollfd[FD_INOTIFY].events = POLLIN;
312 "STATUS=Collecting readahead data");
314 log_debug("Collecting...");
316 if (access("/run/systemd/readahead/cancel", F_OK) >= 0) {
317 log_debug("Collection canceled");
322 if (access("/run/systemd/readahead/done", F_OK) >= 0) {
323 log_debug("Got termination request");
329 struct fanotify_event_metadata metadata;
333 struct fanotify_event_metadata *m;
337 if (hashmap_size(files) > arg_files_max) {
338 log_debug("Reached maximum number of read ahead files, ending collection.");
342 t = now(CLOCK_MONOTONIC);
343 if (t >= not_after) {
344 log_debug("Reached maximum collection time, ending collection.");
348 if ((h = poll(pollfd, _FD_MAX, (int) ((not_after - t) / USEC_PER_MSEC))) < 0) {
353 log_error("poll(): %m");
359 log_debug("Reached maximum collection time, ending collection.");
363 if (pollfd[FD_SIGNAL].revents) {
364 log_debug("Got signal.");
368 if (pollfd[FD_INOTIFY].revents) {
369 uint8_t inotify_buffer[sizeof(struct inotify_event) + FILENAME_MAX];
370 struct inotify_event *e;
372 if ((n = read(inotify_fd, &inotify_buffer, sizeof(inotify_buffer))) < 0) {
373 if (errno == EINTR || errno == EAGAIN)
376 log_error("Failed to read inotify event: %m");
381 e = (struct inotify_event*) inotify_buffer;
385 if ((e->mask & IN_CREATE) && streq(e->name, "cancel")) {
386 log_debug("Collection canceled");
391 if ((e->mask & IN_CREATE) && streq(e->name, "done")) {
392 log_debug("Got termination request");
396 step = sizeof(struct inotify_event) + e->len;
397 assert(step <= (size_t) n);
399 e = (struct inotify_event*) ((uint8_t*) e + step);
404 if ((n = read(fanotify_fd, &data, sizeof(data))) < 0) {
406 if (errno == EINTR || errno == EAGAIN)
409 /* fanotify sometimes returns EACCES on read()
410 * where it shouldn't. For now let's just
411 * ignore it here (which is safe), but
412 * eventually this should be
413 * dropped when the kernel is fixed.
415 * https://bugzilla.redhat.com/show_bug.cgi?id=707577 */
419 log_error("Failed to read event: %m");
424 for (m = &data.metadata; FAN_EVENT_OK(m, n); m = FAN_EVENT_NEXT(m, n)) {
431 if (m->pid == my_pid)
434 __sync_synchronize();
435 if (m->pid == shared->replay)
438 snprintf(fn, sizeof(fn), "/proc/self/fd/%i", m->fd);
441 if ((k = readlink_malloc(fn, &p)) >= 0) {
442 if (startswith(p, "/tmp") ||
443 endswith(p, " (deleted)") ||
444 hashmap_get(files, p))
445 /* Not interesting, or
451 ul = fd_first_block(m->fd);
453 if ((k = hashmap_put(files, p, SECTOR_TO_PTR(ul))) < 0) {
454 log_warning("set_put() failed: %s", strerror(-k));
460 log_warning("readlink(%s) failed: %s", fn, strerror(-k));
464 close_nointr_nofail(m->fd);
469 if (fanotify_fd >= 0) {
470 close_nointr_nofail(fanotify_fd);
474 log_debug("Writing Pack File...");
476 on_ssd = fs_on_ssd(root) > 0;
477 log_debug("On SSD: %s", yes_no(on_ssd));
479 on_btrfs = statfs(root, &sfs) >= 0 && (long) sfs.f_type == (long) BTRFS_SUPER_MAGIC;
480 log_debug("On btrfs: %s", yes_no(on_btrfs));
482 if (asprintf(&pack_fn_new, "%s/.readahead.new", root) < 0) {
487 pack = fopen(pack_fn_new, "we");
489 log_error("Failed to open pack file: %m");
494 fputs(CANONICAL_HOST READAHEAD_PACK_FILE_VERSION, pack);
495 putc(on_ssd ? 'S' : 'R', pack);
497 if (on_ssd || on_btrfs) {
499 /* On SSD or on btrfs, just write things out in the
500 * order the files were accessed. */
502 HASHMAP_FOREACH_KEY(q, p, files, i)
503 pack_file(pack, p, on_btrfs);
505 struct item *ordered, *j;
508 /* On rotating media, order things by the block
511 log_debug("Ordering...");
513 n = hashmap_size(files);
514 if (!(ordered = new(struct item, n))) {
520 HASHMAP_FOREACH_KEY(q, p, files, i) {
522 j->block = PTR_TO_SECTOR(q);
526 assert(ordered + n == j);
528 qsort(ordered, n, sizeof(struct item), qsort_compare);
530 for (k = 0; k < n; k++)
531 pack_file(pack, ordered[k].path, on_btrfs);
536 log_debug("Finalizing...");
541 log_error("Failed to write pack file.");
546 if (rename(pack_fn_new, pack_fn) < 0) {
547 log_error("Failed to rename readahead file: %m");
558 if (fanotify_fd >= 0)
559 close_nointr_nofail(fanotify_fd);
562 close_nointr_nofail(signal_fd);
565 close_nointr_nofail(inotify_fd);
574 while ((p = hashmap_steal_first_key(files)))
579 if (previous_block_readahead_set) {
582 /* Restore the original kernel readahead setting if we
583 * changed it, and nobody has overwritten it since
585 if (block_get_readahead(root, &bytes) >= 0 && bytes == 8*1024)
586 block_set_readahead(root, previous_block_readahead);
592 int main_collect(const char *root) {
597 /* Skip this step on read-only media. Note that we check the
598 * underlying block device here, not he read-only flag of the
599 * file system on top, since that one is most likely mounted
600 * read-only anyway at boot, even if the underlying block
601 * device is theoretically writable. */
602 if (fs_on_read_only(root) > 0) {
603 log_info("Disabling readahead collector due to read-only media.");
608 log_info("Disabling readahead collector due to low memory.");
612 shared = shared_get();
616 shared->collect = getpid();
617 __sync_synchronize();
619 if (collect(root) < 0)