1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
4 This file is part of systemd.
6 Copyright 2010 Lennart Poettering
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
25 #include <linux/limits.h>
30 #include <sys/select.h>
32 #include <sys/types.h>
35 #include <linux/fanotify.h>
36 #include <sys/signalfd.h>
40 #include <linux/fiemap.h>
41 #include <sys/ioctl.h>
44 #include <sys/inotify.h>
46 #include <systemd/sd-daemon.h>
52 #include "readahead-common.h"
57 * - detect ssd on btrfs/lvm...
58 * - read ahead directories
61 * - handle files where nothing is in mincore
62 * - does ioprio_set work with fadvise()?
65 static unsigned arg_files_max = 16*1024;
66 static off_t arg_file_size_max = READAHEAD_FILE_SIZE_MAX;
67 static usec_t arg_timeout = 2*USEC_PER_MINUTE;
69 static ReadaheadShared *shared = NULL;
71 /* Avoid collisions with the NULL pointer */
72 #define SECTOR_TO_PTR(s) ULONG_TO_PTR((s)+1)
73 #define PTR_TO_SECTOR(p) (PTR_TO_ULONG(p)-1)
75 static int btrfs_defrag(int fd) {
76 struct btrfs_ioctl_vol_args data;
81 return ioctl(fd, BTRFS_IOC_DEFRAG, &data);
84 static int pack_file(FILE *pack, const char *fn, bool on_btrfs) {
86 void *start = MAP_FAILED;
92 int r = 0, fd = -1, k;
97 fd = open(fn, O_RDONLY|O_CLOEXEC|O_NOATIME|O_NOCTTY|O_NOFOLLOW);
103 if (errno == EPERM || errno == EACCES)
106 log_warning("open(%s) failed: %m", fn);
111 k = file_verify(fd, fn, arg_file_size_max, &st);
120 l = PAGE_ALIGN(st.st_size);
121 start = mmap(NULL, l, PROT_READ, MAP_SHARED, fd, 0);
122 if (start == MAP_FAILED) {
123 log_warning("mmap(%s) failed: %m", fn);
128 pages = l / page_size();
130 memset(vec, 0, pages);
131 if (mincore(start, l, vec) < 0) {
132 log_warning("mincore(%s) failed: %m", fn);
140 /* Store the inode, so that we notice when the file is deleted */
141 inode = (uint64_t) st.st_ino;
142 fwrite(&inode, sizeof(inode), 1, pack);
145 for (c = 0; c < pages; c++) {
146 bool new_mapped = !!(vec[c] & 1);
148 if (!mapped && new_mapped)
150 else if (mapped && !new_mapped) {
151 fwrite(&b, sizeof(b), 1, pack);
152 fwrite(&c, sizeof(c), 1, pack);
154 log_debug("%s: page %u to %u", fn, b, c);
160 /* We don't write any range data if we should read the entire file */
161 if (mapped && b > 0) {
162 fwrite(&b, sizeof(b), 1, pack);
163 fwrite(&c, sizeof(c), 1, pack);
165 log_debug("%s: page %u to %u", fn, b, c);
170 fwrite(&b, sizeof(b), 1, pack);
171 fwrite(&b, sizeof(b), 1, pack);
174 if (start != MAP_FAILED)
178 close_nointr_nofail(fd);
183 static unsigned long fd_first_block(int fd) {
185 struct fiemap fiemap;
186 struct fiemap_extent extent;
190 data.fiemap.fm_length = ~0ULL;
191 data.fiemap.fm_extent_count = 1;
193 if (ioctl(fd, FS_IOC_FIEMAP, &data) < 0)
196 if (data.fiemap.fm_mapped_extents <= 0)
199 if (data.fiemap.fm_extents[0].fe_flags & FIEMAP_EXTENT_UNKNOWN)
202 return (unsigned long) data.fiemap.fm_extents[0].fe_physical;
210 static int qsort_compare(const void *a, const void *b) {
211 const struct item *i, *j;
216 if (i->block < j->block)
218 if (i->block > j->block)
221 return strcmp(i->path, j->path);
224 static int collect(const char *root) {
226 FD_FANOTIFY, /* Get the actual fs events */
228 FD_INOTIFY, /* We get notifications to quit early via this fd */
231 struct pollfd pollfd[_FD_MAX];
232 int fanotify_fd = -1, signal_fd = -1, inotify_fd = -1, r = 0;
234 Hashmap *files = NULL;
239 char *pack_fn_new = NULL, *pack_fn = NULL;
240 bool on_ssd, on_btrfs;
243 uint64_t previous_block_readahead;
244 bool previous_block_readahead_set = false;
248 if (asprintf(&pack_fn, "%s/.readahead", root) < 0) {
249 log_error("Out of memory");
254 /* If there's no pack file yet we lower the kernel readahead
255 * so that mincore() is accurate. If there is a pack file
256 * already we assume it is accurate enough so that kernel
257 * readahead is never triggered. */
258 previous_block_readahead_set =
259 access(pack_fn, F_OK) < 0 &&
260 block_get_readahead(root, &previous_block_readahead) >= 0 &&
261 block_set_readahead(root, 8*1024) >= 0;
263 if (ioprio_set(IOPRIO_WHO_PROCESS, getpid(), IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0)) < 0)
264 log_warning("Failed to set IDLE IO priority class: %m");
266 assert_se(sigemptyset(&mask) == 0);
267 sigset_add_many(&mask, SIGINT, SIGTERM, -1);
268 assert_se(sigprocmask(SIG_SETMASK, &mask, NULL) == 0);
270 if ((signal_fd = signalfd(-1, &mask, SFD_NONBLOCK|SFD_CLOEXEC)) < 0) {
271 log_error("signalfd(): %m");
276 if (!(files = hashmap_new(string_hash_func, string_compare_func))) {
277 log_error("Failed to allocate set.");
282 if ((fanotify_fd = fanotify_init(FAN_CLOEXEC|FAN_NONBLOCK, O_RDONLY|O_LARGEFILE|O_CLOEXEC|O_NOATIME)) < 0) {
283 log_error("Failed to create fanotify object: %m");
288 if (fanotify_mark(fanotify_fd, FAN_MARK_ADD|FAN_MARK_MOUNT, FAN_OPEN, AT_FDCWD, root) < 0) {
289 log_error("Failed to mark %s: %m", root);
294 if ((inotify_fd = open_inotify()) < 0) {
299 not_after = now(CLOCK_MONOTONIC) + arg_timeout;
304 pollfd[FD_FANOTIFY].fd = fanotify_fd;
305 pollfd[FD_FANOTIFY].events = POLLIN;
306 pollfd[FD_SIGNAL].fd = signal_fd;
307 pollfd[FD_SIGNAL].events = POLLIN;
308 pollfd[FD_INOTIFY].fd = inotify_fd;
309 pollfd[FD_INOTIFY].events = POLLIN;
313 "STATUS=Collecting readahead data");
315 log_debug("Collecting...");
317 if (access("/run/systemd/readahead/cancel", F_OK) >= 0) {
318 log_debug("Collection canceled");
323 if (access("/run/systemd/readahead/done", F_OK) >= 0) {
324 log_debug("Got termination request");
330 struct fanotify_event_metadata metadata;
334 struct fanotify_event_metadata *m;
338 if (hashmap_size(files) > arg_files_max) {
339 log_debug("Reached maximum number of read ahead files, ending collection.");
343 t = now(CLOCK_MONOTONIC);
344 if (t >= not_after) {
345 log_debug("Reached maximum collection time, ending collection.");
349 if ((h = poll(pollfd, _FD_MAX, (int) ((not_after - t) / USEC_PER_MSEC))) < 0) {
354 log_error("poll(): %m");
360 log_debug("Reached maximum collection time, ending collection.");
364 if (pollfd[FD_SIGNAL].revents) {
365 log_debug("Got signal.");
369 if (pollfd[FD_INOTIFY].revents) {
370 uint8_t inotify_buffer[sizeof(struct inotify_event) + FILENAME_MAX];
371 struct inotify_event *e;
373 if ((n = read(inotify_fd, &inotify_buffer, sizeof(inotify_buffer))) < 0) {
374 if (errno == EINTR || errno == EAGAIN)
377 log_error("Failed to read inotify event: %m");
382 e = (struct inotify_event*) inotify_buffer;
386 if ((e->mask & IN_CREATE) && streq(e->name, "cancel")) {
387 log_debug("Collection canceled");
392 if ((e->mask & IN_CREATE) && streq(e->name, "done")) {
393 log_debug("Got termination request");
397 step = sizeof(struct inotify_event) + e->len;
398 assert(step <= (size_t) n);
400 e = (struct inotify_event*) ((uint8_t*) e + step);
405 if ((n = read(fanotify_fd, &data, sizeof(data))) < 0) {
407 if (errno == EINTR || errno == EAGAIN)
410 /* fanotify sometimes returns EACCES on read()
411 * where it shouldn't. For now let's just
412 * ignore it here (which is safe), but
413 * eventually this should be
414 * dropped when the kernel is fixed.
416 * https://bugzilla.redhat.com/show_bug.cgi?id=707577 */
420 log_error("Failed to read event: %m");
425 for (m = &data.metadata; FAN_EVENT_OK(m, n); m = FAN_EVENT_NEXT(m, n)) {
432 if (m->pid == my_pid)
435 __sync_synchronize();
436 if (m->pid == shared->replay)
439 snprintf(fn, sizeof(fn), "/proc/self/fd/%i", m->fd);
442 if ((k = readlink_malloc(fn, &p)) >= 0) {
443 if (startswith(p, "/tmp") ||
444 endswith(p, " (deleted)") ||
445 hashmap_get(files, p))
446 /* Not interesting, or
452 ul = fd_first_block(m->fd);
454 if ((k = hashmap_put(files, p, SECTOR_TO_PTR(ul))) < 0) {
455 log_warning("set_put() failed: %s", strerror(-k));
461 log_warning("readlink(%s) failed: %s", fn, strerror(-k));
465 close_nointr_nofail(m->fd);
470 if (fanotify_fd >= 0) {
471 close_nointr_nofail(fanotify_fd);
475 log_debug("Writing Pack File...");
477 on_ssd = fs_on_ssd(root) > 0;
478 log_debug("On SSD: %s", yes_no(on_ssd));
480 on_btrfs = statfs(root, &sfs) >= 0 && (long) sfs.f_type == (long) BTRFS_SUPER_MAGIC;
481 log_debug("On btrfs: %s", yes_no(on_btrfs));
483 if (asprintf(&pack_fn_new, "%s/.readahead.new", root) < 0) {
484 log_error("Out of memory");
489 pack = fopen(pack_fn_new, "we");
491 log_error("Failed to open pack file: %m");
496 fputs(CANONICAL_HOST ";VERSION=2\n", pack);
497 putc(on_ssd ? 'S' : 'R', pack);
499 if (on_ssd || on_btrfs) {
501 /* On SSD or on btrfs, just write things out in the
502 * order the files were accessed. */
504 HASHMAP_FOREACH_KEY(q, p, files, i)
505 pack_file(pack, p, on_btrfs);
507 struct item *ordered, *j;
510 /* On rotating media, order things by the block
513 log_debug("Ordering...");
515 n = hashmap_size(files);
516 if (!(ordered = new(struct item, n))) {
517 log_error("Out of memory");
523 HASHMAP_FOREACH_KEY(q, p, files, i) {
525 j->block = PTR_TO_SECTOR(q);
529 assert(ordered + n == j);
531 qsort(ordered, n, sizeof(struct item), qsort_compare);
533 for (k = 0; k < n; k++)
534 pack_file(pack, ordered[k].path, on_btrfs);
539 log_debug("Finalizing...");
544 log_error("Failed to write pack file.");
549 if (rename(pack_fn_new, pack_fn) < 0) {
550 log_error("Failed to rename readahead file: %m");
561 if (fanotify_fd >= 0)
562 close_nointr_nofail(fanotify_fd);
565 close_nointr_nofail(signal_fd);
568 close_nointr_nofail(inotify_fd);
577 while ((p = hashmap_steal_first_key(files)))
582 if (previous_block_readahead_set) {
585 /* Restore the original kernel readahead setting if we
586 * changed it, and nobody has overwritten it since
588 if (block_get_readahead(root, &bytes) >= 0 && bytes == 8*1024)
589 block_set_readahead(root, previous_block_readahead);
595 static int help(void) {
597 printf("%s [OPTIONS...] [DIRECTORY]\n\n"
598 "Collect read-ahead data on early boot.\n\n"
599 " -h --help Show this help\n"
600 " --max-files=INT Maximum number of files to read ahead\n"
601 " --max-file-size=BYTES Maximum size of files to read ahead\n"
602 " --timeout=USEC Maximum time to spend collecting data\n",
603 program_invocation_short_name);
608 static int parse_argv(int argc, char *argv[]) {
611 ARG_FILES_MAX = 0x100,
616 static const struct option options[] = {
617 { "help", no_argument, NULL, 'h' },
618 { "files-max", required_argument, NULL, ARG_FILES_MAX },
619 { "file-size-max", required_argument, NULL, ARG_FILE_SIZE_MAX },
620 { "timeout", required_argument, NULL, ARG_TIMEOUT },
629 while ((c = getopt_long(argc, argv, "h", options, NULL)) >= 0) {
638 if (safe_atou(optarg, &arg_files_max) < 0 || arg_files_max <= 0) {
639 log_error("Failed to parse maximum number of files %s.", optarg);
644 case ARG_FILE_SIZE_MAX: {
645 unsigned long long ull;
647 if (safe_atollu(optarg, &ull) < 0 || ull <= 0) {
648 log_error("Failed to parse maximum file size %s.", optarg);
652 arg_file_size_max = (off_t) ull;
657 if (parse_usec(optarg, &arg_timeout) < 0 || arg_timeout <= 0) {
658 log_error("Failed to parse timeout %s.", optarg);
668 log_error("Unknown option code %c", c);
673 if (optind != argc &&
682 int main(int argc, char *argv[]) {
686 log_set_target(LOG_TARGET_AUTO);
687 log_parse_environment();
692 r = parse_argv(argc, argv);
694 return r < 0 ? EXIT_FAILURE : EXIT_SUCCESS;
696 root = optind < argc ? argv[optind] : "/";
698 /* Skip this step on read-only media. Note that we check the
699 * underlying block device here, not he read-only flag of the
700 * file system on top, since that one is most likely mounted
701 * read-only anyway at boot, even if the underlying block
702 * device is theoretically writable. */
703 if (fs_on_read_only(root) > 0) {
704 log_info("Disabling readahead collector due to read-only media.");
709 log_info("Disabling readahead collector due to low memory.");
713 shared = shared_get();
717 shared->collect = getpid();
718 __sync_synchronize();
720 if (collect(root) < 0)