1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
4 This file is part of systemd.
6 Copyright 2010 Lennart Poettering
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
25 #include <linux/limits.h>
30 #include <sys/select.h>
32 #include <sys/types.h>
35 #include <linux/fanotify.h>
36 #include <sys/signalfd.h>
40 #include <linux/fiemap.h>
41 #include <sys/ioctl.h>
44 #include <sys/inotify.h>
46 #include <systemd/sd-daemon.h>
52 #include "readahead-common.h"
57 * - detect ssd on btrfs/lvm...
58 * - read ahead directories
61 * - handle files where nothing is in mincore
62 * - does ioprio_set work with fadvise()?
65 static unsigned arg_files_max = 16*1024;
66 static off_t arg_file_size_max = READAHEAD_FILE_SIZE_MAX;
67 static usec_t arg_timeout = 2*USEC_PER_MINUTE;
69 static ReadaheadShared *shared = NULL;
71 /* Avoid collisions with the NULL pointer */
72 #define SECTOR_TO_PTR(s) ULONG_TO_PTR((s)+1)
73 #define PTR_TO_SECTOR(p) (PTR_TO_ULONG(p)-1)
75 static int btrfs_defrag(int fd) {
76 struct btrfs_ioctl_vol_args data;
81 return ioctl(fd, BTRFS_IOC_DEFRAG, &data);
84 static int pack_file(FILE *pack, const char *fn, bool on_btrfs) {
86 void *start = MAP_FAILED;
91 int r = 0, fd = -1, k;
96 if ((fd = open(fn, O_RDONLY|O_CLOEXEC|O_NOATIME|O_NOCTTY|O_NOFOLLOW)) < 0) {
101 if (errno == EPERM || errno == EACCES)
104 log_warning("open(%s) failed: %m", fn);
109 if ((k = file_verify(fd, fn, arg_file_size_max, &st)) <= 0) {
117 l = PAGE_ALIGN(st.st_size);
118 if ((start = mmap(NULL, l, PROT_READ, MAP_SHARED, fd, 0)) == MAP_FAILED) {
119 log_warning("mmap(%s) failed: %m", fn);
124 pages = l / page_size();
127 memset(vec, 0, pages);
128 if (mincore(start, l, vec) < 0) {
129 log_warning("mincore(%s) failed: %m", fn);
138 for (c = 0; c < pages; c++) {
139 bool new_mapped = !!(vec[c] & 1);
141 if (!mapped && new_mapped)
143 else if (mapped && !new_mapped) {
144 fwrite(&b, sizeof(b), 1, pack);
145 fwrite(&c, sizeof(c), 1, pack);
147 log_debug("%s: page %u to %u", fn, b, c);
153 /* We don't write any range data if we should read the entire file */
154 if (mapped && b > 0) {
155 fwrite(&b, sizeof(b), 1, pack);
156 fwrite(&c, sizeof(c), 1, pack);
158 log_debug("%s: page %u to %u", fn, b, c);
163 fwrite(&b, sizeof(b), 1, pack);
164 fwrite(&b, sizeof(b), 1, pack);
167 if (start != MAP_FAILED)
171 close_nointr_nofail(fd);
176 static unsigned long fd_first_block(int fd) {
178 struct fiemap fiemap;
179 struct fiemap_extent extent;
183 data.fiemap.fm_length = ~0ULL;
184 data.fiemap.fm_extent_count = 1;
186 if (ioctl(fd, FS_IOC_FIEMAP, &data) < 0)
189 if (data.fiemap.fm_mapped_extents <= 0)
192 if (data.fiemap.fm_extents[0].fe_flags & FIEMAP_EXTENT_UNKNOWN)
195 return (unsigned long) data.fiemap.fm_extents[0].fe_physical;
203 static int qsort_compare(const void *a, const void *b) {
204 const struct item *i, *j;
209 if (i->block < j->block)
211 if (i->block > j->block)
214 return strcmp(i->path, j->path);
217 static int collect(const char *root) {
219 FD_FANOTIFY, /* Get the actual fs events */
221 FD_INOTIFY, /* We get notifications to quit early via this fd */
224 struct pollfd pollfd[_FD_MAX];
225 int fanotify_fd = -1, signal_fd = -1, inotify_fd = -1, r = 0;
227 Hashmap *files = NULL;
232 char *pack_fn_new = NULL, *pack_fn = NULL;
233 bool on_ssd, on_btrfs;
236 uint64_t previous_block_readahead;
237 bool previous_block_readahead_set = false;
241 if (asprintf(&pack_fn, "%s/.readahead", root) < 0) {
242 log_error("Out of memory");
247 /* If there's no pack file yet we lower the kernel readahead
248 * so that mincore() is accurate. If there is a pack file
249 * already we assume it is accurate enough so that kernel
250 * readahead is never triggered. */
251 previous_block_readahead_set =
252 access(pack_fn, F_OK) < 0 &&
253 block_get_readahead(root, &previous_block_readahead) >= 0 &&
254 block_set_readahead(root, 8*1024) >= 0;
256 if (ioprio_set(IOPRIO_WHO_PROCESS, getpid(), IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0)) < 0)
257 log_warning("Failed to set IDLE IO priority class: %m");
259 assert_se(sigemptyset(&mask) == 0);
260 sigset_add_many(&mask, SIGINT, SIGTERM, -1);
261 assert_se(sigprocmask(SIG_SETMASK, &mask, NULL) == 0);
263 if ((signal_fd = signalfd(-1, &mask, SFD_NONBLOCK|SFD_CLOEXEC)) < 0) {
264 log_error("signalfd(): %m");
269 if (!(files = hashmap_new(string_hash_func, string_compare_func))) {
270 log_error("Failed to allocate set.");
275 if ((fanotify_fd = fanotify_init(FAN_CLOEXEC|FAN_NONBLOCK, O_RDONLY|O_LARGEFILE|O_CLOEXEC|O_NOATIME)) < 0) {
276 log_error("Failed to create fanotify object: %m");
281 if (fanotify_mark(fanotify_fd, FAN_MARK_ADD|FAN_MARK_MOUNT, FAN_OPEN, AT_FDCWD, root) < 0) {
282 log_error("Failed to mark %s: %m", root);
287 if ((inotify_fd = open_inotify()) < 0) {
292 not_after = now(CLOCK_MONOTONIC) + arg_timeout;
297 pollfd[FD_FANOTIFY].fd = fanotify_fd;
298 pollfd[FD_FANOTIFY].events = POLLIN;
299 pollfd[FD_SIGNAL].fd = signal_fd;
300 pollfd[FD_SIGNAL].events = POLLIN;
301 pollfd[FD_INOTIFY].fd = inotify_fd;
302 pollfd[FD_INOTIFY].events = POLLIN;
306 "STATUS=Collecting readahead data");
308 log_debug("Collecting...");
310 if (access("/run/systemd/readahead/cancel", F_OK) >= 0) {
311 log_debug("Collection canceled");
316 if (access("/run/systemd/readahead/done", F_OK) >= 0) {
317 log_debug("Got termination request");
323 struct fanotify_event_metadata metadata;
327 struct fanotify_event_metadata *m;
331 if (hashmap_size(files) > arg_files_max) {
332 log_debug("Reached maximum number of read ahead files, ending collection.");
336 t = now(CLOCK_MONOTONIC);
337 if (t >= not_after) {
338 log_debug("Reached maximum collection time, ending collection.");
342 if ((h = poll(pollfd, _FD_MAX, (int) ((not_after - t) / USEC_PER_MSEC))) < 0) {
347 log_error("poll(): %m");
353 log_debug("Reached maximum collection time, ending collection.");
357 if (pollfd[FD_SIGNAL].revents) {
358 log_debug("Got signal.");
362 if (pollfd[FD_INOTIFY].revents) {
363 uint8_t inotify_buffer[sizeof(struct inotify_event) + FILENAME_MAX];
364 struct inotify_event *e;
366 if ((n = read(inotify_fd, &inotify_buffer, sizeof(inotify_buffer))) < 0) {
367 if (errno == EINTR || errno == EAGAIN)
370 log_error("Failed to read inotify event: %m");
375 e = (struct inotify_event*) inotify_buffer;
379 if ((e->mask & IN_CREATE) && streq(e->name, "cancel")) {
380 log_debug("Collection canceled");
385 if ((e->mask & IN_CREATE) && streq(e->name, "done")) {
386 log_debug("Got termination request");
390 step = sizeof(struct inotify_event) + e->len;
391 assert(step <= (size_t) n);
393 e = (struct inotify_event*) ((uint8_t*) e + step);
398 if ((n = read(fanotify_fd, &data, sizeof(data))) < 0) {
400 if (errno == EINTR || errno == EAGAIN)
403 /* fanotify sometimes returns EACCES on read()
404 * where it shouldn't. For now let's just
405 * ignore it here (which is safe), but
406 * eventually this should be
407 * dropped when the kernel is fixed.
409 * https://bugzilla.redhat.com/show_bug.cgi?id=707577 */
413 log_error("Failed to read event: %m");
418 for (m = &data.metadata; FAN_EVENT_OK(m, n); m = FAN_EVENT_NEXT(m, n)) {
425 if (m->pid == my_pid)
428 __sync_synchronize();
429 if (m->pid == shared->replay)
432 snprintf(fn, sizeof(fn), "/proc/self/fd/%i", m->fd);
435 if ((k = readlink_malloc(fn, &p)) >= 0) {
436 if (startswith(p, "/tmp") ||
437 endswith(p, " (deleted)") ||
438 hashmap_get(files, p))
439 /* Not interesting, or
445 ul = fd_first_block(m->fd);
447 if ((k = hashmap_put(files, p, SECTOR_TO_PTR(ul))) < 0) {
448 log_warning("set_put() failed: %s", strerror(-k));
454 log_warning("readlink(%s) failed: %s", fn, strerror(-k));
458 close_nointr_nofail(m->fd);
463 if (fanotify_fd >= 0) {
464 close_nointr_nofail(fanotify_fd);
468 log_debug("Writing Pack File...");
470 on_ssd = fs_on_ssd(root) > 0;
471 log_debug("On SSD: %s", yes_no(on_ssd));
473 on_btrfs = statfs(root, &sfs) >= 0 && (long) sfs.f_type == (long) BTRFS_SUPER_MAGIC;
474 log_debug("On btrfs: %s", yes_no(on_btrfs));
476 if (asprintf(&pack_fn_new, "%s/.readahead.new", root) < 0) {
477 log_error("Out of memory");
482 if (!(pack = fopen(pack_fn_new, "we"))) {
483 log_error("Failed to open pack file: %m");
488 fputs(CANONICAL_HOST "\n", pack);
489 putc(on_ssd ? 'S' : 'R', pack);
491 if (on_ssd || on_btrfs) {
493 /* On SSD or on btrfs, just write things out in the
494 * order the files were accessed. */
496 HASHMAP_FOREACH_KEY(q, p, files, i)
497 pack_file(pack, p, on_btrfs);
499 struct item *ordered, *j;
502 /* On rotating media, order things by the block
505 log_debug("Ordering...");
507 n = hashmap_size(files);
508 if (!(ordered = new(struct item, n))) {
509 log_error("Out of memory");
515 HASHMAP_FOREACH_KEY(q, p, files, i) {
517 j->block = PTR_TO_SECTOR(q);
521 assert(ordered + n == j);
523 qsort(ordered, n, sizeof(struct item), qsort_compare);
525 for (k = 0; k < n; k++)
526 pack_file(pack, ordered[k].path, on_btrfs);
531 log_debug("Finalizing...");
536 log_error("Failed to write pack file.");
541 if (rename(pack_fn_new, pack_fn) < 0) {
542 log_error("Failed to rename readahead file: %m");
553 if (fanotify_fd >= 0)
554 close_nointr_nofail(fanotify_fd);
557 close_nointr_nofail(signal_fd);
560 close_nointr_nofail(inotify_fd);
569 while ((p = hashmap_steal_first_key(files)))
574 if (previous_block_readahead_set) {
577 /* Restore the original kernel readahead setting if we
578 * changed it, and nobody has overwritten it since
580 if (block_get_readahead(root, &bytes) >= 0 && bytes == 8*1024)
581 block_set_readahead(root, previous_block_readahead);
587 static int help(void) {
589 printf("%s [OPTIONS...] [DIRECTORY]\n\n"
590 "Collect read-ahead data on early boot.\n\n"
591 " -h --help Show this help\n"
592 " --max-files=INT Maximum number of files to read ahead\n"
593 " --max-file-size=BYTES Maximum size of files to read ahead\n"
594 " --timeout=USEC Maximum time to spend collecting data\n",
595 program_invocation_short_name);
600 static int parse_argv(int argc, char *argv[]) {
603 ARG_FILES_MAX = 0x100,
608 static const struct option options[] = {
609 { "help", no_argument, NULL, 'h' },
610 { "files-max", required_argument, NULL, ARG_FILES_MAX },
611 { "file-size-max", required_argument, NULL, ARG_FILE_SIZE_MAX },
612 { "timeout", required_argument, NULL, ARG_TIMEOUT },
621 while ((c = getopt_long(argc, argv, "h", options, NULL)) >= 0) {
630 if (safe_atou(optarg, &arg_files_max) < 0 || arg_files_max <= 0) {
631 log_error("Failed to parse maximum number of files %s.", optarg);
636 case ARG_FILE_SIZE_MAX: {
637 unsigned long long ull;
639 if (safe_atollu(optarg, &ull) < 0 || ull <= 0) {
640 log_error("Failed to parse maximum file size %s.", optarg);
644 arg_file_size_max = (off_t) ull;
649 if (parse_usec(optarg, &arg_timeout) < 0 || arg_timeout <= 0) {
650 log_error("Failed to parse timeout %s.", optarg);
660 log_error("Unknown option code %c", c);
665 if (optind != argc &&
674 int main(int argc, char *argv[]) {
678 log_set_target(LOG_TARGET_AUTO);
679 log_parse_environment();
684 r = parse_argv(argc, argv);
686 return r < 0 ? EXIT_FAILURE : EXIT_SUCCESS;
688 root = optind < argc ? argv[optind] : "/";
690 /* Skip this step on read-only media. Note that we check the
691 * underlying block device here, not he read-only flag of the
692 * file system on top, since that one is most likely mounted
693 * read-only anyway at boot, even if the underlying block
694 * device is theoretically writable. */
695 if (fs_on_read_only(root) > 0) {
696 log_info("Disabling readahead collector due to read-only media.");
701 log_info("Disabling readahead collector due to low memory.");
705 shared = shared_get();
709 shared->collect = getpid();
710 __sync_synchronize();
712 if (collect(root) < 0)