1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
4 This file is part of systemd.
6 Copyright 2010 Lennart Poettering
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
25 #include <linux/limits.h>
30 #include <sys/select.h>
32 #include <sys/types.h>
35 #include <linux/fanotify.h>
36 #include <sys/signalfd.h>
40 #include <linux/fiemap.h>
41 #include <sys/ioctl.h>
44 #include <sys/inotify.h>
46 #include <systemd/sd-daemon.h>
52 #include "readahead-common.h"
57 * - detect ssd on btrfs/lvm...
58 * - read ahead directories
61 * - handle files where nothing is in mincore
62 * - does ioprio_set work with fadvise()?
65 static unsigned arg_files_max = 16*1024;
66 static off_t arg_file_size_max = READAHEAD_FILE_SIZE_MAX;
67 static usec_t arg_timeout = 2*USEC_PER_MINUTE;
69 static ReadaheadShared *shared = NULL;
71 /* Avoid collisions with the NULL pointer */
72 #define SECTOR_TO_PTR(s) ULONG_TO_PTR((s)+1)
73 #define PTR_TO_SECTOR(p) (PTR_TO_ULONG(p)-1)
75 static int btrfs_defrag(int fd) {
76 struct btrfs_ioctl_vol_args data;
81 return ioctl(fd, BTRFS_IOC_DEFRAG, &data);
84 static int pack_file(FILE *pack, const char *fn, bool on_btrfs) {
86 void *start = MAP_FAILED;
91 int r = 0, fd = -1, k;
96 if ((fd = open(fn, O_RDONLY|O_CLOEXEC|O_NOATIME|O_NOCTTY|O_NOFOLLOW)) < 0) {
101 if (errno == EPERM || errno == EACCES)
104 log_warning("open(%s) failed: %m", fn);
109 if ((k = file_verify(fd, fn, arg_file_size_max, &st)) <= 0) {
117 l = PAGE_ALIGN(st.st_size);
118 if ((start = mmap(NULL, l, PROT_READ, MAP_SHARED, fd, 0)) == MAP_FAILED) {
119 log_warning("mmap(%s) failed: %m", fn);
124 pages = l / page_size();
127 memset(vec, 0, pages);
128 if (mincore(start, l, vec) < 0) {
129 log_warning("mincore(%s) failed: %m", fn);
138 for (c = 0; c < pages; c++) {
139 bool new_mapped = !!(vec[c] & 1);
141 if (!mapped && new_mapped)
143 else if (mapped && !new_mapped) {
144 fwrite(&b, sizeof(b), 1, pack);
145 fwrite(&c, sizeof(c), 1, pack);
147 log_debug("%s: page %u to %u", fn, b, c);
153 /* We don't write any range data if we should read the entire file */
154 if (mapped && b > 0) {
155 fwrite(&b, sizeof(b), 1, pack);
156 fwrite(&c, sizeof(c), 1, pack);
158 log_debug("%s: page %u to %u", fn, b, c);
163 fwrite(&b, sizeof(b), 1, pack);
164 fwrite(&b, sizeof(b), 1, pack);
167 if (start != MAP_FAILED)
171 close_nointr_nofail(fd);
176 static unsigned long fd_first_block(int fd) {
178 struct fiemap fiemap;
179 struct fiemap_extent extent;
183 data.fiemap.fm_length = ~0ULL;
184 data.fiemap.fm_extent_count = 1;
186 if (ioctl(fd, FS_IOC_FIEMAP, &data) < 0)
189 if (data.fiemap.fm_mapped_extents <= 0)
192 if (data.fiemap.fm_extents[0].fe_flags & FIEMAP_EXTENT_UNKNOWN)
195 return (unsigned long) data.fiemap.fm_extents[0].fe_physical;
203 static int qsort_compare(const void *a, const void *b) {
204 const struct item *i, *j;
209 if (i->block < j->block)
211 if (i->block > j->block)
214 return strcmp(i->path, j->path);
217 static int collect(const char *root) {
219 FD_FANOTIFY, /* Get the actual fs events */
221 FD_INOTIFY, /* We get notifications to quit early via this fd */
224 struct pollfd pollfd[_FD_MAX];
225 int fanotify_fd = -1, signal_fd = -1, inotify_fd = -1, r = 0;
227 Hashmap *files = NULL;
232 char *pack_fn_new = NULL, *pack_fn = NULL;
233 bool on_ssd, on_btrfs;
236 uint64_t previous_block_readahead;
237 bool previous_block_readahead_set = false;
241 if (asprintf(&pack_fn, "%s/.readahead", root) < 0) {
242 log_error("Out of memory");
247 /* If there's no pack file yet we lower the kernel readahead
248 * so that mincore() is accurate. If there is a pack file
249 * already we assume it is accurate enough so that kernel
250 * readahead is never triggered. */
251 previous_block_readahead_set =
252 access(pack_fn, F_OK) < 0 &&
253 block_get_readahead(root, &previous_block_readahead) >= 0 &&
254 block_set_readahead(root, 8*1024) >= 0;
256 write_one_line_file("/proc/self/oom_score_adj", "1000");
258 if (ioprio_set(IOPRIO_WHO_PROCESS, getpid(), IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0)) < 0)
259 log_warning("Failed to set IDLE IO priority class: %m");
261 assert_se(sigemptyset(&mask) == 0);
262 sigset_add_many(&mask, SIGINT, SIGTERM, -1);
263 assert_se(sigprocmask(SIG_SETMASK, &mask, NULL) == 0);
265 if ((signal_fd = signalfd(-1, &mask, SFD_NONBLOCK|SFD_CLOEXEC)) < 0) {
266 log_error("signalfd(): %m");
271 if (!(files = hashmap_new(string_hash_func, string_compare_func))) {
272 log_error("Failed to allocate set.");
277 if ((fanotify_fd = fanotify_init(FAN_CLOEXEC|FAN_NONBLOCK, O_RDONLY|O_LARGEFILE|O_CLOEXEC|O_NOATIME)) < 0) {
278 log_error("Failed to create fanotify object: %m");
283 if (fanotify_mark(fanotify_fd, FAN_MARK_ADD|FAN_MARK_MOUNT, FAN_OPEN, AT_FDCWD, root) < 0) {
284 log_error("Failed to mark %s: %m", root);
289 if ((inotify_fd = open_inotify()) < 0) {
294 not_after = now(CLOCK_MONOTONIC) + arg_timeout;
299 pollfd[FD_FANOTIFY].fd = fanotify_fd;
300 pollfd[FD_FANOTIFY].events = POLLIN;
301 pollfd[FD_SIGNAL].fd = signal_fd;
302 pollfd[FD_SIGNAL].events = POLLIN;
303 pollfd[FD_INOTIFY].fd = inotify_fd;
304 pollfd[FD_INOTIFY].events = POLLIN;
308 "STATUS=Collecting readahead data");
310 log_debug("Collecting...");
312 if (access("/run/systemd/readahead/cancel", F_OK) >= 0) {
313 log_debug("Collection canceled");
318 if (access("/run/systemd/readahead/done", F_OK) >= 0) {
319 log_debug("Got termination request");
325 struct fanotify_event_metadata metadata;
329 struct fanotify_event_metadata *m;
333 if (hashmap_size(files) > arg_files_max) {
334 log_debug("Reached maximum number of read ahead files, ending collection.");
338 t = now(CLOCK_MONOTONIC);
339 if (t >= not_after) {
340 log_debug("Reached maximum collection time, ending collection.");
344 if ((h = poll(pollfd, _FD_MAX, (int) ((not_after - t) / USEC_PER_MSEC))) < 0) {
349 log_error("poll(): %m");
355 log_debug("Reached maximum collection time, ending collection.");
359 if (pollfd[FD_SIGNAL].revents) {
360 log_debug("Got signal.");
364 if (pollfd[FD_INOTIFY].revents) {
365 uint8_t inotify_buffer[sizeof(struct inotify_event) + FILENAME_MAX];
366 struct inotify_event *e;
368 if ((n = read(inotify_fd, &inotify_buffer, sizeof(inotify_buffer))) < 0) {
369 if (errno == EINTR || errno == EAGAIN)
372 log_error("Failed to read inotify event: %m");
377 e = (struct inotify_event*) inotify_buffer;
381 if ((e->mask & IN_CREATE) && streq(e->name, "cancel")) {
382 log_debug("Collection canceled");
387 if ((e->mask & IN_CREATE) && streq(e->name, "done")) {
388 log_debug("Got termination request");
392 step = sizeof(struct inotify_event) + e->len;
393 assert(step <= (size_t) n);
395 e = (struct inotify_event*) ((uint8_t*) e + step);
400 if ((n = read(fanotify_fd, &data, sizeof(data))) < 0) {
402 if (errno == EINTR || errno == EAGAIN)
405 /* fanotify sometimes returns EACCES on read()
406 * where it shouldn't. For now let's just
407 * ignore it here (which is safe), but
408 * eventually this should be
409 * dropped when the kernel is fixed.
411 * https://bugzilla.redhat.com/show_bug.cgi?id=707577 */
415 log_error("Failed to read event: %m");
420 for (m = &data.metadata; FAN_EVENT_OK(m, n); m = FAN_EVENT_NEXT(m, n)) {
427 if (m->pid == my_pid)
430 __sync_synchronize();
431 if (m->pid == shared->replay)
434 snprintf(fn, sizeof(fn), "/proc/self/fd/%i", m->fd);
437 if ((k = readlink_malloc(fn, &p)) >= 0) {
438 if (startswith(p, "/tmp") ||
439 endswith(p, " (deleted)") ||
440 hashmap_get(files, p))
441 /* Not interesting, or
447 ul = fd_first_block(m->fd);
449 if ((k = hashmap_put(files, p, SECTOR_TO_PTR(ul))) < 0) {
450 log_warning("set_put() failed: %s", strerror(-k));
456 log_warning("readlink(%s) failed: %s", fn, strerror(-k));
460 close_nointr_nofail(m->fd);
465 if (fanotify_fd >= 0) {
466 close_nointr_nofail(fanotify_fd);
470 log_debug("Writing Pack File...");
472 on_ssd = fs_on_ssd(root) > 0;
473 log_debug("On SSD: %s", yes_no(on_ssd));
475 on_btrfs = statfs(root, &sfs) >= 0 && (long) sfs.f_type == (long) BTRFS_SUPER_MAGIC;
476 log_debug("On btrfs: %s", yes_no(on_btrfs));
478 if (asprintf(&pack_fn_new, "%s/.readahead.new", root) < 0) {
479 log_error("Out of memory");
484 if (!(pack = fopen(pack_fn_new, "we"))) {
485 log_error("Failed to open pack file: %m");
490 fputs(CANONICAL_HOST "\n", pack);
491 putc(on_ssd ? 'S' : 'R', pack);
493 if (on_ssd || on_btrfs) {
495 /* On SSD or on btrfs, just write things out in the
496 * order the files were accessed. */
498 HASHMAP_FOREACH_KEY(q, p, files, i)
499 pack_file(pack, p, on_btrfs);
501 struct item *ordered, *j;
504 /* On rotating media, order things by the block
507 log_debug("Ordering...");
509 n = hashmap_size(files);
510 if (!(ordered = new(struct item, n))) {
511 log_error("Out of memory");
517 HASHMAP_FOREACH_KEY(q, p, files, i) {
519 j->block = PTR_TO_SECTOR(q);
523 assert(ordered + n == j);
525 qsort(ordered, n, sizeof(struct item), qsort_compare);
527 for (k = 0; k < n; k++)
528 pack_file(pack, ordered[k].path, on_btrfs);
533 log_debug("Finalizing...");
538 log_error("Failed to write pack file.");
543 if (rename(pack_fn_new, pack_fn) < 0) {
544 log_error("Failed to rename readahead file: %m");
555 if (fanotify_fd >= 0)
556 close_nointr_nofail(fanotify_fd);
559 close_nointr_nofail(signal_fd);
562 close_nointr_nofail(inotify_fd);
571 while ((p = hashmap_steal_first_key(files)))
576 if (previous_block_readahead_set) {
579 /* Restore the original kernel readahead setting if we
580 * changed it, and nobody has overwritten it since
582 if (block_get_readahead(root, &bytes) >= 0 && bytes == 8*1024)
583 block_set_readahead(root, previous_block_readahead);
589 static int help(void) {
591 printf("%s [OPTIONS...] [DIRECTORY]\n\n"
592 "Collect read-ahead data on early boot.\n\n"
593 " -h --help Show this help\n"
594 " --max-files=INT Maximum number of files to read ahead\n"
595 " --max-file-size=BYTES Maximum size of files to read ahead\n"
596 " --timeout=USEC Maximum time to spend collecting data\n",
597 program_invocation_short_name);
602 static int parse_argv(int argc, char *argv[]) {
605 ARG_FILES_MAX = 0x100,
610 static const struct option options[] = {
611 { "help", no_argument, NULL, 'h' },
612 { "files-max", required_argument, NULL, ARG_FILES_MAX },
613 { "file-size-max", required_argument, NULL, ARG_FILE_SIZE_MAX },
614 { "timeout", required_argument, NULL, ARG_TIMEOUT },
623 while ((c = getopt_long(argc, argv, "h", options, NULL)) >= 0) {
632 if (safe_atou(optarg, &arg_files_max) < 0 || arg_files_max <= 0) {
633 log_error("Failed to parse maximum number of files %s.", optarg);
638 case ARG_FILE_SIZE_MAX: {
639 unsigned long long ull;
641 if (safe_atollu(optarg, &ull) < 0 || ull <= 0) {
642 log_error("Failed to parse maximum file size %s.", optarg);
646 arg_file_size_max = (off_t) ull;
651 if (parse_usec(optarg, &arg_timeout) < 0 || arg_timeout <= 0) {
652 log_error("Failed to parse timeout %s.", optarg);
662 log_error("Unknown option code %c", c);
667 if (optind != argc &&
676 int main(int argc, char *argv[]) {
680 log_set_target(LOG_TARGET_AUTO);
681 log_parse_environment();
686 r = parse_argv(argc, argv);
688 return r < 0 ? EXIT_FAILURE : EXIT_SUCCESS;
690 root = optind < argc ? argv[optind] : "/";
692 /* Skip this step on read-only media. Note that we check the
693 * underlying block device here, not he read-only flag of the
694 * file system on top, since that one is most likely mounted
695 * read-only anyway at boot, even if the underlying block
696 * device is theoretically writable. */
697 if (fs_on_read_only(root) > 0) {
698 log_info("Disabling readahead collector due to read-only media.");
703 log_info("Disabling readahead collector due to low memory.");
707 shared = shared_get();
711 shared->collect = getpid();
712 __sync_synchronize();
714 if (collect(root) < 0)