1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
4 This file is part of systemd.
6 Copyright 2010 Lennart Poettering
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
25 #include <linux/limits.h>
30 #include <sys/select.h>
32 #include <sys/types.h>
35 #include <linux/fanotify.h>
36 #include <sys/signalfd.h>
40 #include <linux/fiemap.h>
41 #include <sys/ioctl.h>
44 #include <sys/inotify.h>
46 #include <systemd/sd-daemon.h>
52 #include "readahead-common.h"
57 * - detect ssd on btrfs/lvm...
58 * - read ahead directories
61 * - handle files where nothing is in mincore
62 * - does ioprio_set work with fadvise()?
65 static unsigned arg_files_max = 16*1024;
66 static off_t arg_file_size_max = READAHEAD_FILE_SIZE_MAX;
67 static usec_t arg_timeout = 2*USEC_PER_MINUTE;
69 static ReadaheadShared *shared = NULL;
71 /* Avoid collisions with the NULL pointer */
72 #define SECTOR_TO_PTR(s) ULONG_TO_PTR((s)+1)
73 #define PTR_TO_SECTOR(p) (PTR_TO_ULONG(p)-1)
75 static int btrfs_defrag(int fd) {
76 struct btrfs_ioctl_vol_args data;
81 return ioctl(fd, BTRFS_IOC_DEFRAG, &data);
84 static int pack_file(FILE *pack, const char *fn, bool on_btrfs) {
86 void *start = MAP_FAILED;
91 int r = 0, fd = -1, k;
96 if ((fd = open(fn, O_RDONLY|O_CLOEXEC|O_NOATIME|O_NOCTTY|O_NOFOLLOW)) < 0) {
101 if (errno == EPERM || errno == EACCES)
104 log_warning("open(%s) failed: %m", fn);
109 if ((k = file_verify(fd, fn, arg_file_size_max, &st)) <= 0) {
117 l = PAGE_ALIGN(st.st_size);
118 if ((start = mmap(NULL, l, PROT_READ, MAP_SHARED, fd, 0)) == MAP_FAILED) {
119 log_warning("mmap(%s) failed: %m", fn);
124 pages = l / page_size();
127 memset(vec, 0, pages);
128 if (mincore(start, l, vec) < 0) {
129 log_warning("mincore(%s) failed: %m", fn);
138 for (c = 0; c < pages; c++) {
139 bool new_mapped = !!(vec[c] & 1);
141 if (!mapped && new_mapped)
143 else if (mapped && !new_mapped) {
144 fwrite(&b, sizeof(b), 1, pack);
145 fwrite(&c, sizeof(c), 1, pack);
147 log_debug("%s: page %u to %u", fn, b, c);
153 /* We don't write any range data if we should read the entire file */
154 if (mapped && b > 0) {
155 fwrite(&b, sizeof(b), 1, pack);
156 fwrite(&c, sizeof(c), 1, pack);
158 log_debug("%s: page %u to %u", fn, b, c);
163 fwrite(&b, sizeof(b), 1, pack);
164 fwrite(&b, sizeof(b), 1, pack);
167 if (start != MAP_FAILED)
171 close_nointr_nofail(fd);
176 static unsigned long fd_first_block(int fd) {
178 struct fiemap fiemap;
179 struct fiemap_extent extent;
183 data.fiemap.fm_length = ~0ULL;
184 data.fiemap.fm_extent_count = 1;
186 if (ioctl(fd, FS_IOC_FIEMAP, &data) < 0)
189 if (data.fiemap.fm_mapped_extents <= 0)
192 if (data.fiemap.fm_extents[0].fe_flags & FIEMAP_EXTENT_UNKNOWN)
195 return (unsigned long) data.fiemap.fm_extents[0].fe_physical;
203 static int qsort_compare(const void *a, const void *b) {
204 const struct item *i, *j;
209 if (i->block < j->block)
211 if (i->block > j->block)
214 return strcmp(i->path, j->path);
217 static int collect(const char *root) {
219 FD_FANOTIFY, /* Get the actual fs events */
221 FD_INOTIFY, /* We get notifications to quit early via this fd */
224 struct pollfd pollfd[_FD_MAX];
225 int fanotify_fd = -1, signal_fd = -1, inotify_fd = -1, r = 0;
227 Hashmap *files = NULL;
232 char *pack_fn_new = NULL, *pack_fn = NULL;
233 bool on_ssd, on_btrfs;
239 write_one_line_file("/proc/self/oom_score_adj", "1000");
241 if (ioprio_set(IOPRIO_WHO_PROCESS, getpid(), IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0)) < 0)
242 log_warning("Failed to set IDLE IO priority class: %m");
244 assert_se(sigemptyset(&mask) == 0);
245 sigset_add_many(&mask, SIGINT, SIGTERM, -1);
246 assert_se(sigprocmask(SIG_SETMASK, &mask, NULL) == 0);
248 if ((signal_fd = signalfd(-1, &mask, SFD_NONBLOCK|SFD_CLOEXEC)) < 0) {
249 log_error("signalfd(): %m");
254 if (!(files = hashmap_new(string_hash_func, string_compare_func))) {
255 log_error("Failed to allocate set.");
260 if ((fanotify_fd = fanotify_init(FAN_CLOEXEC|FAN_NONBLOCK, O_RDONLY|O_LARGEFILE|O_CLOEXEC|O_NOATIME)) < 0) {
261 log_error("Failed to create fanotify object: %m");
266 if (fanotify_mark(fanotify_fd, FAN_MARK_ADD|FAN_MARK_MOUNT, FAN_OPEN, AT_FDCWD, root) < 0) {
267 log_error("Failed to mark %s: %m", root);
272 if ((inotify_fd = open_inotify()) < 0) {
277 not_after = now(CLOCK_MONOTONIC) + arg_timeout;
282 pollfd[FD_FANOTIFY].fd = fanotify_fd;
283 pollfd[FD_FANOTIFY].events = POLLIN;
284 pollfd[FD_SIGNAL].fd = signal_fd;
285 pollfd[FD_SIGNAL].events = POLLIN;
286 pollfd[FD_INOTIFY].fd = inotify_fd;
287 pollfd[FD_INOTIFY].events = POLLIN;
291 "STATUS=Collecting readahead data");
293 log_debug("Collecting...");
295 if (access("/run/systemd/readahead/cancel", F_OK) >= 0) {
296 log_debug("Collection canceled");
301 if (access("/run/systemd/readahead/done", F_OK) >= 0) {
302 log_debug("Got termination request");
308 struct fanotify_event_metadata metadata;
312 struct fanotify_event_metadata *m;
316 if (hashmap_size(files) > arg_files_max) {
317 log_debug("Reached maximum number of read ahead files, ending collection.");
321 t = now(CLOCK_MONOTONIC);
322 if (t >= not_after) {
323 log_debug("Reached maximum collection time, ending collection.");
327 if ((h = poll(pollfd, _FD_MAX, (int) ((not_after - t) / USEC_PER_MSEC))) < 0) {
332 log_error("poll(): %m");
338 log_debug("Reached maximum collection time, ending collection.");
342 if (pollfd[FD_SIGNAL].revents) {
343 log_debug("Got signal.");
347 if (pollfd[FD_INOTIFY].revents) {
348 uint8_t inotify_buffer[sizeof(struct inotify_event) + FILENAME_MAX];
349 struct inotify_event *e;
351 if ((n = read(inotify_fd, &inotify_buffer, sizeof(inotify_buffer))) < 0) {
352 if (errno == EINTR || errno == EAGAIN)
355 log_error("Failed to read inotify event: %m");
360 e = (struct inotify_event*) inotify_buffer;
364 if ((e->mask & IN_CREATE) && streq(e->name, "cancel")) {
365 log_debug("Collection canceled");
370 if ((e->mask & IN_CREATE) && streq(e->name, "done")) {
371 log_debug("Got termination request");
375 step = sizeof(struct inotify_event) + e->len;
376 assert(step <= (size_t) n);
378 e = (struct inotify_event*) ((uint8_t*) e + step);
383 if ((n = read(fanotify_fd, &data, sizeof(data))) < 0) {
385 if (errno == EINTR || errno == EAGAIN)
388 /* fanotify sometimes returns EACCES on read()
389 * where it shouldn't. For now let's just
390 * ignore it here (which is safe), but
391 * eventually this should be
392 * dropped when the kernel is fixed.
394 * https://bugzilla.redhat.com/show_bug.cgi?id=707577 */
398 log_error("Failed to read event: %m");
403 for (m = &data.metadata; FAN_EVENT_OK(m, n); m = FAN_EVENT_NEXT(m, n)) {
410 if (m->pid == my_pid)
413 __sync_synchronize();
414 if (m->pid == shared->replay)
417 snprintf(fn, sizeof(fn), "/proc/self/fd/%i", m->fd);
420 if ((k = readlink_malloc(fn, &p)) >= 0) {
421 if (startswith(p, "/tmp") ||
422 endswith(p, " (deleted)") ||
423 hashmap_get(files, p))
424 /* Not interesting, or
430 ul = fd_first_block(m->fd);
432 if ((k = hashmap_put(files, p, SECTOR_TO_PTR(ul))) < 0) {
433 log_warning("set_put() failed: %s", strerror(-k));
439 log_warning("readlink(%s) failed: %s", fn, strerror(-k));
443 close_nointr_nofail(m->fd);
448 if (fanotify_fd >= 0) {
449 close_nointr_nofail(fanotify_fd);
453 log_debug("Writing Pack File...");
455 on_ssd = fs_on_ssd(root) > 0;
456 log_debug("On SSD: %s", yes_no(on_ssd));
458 on_btrfs = statfs(root, &sfs) >= 0 && (long) sfs.f_type == (long) BTRFS_SUPER_MAGIC;
459 log_debug("On btrfs: %s", yes_no(on_btrfs));
461 asprintf(&pack_fn, "%s/.readahead", root);
462 asprintf(&pack_fn_new, "%s/.readahead.new", root);
464 if (!pack_fn || !pack_fn_new) {
465 log_error("Out of memory");
470 if (!(pack = fopen(pack_fn_new, "we"))) {
471 log_error("Failed to open pack file: %m");
476 fputs(CANONICAL_HOST "\n", pack);
477 putc(on_ssd ? 'S' : 'R', pack);
479 if (on_ssd || on_btrfs) {
481 /* On SSD or on btrfs, just write things out in the
482 * order the files were accessed. */
484 HASHMAP_FOREACH_KEY(q, p, files, i)
485 pack_file(pack, p, on_btrfs);
487 struct item *ordered, *j;
490 /* On rotating media, order things by the block
493 log_debug("Ordering...");
495 n = hashmap_size(files);
496 if (!(ordered = new(struct item, n))) {
497 log_error("Out of memory");
503 HASHMAP_FOREACH_KEY(q, p, files, i) {
505 j->block = PTR_TO_SECTOR(q);
509 assert(ordered + n == j);
511 qsort(ordered, n, sizeof(struct item), qsort_compare);
513 for (k = 0; k < n; k++)
514 pack_file(pack, ordered[k].path, on_btrfs);
519 log_debug("Finalizing...");
524 log_error("Failed to write pack file.");
529 if (rename(pack_fn_new, pack_fn) < 0) {
530 log_error("Failed to rename readahead file: %m");
541 if (fanotify_fd >= 0)
542 close_nointr_nofail(fanotify_fd);
545 close_nointr_nofail(signal_fd);
548 close_nointr_nofail(inotify_fd);
558 while ((p = hashmap_steal_first_key(files)))
566 static int help(void) {
568 printf("%s [OPTIONS...] [DIRECTORY]\n\n"
569 "Collect read-ahead data on early boot.\n\n"
570 " -h --help Show this help\n"
571 " --max-files=INT Maximum number of files to read ahead\n"
572 " --max-file-size=BYTES Maximum size of files to read ahead\n"
573 " --timeout=USEC Maximum time to spend collecting data\n",
574 program_invocation_short_name);
579 static int parse_argv(int argc, char *argv[]) {
582 ARG_FILES_MAX = 0x100,
587 static const struct option options[] = {
588 { "help", no_argument, NULL, 'h' },
589 { "files-max", required_argument, NULL, ARG_FILES_MAX },
590 { "file-size-max", required_argument, NULL, ARG_FILE_SIZE_MAX },
591 { "timeout", required_argument, NULL, ARG_TIMEOUT },
600 while ((c = getopt_long(argc, argv, "h", options, NULL)) >= 0) {
609 if (safe_atou(optarg, &arg_files_max) < 0 || arg_files_max <= 0) {
610 log_error("Failed to parse maximum number of files %s.", optarg);
615 case ARG_FILE_SIZE_MAX: {
616 unsigned long long ull;
618 if (safe_atollu(optarg, &ull) < 0 || ull <= 0) {
619 log_error("Failed to parse maximum file size %s.", optarg);
623 arg_file_size_max = (off_t) ull;
628 if (parse_usec(optarg, &arg_timeout) < 0 || arg_timeout <= 0) {
629 log_error("Failed to parse timeout %s.", optarg);
639 log_error("Unknown option code %c", c);
644 if (optind != argc &&
653 int main(int argc, char *argv[]) {
657 log_set_target(LOG_TARGET_AUTO);
658 log_parse_environment();
663 if ((r = parse_argv(argc, argv)) <= 0)
664 return r < 0 ? EXIT_FAILURE : EXIT_SUCCESS;
666 root = optind < argc ? argv[optind] : "/";
668 if (fs_on_read_only(root) > 0) {
669 log_info("Disabling readahead collector due to read-only media.");
674 log_info("Disabling readahead collector due to low memory.");
678 if (detect_virtualization(NULL) > 0) {
679 log_info("Disabling readahead collector due to execution in virtualized environment.");
683 if (!(shared = shared_get()))
686 shared->collect = getpid();
687 __sync_synchronize();
689 if (collect(root) < 0)