src/readahead/readahead-collect.c

   1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
   2
   3 /***
   4   This file is part of systemd.
   5
   6   Copyright 2010 Lennart Poettering
   7
   8   systemd is free software; you can redistribute it and/or modify it
   9   under the terms of the GNU Lesser General Public License as published by
  10   the Free Software Foundation; either version 2.1 of the License, or
  11   (at your option) any later version.
  12
  13   systemd is distributed in the hope that it will be useful, but
  14   WITHOUT ANY WARRANTY; without even the implied warranty of
  15   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  16   Lesser General Public License for more details.
  17
  18   You should have received a copy of the GNU Lesser General Public License
  19   along with systemd; If not, see <http://www.gnu.org/licenses/>.
  20 ***/
  21
  22 #include <errno.h>
  23 #include <inttypes.h>
  24 #include <fcntl.h>
  25 #include <linux/limits.h>
  26 #include <stdbool.h>
  27 #include <stdio.h>
  28 #include <stdlib.h>
  29 #include <string.h>
  30 #include <sys/select.h>
  31 #include <sys/time.h>
  32 #include <sys/types.h>
  33 #include <sys/stat.h>
  34 #include <unistd.h>
  35 #include <linux/fanotify.h>
  36 #include <sys/signalfd.h>
  37 #include <sys/poll.h>
  38 #include <sys/mman.h>
  39 #include <linux/fs.h>
  40 #include <linux/fiemap.h>
  41 #include <sys/ioctl.h>
  42 #include <sys/vfs.h>
  43 #include <getopt.h>
  44 #include <sys/inotify.h>
  45 #include <math.h>
  46
  47 #ifdef HAVE_FANOTIFY_INIT
  48 #include <sys/fanotify.h>
  49 #endif
  50
  51 #include <systemd/sd-daemon.h>
  52
  53 #include "missing.h"
  54 #include "util.h"
  55 #include "set.h"
  56 #include "ioprio.h"
  57 #include "readahead-common.h"
  58 #include "virt.h"
  59
  60 /* fixme:
  61  *
  62  * - detect ssd on btrfs/lvm...
  63  * - read ahead directories
  64  * - gzip?
  65  * - remount rw?
  66  * - handle files where nothing is in mincore
  67  * - does ioprio_set work with fadvise()?
  68  */
  69
  70 static ReadaheadShared *shared = NULL;
  71 static usec_t starttime;
  72
  73 /* Avoid collisions with the NULL pointer */
  74 #define SECTOR_TO_PTR(s) ULONG_TO_PTR((s)+1)
  75 #define PTR_TO_SECTOR(p) (PTR_TO_ULONG(p)-1)
  76
  77 static int btrfs_defrag(int fd) {
  78         struct btrfs_ioctl_vol_args data;
  79
  80         zero(data);
  81         data.fd = fd;
  82
  83         return ioctl(fd, BTRFS_IOC_DEFRAG, &data);
  84 }
  85
  86 static int pack_file(FILE *pack, const char *fn, bool on_btrfs) {
  87         struct stat st;
  88         void *start = MAP_FAILED;
  89         uint8_t *vec;
  90         uint32_t b, c;
  91         uint64_t inode;
  92         size_t l, pages;
  93         bool mapped;
  94         int r = 0, fd = -1, k;
  95
  96         assert(pack);
  97         assert(fn);
  98
  99         fd = open(fn, O_RDONLY|O_CLOEXEC|O_NOATIME|O_NOCTTY|O_NOFOLLOW);
 100         if (fd < 0) {
 101
 102                 if (errno == ENOENT)
 103                         return 0;
 104
 105                 if (errno == EPERM || errno == EACCES)
 106                         return 0;
 107
 108                 log_warning("open(%s) failed: %m", fn);
 109                 r = -errno;
 110                 goto finish;
 111         }
 112
 113         k = file_verify(fd, fn, arg_file_size_max, &st);
 114         if (k <= 0) {
 115                 r = k;
 116                 goto finish;
 117         }
 118
 119         if (on_btrfs)
 120                 btrfs_defrag(fd);
 121
 122         l = PAGE_ALIGN(st.st_size);
 123         start = mmap(NULL, l, PROT_READ, MAP_SHARED, fd, 0);
 124         if (start == MAP_FAILED) {
 125                 log_warning("mmap(%s) failed: %m", fn);
 126                 r = -errno;
 127                 goto finish;
 128         }
 129
 130         pages = l / page_size();
 131         vec = alloca(pages);
 132         memset(vec, 0, pages);
 133         if (mincore(start, l, vec) < 0) {
 134                 log_warning("mincore(%s) failed: %m", fn);
 135                 r = -errno;
 136                 goto finish;
 137         }
 138
 139         fputs(fn, pack);
 140         fputc('\n', pack);
 141
 142         /* Store the inode, so that we notice when the file is deleted */
 143         inode = (uint64_t) st.st_ino;
 144         fwrite(&inode, sizeof(inode), 1, pack);
 145
 146         mapped = false;
 147         for (c = 0; c < pages; c++) {
 148                 bool new_mapped = !!(vec[c] & 1);
 149
 150                 if (!mapped && new_mapped)
 151                         b = c;
 152                 else if (mapped && !new_mapped) {
 153                         fwrite(&b, sizeof(b), 1, pack);
 154                         fwrite(&c, sizeof(c), 1, pack);
 155
 156                         log_debug("%s: page %u to %u", fn, b, c);
 157                 }
 158
 159                 mapped = new_mapped;
 160         }
 161
 162         /* We don't write any range data if we should read the entire file */
 163         if (mapped && b > 0) {
 164                 fwrite(&b, sizeof(b), 1, pack);
 165                 fwrite(&c, sizeof(c), 1, pack);
 166
 167                 log_debug("%s: page %u to %u", fn, b, c);
 168         }
 169
 170         /* End marker */
 171         b = 0;
 172         fwrite(&b, sizeof(b), 1, pack);
 173         fwrite(&b, sizeof(b), 1, pack);
 174
 175 finish:
 176         if (start != MAP_FAILED)
 177                 munmap(start, l);
 178
 179         if (fd >= 0)
 180                 close_nointr_nofail(fd);
 181
 182         return r;
 183 }
 184
 185 static unsigned long fd_first_block(int fd) {
 186         struct {
 187                 struct fiemap fiemap;
 188                 struct fiemap_extent extent;
 189         } data;
 190
 191         zero(data);
 192         data.fiemap.fm_length = ~0ULL;
 193         data.fiemap.fm_extent_count = 1;
 194
 195         if (ioctl(fd, FS_IOC_FIEMAP, &data) < 0)
 196                 return 0;
 197
 198         if (data.fiemap.fm_mapped_extents <= 0)
 199                 return 0;
 200
 201         if (data.fiemap.fm_extents[0].fe_flags & FIEMAP_EXTENT_UNKNOWN)
 202                 return 0;
 203
 204         return (unsigned long) data.fiemap.fm_extents[0].fe_physical;
 205 }
 206
 207 struct item {
 208         const char *path;
 209         unsigned long block;
 210         unsigned long bin;
 211 };
 212
 213 static int qsort_compare(const void *a, const void *b) {
 214         const struct item *i, *j;
 215
 216         i = a;
 217         j = b;
 218
 219         /* sort by bin first */
 220         if (i->bin < j->bin)
 221                 return -1;
 222         if (i->bin > j->bin)
 223                 return 1;
 224
 225         /* then sort by sector */
 226         if (i->block < j->block)
 227                 return -1;
 228         if (i->block > j->block)
 229                 return 1;
 230
 231         return strcmp(i->path, j->path);
 232 }
 233
 234 static int collect(const char *root) {
 235         enum {
 236                 FD_FANOTIFY,  /* Get the actual fs events */
 237                 FD_SIGNAL,
 238                 FD_INOTIFY,   /* We get notifications to quit early via this fd */
 239                 _FD_MAX
 240         };
 241         struct pollfd pollfd[_FD_MAX];
 242         int fanotify_fd = -1, signal_fd = -1, inotify_fd = -1, r = 0;
 243         pid_t my_pid;
 244         Hashmap *files = NULL;
 245         Iterator i;
 246         char *p, *q;
 247         sigset_t mask;
 248         FILE *pack = NULL;
 249         char *pack_fn_new = NULL, *pack_fn = NULL;
 250         bool on_ssd, on_btrfs;
 251         struct statfs sfs;
 252         usec_t not_after;
 253         uint64_t previous_block_readahead;
 254         bool previous_block_readahead_set = false;
 255
 256         assert(root);
 257
 258         if (asprintf(&pack_fn, "%s/.readahead", root) < 0) {
 259                 r = log_oom();
 260                 goto finish;
 261         }
 262
 263         starttime = now(CLOCK_MONOTONIC);
 264
 265         /* If there's no pack file yet we lower the kernel readahead
 266          * so that mincore() is accurate. If there is a pack file
 267          * already we assume it is accurate enough so that kernel
 268          * readahead is never triggered. */
 269         previous_block_readahead_set =
 270                 access(pack_fn, F_OK) < 0 &&
 271                 block_get_readahead(root, &previous_block_readahead) >= 0 &&
 272                 block_set_readahead(root, 8*1024) >= 0;
 273
 274         if (ioprio_set(IOPRIO_WHO_PROCESS, getpid(), IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0)) < 0)
 275                 log_warning("Failed to set IDLE IO priority class: %m");
 276
 277         assert_se(sigemptyset(&mask) == 0);
 278         sigset_add_many(&mask, SIGINT, SIGTERM, -1);
 279         assert_se(sigprocmask(SIG_SETMASK, &mask, NULL) == 0);
 280
 281         if ((signal_fd = signalfd(-1, &mask, SFD_NONBLOCK|SFD_CLOEXEC)) < 0) {
 282                 log_error("signalfd(): %m");
 283                 r = -errno;
 284                 goto finish;
 285         }
 286
 287         files = hashmap_new(string_hash_func, string_compare_func);
 288         if (!files) {
 289                 log_error("Failed to allocate set.");
 290                 r = -ENOMEM;
 291                 goto finish;
 292         }
 293
 294         fanotify_fd = fanotify_init(FAN_CLOEXEC|FAN_NONBLOCK, O_RDONLY|O_LARGEFILE|O_CLOEXEC|O_NOATIME);
 295         if (fanotify_fd < 0)  {
 296                 log_error("Failed to create fanotify object: %m");
 297                 r = -errno;
 298                 goto finish;
 299         }
 300
 301         if (fanotify_mark(fanotify_fd, FAN_MARK_ADD|FAN_MARK_MOUNT, FAN_OPEN, AT_FDCWD, root) < 0) {
 302                 log_error("Failed to mark %s: %m", root);
 303                 r = -errno;
 304                 goto finish;
 305         }
 306
 307         inotify_fd = open_inotify();
 308         if (inotify_fd < 0) {
 309                 r = inotify_fd;
 310                 goto finish;
 311         }
 312
 313         not_after = now(CLOCK_MONOTONIC) + arg_timeout;
 314
 315         my_pid = getpid();
 316
 317         zero(pollfd);
 318         pollfd[FD_FANOTIFY].fd = fanotify_fd;
 319         pollfd[FD_FANOTIFY].events = POLLIN;
 320         pollfd[FD_SIGNAL].fd = signal_fd;
 321         pollfd[FD_SIGNAL].events = POLLIN;
 322         pollfd[FD_INOTIFY].fd = inotify_fd;
 323         pollfd[FD_INOTIFY].events = POLLIN;
 324
 325         sd_notify(0,
 326                   "READY=1\n"
 327                   "STATUS=Collecting readahead data");
 328
 329         log_debug("Collecting...");
 330
 331         if (access("/run/systemd/readahead/cancel", F_OK) >= 0) {
 332                 log_debug("Collection canceled");
 333                 r = -ECANCELED;
 334                 goto finish;
 335         }
 336
 337         if (access("/run/systemd/readahead/done", F_OK) >= 0) {
 338                 log_debug("Got termination request");
 339                 goto done;
 340         }
 341
 342         for (;;) {
 343                 union {
 344                         struct fanotify_event_metadata metadata;
 345                         char buffer[4096];
 346                 } data;
 347                 ssize_t n;
 348                 struct fanotify_event_metadata *m;
 349                 usec_t t;
 350                 int h;
 351
 352                 if (hashmap_size(files) > arg_files_max) {
 353                         log_debug("Reached maximum number of read ahead files, ending collection.");
 354                         break;
 355                 }
 356
 357                 t = now(CLOCK_MONOTONIC);
 358                 if (t >= not_after) {
 359                         log_debug("Reached maximum collection time, ending collection.");
 360                         break;
 361                 }
 362
 363                 if ((h = poll(pollfd, _FD_MAX, (int) ((not_after - t) / USEC_PER_MSEC))) < 0) {
 364
 365                         if (errno == EINTR)
 366                                 continue;
 367
 368                         log_error("poll(): %m");
 369                         r = -errno;
 370                         goto finish;
 371                 }
 372
 373                 if (h == 0) {
 374                         log_debug("Reached maximum collection time, ending collection.");
 375                         break;
 376                 }
 377
 378                 if (pollfd[FD_SIGNAL].revents) {
 379                         log_debug("Got signal.");
 380                         break;
 381                 }
 382
 383                 if (pollfd[FD_INOTIFY].revents) {
 384                         uint8_t inotify_buffer[sizeof(struct inotify_event) + FILENAME_MAX];
 385                         struct inotify_event *e;
 386
 387                         if ((n = read(inotify_fd, &inotify_buffer, sizeof(inotify_buffer))) < 0) {
 388                                 if (errno == EINTR || errno == EAGAIN)
 389                                         continue;
 390
 391                                 log_error("Failed to read inotify event: %m");
 392                                 r = -errno;
 393                                 goto finish;
 394                         }
 395
 396                         e = (struct inotify_event*) inotify_buffer;
 397                         while (n > 0) {
 398                                 size_t step;
 399
 400                                 if ((e->mask & IN_CREATE) && streq(e->name, "cancel")) {
 401                                         log_debug("Collection canceled");
 402                                         r = -ECANCELED;
 403                                         goto finish;
 404                                 }
 405
 406                                 if ((e->mask & IN_CREATE) && streq(e->name, "done")) {
 407                                         log_debug("Got termination request");
 408                                         goto done;
 409                                 }
 410
 411                                 step = sizeof(struct inotify_event) + e->len;
 412                                 assert(step <= (size_t) n);
 413
 414                                 e = (struct inotify_event*) ((uint8_t*) e + step);
 415                                 n -= step;
 416                         }
 417                 }
 418
 419                 if ((n = read(fanotify_fd, &data, sizeof(data))) < 0) {
 420
 421                         if (errno == EINTR || errno == EAGAIN)
 422                                 continue;
 423
 424                         /* fanotify sometimes returns EACCES on read()
 425                          * where it shouldn't. For now let's just
 426                          * ignore it here (which is safe), but
 427                          * eventually this should be
 428                          * dropped when the kernel is fixed.
 429                          *
 430                          * https://bugzilla.redhat.com/show_bug.cgi?id=707577 */
 431                         if (errno == EACCES)
 432                                 continue;
 433
 434                         log_error("Failed to read event: %m");
 435                         r = -errno;
 436                         goto finish;
 437                 }
 438
 439                 for (m = &data.metadata; FAN_EVENT_OK(m, n); m = FAN_EVENT_NEXT(m, n)) {
 440                         char fn[PATH_MAX];
 441                         int k;
 442
 443                         if (m->fd < 0)
 444                                 goto next_iteration;
 445
 446                         if (m->pid == my_pid)
 447                                 goto next_iteration;
 448
 449                         __sync_synchronize();
 450                         if (m->pid == shared->replay)
 451                                 goto next_iteration;
 452
 453                         snprintf(fn, sizeof(fn), "/proc/self/fd/%i", m->fd);
 454                         char_array_0(fn);
 455
 456                         if ((k = readlink_malloc(fn, &p)) >= 0) {
 457                                 if (startswith(p, "/tmp") ||
 458                                     endswith(p, " (deleted)") ||
 459                                     hashmap_get(files, p))
 460                                         /* Not interesting, or
 461                                          * already read */
 462                                         free(p);
 463                                 else {
 464                                         unsigned long ul;
 465                                         usec_t entrytime;
 466                                         struct item *entry;
 467
 468                                         entry = new0(struct item, 1);
 469                                         if (!entry) {
 470                                                 r = log_oom();
 471                                                 goto finish;
 472                                         }
 473
 474                                         ul = fd_first_block(m->fd);
 475
 476                                         entrytime = now(CLOCK_MONOTONIC);
 477
 478                                         entry->block = ul;
 479                                         entry->path = strdup(p);
 480                                         if (!entry->path) {
 481                                                 free(entry);
 482                                                 r = log_oom();
 483                                                 goto finish;
 484                                         }
 485                                         entry->bin = (entrytime - starttime) / 2000000;
 486
 487                                         if ((k = hashmap_put(files, p, entry)) < 0) {
 488                                                 log_warning("set_put() failed: %s", strerror(-k));
 489                                                 free(p);
 490                                         }
 491                                 }
 492
 493                         } else
 494                                 log_warning("readlink(%s) failed: %s", fn, strerror(-k));
 495
 496                 next_iteration:
 497                         if (m->fd >= 0)
 498                                 close_nointr_nofail(m->fd);
 499                 }
 500         }
 501
 502 done:
 503         if (fanotify_fd >= 0) {
 504                 close_nointr_nofail(fanotify_fd);
 505                 fanotify_fd = -1;
 506         }
 507
 508         log_debug("Writing Pack File...");
 509
 510         on_ssd = fs_on_ssd(root) > 0;
 511         log_debug("On SSD: %s", yes_no(on_ssd));
 512
 513         on_btrfs = statfs(root, &sfs) >= 0 && (long) sfs.f_type == (long) BTRFS_SUPER_MAGIC;
 514         log_debug("On btrfs: %s", yes_no(on_btrfs));
 515
 516         if (asprintf(&pack_fn_new, "%s/.readahead.new", root) < 0) {
 517                 r = log_oom();
 518                 goto finish;
 519         }
 520
 521         pack = fopen(pack_fn_new, "we");
 522         if (!pack) {
 523                 log_error("Failed to open pack file: %m");
 524                 r = -errno;
 525                 goto finish;
 526         }
 527
 528         fputs(CANONICAL_HOST READAHEAD_PACK_FILE_VERSION, pack);
 529         putc(on_ssd ? 'S' : 'R', pack);
 530
 531         if (on_ssd || on_btrfs) {
 532
 533                 /* On SSD or on btrfs, just write things out in the
 534                  * order the files were accessed. */
 535
 536                 HASHMAP_FOREACH_KEY(q, p, files, i)
 537                         pack_file(pack, p, on_btrfs);
 538         } else {
 539                 struct item *ordered, *j;
 540                 unsigned k, n;
 541
 542                 /* On rotating media, order things by the block
 543                  * numbers */
 544
 545                 log_debug("Ordering...");
 546
 547                 n = hashmap_size(files);
 548                 if (!(ordered = new(struct item, n))) {
 549                         r = log_oom();
 550                         goto finish;
 551                 }
 552
 553                 j = ordered;
 554                 HASHMAP_FOREACH_KEY(q, p, files, i) {
 555                         memcpy(j, q, sizeof(struct item));
 556                         j++;
 557                 }
 558
 559                 assert(ordered + n == j);
 560
 561                 qsort(ordered, n, sizeof(struct item), qsort_compare);
 562
 563                 for (k = 0; k < n; k++)
 564                         pack_file(pack, ordered[k].path, on_btrfs);
 565
 566                 free(ordered);
 567         }
 568
 569         log_debug("Finalizing...");
 570
 571         fflush(pack);
 572
 573         if (ferror(pack)) {
 574                 log_error("Failed to write pack file.");
 575                 r = -EIO;
 576                 goto finish;
 577         }
 578
 579         if (rename(pack_fn_new, pack_fn) < 0) {
 580                 log_error("Failed to rename readahead file: %m");
 581                 r = -errno;
 582                 goto finish;
 583         }
 584
 585         fclose(pack);
 586         pack = NULL;
 587
 588         log_debug("Done.");
 589
 590 finish:
 591         if (fanotify_fd >= 0)
 592                 close_nointr_nofail(fanotify_fd);
 593
 594         if (signal_fd >= 0)
 595                 close_nointr_nofail(signal_fd);
 596
 597         if (inotify_fd >= 0)
 598                 close_nointr_nofail(inotify_fd);
 599
 600         if (pack) {
 601                 fclose(pack);
 602                 unlink(pack_fn_new);
 603         }
 604         free(pack_fn_new);
 605         free(pack_fn);
 606
 607         while ((p = hashmap_steal_first_key(files)))
 608                 free(p);
 609
 610         hashmap_free(files);
 611
 612         if (previous_block_readahead_set) {
 613                 uint64_t bytes;
 614
 615                 /* Restore the original kernel readahead setting if we
 616                  * changed it, and nobody has overwritten it since
 617                  * yet. */
 618                 if (block_get_readahead(root, &bytes) >= 0 && bytes == 8*1024)
 619                         block_set_readahead(root, previous_block_readahead);
 620         }
 621
 622         return r;
 623 }
 624
 625 int main_collect(const char *root) {
 626
 627         if (!root)
 628                 root = "/";
 629
 630         /* Skip this step on read-only media. Note that we check the
 631          * underlying block device here, not he read-only flag of the
 632          * file system on top, since that one is most likely mounted
 633          * read-only anyway at boot, even if the underlying block
 634          * device is theoretically writable. */
 635         if (fs_on_read_only(root) > 0) {
 636                 log_info("Disabling readahead collector due to read-only media.");
 637                 return EXIT_SUCCESS;
 638         }
 639
 640         if (!enough_ram()) {
 641                 log_info("Disabling readahead collector due to low memory.");
 642                 return EXIT_SUCCESS;
 643         }
 644
 645         shared = shared_get();
 646         if (!shared)
 647                 return EXIT_FAILURE;
 648
 649         shared->collect = getpid();
 650         __sync_synchronize();
 651
 652         if (collect(root) < 0)
 653                 return EXIT_FAILURE;
 654
 655         return EXIT_SUCCESS;
 656 }