src/readahead/readahead-collect.c

   1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
   2
   3 /***
   4   This file is part of systemd.
   5
   6   Copyright 2010 Lennart Poettering
   7
   8   systemd is free software; you can redistribute it and/or modify it
   9   under the terms of the GNU Lesser General Public License as published by
  10   the Free Software Foundation; either version 2.1 of the License, or
  11   (at your option) any later version.
  12
  13   systemd is distributed in the hope that it will be useful, but
  14   WITHOUT ANY WARRANTY; without even the implied warranty of
  15   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  16   Lesser General Public License for more details.
  17
  18   You should have received a copy of the GNU Lesser General Public License
  19   along with systemd; If not, see <http://www.gnu.org/licenses/>.
  20 ***/
  21
  22 #include <errno.h>
  23 #include <inttypes.h>
  24 #include <fcntl.h>
  25 #include <linux/limits.h>
  26 #include <stdbool.h>
  27 #include <stdio.h>
  28 #include <stdlib.h>
  29 #include <string.h>
  30 #include <sys/select.h>
  31 #include <sys/time.h>
  32 #include <sys/types.h>
  33 #include <sys/stat.h>
  34 #include <unistd.h>
  35 #include <linux/fanotify.h>
  36 #include <sys/signalfd.h>
  37 #include <sys/poll.h>
  38 #include <sys/mman.h>
  39 #include <linux/fs.h>
  40 #include <linux/fiemap.h>
  41 #include <sys/ioctl.h>
  42 #include <sys/vfs.h>
  43 #include <getopt.h>
  44 #include <sys/inotify.h>
  45 #include <math.h>
  46
  47 #ifdef HAVE_LINUX_BTRFS_H
  48 #include <linux/btrfs.h>
  49 #endif
  50
  51 #ifdef HAVE_FANOTIFY_INIT
  52 #include <sys/fanotify.h>
  53 #endif
  54
  55 #include <systemd/sd-daemon.h>
  56
  57 #include "missing.h"
  58 #include "util.h"
  59 #include "set.h"
  60 #include "ioprio.h"
  61 #include "readahead-common.h"
  62 #include "virt.h"
  63
  64 /* fixme:
  65  *
  66  * - detect ssd on btrfs/lvm...
  67  * - read ahead directories
  68  * - gzip?
  69  * - remount rw?
  70  * - handle files where nothing is in mincore
  71  * - does ioprio_set work with fadvise()?
  72  */
  73
  74 static ReadaheadShared *shared = NULL;
  75 static usec_t starttime;
  76
  77 /* Avoid collisions with the NULL pointer */
  78 #define SECTOR_TO_PTR(s) ULONG_TO_PTR((s)+1)
  79 #define PTR_TO_SECTOR(p) (PTR_TO_ULONG(p)-1)
  80
  81 static int btrfs_defrag(int fd) {
  82         struct btrfs_ioctl_vol_args data = { .fd = fd };
  83
  84         return ioctl(fd, BTRFS_IOC_DEFRAG, &data);
  85 }
  86
  87 static int pack_file(FILE *pack, const char *fn, bool on_btrfs) {
  88         struct stat st;
  89         void *start = MAP_FAILED;
  90         uint8_t *vec;
  91         uint32_t b, c;
  92         uint64_t inode;
  93         size_t l, pages;
  94         bool mapped;
  95         int r = 0, fd = -1, k;
  96
  97         assert(pack);
  98         assert(fn);
  99
 100         fd = open(fn, O_RDONLY|O_CLOEXEC|O_NOATIME|O_NOCTTY|O_NOFOLLOW);
 101         if (fd < 0) {
 102
 103                 if (errno == ENOENT)
 104                         return 0;
 105
 106                 if (errno == EPERM || errno == EACCES)
 107                         return 0;
 108
 109                 log_warning("open(%s) failed: %m", fn);
 110                 r = -errno;
 111                 goto finish;
 112         }
 113
 114         k = file_verify(fd, fn, arg_file_size_max, &st);
 115         if (k <= 0) {
 116                 r = k;
 117                 goto finish;
 118         }
 119
 120         if (on_btrfs)
 121                 btrfs_defrag(fd);
 122
 123         l = PAGE_ALIGN(st.st_size);
 124         start = mmap(NULL, l, PROT_READ, MAP_SHARED, fd, 0);
 125         if (start == MAP_FAILED) {
 126                 log_warning("mmap(%s) failed: %m", fn);
 127                 r = -errno;
 128                 goto finish;
 129         }
 130
 131         pages = l / page_size();
 132         vec = alloca0(pages);
 133         if (mincore(start, l, vec) < 0) {
 134                 log_warning("mincore(%s) failed: %m", fn);
 135                 r = -errno;
 136                 goto finish;
 137         }
 138
 139         fputs(fn, pack);
 140         fputc('\n', pack);
 141
 142         /* Store the inode, so that we notice when the file is deleted */
 143         inode = (uint64_t) st.st_ino;
 144         fwrite(&inode, sizeof(inode), 1, pack);
 145
 146         mapped = false;
 147         for (c = 0; c < pages; c++) {
 148                 bool new_mapped = !!(vec[c] & 1);
 149
 150                 if (!mapped && new_mapped)
 151                         b = c;
 152                 else if (mapped && !new_mapped) {
 153                         fwrite(&b, sizeof(b), 1, pack);
 154                         fwrite(&c, sizeof(c), 1, pack);
 155
 156                         log_debug("%s: page %u to %u", fn, b, c);
 157                 }
 158
 159                 mapped = new_mapped;
 160         }
 161
 162         /* We don't write any range data if we should read the entire file */
 163         if (mapped && b > 0) {
 164                 fwrite(&b, sizeof(b), 1, pack);
 165                 fwrite(&c, sizeof(c), 1, pack);
 166
 167                 log_debug("%s: page %u to %u", fn, b, c);
 168         }
 169
 170         /* End marker */
 171         b = 0;
 172         fwrite(&b, sizeof(b), 1, pack);
 173         fwrite(&b, sizeof(b), 1, pack);
 174
 175 finish:
 176         if (start != MAP_FAILED)
 177                 munmap(start, l);
 178
 179         if (fd >= 0)
 180                 close_nointr_nofail(fd);
 181
 182         return r;
 183 }
 184
 185 static unsigned long fd_first_block(int fd) {
 186         struct {
 187                 struct fiemap fiemap;
 188                 struct fiemap_extent extent;
 189         } data = {
 190                 .fiemap.fm_length = ~0ULL,
 191                 .fiemap.fm_extent_count = 1,
 192         };
 193
 194         if (ioctl(fd, FS_IOC_FIEMAP, &data) < 0)
 195                 return 0;
 196
 197         if (data.fiemap.fm_mapped_extents <= 0)
 198                 return 0;
 199
 200         if (data.fiemap.fm_extents[0].fe_flags & FIEMAP_EXTENT_UNKNOWN)
 201                 return 0;
 202
 203         return (unsigned long) data.fiemap.fm_extents[0].fe_physical;
 204 }
 205
 206 struct item {
 207         const char *path;
 208         unsigned long block;
 209         unsigned long bin;
 210 };
 211
 212 static int qsort_compare(const void *a, const void *b) {
 213         const struct item *i, *j;
 214
 215         i = a;
 216         j = b;
 217
 218         /* sort by bin first */
 219         if (i->bin < j->bin)
 220                 return -1;
 221         if (i->bin > j->bin)
 222                 return 1;
 223
 224         /* then sort by sector */
 225         if (i->block < j->block)
 226                 return -1;
 227         if (i->block > j->block)
 228                 return 1;
 229
 230         return strcmp(i->path, j->path);
 231 }
 232
 233 static int collect(const char *root) {
 234         enum {
 235                 FD_FANOTIFY,  /* Get the actual fs events */
 236                 FD_SIGNAL,
 237                 FD_INOTIFY,   /* We get notifications to quit early via this fd */
 238                 _FD_MAX
 239         };
 240         struct pollfd pollfd[_FD_MAX] = {};
 241         int fanotify_fd = -1, signal_fd = -1, inotify_fd = -1, r = 0;
 242         pid_t my_pid;
 243         Hashmap *files = NULL;
 244         Iterator i;
 245         char *p, *q;
 246         sigset_t mask;
 247         FILE *pack = NULL;
 248         char *pack_fn_new = NULL, *pack_fn = NULL;
 249         bool on_ssd, on_btrfs;
 250         struct statfs sfs;
 251         usec_t not_after;
 252         uint64_t previous_block_readahead;
 253         bool previous_block_readahead_set = false;
 254
 255         assert(root);
 256
 257         if (asprintf(&pack_fn, "%s/.readahead", root) < 0) {
 258                 r = log_oom();
 259                 goto finish;
 260         }
 261
 262         starttime = now(CLOCK_MONOTONIC);
 263
 264         /* If there's no pack file yet we lower the kernel readahead
 265          * so that mincore() is accurate. If there is a pack file
 266          * already we assume it is accurate enough so that kernel
 267          * readahead is never triggered. */
 268         previous_block_readahead_set =
 269                 access(pack_fn, F_OK) < 0 &&
 270                 block_get_readahead(root, &previous_block_readahead) >= 0 &&
 271                 block_set_readahead(root, 8*1024) >= 0;
 272
 273         if (ioprio_set(IOPRIO_WHO_PROCESS, getpid(), IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0)) < 0)
 274                 log_warning("Failed to set IDLE IO priority class: %m");
 275
 276         assert_se(sigemptyset(&mask) == 0);
 277         sigset_add_many(&mask, SIGINT, SIGTERM, -1);
 278         assert_se(sigprocmask(SIG_SETMASK, &mask, NULL) == 0);
 279
 280         if ((signal_fd = signalfd(-1, &mask, SFD_NONBLOCK|SFD_CLOEXEC)) < 0) {
 281                 log_error("signalfd(): %m");
 282                 r = -errno;
 283                 goto finish;
 284         }
 285
 286         files = hashmap_new(string_hash_func, string_compare_func);
 287         if (!files) {
 288                 log_error("Failed to allocate set.");
 289                 r = -ENOMEM;
 290                 goto finish;
 291         }
 292
 293         fanotify_fd = fanotify_init(FAN_CLOEXEC|FAN_NONBLOCK, O_RDONLY|O_LARGEFILE|O_CLOEXEC|O_NOATIME);
 294         if (fanotify_fd < 0)  {
 295                 log_error("Failed to create fanotify object: %m");
 296                 r = -errno;
 297                 goto finish;
 298         }
 299
 300         if (fanotify_mark(fanotify_fd, FAN_MARK_ADD|FAN_MARK_MOUNT, FAN_OPEN, AT_FDCWD, root) < 0) {
 301                 log_error("Failed to mark %s: %m", root);
 302                 r = -errno;
 303                 goto finish;
 304         }
 305
 306         inotify_fd = open_inotify();
 307         if (inotify_fd < 0) {
 308                 r = inotify_fd;
 309                 goto finish;
 310         }
 311
 312         not_after = now(CLOCK_MONOTONIC) + arg_timeout;
 313
 314         my_pid = getpid();
 315
 316         pollfd[FD_FANOTIFY].fd = fanotify_fd;
 317         pollfd[FD_FANOTIFY].events = POLLIN;
 318         pollfd[FD_SIGNAL].fd = signal_fd;
 319         pollfd[FD_SIGNAL].events = POLLIN;
 320         pollfd[FD_INOTIFY].fd = inotify_fd;
 321         pollfd[FD_INOTIFY].events = POLLIN;
 322
 323         sd_notify(0,
 324                   "READY=1\n"
 325                   "STATUS=Collecting readahead data");
 326
 327         log_debug("Collecting...");
 328
 329         if (access("/run/systemd/readahead/cancel", F_OK) >= 0) {
 330                 log_debug("Collection canceled");
 331                 r = -ECANCELED;
 332                 goto finish;
 333         }
 334
 335         if (access("/run/systemd/readahead/done", F_OK) >= 0) {
 336                 log_debug("Got termination request");
 337                 goto done;
 338         }
 339
 340         for (;;) {
 341                 union {
 342                         struct fanotify_event_metadata metadata;
 343                         char buffer[4096];
 344                 } data;
 345                 ssize_t n;
 346                 struct fanotify_event_metadata *m;
 347                 usec_t t;
 348                 int h;
 349
 350                 if (hashmap_size(files) > arg_files_max) {
 351                         log_debug("Reached maximum number of read ahead files, ending collection.");
 352                         break;
 353                 }
 354
 355                 t = now(CLOCK_MONOTONIC);
 356                 if (t >= not_after) {
 357                         log_debug("Reached maximum collection time, ending collection.");
 358                         break;
 359                 }
 360
 361                 if ((h = poll(pollfd, _FD_MAX, (int) ((not_after - t) / USEC_PER_MSEC))) < 0) {
 362
 363                         if (errno == EINTR)
 364                                 continue;
 365
 366                         log_error("poll(): %m");
 367                         r = -errno;
 368                         goto finish;
 369                 }
 370
 371                 if (h == 0) {
 372                         log_debug("Reached maximum collection time, ending collection.");
 373                         break;
 374                 }
 375
 376                 if (pollfd[FD_SIGNAL].revents) {
 377                         log_debug("Got signal.");
 378                         break;
 379                 }
 380
 381                 if (pollfd[FD_INOTIFY].revents) {
 382                         uint8_t inotify_buffer[sizeof(struct inotify_event) + FILENAME_MAX];
 383                         struct inotify_event *e;
 384
 385                         if ((n = read(inotify_fd, &inotify_buffer, sizeof(inotify_buffer))) < 0) {
 386                                 if (errno == EINTR || errno == EAGAIN)
 387                                         continue;
 388
 389                                 log_error("Failed to read inotify event: %m");
 390                                 r = -errno;
 391                                 goto finish;
 392                         }
 393
 394                         e = (struct inotify_event*) inotify_buffer;
 395                         while (n > 0) {
 396                                 size_t step;
 397
 398                                 if ((e->mask & IN_CREATE) && streq(e->name, "cancel")) {
 399                                         log_debug("Collection canceled");
 400                                         r = -ECANCELED;
 401                                         goto finish;
 402                                 }
 403
 404                                 if ((e->mask & IN_CREATE) && streq(e->name, "done")) {
 405                                         log_debug("Got termination request");
 406                                         goto done;
 407                                 }
 408
 409                                 step = sizeof(struct inotify_event) + e->len;
 410                                 assert(step <= (size_t) n);
 411
 412                                 e = (struct inotify_event*) ((uint8_t*) e + step);
 413                                 n -= step;
 414                         }
 415                 }
 416
 417                 n = read(fanotify_fd, &data, sizeof(data));
 418                 if (n < 0) {
 419
 420                         if (errno == EINTR || errno == EAGAIN)
 421                                 continue;
 422
 423                         /* fanotify sometimes returns EACCES on read()
 424                          * where it shouldn't. For now let's just
 425                          * ignore it here (which is safe), but
 426                          * eventually this should be
 427                          * dropped when the kernel is fixed.
 428                          *
 429                          * https://bugzilla.redhat.com/show_bug.cgi?id=707577 */
 430                         if (errno == EACCES)
 431                                 continue;
 432
 433                         log_error("Failed to read event: %m");
 434                         r = -errno;
 435                         goto finish;
 436                 }
 437
 438                 for (m = &data.metadata; FAN_EVENT_OK(m, n); m = FAN_EVENT_NEXT(m, n)) {
 439                         char fn[sizeof("/proc/self/fd/") + DECIMAL_STR_MAX(int)];
 440                         int k;
 441
 442                         if (m->fd < 0)
 443                                 goto next_iteration;
 444
 445                         if (m->pid == my_pid)
 446                                 goto next_iteration;
 447
 448                         __sync_synchronize();
 449                         if (m->pid == shared->replay)
 450                                 goto next_iteration;
 451
 452                         snprintf(fn, sizeof(fn), "/proc/self/fd/%i", m->fd);
 453                         k = readlink_malloc(fn, &p);
 454                         if (k >= 0) {
 455                                 if (startswith(p, "/tmp") ||
 456                                     endswith(p, " (deleted)") ||
 457                                     hashmap_get(files, p))
 458                                         /* Not interesting, or
 459                                          * already read */
 460                                         free(p);
 461                                 else {
 462                                         unsigned long ul;
 463                                         usec_t entrytime;
 464                                         struct item *entry;
 465
 466                                         entry = new0(struct item, 1);
 467                                         if (!entry) {
 468                                                 r = log_oom();
 469                                                 goto finish;
 470                                         }
 471
 472                                         ul = fd_first_block(m->fd);
 473
 474                                         entrytime = now(CLOCK_MONOTONIC);
 475
 476                                         entry->block = ul;
 477                                         entry->path = strdup(p);
 478                                         if (!entry->path) {
 479                                                 free(entry);
 480                                                 r = log_oom();
 481                                                 goto finish;
 482                                         }
 483                                         entry->bin = (entrytime - starttime) / 2000000;
 484
 485                                         k = hashmap_put(files, p, entry);
 486                                         if (k < 0) {
 487                                                 log_warning("hashmap_put() failed: %s", strerror(-k));
 488                                                 free(p);
 489                                         }
 490                                 }
 491
 492                         } else
 493                                 log_warning("readlink(%s) failed: %s", fn, strerror(-k));
 494
 495                 next_iteration:
 496                         if (m->fd >= 0)
 497                                 close_nointr_nofail(m->fd);
 498                 }
 499         }
 500
 501 done:
 502         if (fanotify_fd >= 0) {
 503                 close_nointr_nofail(fanotify_fd);
 504                 fanotify_fd = -1;
 505         }
 506
 507         log_debug("Writing Pack File...");
 508
 509         on_ssd = fs_on_ssd(root) > 0;
 510         log_debug("On SSD: %s", yes_no(on_ssd));
 511
 512         on_btrfs = statfs(root, &sfs) >= 0 && F_TYPE_EQUAL(sfs.f_type, BTRFS_SUPER_MAGIC);
 513         log_debug("On btrfs: %s", yes_no(on_btrfs));
 514
 515         if (asprintf(&pack_fn_new, "%s/.readahead.new", root) < 0) {
 516                 r = log_oom();
 517                 goto finish;
 518         }
 519
 520         pack = fopen(pack_fn_new, "we");
 521         if (!pack) {
 522                 log_error("Failed to open pack file: %m");
 523                 r = -errno;
 524                 goto finish;
 525         }
 526
 527         fputs(CANONICAL_HOST READAHEAD_PACK_FILE_VERSION, pack);
 528         putc(on_ssd ? 'S' : 'R', pack);
 529
 530         if (on_ssd || on_btrfs) {
 531
 532                 /* On SSD or on btrfs, just write things out in the
 533                  * order the files were accessed. */
 534
 535                 HASHMAP_FOREACH_KEY(q, p, files, i)
 536                         pack_file(pack, p, on_btrfs);
 537         } else {
 538                 unsigned n;
 539
 540                 /* On rotating media, order things by the block
 541                  * numbers */
 542
 543                 log_debug("Ordering...");
 544
 545                 n = hashmap_size(files);
 546                 if (n) {
 547                         _cleanup_free_ struct item *ordered;
 548                         struct item *j;
 549                         unsigned k;
 550
 551                         ordered = new(struct item, n);
 552                         if (!ordered) {
 553                                 r = log_oom();
 554                                 goto finish;
 555                         }
 556
 557                         j = ordered;
 558                         HASHMAP_FOREACH_KEY(q, p, files, i) {
 559                                 memcpy(j, q, sizeof(struct item));
 560                                 j++;
 561                         }
 562
 563                         assert(ordered + n == j);
 564
 565                         qsort(ordered, n, sizeof(struct item), qsort_compare);
 566
 567                         for (k = 0; k < n; k++)
 568                                 pack_file(pack, ordered[k].path, on_btrfs);
 569                 } else
 570                         log_warning("No pack files");
 571         }
 572
 573         log_debug("Finalizing...");
 574
 575         fflush(pack);
 576
 577         if (ferror(pack)) {
 578                 log_error("Failed to write pack file.");
 579                 r = -EIO;
 580                 goto finish;
 581         }
 582
 583         if (rename(pack_fn_new, pack_fn) < 0) {
 584                 log_error("Failed to rename readahead file: %m");
 585                 r = -errno;
 586                 goto finish;
 587         }
 588
 589         fclose(pack);
 590         pack = NULL;
 591
 592         log_debug("Done.");
 593
 594 finish:
 595         if (fanotify_fd >= 0)
 596                 close_nointr_nofail(fanotify_fd);
 597
 598         if (signal_fd >= 0)
 599                 close_nointr_nofail(signal_fd);
 600
 601         if (inotify_fd >= 0)
 602                 close_nointr_nofail(inotify_fd);
 603
 604         if (pack) {
 605                 fclose(pack);
 606                 unlink(pack_fn_new);
 607         }
 608         free(pack_fn_new);
 609         free(pack_fn);
 610
 611         while ((p = hashmap_steal_first_key(files)))
 612                 free(p);
 613
 614         hashmap_free(files);
 615
 616         if (previous_block_readahead_set) {
 617                 uint64_t bytes;
 618
 619                 /* Restore the original kernel readahead setting if we
 620                  * changed it, and nobody has overwritten it since
 621                  * yet. */
 622                 if (block_get_readahead(root, &bytes) >= 0 && bytes == 8*1024)
 623                         block_set_readahead(root, previous_block_readahead);
 624         }
 625
 626         return r;
 627 }
 628
 629 int main_collect(const char *root) {
 630
 631         if (!root)
 632                 root = "/";
 633
 634         /* Skip this step on read-only media. Note that we check the
 635          * underlying block device here, not he read-only flag of the
 636          * file system on top, since that one is most likely mounted
 637          * read-only anyway at boot, even if the underlying block
 638          * device is theoretically writable. */
 639         if (fs_on_read_only(root) > 0) {
 640                 log_info("Disabling readahead collector due to read-only media.");
 641                 return EXIT_SUCCESS;
 642         }
 643
 644         if (!enough_ram()) {
 645                 log_info("Disabling readahead collector due to low memory.");
 646                 return EXIT_SUCCESS;
 647         }
 648
 649         shared = shared_get();
 650         if (!shared)
 651                 return EXIT_FAILURE;
 652
 653         shared->collect = getpid();
 654         __sync_synchronize();
 655
 656         if (collect(root) < 0)
 657                 return EXIT_FAILURE;
 658
 659         return EXIT_SUCCESS;
 660 }