src/readahead/readahead-collect.c

   1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
   2
   3 /***
   4   This file is part of systemd.
   5
   6   Copyright 2010 Lennart Poettering
   7
   8   systemd is free software; you can redistribute it and/or modify it
   9   under the terms of the GNU Lesser General Public License as published by
  10   the Free Software Foundation; either version 2.1 of the License, or
  11   (at your option) any later version.
  12
  13   systemd is distributed in the hope that it will be useful, but
  14   WITHOUT ANY WARRANTY; without even the implied warranty of
  15   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  16   Lesser General Public License for more details.
  17
  18   You should have received a copy of the GNU Lesser General Public License
  19   along with systemd; If not, see <http://www.gnu.org/licenses/>.
  20 ***/
  21
  22 #include <errno.h>
  23 #include <inttypes.h>
  24 #include <fcntl.h>
  25 #include <linux/limits.h>
  26 #include <stdbool.h>
  27 #include <stdio.h>
  28 #include <stdlib.h>
  29 #include <string.h>
  30 #include <sys/select.h>
  31 #include <sys/time.h>
  32 #include <sys/types.h>
  33 #include <sys/stat.h>
  34 #include <unistd.h>
  35 #include <linux/fanotify.h>
  36 #include <sys/signalfd.h>
  37 #include <sys/poll.h>
  38 #include <sys/mman.h>
  39 #include <linux/fs.h>
  40 #include <linux/fiemap.h>
  41 #include <sys/ioctl.h>
  42 #include <sys/vfs.h>
  43 #include <getopt.h>
  44 #include <sys/inotify.h>
  45 #include <math.h>
  46
  47 #ifdef HAVE_LINUX_BTRFS_H
  48 #include <linux/btrfs.h>
  49 #endif
  50
  51 #ifdef HAVE_FANOTIFY_INIT
  52 #include <sys/fanotify.h>
  53 #endif
  54
  55 #include <systemd/sd-daemon.h>
  56
  57 #include "missing.h"
  58 #include "util.h"
  59 #include "set.h"
  60 #include "ioprio.h"
  61 #include "readahead-common.h"
  62 #include "virt.h"
  63
  64 /* fixme:
  65  *
  66  * - detect ssd on btrfs/lvm...
  67  * - read ahead directories
  68  * - gzip?
  69  * - remount rw?
  70  * - handle files where nothing is in mincore
  71  * - does ioprio_set work with fadvise()?
  72  */
  73
  74 static ReadaheadShared *shared = NULL;
  75 static usec_t starttime;
  76
  77 /* Avoid collisions with the NULL pointer */
  78 #define SECTOR_TO_PTR(s) ULONG_TO_PTR((s)+1)
  79 #define PTR_TO_SECTOR(p) (PTR_TO_ULONG(p)-1)
  80
  81 static int btrfs_defrag(int fd) {
  82         struct btrfs_ioctl_vol_args data = { .fd = fd };
  83
  84         return ioctl(fd, BTRFS_IOC_DEFRAG, &data);
  85 }
  86
  87 static int pack_file(FILE *pack, const char *fn, bool on_btrfs) {
  88         struct stat st;
  89         void *start = MAP_FAILED;
  90         uint8_t *vec;
  91         uint32_t b, c;
  92         uint64_t inode;
  93         size_t l, pages;
  94         bool mapped;
  95         int r = 0, fd = -1, k;
  96
  97         assert(pack);
  98         assert(fn);
  99
 100         fd = open(fn, O_RDONLY|O_CLOEXEC|O_NOATIME|O_NOCTTY|O_NOFOLLOW);
 101         if (fd < 0) {
 102
 103                 if (errno == ENOENT)
 104                         return 0;
 105
 106                 if (errno == EPERM || errno == EACCES)
 107                         return 0;
 108
 109                 log_warning("open(%s) failed: %m", fn);
 110                 r = -errno;
 111                 goto finish;
 112         }
 113
 114         k = file_verify(fd, fn, arg_file_size_max, &st);
 115         if (k <= 0) {
 116                 r = k;
 117                 goto finish;
 118         }
 119
 120         if (on_btrfs)
 121                 btrfs_defrag(fd);
 122
 123         l = PAGE_ALIGN(st.st_size);
 124         start = mmap(NULL, l, PROT_READ, MAP_SHARED, fd, 0);
 125         if (start == MAP_FAILED) {
 126                 log_warning("mmap(%s) failed: %m", fn);
 127                 r = -errno;
 128                 goto finish;
 129         }
 130
 131         pages = l / page_size();
 132         vec = alloca(pages);
 133         memset(vec, 0, pages);
 134         if (mincore(start, l, vec) < 0) {
 135                 log_warning("mincore(%s) failed: %m", fn);
 136                 r = -errno;
 137                 goto finish;
 138         }
 139
 140         fputs(fn, pack);
 141         fputc('\n', pack);
 142
 143         /* Store the inode, so that we notice when the file is deleted */
 144         inode = (uint64_t) st.st_ino;
 145         fwrite(&inode, sizeof(inode), 1, pack);
 146
 147         mapped = false;
 148         for (c = 0; c < pages; c++) {
 149                 bool new_mapped = !!(vec[c] & 1);
 150
 151                 if (!mapped && new_mapped)
 152                         b = c;
 153                 else if (mapped && !new_mapped) {
 154                         fwrite(&b, sizeof(b), 1, pack);
 155                         fwrite(&c, sizeof(c), 1, pack);
 156
 157                         log_debug("%s: page %u to %u", fn, b, c);
 158                 }
 159
 160                 mapped = new_mapped;
 161         }
 162
 163         /* We don't write any range data if we should read the entire file */
 164         if (mapped && b > 0) {
 165                 fwrite(&b, sizeof(b), 1, pack);
 166                 fwrite(&c, sizeof(c), 1, pack);
 167
 168                 log_debug("%s: page %u to %u", fn, b, c);
 169         }
 170
 171         /* End marker */
 172         b = 0;
 173         fwrite(&b, sizeof(b), 1, pack);
 174         fwrite(&b, sizeof(b), 1, pack);
 175
 176 finish:
 177         if (start != MAP_FAILED)
 178                 munmap(start, l);
 179
 180         if (fd >= 0)
 181                 close_nointr_nofail(fd);
 182
 183         return r;
 184 }
 185
 186 static unsigned long fd_first_block(int fd) {
 187         struct {
 188                 struct fiemap fiemap;
 189                 struct fiemap_extent extent;
 190         } data = {
 191                 .fiemap.fm_length = ~0ULL,
 192                 .fiemap.fm_extent_count = 1,
 193         };
 194
 195         if (ioctl(fd, FS_IOC_FIEMAP, &data) < 0)
 196                 return 0;
 197
 198         if (data.fiemap.fm_mapped_extents <= 0)
 199                 return 0;
 200
 201         if (data.fiemap.fm_extents[0].fe_flags & FIEMAP_EXTENT_UNKNOWN)
 202                 return 0;
 203
 204         return (unsigned long) data.fiemap.fm_extents[0].fe_physical;
 205 }
 206
 207 struct item {
 208         const char *path;
 209         unsigned long block;
 210         unsigned long bin;
 211 };
 212
 213 static int qsort_compare(const void *a, const void *b) {
 214         const struct item *i, *j;
 215
 216         i = a;
 217         j = b;
 218
 219         /* sort by bin first */
 220         if (i->bin < j->bin)
 221                 return -1;
 222         if (i->bin > j->bin)
 223                 return 1;
 224
 225         /* then sort by sector */
 226         if (i->block < j->block)
 227                 return -1;
 228         if (i->block > j->block)
 229                 return 1;
 230
 231         return strcmp(i->path, j->path);
 232 }
 233
 234 static int collect(const char *root) {
 235         enum {
 236                 FD_FANOTIFY,  /* Get the actual fs events */
 237                 FD_SIGNAL,
 238                 FD_INOTIFY,   /* We get notifications to quit early via this fd */
 239                 _FD_MAX
 240         };
 241         struct pollfd pollfd[_FD_MAX] = {};
 242         int fanotify_fd = -1, signal_fd = -1, inotify_fd = -1, r = 0;
 243         pid_t my_pid;
 244         Hashmap *files = NULL;
 245         Iterator i;
 246         char *p, *q;
 247         sigset_t mask;
 248         FILE *pack = NULL;
 249         char *pack_fn_new = NULL, *pack_fn = NULL;
 250         bool on_ssd, on_btrfs;
 251         struct statfs sfs;
 252         usec_t not_after;
 253         uint64_t previous_block_readahead;
 254         bool previous_block_readahead_set = false;
 255
 256         assert(root);
 257
 258         if (asprintf(&pack_fn, "%s/.readahead", root) < 0) {
 259                 r = log_oom();
 260                 goto finish;
 261         }
 262
 263         starttime = now(CLOCK_MONOTONIC);
 264
 265         /* If there's no pack file yet we lower the kernel readahead
 266          * so that mincore() is accurate. If there is a pack file
 267          * already we assume it is accurate enough so that kernel
 268          * readahead is never triggered. */
 269         previous_block_readahead_set =
 270                 access(pack_fn, F_OK) < 0 &&
 271                 block_get_readahead(root, &previous_block_readahead) >= 0 &&
 272                 block_set_readahead(root, 8*1024) >= 0;
 273
 274         if (ioprio_set(IOPRIO_WHO_PROCESS, getpid(), IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0)) < 0)
 275                 log_warning("Failed to set IDLE IO priority class: %m");
 276
 277         assert_se(sigemptyset(&mask) == 0);
 278         sigset_add_many(&mask, SIGINT, SIGTERM, -1);
 279         assert_se(sigprocmask(SIG_SETMASK, &mask, NULL) == 0);
 280
 281         if ((signal_fd = signalfd(-1, &mask, SFD_NONBLOCK|SFD_CLOEXEC)) < 0) {
 282                 log_error("signalfd(): %m");
 283                 r = -errno;
 284                 goto finish;
 285         }
 286
 287         files = hashmap_new(string_hash_func, string_compare_func);
 288         if (!files) {
 289                 log_error("Failed to allocate set.");
 290                 r = -ENOMEM;
 291                 goto finish;
 292         }
 293
 294         fanotify_fd = fanotify_init(FAN_CLOEXEC|FAN_NONBLOCK, O_RDONLY|O_LARGEFILE|O_CLOEXEC|O_NOATIME);
 295         if (fanotify_fd < 0)  {
 296                 log_error("Failed to create fanotify object: %m");
 297                 r = -errno;
 298                 goto finish;
 299         }
 300
 301         if (fanotify_mark(fanotify_fd, FAN_MARK_ADD|FAN_MARK_MOUNT, FAN_OPEN, AT_FDCWD, root) < 0) {
 302                 log_error("Failed to mark %s: %m", root);
 303                 r = -errno;
 304                 goto finish;
 305         }
 306
 307         inotify_fd = open_inotify();
 308         if (inotify_fd < 0) {
 309                 r = inotify_fd;
 310                 goto finish;
 311         }
 312
 313         not_after = now(CLOCK_MONOTONIC) + arg_timeout;
 314
 315         my_pid = getpid();
 316
 317         pollfd[FD_FANOTIFY].fd = fanotify_fd;
 318         pollfd[FD_FANOTIFY].events = POLLIN;
 319         pollfd[FD_SIGNAL].fd = signal_fd;
 320         pollfd[FD_SIGNAL].events = POLLIN;
 321         pollfd[FD_INOTIFY].fd = inotify_fd;
 322         pollfd[FD_INOTIFY].events = POLLIN;
 323
 324         sd_notify(0,
 325                   "READY=1\n"
 326                   "STATUS=Collecting readahead data");
 327
 328         log_debug("Collecting...");
 329
 330         if (access("/run/systemd/readahead/cancel", F_OK) >= 0) {
 331                 log_debug("Collection canceled");
 332                 r = -ECANCELED;
 333                 goto finish;
 334         }
 335
 336         if (access("/run/systemd/readahead/done", F_OK) >= 0) {
 337                 log_debug("Got termination request");
 338                 goto done;
 339         }
 340
 341         for (;;) {
 342                 union {
 343                         struct fanotify_event_metadata metadata;
 344                         char buffer[4096];
 345                 } data;
 346                 ssize_t n;
 347                 struct fanotify_event_metadata *m;
 348                 usec_t t;
 349                 int h;
 350
 351                 if (hashmap_size(files) > arg_files_max) {
 352                         log_debug("Reached maximum number of read ahead files, ending collection.");
 353                         break;
 354                 }
 355
 356                 t = now(CLOCK_MONOTONIC);
 357                 if (t >= not_after) {
 358                         log_debug("Reached maximum collection time, ending collection.");
 359                         break;
 360                 }
 361
 362                 if ((h = poll(pollfd, _FD_MAX, (int) ((not_after - t) / USEC_PER_MSEC))) < 0) {
 363
 364                         if (errno == EINTR)
 365                                 continue;
 366
 367                         log_error("poll(): %m");
 368                         r = -errno;
 369                         goto finish;
 370                 }
 371
 372                 if (h == 0) {
 373                         log_debug("Reached maximum collection time, ending collection.");
 374                         break;
 375                 }
 376
 377                 if (pollfd[FD_SIGNAL].revents) {
 378                         log_debug("Got signal.");
 379                         break;
 380                 }
 381
 382                 if (pollfd[FD_INOTIFY].revents) {
 383                         uint8_t inotify_buffer[sizeof(struct inotify_event) + FILENAME_MAX];
 384                         struct inotify_event *e;
 385
 386                         if ((n = read(inotify_fd, &inotify_buffer, sizeof(inotify_buffer))) < 0) {
 387                                 if (errno == EINTR || errno == EAGAIN)
 388                                         continue;
 389
 390                                 log_error("Failed to read inotify event: %m");
 391                                 r = -errno;
 392                                 goto finish;
 393                         }
 394
 395                         e = (struct inotify_event*) inotify_buffer;
 396                         while (n > 0) {
 397                                 size_t step;
 398
 399                                 if ((e->mask & IN_CREATE) && streq(e->name, "cancel")) {
 400                                         log_debug("Collection canceled");
 401                                         r = -ECANCELED;
 402                                         goto finish;
 403                                 }
 404
 405                                 if ((e->mask & IN_CREATE) && streq(e->name, "done")) {
 406                                         log_debug("Got termination request");
 407                                         goto done;
 408                                 }
 409
 410                                 step = sizeof(struct inotify_event) + e->len;
 411                                 assert(step <= (size_t) n);
 412
 413                                 e = (struct inotify_event*) ((uint8_t*) e + step);
 414                                 n -= step;
 415                         }
 416                 }
 417
 418                 n = read(fanotify_fd, &data, sizeof(data));
 419                 if (n < 0) {
 420
 421                         if (errno == EINTR || errno == EAGAIN)
 422                                 continue;
 423
 424                         /* fanotify sometimes returns EACCES on read()
 425                          * where it shouldn't. For now let's just
 426                          * ignore it here (which is safe), but
 427                          * eventually this should be
 428                          * dropped when the kernel is fixed.
 429                          *
 430                          * https://bugzilla.redhat.com/show_bug.cgi?id=707577 */
 431                         if (errno == EACCES)
 432                                 continue;
 433
 434                         log_error("Failed to read event: %m");
 435                         r = -errno;
 436                         goto finish;
 437                 }
 438
 439                 for (m = &data.metadata; FAN_EVENT_OK(m, n); m = FAN_EVENT_NEXT(m, n)) {
 440                         char fn[sizeof("/proc/self/fd/") + DECIMAL_STR_MAX(int)];
 441                         int k;
 442
 443                         if (m->fd < 0)
 444                                 goto next_iteration;
 445
 446                         if (m->pid == my_pid)
 447                                 goto next_iteration;
 448
 449                         __sync_synchronize();
 450                         if (m->pid == shared->replay)
 451                                 goto next_iteration;
 452
 453                         snprintf(fn, sizeof(fn), "/proc/self/fd/%i", m->fd);
 454                         k = readlink_malloc(fn, &p);
 455                         if (k >= 0) {
 456                                 if (startswith(p, "/tmp") ||
 457                                     endswith(p, " (deleted)") ||
 458                                     hashmap_get(files, p))
 459                                         /* Not interesting, or
 460                                          * already read */
 461                                         free(p);
 462                                 else {
 463                                         unsigned long ul;
 464                                         usec_t entrytime;
 465                                         struct item *entry;
 466
 467                                         entry = new0(struct item, 1);
 468                                         if (!entry) {
 469                                                 r = log_oom();
 470                                                 goto finish;
 471                                         }
 472
 473                                         ul = fd_first_block(m->fd);
 474
 475                                         entrytime = now(CLOCK_MONOTONIC);
 476
 477                                         entry->block = ul;
 478                                         entry->path = strdup(p);
 479                                         if (!entry->path) {
 480                                                 free(entry);
 481                                                 r = log_oom();
 482                                                 goto finish;
 483                                         }
 484                                         entry->bin = (entrytime - starttime) / 2000000;
 485
 486                                         k = hashmap_put(files, p, entry);
 487                                         if (k < 0) {
 488                                                 log_warning("hashmap_put() failed: %s", strerror(-k));
 489                                                 free(p);
 490                                         }
 491                                 }
 492
 493                         } else
 494                                 log_warning("readlink(%s) failed: %s", fn, strerror(-k));
 495
 496                 next_iteration:
 497                         if (m->fd >= 0)
 498                                 close_nointr_nofail(m->fd);
 499                 }
 500         }
 501
 502 done:
 503         if (fanotify_fd >= 0) {
 504                 close_nointr_nofail(fanotify_fd);
 505                 fanotify_fd = -1;
 506         }
 507
 508         log_debug("Writing Pack File...");
 509
 510         on_ssd = fs_on_ssd(root) > 0;
 511         log_debug("On SSD: %s", yes_no(on_ssd));
 512
 513         on_btrfs = statfs(root, &sfs) >= 0 && F_TYPE_EQUAL(sfs.f_type, BTRFS_SUPER_MAGIC);
 514         log_debug("On btrfs: %s", yes_no(on_btrfs));
 515
 516         if (asprintf(&pack_fn_new, "%s/.readahead.new", root) < 0) {
 517                 r = log_oom();
 518                 goto finish;
 519         }
 520
 521         pack = fopen(pack_fn_new, "we");
 522         if (!pack) {
 523                 log_error("Failed to open pack file: %m");
 524                 r = -errno;
 525                 goto finish;
 526         }
 527
 528         fputs(CANONICAL_HOST READAHEAD_PACK_FILE_VERSION, pack);
 529         putc(on_ssd ? 'S' : 'R', pack);
 530
 531         if (on_ssd || on_btrfs) {
 532
 533                 /* On SSD or on btrfs, just write things out in the
 534                  * order the files were accessed. */
 535
 536                 HASHMAP_FOREACH_KEY(q, p, files, i)
 537                         pack_file(pack, p, on_btrfs);
 538         } else {
 539                 unsigned n;
 540
 541                 /* On rotating media, order things by the block
 542                  * numbers */
 543
 544                 log_debug("Ordering...");
 545
 546                 n = hashmap_size(files);
 547                 if (n) {
 548                         _cleanup_free_ struct item *ordered;
 549                         struct item *j;
 550                         unsigned k;
 551
 552                         ordered = new(struct item, n);
 553                         if (!ordered) {
 554                                 r = log_oom();
 555                                 goto finish;
 556                         }
 557
 558                         j = ordered;
 559                         HASHMAP_FOREACH_KEY(q, p, files, i) {
 560                                 memcpy(j, q, sizeof(struct item));
 561                                 j++;
 562                         }
 563
 564                         assert(ordered + n == j);
 565
 566                         qsort(ordered, n, sizeof(struct item), qsort_compare);
 567
 568                         for (k = 0; k < n; k++)
 569                                 pack_file(pack, ordered[k].path, on_btrfs);
 570                 } else
 571                         log_warning("No pack files");
 572         }
 573
 574         log_debug("Finalizing...");
 575
 576         fflush(pack);
 577
 578         if (ferror(pack)) {
 579                 log_error("Failed to write pack file.");
 580                 r = -EIO;
 581                 goto finish;
 582         }
 583
 584         if (rename(pack_fn_new, pack_fn) < 0) {
 585                 log_error("Failed to rename readahead file: %m");
 586                 r = -errno;
 587                 goto finish;
 588         }
 589
 590         fclose(pack);
 591         pack = NULL;
 592
 593         log_debug("Done.");
 594
 595 finish:
 596         if (fanotify_fd >= 0)
 597                 close_nointr_nofail(fanotify_fd);
 598
 599         if (signal_fd >= 0)
 600                 close_nointr_nofail(signal_fd);
 601
 602         if (inotify_fd >= 0)
 603                 close_nointr_nofail(inotify_fd);
 604
 605         if (pack) {
 606                 fclose(pack);
 607                 unlink(pack_fn_new);
 608         }
 609         free(pack_fn_new);
 610         free(pack_fn);
 611
 612         while ((p = hashmap_steal_first_key(files)))
 613                 free(p);
 614
 615         hashmap_free(files);
 616
 617         if (previous_block_readahead_set) {
 618                 uint64_t bytes;
 619
 620                 /* Restore the original kernel readahead setting if we
 621                  * changed it, and nobody has overwritten it since
 622                  * yet. */
 623                 if (block_get_readahead(root, &bytes) >= 0 && bytes == 8*1024)
 624                         block_set_readahead(root, previous_block_readahead);
 625         }
 626
 627         return r;
 628 }
 629
 630 int main_collect(const char *root) {
 631
 632         if (!root)
 633                 root = "/";
 634
 635         /* Skip this step on read-only media. Note that we check the
 636          * underlying block device here, not he read-only flag of the
 637          * file system on top, since that one is most likely mounted
 638          * read-only anyway at boot, even if the underlying block
 639          * device is theoretically writable. */
 640         if (fs_on_read_only(root) > 0) {
 641                 log_info("Disabling readahead collector due to read-only media.");
 642                 return EXIT_SUCCESS;
 643         }
 644
 645         if (!enough_ram()) {
 646                 log_info("Disabling readahead collector due to low memory.");
 647                 return EXIT_SUCCESS;
 648         }
 649
 650         shared = shared_get();
 651         if (!shared)
 652                 return EXIT_FAILURE;
 653
 654         shared->collect = getpid();
 655         __sync_synchronize();
 656
 657         if (collect(root) < 0)
 658                 return EXIT_FAILURE;
 659
 660         return EXIT_SUCCESS;
 661 }