src/readahead/readahead-collect.c

   1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
   2
   3 /***
   4   This file is part of systemd.
   5
   6   Copyright 2010 Lennart Poettering
   7
   8   systemd is free software; you can redistribute it and/or modify it
   9   under the terms of the GNU Lesser General Public License as published by
  10   the Free Software Foundation; either version 2.1 of the License, or
  11   (at your option) any later version.
  12
  13   systemd is distributed in the hope that it will be useful, but
  14   WITHOUT ANY WARRANTY; without even the implied warranty of
  15   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  16   Lesser General Public License for more details.
  17
  18   You should have received a copy of the GNU Lesser General Public License
  19   along with systemd; If not, see <http://www.gnu.org/licenses/>.
  20 ***/
  21
  22 #include <errno.h>
  23 #include <inttypes.h>
  24 #include <fcntl.h>
  25 #include <linux/limits.h>
  26 #include <stdbool.h>
  27 #include <stdio.h>
  28 #include <stdlib.h>
  29 #include <string.h>
  30 #include <sys/select.h>
  31 #include <sys/time.h>
  32 #include <sys/types.h>
  33 #include <sys/stat.h>
  34 #include <unistd.h>
  35 #include <linux/fanotify.h>
  36 #include <sys/signalfd.h>
  37 #include <sys/poll.h>
  38 #include <sys/mman.h>
  39 #include <linux/fs.h>
  40 #include <linux/fiemap.h>
  41 #include <sys/ioctl.h>
  42 #include <sys/vfs.h>
  43 #include <getopt.h>
  44 #include <sys/inotify.h>
  45 #include <math.h>
  46
  47 #ifdef HAVE_FANOTIFY_INIT
  48 #include <sys/fanotify.h>
  49 #endif
  50
  51 #include <systemd/sd-daemon.h>
  52
  53 #include "missing.h"
  54 #include "util.h"
  55 #include "set.h"
  56 #include "ioprio.h"
  57 #include "readahead-common.h"
  58 #include "virt.h"
  59
  60 /* fixme:
  61  *
  62  * - detect ssd on btrfs/lvm...
  63  * - read ahead directories
  64  * - gzip?
  65  * - remount rw?
  66  * - handle files where nothing is in mincore
  67  * - does ioprio_set work with fadvise()?
  68  */
  69
  70 static ReadaheadShared *shared = NULL;
  71 static usec_t starttime;
  72
  73 /* Avoid collisions with the NULL pointer */
  74 #define SECTOR_TO_PTR(s) ULONG_TO_PTR((s)+1)
  75 #define PTR_TO_SECTOR(p) (PTR_TO_ULONG(p)-1)
  76
  77 static int btrfs_defrag(int fd) {
  78         struct btrfs_ioctl_vol_args data = { .fd = fd };
  79
  80         return ioctl(fd, BTRFS_IOC_DEFRAG, &data);
  81 }
  82
  83 static int pack_file(FILE *pack, const char *fn, bool on_btrfs) {
  84         struct stat st;
  85         void *start = MAP_FAILED;
  86         uint8_t *vec;
  87         uint32_t b, c;
  88         uint64_t inode;
  89         size_t l, pages;
  90         bool mapped;
  91         int r = 0, fd = -1, k;
  92
  93         assert(pack);
  94         assert(fn);
  95
  96         fd = open(fn, O_RDONLY|O_CLOEXEC|O_NOATIME|O_NOCTTY|O_NOFOLLOW);
  97         if (fd < 0) {
  98
  99                 if (errno == ENOENT)
 100                         return 0;
 101
 102                 if (errno == EPERM || errno == EACCES)
 103                         return 0;
 104
 105                 log_warning("open(%s) failed: %m", fn);
 106                 r = -errno;
 107                 goto finish;
 108         }
 109
 110         k = file_verify(fd, fn, arg_file_size_max, &st);
 111         if (k <= 0) {
 112                 r = k;
 113                 goto finish;
 114         }
 115
 116         if (on_btrfs)
 117                 btrfs_defrag(fd);
 118
 119         l = PAGE_ALIGN(st.st_size);
 120         start = mmap(NULL, l, PROT_READ, MAP_SHARED, fd, 0);
 121         if (start == MAP_FAILED) {
 122                 log_warning("mmap(%s) failed: %m", fn);
 123                 r = -errno;
 124                 goto finish;
 125         }
 126
 127         pages = l / page_size();
 128         vec = alloca(pages);
 129         memset(vec, 0, pages);
 130         if (mincore(start, l, vec) < 0) {
 131                 log_warning("mincore(%s) failed: %m", fn);
 132                 r = -errno;
 133                 goto finish;
 134         }
 135
 136         fputs(fn, pack);
 137         fputc('\n', pack);
 138
 139         /* Store the inode, so that we notice when the file is deleted */
 140         inode = (uint64_t) st.st_ino;
 141         fwrite(&inode, sizeof(inode), 1, pack);
 142
 143         mapped = false;
 144         for (c = 0; c < pages; c++) {
 145                 bool new_mapped = !!(vec[c] & 1);
 146
 147                 if (!mapped && new_mapped)
 148                         b = c;
 149                 else if (mapped && !new_mapped) {
 150                         fwrite(&b, sizeof(b), 1, pack);
 151                         fwrite(&c, sizeof(c), 1, pack);
 152
 153                         log_debug("%s: page %u to %u", fn, b, c);
 154                 }
 155
 156                 mapped = new_mapped;
 157         }
 158
 159         /* We don't write any range data if we should read the entire file */
 160         if (mapped && b > 0) {
 161                 fwrite(&b, sizeof(b), 1, pack);
 162                 fwrite(&c, sizeof(c), 1, pack);
 163
 164                 log_debug("%s: page %u to %u", fn, b, c);
 165         }
 166
 167         /* End marker */
 168         b = 0;
 169         fwrite(&b, sizeof(b), 1, pack);
 170         fwrite(&b, sizeof(b), 1, pack);
 171
 172 finish:
 173         if (start != MAP_FAILED)
 174                 munmap(start, l);
 175
 176         if (fd >= 0)
 177                 close_nointr_nofail(fd);
 178
 179         return r;
 180 }
 181
 182 static unsigned long fd_first_block(int fd) {
 183         struct {
 184                 struct fiemap fiemap;
 185                 struct fiemap_extent extent;
 186         } data = {
 187                 .fiemap.fm_length = ~0ULL,
 188                 .fiemap.fm_extent_count = 1,
 189         };
 190
 191         if (ioctl(fd, FS_IOC_FIEMAP, &data) < 0)
 192                 return 0;
 193
 194         if (data.fiemap.fm_mapped_extents <= 0)
 195                 return 0;
 196
 197         if (data.fiemap.fm_extents[0].fe_flags & FIEMAP_EXTENT_UNKNOWN)
 198                 return 0;
 199
 200         return (unsigned long) data.fiemap.fm_extents[0].fe_physical;
 201 }
 202
 203 struct item {
 204         const char *path;
 205         unsigned long block;
 206         unsigned long bin;
 207 };
 208
 209 static int qsort_compare(const void *a, const void *b) {
 210         const struct item *i, *j;
 211
 212         i = a;
 213         j = b;
 214
 215         /* sort by bin first */
 216         if (i->bin < j->bin)
 217                 return -1;
 218         if (i->bin > j->bin)
 219                 return 1;
 220
 221         /* then sort by sector */
 222         if (i->block < j->block)
 223                 return -1;
 224         if (i->block > j->block)
 225                 return 1;
 226
 227         return strcmp(i->path, j->path);
 228 }
 229
 230 static int collect(const char *root) {
 231         enum {
 232                 FD_FANOTIFY,  /* Get the actual fs events */
 233                 FD_SIGNAL,
 234                 FD_INOTIFY,   /* We get notifications to quit early via this fd */
 235                 _FD_MAX
 236         };
 237         struct pollfd pollfd[_FD_MAX] = {};
 238         int fanotify_fd = -1, signal_fd = -1, inotify_fd = -1, r = 0;
 239         pid_t my_pid;
 240         Hashmap *files = NULL;
 241         Iterator i;
 242         char *p, *q;
 243         sigset_t mask;
 244         FILE *pack = NULL;
 245         char *pack_fn_new = NULL, *pack_fn = NULL;
 246         bool on_ssd, on_btrfs;
 247         struct statfs sfs;
 248         usec_t not_after;
 249         uint64_t previous_block_readahead;
 250         bool previous_block_readahead_set = false;
 251
 252         assert(root);
 253
 254         if (asprintf(&pack_fn, "%s/.readahead", root) < 0) {
 255                 r = log_oom();
 256                 goto finish;
 257         }
 258
 259         starttime = now(CLOCK_MONOTONIC);
 260
 261         /* If there's no pack file yet we lower the kernel readahead
 262          * so that mincore() is accurate. If there is a pack file
 263          * already we assume it is accurate enough so that kernel
 264          * readahead is never triggered. */
 265         previous_block_readahead_set =
 266                 access(pack_fn, F_OK) < 0 &&
 267                 block_get_readahead(root, &previous_block_readahead) >= 0 &&
 268                 block_set_readahead(root, 8*1024) >= 0;
 269
 270         if (ioprio_set(IOPRIO_WHO_PROCESS, getpid(), IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0)) < 0)
 271                 log_warning("Failed to set IDLE IO priority class: %m");
 272
 273         assert_se(sigemptyset(&mask) == 0);
 274         sigset_add_many(&mask, SIGINT, SIGTERM, -1);
 275         assert_se(sigprocmask(SIG_SETMASK, &mask, NULL) == 0);
 276
 277         if ((signal_fd = signalfd(-1, &mask, SFD_NONBLOCK|SFD_CLOEXEC)) < 0) {
 278                 log_error("signalfd(): %m");
 279                 r = -errno;
 280                 goto finish;
 281         }
 282
 283         files = hashmap_new(string_hash_func, string_compare_func);
 284         if (!files) {
 285                 log_error("Failed to allocate set.");
 286                 r = -ENOMEM;
 287                 goto finish;
 288         }
 289
 290         fanotify_fd = fanotify_init(FAN_CLOEXEC|FAN_NONBLOCK, O_RDONLY|O_LARGEFILE|O_CLOEXEC|O_NOATIME);
 291         if (fanotify_fd < 0)  {
 292                 log_error("Failed to create fanotify object: %m");
 293                 r = -errno;
 294                 goto finish;
 295         }
 296
 297         if (fanotify_mark(fanotify_fd, FAN_MARK_ADD|FAN_MARK_MOUNT, FAN_OPEN, AT_FDCWD, root) < 0) {
 298                 log_error("Failed to mark %s: %m", root);
 299                 r = -errno;
 300                 goto finish;
 301         }
 302
 303         inotify_fd = open_inotify();
 304         if (inotify_fd < 0) {
 305                 r = inotify_fd;
 306                 goto finish;
 307         }
 308
 309         not_after = now(CLOCK_MONOTONIC) + arg_timeout;
 310
 311         my_pid = getpid();
 312
 313         pollfd[FD_FANOTIFY].fd = fanotify_fd;
 314         pollfd[FD_FANOTIFY].events = POLLIN;
 315         pollfd[FD_SIGNAL].fd = signal_fd;
 316         pollfd[FD_SIGNAL].events = POLLIN;
 317         pollfd[FD_INOTIFY].fd = inotify_fd;
 318         pollfd[FD_INOTIFY].events = POLLIN;
 319
 320         sd_notify(0,
 321                   "READY=1\n"
 322                   "STATUS=Collecting readahead data");
 323
 324         log_debug("Collecting...");
 325
 326         if (access("/run/systemd/readahead/cancel", F_OK) >= 0) {
 327                 log_debug("Collection canceled");
 328                 r = -ECANCELED;
 329                 goto finish;
 330         }
 331
 332         if (access("/run/systemd/readahead/done", F_OK) >= 0) {
 333                 log_debug("Got termination request");
 334                 goto done;
 335         }
 336
 337         for (;;) {
 338                 union {
 339                         struct fanotify_event_metadata metadata;
 340                         char buffer[4096];
 341                 } data;
 342                 ssize_t n;
 343                 struct fanotify_event_metadata *m;
 344                 usec_t t;
 345                 int h;
 346
 347                 if (hashmap_size(files) > arg_files_max) {
 348                         log_debug("Reached maximum number of read ahead files, ending collection.");
 349                         break;
 350                 }
 351
 352                 t = now(CLOCK_MONOTONIC);
 353                 if (t >= not_after) {
 354                         log_debug("Reached maximum collection time, ending collection.");
 355                         break;
 356                 }
 357
 358                 if ((h = poll(pollfd, _FD_MAX, (int) ((not_after - t) / USEC_PER_MSEC))) < 0) {
 359
 360                         if (errno == EINTR)
 361                                 continue;
 362
 363                         log_error("poll(): %m");
 364                         r = -errno;
 365                         goto finish;
 366                 }
 367
 368                 if (h == 0) {
 369                         log_debug("Reached maximum collection time, ending collection.");
 370                         break;
 371                 }
 372
 373                 if (pollfd[FD_SIGNAL].revents) {
 374                         log_debug("Got signal.");
 375                         break;
 376                 }
 377
 378                 if (pollfd[FD_INOTIFY].revents) {
 379                         uint8_t inotify_buffer[sizeof(struct inotify_event) + FILENAME_MAX];
 380                         struct inotify_event *e;
 381
 382                         if ((n = read(inotify_fd, &inotify_buffer, sizeof(inotify_buffer))) < 0) {
 383                                 if (errno == EINTR || errno == EAGAIN)
 384                                         continue;
 385
 386                                 log_error("Failed to read inotify event: %m");
 387                                 r = -errno;
 388                                 goto finish;
 389                         }
 390
 391                         e = (struct inotify_event*) inotify_buffer;
 392                         while (n > 0) {
 393                                 size_t step;
 394
 395                                 if ((e->mask & IN_CREATE) && streq(e->name, "cancel")) {
 396                                         log_debug("Collection canceled");
 397                                         r = -ECANCELED;
 398                                         goto finish;
 399                                 }
 400
 401                                 if ((e->mask & IN_CREATE) && streq(e->name, "done")) {
 402                                         log_debug("Got termination request");
 403                                         goto done;
 404                                 }
 405
 406                                 step = sizeof(struct inotify_event) + e->len;
 407                                 assert(step <= (size_t) n);
 408
 409                                 e = (struct inotify_event*) ((uint8_t*) e + step);
 410                                 n -= step;
 411                         }
 412                 }
 413
 414                 if ((n = read(fanotify_fd, &data, sizeof(data))) < 0) {
 415
 416                         if (errno == EINTR || errno == EAGAIN)
 417                                 continue;
 418
 419                         /* fanotify sometimes returns EACCES on read()
 420                          * where it shouldn't. For now let's just
 421                          * ignore it here (which is safe), but
 422                          * eventually this should be
 423                          * dropped when the kernel is fixed.
 424                          *
 425                          * https://bugzilla.redhat.com/show_bug.cgi?id=707577 */
 426                         if (errno == EACCES)
 427                                 continue;
 428
 429                         log_error("Failed to read event: %m");
 430                         r = -errno;
 431                         goto finish;
 432                 }
 433
 434                 for (m = &data.metadata; FAN_EVENT_OK(m, n); m = FAN_EVENT_NEXT(m, n)) {
 435                         char fn[PATH_MAX];
 436                         int k;
 437
 438                         if (m->fd < 0)
 439                                 goto next_iteration;
 440
 441                         if (m->pid == my_pid)
 442                                 goto next_iteration;
 443
 444                         __sync_synchronize();
 445                         if (m->pid == shared->replay)
 446                                 goto next_iteration;
 447
 448                         snprintf(fn, sizeof(fn), "/proc/self/fd/%i", m->fd);
 449                         char_array_0(fn);
 450
 451                         if ((k = readlink_malloc(fn, &p)) >= 0) {
 452                                 if (startswith(p, "/tmp") ||
 453                                     endswith(p, " (deleted)") ||
 454                                     hashmap_get(files, p))
 455                                         /* Not interesting, or
 456                                          * already read */
 457                                         free(p);
 458                                 else {
 459                                         unsigned long ul;
 460                                         usec_t entrytime;
 461                                         struct item *entry;
 462
 463                                         entry = new0(struct item, 1);
 464                                         if (!entry) {
 465                                                 r = log_oom();
 466                                                 goto finish;
 467                                         }
 468
 469                                         ul = fd_first_block(m->fd);
 470
 471                                         entrytime = now(CLOCK_MONOTONIC);
 472
 473                                         entry->block = ul;
 474                                         entry->path = strdup(p);
 475                                         if (!entry->path) {
 476                                                 free(entry);
 477                                                 r = log_oom();
 478                                                 goto finish;
 479                                         }
 480                                         entry->bin = (entrytime - starttime) / 2000000;
 481
 482                                         k = hashmap_put(files, p, entry);
 483                                         if (k < 0) {
 484                                                 log_warning("hashmap_put() failed: %s", strerror(-k));
 485                                                 free(p);
 486                                         }
 487                                 }
 488
 489                         } else
 490                                 log_warning("readlink(%s) failed: %s", fn, strerror(-k));
 491
 492                 next_iteration:
 493                         if (m->fd >= 0)
 494                                 close_nointr_nofail(m->fd);
 495                 }
 496         }
 497
 498 done:
 499         if (fanotify_fd >= 0) {
 500                 close_nointr_nofail(fanotify_fd);
 501                 fanotify_fd = -1;
 502         }
 503
 504         log_debug("Writing Pack File...");
 505
 506         on_ssd = fs_on_ssd(root) > 0;
 507         log_debug("On SSD: %s", yes_no(on_ssd));
 508
 509         on_btrfs = statfs(root, &sfs) >= 0 && F_TYPE_CMP(sfs.f_type, BTRFS_SUPER_MAGIC);
 510         log_debug("On btrfs: %s", yes_no(on_btrfs));
 511
 512         if (asprintf(&pack_fn_new, "%s/.readahead.new", root) < 0) {
 513                 r = log_oom();
 514                 goto finish;
 515         }
 516
 517         pack = fopen(pack_fn_new, "we");
 518         if (!pack) {
 519                 log_error("Failed to open pack file: %m");
 520                 r = -errno;
 521                 goto finish;
 522         }
 523
 524         fputs(CANONICAL_HOST READAHEAD_PACK_FILE_VERSION, pack);
 525         putc(on_ssd ? 'S' : 'R', pack);
 526
 527         if (on_ssd || on_btrfs) {
 528
 529                 /* On SSD or on btrfs, just write things out in the
 530                  * order the files were accessed. */
 531
 532                 HASHMAP_FOREACH_KEY(q, p, files, i)
 533                         pack_file(pack, p, on_btrfs);
 534         } else {
 535                 struct item *ordered, *j;
 536                 unsigned k, n;
 537
 538                 /* On rotating media, order things by the block
 539                  * numbers */
 540
 541                 log_debug("Ordering...");
 542
 543                 n = hashmap_size(files);
 544                 if (!(ordered = new(struct item, n))) {
 545                         r = log_oom();
 546                         goto finish;
 547                 }
 548
 549                 j = ordered;
 550                 HASHMAP_FOREACH_KEY(q, p, files, i) {
 551                         memcpy(j, q, sizeof(struct item));
 552                         j++;
 553                 }
 554
 555                 assert(ordered + n == j);
 556
 557                 qsort(ordered, n, sizeof(struct item), qsort_compare);
 558
 559                 for (k = 0; k < n; k++)
 560                         pack_file(pack, ordered[k].path, on_btrfs);
 561
 562                 free(ordered);
 563         }
 564
 565         log_debug("Finalizing...");
 566
 567         fflush(pack);
 568
 569         if (ferror(pack)) {
 570                 log_error("Failed to write pack file.");
 571                 r = -EIO;
 572                 goto finish;
 573         }
 574
 575         if (rename(pack_fn_new, pack_fn) < 0) {
 576                 log_error("Failed to rename readahead file: %m");
 577                 r = -errno;
 578                 goto finish;
 579         }
 580
 581         fclose(pack);
 582         pack = NULL;
 583
 584         log_debug("Done.");
 585
 586 finish:
 587         if (fanotify_fd >= 0)
 588                 close_nointr_nofail(fanotify_fd);
 589
 590         if (signal_fd >= 0)
 591                 close_nointr_nofail(signal_fd);
 592
 593         if (inotify_fd >= 0)
 594                 close_nointr_nofail(inotify_fd);
 595
 596         if (pack) {
 597                 fclose(pack);
 598                 unlink(pack_fn_new);
 599         }
 600         free(pack_fn_new);
 601         free(pack_fn);
 602
 603         while ((p = hashmap_steal_first_key(files)))
 604                 free(p);
 605
 606         hashmap_free(files);
 607
 608         if (previous_block_readahead_set) {
 609                 uint64_t bytes;
 610
 611                 /* Restore the original kernel readahead setting if we
 612                  * changed it, and nobody has overwritten it since
 613                  * yet. */
 614                 if (block_get_readahead(root, &bytes) >= 0 && bytes == 8*1024)
 615                         block_set_readahead(root, previous_block_readahead);
 616         }
 617
 618         return r;
 619 }
 620
 621 int main_collect(const char *root) {
 622
 623         if (!root)
 624                 root = "/";
 625
 626         /* Skip this step on read-only media. Note that we check the
 627          * underlying block device here, not he read-only flag of the
 628          * file system on top, since that one is most likely mounted
 629          * read-only anyway at boot, even if the underlying block
 630          * device is theoretically writable. */
 631         if (fs_on_read_only(root) > 0) {
 632                 log_info("Disabling readahead collector due to read-only media.");
 633                 return EXIT_SUCCESS;
 634         }
 635
 636         if (!enough_ram()) {
 637                 log_info("Disabling readahead collector due to low memory.");
 638                 return EXIT_SUCCESS;
 639         }
 640
 641         shared = shared_get();
 642         if (!shared)
 643                 return EXIT_FAILURE;
 644
 645         shared->collect = getpid();
 646         __sync_synchronize();
 647
 648         if (collect(root) < 0)
 649                 return EXIT_FAILURE;
 650
 651         return EXIT_SUCCESS;
 652 }