src/readahead/readahead-collect.c

   1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
   2
   3 /***
   4   This file is part of systemd.
   5
   6   Copyright 2010 Lennart Poettering
   7
   8   systemd is free software; you can redistribute it and/or modify it
   9   under the terms of the GNU Lesser General Public License as published by
  10   the Free Software Foundation; either version 2.1 of the License, or
  11   (at your option) any later version.
  12
  13   systemd is distributed in the hope that it will be useful, but
  14   WITHOUT ANY WARRANTY; without even the implied warranty of
  15   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  16   Lesser General Public License for more details.
  17
  18   You should have received a copy of the GNU Lesser General Public License
  19   along with systemd; If not, see <http://www.gnu.org/licenses/>.
  20 ***/
  21
  22 #include <errno.h>
  23 #include <inttypes.h>
  24 #include <fcntl.h>
  25 #include <linux/limits.h>
  26 #include <stdbool.h>
  27 #include <stdio.h>
  28 #include <stdlib.h>
  29 #include <string.h>
  30 #include <sys/select.h>
  31 #include <sys/time.h>
  32 #include <sys/types.h>
  33 #include <sys/stat.h>
  34 #include <unistd.h>
  35 #include <linux/fanotify.h>
  36 #include <sys/signalfd.h>
  37 #include <sys/poll.h>
  38 #include <sys/mman.h>
  39 #include <linux/fs.h>
  40 #include <linux/fiemap.h>
  41 #include <sys/ioctl.h>
  42 #include <sys/vfs.h>
  43 #include <getopt.h>
  44 #include <sys/inotify.h>
  45 #include <math.h>
  46
  47 #ifdef HAVE_LINUX_BTRFS_H
  48 #include <linux/btrfs.h>
  49 #endif
  50
  51 #ifdef HAVE_FANOTIFY_INIT
  52 #include <sys/fanotify.h>
  53 #endif
  54
  55 #include <systemd/sd-daemon.h>
  56
  57 #include "missing.h"
  58 #include "util.h"
  59 #include "set.h"
  60 #include "ioprio.h"
  61 #include "readahead-common.h"
  62 #include "virt.h"
  63
  64 /* fixme:
  65  *
  66  * - detect ssd on btrfs/lvm...
  67  * - read ahead directories
  68  * - gzip?
  69  * - remount rw?
  70  * - handle files where nothing is in mincore
  71  * - does ioprio_set work with fadvise()?
  72  */
  73
  74 static ReadaheadShared *shared = NULL;
  75 static usec_t starttime;
  76
  77 /* Avoid collisions with the NULL pointer */
  78 #define SECTOR_TO_PTR(s) ULONG_TO_PTR((s)+1)
  79 #define PTR_TO_SECTOR(p) (PTR_TO_ULONG(p)-1)
  80
  81 static int btrfs_defrag(int fd) {
  82         struct btrfs_ioctl_vol_args data = { .fd = fd };
  83
  84         return ioctl(fd, BTRFS_IOC_DEFRAG, &data);
  85 }
  86
  87 static int pack_file(FILE *pack, const char *fn, bool on_btrfs) {
  88         struct stat st;
  89         void *start = MAP_FAILED;
  90         uint8_t *vec;
  91         uint32_t b, c;
  92         uint64_t inode;
  93         size_t l, pages;
  94         bool mapped;
  95         int r = 0, fd = -1, k;
  96
  97         assert(pack);
  98         assert(fn);
  99
 100         fd = open(fn, O_RDONLY|O_CLOEXEC|O_NOATIME|O_NOCTTY|O_NOFOLLOW);
 101         if (fd < 0) {
 102
 103                 if (errno == ENOENT)
 104                         return 0;
 105
 106                 if (errno == EPERM || errno == EACCES)
 107                         return 0;
 108
 109                 log_warning("open(%s) failed: %m", fn);
 110                 r = -errno;
 111                 goto finish;
 112         }
 113
 114         k = file_verify(fd, fn, arg_file_size_max, &st);
 115         if (k <= 0) {
 116                 r = k;
 117                 goto finish;
 118         }
 119
 120         if (on_btrfs)
 121                 btrfs_defrag(fd);
 122
 123         l = PAGE_ALIGN(st.st_size);
 124         start = mmap(NULL, l, PROT_READ, MAP_SHARED, fd, 0);
 125         if (start == MAP_FAILED) {
 126                 log_warning("mmap(%s) failed: %m", fn);
 127                 r = -errno;
 128                 goto finish;
 129         }
 130
 131         pages = l / page_size();
 132         vec = alloca0(pages);
 133         if (mincore(start, l, vec) < 0) {
 134                 log_warning("mincore(%s) failed: %m", fn);
 135                 r = -errno;
 136                 goto finish;
 137         }
 138
 139         fputs(fn, pack);
 140         fputc('\n', pack);
 141
 142         /* Store the inode, so that we notice when the file is deleted */
 143         inode = (uint64_t) st.st_ino;
 144         fwrite(&inode, sizeof(inode), 1, pack);
 145
 146         mapped = false;
 147         for (c = 0; c < pages; c++) {
 148                 bool new_mapped = !!(vec[c] & 1);
 149
 150                 if (!mapped && new_mapped)
 151                         b = c;
 152                 else if (mapped && !new_mapped) {
 153                         fwrite(&b, sizeof(b), 1, pack);
 154                         fwrite(&c, sizeof(c), 1, pack);
 155
 156                         log_debug("%s: page %u to %u", fn, b, c);
 157                 }
 158
 159                 mapped = new_mapped;
 160         }
 161
 162         /* We don't write any range data if we should read the entire file */
 163         if (mapped && b > 0) {
 164                 fwrite(&b, sizeof(b), 1, pack);
 165                 fwrite(&c, sizeof(c), 1, pack);
 166
 167                 log_debug("%s: page %u to %u", fn, b, c);
 168         }
 169
 170         /* End marker */
 171         b = 0;
 172         fwrite(&b, sizeof(b), 1, pack);
 173         fwrite(&b, sizeof(b), 1, pack);
 174
 175 finish:
 176         if (start != MAP_FAILED)
 177                 munmap(start, l);
 178
 179         safe_close(fd);
 180
 181         return r;
 182 }
 183
 184 static unsigned long fd_first_block(int fd) {
 185         struct {
 186                 struct fiemap fiemap;
 187                 struct fiemap_extent extent;
 188         } data = {
 189                 .fiemap.fm_length = ~0ULL,
 190                 .fiemap.fm_extent_count = 1,
 191         };
 192
 193         if (ioctl(fd, FS_IOC_FIEMAP, &data) < 0)
 194                 return 0;
 195
 196         if (data.fiemap.fm_mapped_extents <= 0)
 197                 return 0;
 198
 199         if (data.fiemap.fm_extents[0].fe_flags & FIEMAP_EXTENT_UNKNOWN)
 200                 return 0;
 201
 202         return (unsigned long) data.fiemap.fm_extents[0].fe_physical;
 203 }
 204
 205 struct item {
 206         const char *path;
 207         unsigned long block;
 208         unsigned long bin;
 209 };
 210
 211 static int qsort_compare(const void *a, const void *b) {
 212         const struct item *i, *j;
 213
 214         i = a;
 215         j = b;
 216
 217         /* sort by bin first */
 218         if (i->bin < j->bin)
 219                 return -1;
 220         if (i->bin > j->bin)
 221                 return 1;
 222
 223         /* then sort by sector */
 224         if (i->block < j->block)
 225                 return -1;
 226         if (i->block > j->block)
 227                 return 1;
 228
 229         return strcmp(i->path, j->path);
 230 }
 231
 232 static int collect(const char *root) {
 233         enum {
 234                 FD_FANOTIFY,  /* Get the actual fs events */
 235                 FD_SIGNAL,
 236                 FD_INOTIFY,   /* We get notifications to quit early via this fd */
 237                 _FD_MAX
 238         };
 239         struct pollfd pollfd[_FD_MAX] = {};
 240         int fanotify_fd = -1, signal_fd = -1, inotify_fd = -1, r = 0;
 241         pid_t my_pid;
 242         Hashmap *files = NULL;
 243         Iterator i;
 244         char *p, *q;
 245         sigset_t mask;
 246         FILE *pack = NULL;
 247         char *pack_fn_new = NULL, *pack_fn = NULL;
 248         bool on_ssd, on_btrfs;
 249         struct statfs sfs;
 250         usec_t not_after;
 251         uint64_t previous_block_readahead;
 252         bool previous_block_readahead_set = false;
 253
 254         assert(root);
 255
 256         if (asprintf(&pack_fn, "%s/.readahead", root) < 0) {
 257                 r = log_oom();
 258                 goto finish;
 259         }
 260
 261         starttime = now(CLOCK_MONOTONIC);
 262
 263         /* If there's no pack file yet we lower the kernel readahead
 264          * so that mincore() is accurate. If there is a pack file
 265          * already we assume it is accurate enough so that kernel
 266          * readahead is never triggered. */
 267         previous_block_readahead_set =
 268                 access(pack_fn, F_OK) < 0 &&
 269                 block_get_readahead(root, &previous_block_readahead) >= 0 &&
 270                 block_set_readahead(root, 8*1024) >= 0;
 271
 272         if (ioprio_set(IOPRIO_WHO_PROCESS, getpid(), IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0)) < 0)
 273                 log_warning("Failed to set IDLE IO priority class: %m");
 274
 275         assert_se(sigemptyset(&mask) == 0);
 276         sigset_add_many(&mask, SIGINT, SIGTERM, -1);
 277         assert_se(sigprocmask(SIG_SETMASK, &mask, NULL) == 0);
 278
 279         if ((signal_fd = signalfd(-1, &mask, SFD_NONBLOCK|SFD_CLOEXEC)) < 0) {
 280                 log_error("signalfd(): %m");
 281                 r = -errno;
 282                 goto finish;
 283         }
 284
 285         files = hashmap_new(string_hash_func, string_compare_func);
 286         if (!files) {
 287                 log_error("Failed to allocate set.");
 288                 r = -ENOMEM;
 289                 goto finish;
 290         }
 291
 292         fanotify_fd = fanotify_init(FAN_CLOEXEC|FAN_NONBLOCK, O_RDONLY|O_LARGEFILE|O_CLOEXEC|O_NOATIME);
 293         if (fanotify_fd < 0)  {
 294                 log_error("Failed to create fanotify object: %m");
 295                 r = -errno;
 296                 goto finish;
 297         }
 298
 299         if (fanotify_mark(fanotify_fd, FAN_MARK_ADD|FAN_MARK_MOUNT, FAN_OPEN, AT_FDCWD, root) < 0) {
 300                 log_error("Failed to mark %s: %m", root);
 301                 r = -errno;
 302                 goto finish;
 303         }
 304
 305         inotify_fd = open_inotify();
 306         if (inotify_fd < 0) {
 307                 r = inotify_fd;
 308                 goto finish;
 309         }
 310
 311         not_after = now(CLOCK_MONOTONIC) + arg_timeout;
 312
 313         my_pid = getpid();
 314
 315         pollfd[FD_FANOTIFY].fd = fanotify_fd;
 316         pollfd[FD_FANOTIFY].events = POLLIN;
 317         pollfd[FD_SIGNAL].fd = signal_fd;
 318         pollfd[FD_SIGNAL].events = POLLIN;
 319         pollfd[FD_INOTIFY].fd = inotify_fd;
 320         pollfd[FD_INOTIFY].events = POLLIN;
 321
 322         sd_notify(0,
 323                   "READY=1\n"
 324                   "STATUS=Collecting readahead data");
 325
 326         log_debug("Collecting...");
 327
 328         if (access("/run/systemd/readahead/cancel", F_OK) >= 0) {
 329                 log_debug("Collection canceled");
 330                 r = -ECANCELED;
 331                 goto finish;
 332         }
 333
 334         if (access("/run/systemd/readahead/done", F_OK) >= 0) {
 335                 log_debug("Got termination request");
 336                 goto done;
 337         }
 338
 339         for (;;) {
 340                 union {
 341                         struct fanotify_event_metadata metadata;
 342                         char buffer[4096];
 343                 } data;
 344                 ssize_t n;
 345                 struct fanotify_event_metadata *m;
 346                 usec_t t;
 347                 int h;
 348
 349                 if (hashmap_size(files) > arg_files_max) {
 350                         log_debug("Reached maximum number of read ahead files, ending collection.");
 351                         break;
 352                 }
 353
 354                 t = now(CLOCK_MONOTONIC);
 355                 if (t >= not_after) {
 356                         log_debug("Reached maximum collection time, ending collection.");
 357                         break;
 358                 }
 359
 360                 if ((h = poll(pollfd, _FD_MAX, (int) ((not_after - t) / USEC_PER_MSEC))) < 0) {
 361
 362                         if (errno == EINTR)
 363                                 continue;
 364
 365                         log_error("poll(): %m");
 366                         r = -errno;
 367                         goto finish;
 368                 }
 369
 370                 if (h == 0) {
 371                         log_debug("Reached maximum collection time, ending collection.");
 372                         break;
 373                 }
 374
 375                 if (pollfd[FD_SIGNAL].revents) {
 376                         log_debug("Got signal.");
 377                         break;
 378                 }
 379
 380                 if (pollfd[FD_INOTIFY].revents) {
 381                         uint8_t inotify_buffer[sizeof(struct inotify_event) + FILENAME_MAX];
 382                         struct inotify_event *e;
 383
 384                         if ((n = read(inotify_fd, &inotify_buffer, sizeof(inotify_buffer))) < 0) {
 385                                 if (errno == EINTR || errno == EAGAIN)
 386                                         continue;
 387
 388                                 log_error("Failed to read inotify event: %m");
 389                                 r = -errno;
 390                                 goto finish;
 391                         }
 392
 393                         e = (struct inotify_event*) inotify_buffer;
 394                         while (n > 0) {
 395                                 size_t step;
 396
 397                                 if ((e->mask & IN_CREATE) && streq(e->name, "cancel")) {
 398                                         log_debug("Collection canceled");
 399                                         r = -ECANCELED;
 400                                         goto finish;
 401                                 }
 402
 403                                 if ((e->mask & IN_CREATE) && streq(e->name, "done")) {
 404                                         log_debug("Got termination request");
 405                                         goto done;
 406                                 }
 407
 408                                 step = sizeof(struct inotify_event) + e->len;
 409                                 assert(step <= (size_t) n);
 410
 411                                 e = (struct inotify_event*) ((uint8_t*) e + step);
 412                                 n -= step;
 413                         }
 414                 }
 415
 416                 n = read(fanotify_fd, &data, sizeof(data));
 417                 if (n < 0) {
 418
 419                         if (errno == EINTR || errno == EAGAIN)
 420                                 continue;
 421
 422                         /* fanotify sometimes returns EACCES on read()
 423                          * where it shouldn't. For now let's just
 424                          * ignore it here (which is safe), but
 425                          * eventually this should be
 426                          * dropped when the kernel is fixed.
 427                          *
 428                          * https://bugzilla.redhat.com/show_bug.cgi?id=707577 */
 429                         if (errno == EACCES)
 430                                 continue;
 431
 432                         log_error("Failed to read event: %m");
 433                         r = -errno;
 434                         goto finish;
 435                 }
 436
 437                 for (m = &data.metadata; FAN_EVENT_OK(m, n); m = FAN_EVENT_NEXT(m, n)) {
 438                         char fn[sizeof("/proc/self/fd/") + DECIMAL_STR_MAX(int)];
 439                         int k;
 440
 441                         if (m->fd < 0)
 442                                 goto next_iteration;
 443
 444                         if (m->pid == my_pid)
 445                                 goto next_iteration;
 446
 447                         __sync_synchronize();
 448                         if (m->pid == shared->replay)
 449                                 goto next_iteration;
 450
 451                         snprintf(fn, sizeof(fn), "/proc/self/fd/%i", m->fd);
 452                         k = readlink_malloc(fn, &p);
 453                         if (k >= 0) {
 454                                 if (startswith(p, "/tmp") ||
 455                                     endswith(p, " (deleted)") ||
 456                                     hashmap_get(files, p))
 457                                         /* Not interesting, or
 458                                          * already read */
 459                                         free(p);
 460                                 else {
 461                                         unsigned long ul;
 462                                         usec_t entrytime;
 463                                         struct item *entry;
 464
 465                                         entry = new0(struct item, 1);
 466                                         if (!entry) {
 467                                                 r = log_oom();
 468                                                 goto finish;
 469                                         }
 470
 471                                         ul = fd_first_block(m->fd);
 472
 473                                         entrytime = now(CLOCK_MONOTONIC);
 474
 475                                         entry->block = ul;
 476                                         entry->path = strdup(p);
 477                                         if (!entry->path) {
 478                                                 free(entry);
 479                                                 r = log_oom();
 480                                                 goto finish;
 481                                         }
 482                                         entry->bin = (entrytime - starttime) / 2000000;
 483
 484                                         k = hashmap_put(files, p, entry);
 485                                         if (k < 0) {
 486                                                 log_warning("hashmap_put() failed: %s", strerror(-k));
 487                                                 free(p);
 488                                         }
 489                                 }
 490
 491                         } else
 492                                 log_warning("readlink(%s) failed: %s", fn, strerror(-k));
 493
 494                 next_iteration:
 495                         safe_close(m->fd);
 496                 }
 497         }
 498
 499 done:
 500         fanotify_fd = safe_close(fanotify_fd);
 501
 502         log_debug("Writing Pack File...");
 503
 504         on_ssd = fs_on_ssd(root) > 0;
 505         log_debug("On SSD: %s", yes_no(on_ssd));
 506
 507         on_btrfs = statfs(root, &sfs) >= 0 && F_TYPE_EQUAL(sfs.f_type, BTRFS_SUPER_MAGIC);
 508         log_debug("On btrfs: %s", yes_no(on_btrfs));
 509
 510         if (asprintf(&pack_fn_new, "%s/.readahead.new", root) < 0) {
 511                 r = log_oom();
 512                 goto finish;
 513         }
 514
 515         pack = fopen(pack_fn_new, "we");
 516         if (!pack) {
 517                 log_error("Failed to open pack file: %m");
 518                 r = -errno;
 519                 goto finish;
 520         }
 521
 522         fputs(CANONICAL_HOST READAHEAD_PACK_FILE_VERSION, pack);
 523         putc(on_ssd ? 'S' : 'R', pack);
 524
 525         if (on_ssd || on_btrfs) {
 526
 527                 /* On SSD or on btrfs, just write things out in the
 528                  * order the files were accessed. */
 529
 530                 HASHMAP_FOREACH_KEY(q, p, files, i)
 531                         pack_file(pack, p, on_btrfs);
 532         } else {
 533                 unsigned n;
 534
 535                 /* On rotating media, order things by the block
 536                  * numbers */
 537
 538                 log_debug("Ordering...");
 539
 540                 n = hashmap_size(files);
 541                 if (n) {
 542                         _cleanup_free_ struct item *ordered;
 543                         struct item *j;
 544                         unsigned k;
 545
 546                         ordered = new(struct item, n);
 547                         if (!ordered) {
 548                                 r = log_oom();
 549                                 goto finish;
 550                         }
 551
 552                         j = ordered;
 553                         HASHMAP_FOREACH_KEY(q, p, files, i) {
 554                                 memcpy(j, q, sizeof(struct item));
 555                                 j++;
 556                         }
 557
 558                         assert(ordered + n == j);
 559
 560                         qsort(ordered, n, sizeof(struct item), qsort_compare);
 561
 562                         for (k = 0; k < n; k++)
 563                                 pack_file(pack, ordered[k].path, on_btrfs);
 564                 } else
 565                         log_warning("No pack files");
 566         }
 567
 568         log_debug("Finalizing...");
 569
 570         fflush(pack);
 571
 572         if (ferror(pack)) {
 573                 log_error("Failed to write pack file.");
 574                 r = -EIO;
 575                 goto finish;
 576         }
 577
 578         if (rename(pack_fn_new, pack_fn) < 0) {
 579                 log_error("Failed to rename readahead file: %m");
 580                 r = -errno;
 581                 goto finish;
 582         }
 583
 584         fclose(pack);
 585         pack = NULL;
 586
 587         log_debug("Done.");
 588
 589 finish:
 590         safe_close(fanotify_fd);
 591         safe_close(signal_fd);
 592         safe_close(inotify_fd);
 593
 594         if (pack) {
 595                 fclose(pack);
 596                 unlink(pack_fn_new);
 597         }
 598         free(pack_fn_new);
 599         free(pack_fn);
 600
 601         while ((p = hashmap_steal_first_key(files)))
 602                 free(p);
 603
 604         hashmap_free(files);
 605
 606         if (previous_block_readahead_set) {
 607                 uint64_t bytes;
 608
 609                 /* Restore the original kernel readahead setting if we
 610                  * changed it, and nobody has overwritten it since
 611                  * yet. */
 612                 if (block_get_readahead(root, &bytes) >= 0 && bytes == 8*1024)
 613                         block_set_readahead(root, previous_block_readahead);
 614         }
 615
 616         return r;
 617 }
 618
 619 int main_collect(const char *root) {
 620
 621         if (!root)
 622                 root = "/";
 623
 624         /* Skip this step on read-only media. Note that we check the
 625          * underlying block device here, not he read-only flag of the
 626          * file system on top, since that one is most likely mounted
 627          * read-only anyway at boot, even if the underlying block
 628          * device is theoretically writable. */
 629         if (fs_on_read_only(root) > 0) {
 630                 log_info("Disabling readahead collector due to read-only media.");
 631                 return EXIT_SUCCESS;
 632         }
 633
 634         if (!enough_ram()) {
 635                 log_info("Disabling readahead collector due to low memory.");
 636                 return EXIT_SUCCESS;
 637         }
 638
 639         shared = shared_get();
 640         if (!shared)
 641                 return EXIT_FAILURE;
 642
 643         shared->collect = getpid();
 644         __sync_synchronize();
 645
 646         if (collect(root) < 0)
 647                 return EXIT_FAILURE;
 648
 649         return EXIT_SUCCESS;
 650 }