src/readahead/readahead-collect.c

   1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
   2
   3 /***
   4   This file is part of systemd.
   5
   6   Copyright 2010 Lennart Poettering
   7
   8   systemd is free software; you can redistribute it and/or modify it
   9   under the terms of the GNU Lesser General Public License as published by
  10   the Free Software Foundation; either version 2.1 of the License, or
  11   (at your option) any later version.
  12
  13   systemd is distributed in the hope that it will be useful, but
  14   WITHOUT ANY WARRANTY; without even the implied warranty of
  15   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  16   Lesser General Public License for more details.
  17
  18   You should have received a copy of the GNU Lesser General Public License
  19   along with systemd; If not, see <http://www.gnu.org/licenses/>.
  20 ***/
  21
  22 #include <errno.h>
  23 #include <inttypes.h>
  24 #include <fcntl.h>
  25 #include <linux/limits.h>
  26 #include <stdbool.h>
  27 #include <stdio.h>
  28 #include <stdlib.h>
  29 #include <string.h>
  30 #include <sys/select.h>
  31 #include <sys/time.h>
  32 #include <sys/types.h>
  33 #include <sys/stat.h>
  34 #include <unistd.h>
  35 #include <linux/fanotify.h>
  36 #include <sys/signalfd.h>
  37 #include <sys/poll.h>
  38 #include <sys/mman.h>
  39 #include <linux/fs.h>
  40 #include <linux/fiemap.h>
  41 #include <linux/btrfs.h>
  42 #include <sys/ioctl.h>
  43 #include <sys/vfs.h>
  44 #include <getopt.h>
  45 #include <sys/inotify.h>
  46 #include <math.h>
  47
  48 #ifdef HAVE_FANOTIFY_INIT
  49 #include <sys/fanotify.h>
  50 #endif
  51
  52 #include <systemd/sd-daemon.h>
  53
  54 #include "missing.h"
  55 #include "util.h"
  56 #include "set.h"
  57 #include "ioprio.h"
  58 #include "readahead-common.h"
  59 #include "virt.h"
  60
  61 /* fixme:
  62  *
  63  * - detect ssd on btrfs/lvm...
  64  * - read ahead directories
  65  * - gzip?
  66  * - remount rw?
  67  * - handle files where nothing is in mincore
  68  * - does ioprio_set work with fadvise()?
  69  */
  70
  71 static ReadaheadShared *shared = NULL;
  72 static usec_t starttime;
  73
  74 /* Avoid collisions with the NULL pointer */
  75 #define SECTOR_TO_PTR(s) ULONG_TO_PTR((s)+1)
  76 #define PTR_TO_SECTOR(p) (PTR_TO_ULONG(p)-1)
  77
  78 static int btrfs_defrag(int fd) {
  79         struct btrfs_ioctl_vol_args data = { .fd = fd };
  80
  81         return ioctl(fd, BTRFS_IOC_DEFRAG, &data);
  82 }
  83
  84 static int pack_file(FILE *pack, const char *fn, bool on_btrfs) {
  85         struct stat st;
  86         void *start = MAP_FAILED;
  87         uint8_t *vec;
  88         uint32_t b, c;
  89         uint64_t inode;
  90         size_t l, pages;
  91         bool mapped;
  92         int r = 0, fd = -1, k;
  93
  94         assert(pack);
  95         assert(fn);
  96
  97         fd = open(fn, O_RDONLY|O_CLOEXEC|O_NOATIME|O_NOCTTY|O_NOFOLLOW);
  98         if (fd < 0) {
  99
 100                 if (errno == ENOENT)
 101                         return 0;
 102
 103                 if (errno == EPERM || errno == EACCES)
 104                         return 0;
 105
 106                 log_warning("open(%s) failed: %m", fn);
 107                 r = -errno;
 108                 goto finish;
 109         }
 110
 111         k = file_verify(fd, fn, arg_file_size_max, &st);
 112         if (k <= 0) {
 113                 r = k;
 114                 goto finish;
 115         }
 116
 117         if (on_btrfs)
 118                 btrfs_defrag(fd);
 119
 120         l = PAGE_ALIGN(st.st_size);
 121         start = mmap(NULL, l, PROT_READ, MAP_SHARED, fd, 0);
 122         if (start == MAP_FAILED) {
 123                 log_warning("mmap(%s) failed: %m", fn);
 124                 r = -errno;
 125                 goto finish;
 126         }
 127
 128         pages = l / page_size();
 129         vec = alloca(pages);
 130         memset(vec, 0, pages);
 131         if (mincore(start, l, vec) < 0) {
 132                 log_warning("mincore(%s) failed: %m", fn);
 133                 r = -errno;
 134                 goto finish;
 135         }
 136
 137         fputs(fn, pack);
 138         fputc('\n', pack);
 139
 140         /* Store the inode, so that we notice when the file is deleted */
 141         inode = (uint64_t) st.st_ino;
 142         fwrite(&inode, sizeof(inode), 1, pack);
 143
 144         mapped = false;
 145         for (c = 0; c < pages; c++) {
 146                 bool new_mapped = !!(vec[c] & 1);
 147
 148                 if (!mapped && new_mapped)
 149                         b = c;
 150                 else if (mapped && !new_mapped) {
 151                         fwrite(&b, sizeof(b), 1, pack);
 152                         fwrite(&c, sizeof(c), 1, pack);
 153
 154                         log_debug("%s: page %u to %u", fn, b, c);
 155                 }
 156
 157                 mapped = new_mapped;
 158         }
 159
 160         /* We don't write any range data if we should read the entire file */
 161         if (mapped && b > 0) {
 162                 fwrite(&b, sizeof(b), 1, pack);
 163                 fwrite(&c, sizeof(c), 1, pack);
 164
 165                 log_debug("%s: page %u to %u", fn, b, c);
 166         }
 167
 168         /* End marker */
 169         b = 0;
 170         fwrite(&b, sizeof(b), 1, pack);
 171         fwrite(&b, sizeof(b), 1, pack);
 172
 173 finish:
 174         if (start != MAP_FAILED)
 175                 munmap(start, l);
 176
 177         if (fd >= 0)
 178                 close_nointr_nofail(fd);
 179
 180         return r;
 181 }
 182
 183 static unsigned long fd_first_block(int fd) {
 184         struct {
 185                 struct fiemap fiemap;
 186                 struct fiemap_extent extent;
 187         } data = {
 188                 .fiemap.fm_length = ~0ULL,
 189                 .fiemap.fm_extent_count = 1,
 190         };
 191
 192         if (ioctl(fd, FS_IOC_FIEMAP, &data) < 0)
 193                 return 0;
 194
 195         if (data.fiemap.fm_mapped_extents <= 0)
 196                 return 0;
 197
 198         if (data.fiemap.fm_extents[0].fe_flags & FIEMAP_EXTENT_UNKNOWN)
 199                 return 0;
 200
 201         return (unsigned long) data.fiemap.fm_extents[0].fe_physical;
 202 }
 203
 204 struct item {
 205         const char *path;
 206         unsigned long block;
 207         unsigned long bin;
 208 };
 209
 210 static int qsort_compare(const void *a, const void *b) {
 211         const struct item *i, *j;
 212
 213         i = a;
 214         j = b;
 215
 216         /* sort by bin first */
 217         if (i->bin < j->bin)
 218                 return -1;
 219         if (i->bin > j->bin)
 220                 return 1;
 221
 222         /* then sort by sector */
 223         if (i->block < j->block)
 224                 return -1;
 225         if (i->block > j->block)
 226                 return 1;
 227
 228         return strcmp(i->path, j->path);
 229 }
 230
 231 static int collect(const char *root) {
 232         enum {
 233                 FD_FANOTIFY,  /* Get the actual fs events */
 234                 FD_SIGNAL,
 235                 FD_INOTIFY,   /* We get notifications to quit early via this fd */
 236                 _FD_MAX
 237         };
 238         struct pollfd pollfd[_FD_MAX] = {};
 239         int fanotify_fd = -1, signal_fd = -1, inotify_fd = -1, r = 0;
 240         pid_t my_pid;
 241         Hashmap *files = NULL;
 242         Iterator i;
 243         char *p, *q;
 244         sigset_t mask;
 245         FILE *pack = NULL;
 246         char *pack_fn_new = NULL, *pack_fn = NULL;
 247         bool on_ssd, on_btrfs;
 248         struct statfs sfs;
 249         usec_t not_after;
 250         uint64_t previous_block_readahead;
 251         bool previous_block_readahead_set = false;
 252
 253         assert(root);
 254
 255         if (asprintf(&pack_fn, "%s/.readahead", root) < 0) {
 256                 r = log_oom();
 257                 goto finish;
 258         }
 259
 260         starttime = now(CLOCK_MONOTONIC);
 261
 262         /* If there's no pack file yet we lower the kernel readahead
 263          * so that mincore() is accurate. If there is a pack file
 264          * already we assume it is accurate enough so that kernel
 265          * readahead is never triggered. */
 266         previous_block_readahead_set =
 267                 access(pack_fn, F_OK) < 0 &&
 268                 block_get_readahead(root, &previous_block_readahead) >= 0 &&
 269                 block_set_readahead(root, 8*1024) >= 0;
 270
 271         if (ioprio_set(IOPRIO_WHO_PROCESS, getpid(), IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0)) < 0)
 272                 log_warning("Failed to set IDLE IO priority class: %m");
 273
 274         assert_se(sigemptyset(&mask) == 0);
 275         sigset_add_many(&mask, SIGINT, SIGTERM, -1);
 276         assert_se(sigprocmask(SIG_SETMASK, &mask, NULL) == 0);
 277
 278         if ((signal_fd = signalfd(-1, &mask, SFD_NONBLOCK|SFD_CLOEXEC)) < 0) {
 279                 log_error("signalfd(): %m");
 280                 r = -errno;
 281                 goto finish;
 282         }
 283
 284         files = hashmap_new(string_hash_func, string_compare_func);
 285         if (!files) {
 286                 log_error("Failed to allocate set.");
 287                 r = -ENOMEM;
 288                 goto finish;
 289         }
 290
 291         fanotify_fd = fanotify_init(FAN_CLOEXEC|FAN_NONBLOCK, O_RDONLY|O_LARGEFILE|O_CLOEXEC|O_NOATIME);
 292         if (fanotify_fd < 0)  {
 293                 log_error("Failed to create fanotify object: %m");
 294                 r = -errno;
 295                 goto finish;
 296         }
 297
 298         if (fanotify_mark(fanotify_fd, FAN_MARK_ADD|FAN_MARK_MOUNT, FAN_OPEN, AT_FDCWD, root) < 0) {
 299                 log_error("Failed to mark %s: %m", root);
 300                 r = -errno;
 301                 goto finish;
 302         }
 303
 304         inotify_fd = open_inotify();
 305         if (inotify_fd < 0) {
 306                 r = inotify_fd;
 307                 goto finish;
 308         }
 309
 310         not_after = now(CLOCK_MONOTONIC) + arg_timeout;
 311
 312         my_pid = getpid();
 313
 314         pollfd[FD_FANOTIFY].fd = fanotify_fd;
 315         pollfd[FD_FANOTIFY].events = POLLIN;
 316         pollfd[FD_SIGNAL].fd = signal_fd;
 317         pollfd[FD_SIGNAL].events = POLLIN;
 318         pollfd[FD_INOTIFY].fd = inotify_fd;
 319         pollfd[FD_INOTIFY].events = POLLIN;
 320
 321         sd_notify(0,
 322                   "READY=1\n"
 323                   "STATUS=Collecting readahead data");
 324
 325         log_debug("Collecting...");
 326
 327         if (access("/run/systemd/readahead/cancel", F_OK) >= 0) {
 328                 log_debug("Collection canceled");
 329                 r = -ECANCELED;
 330                 goto finish;
 331         }
 332
 333         if (access("/run/systemd/readahead/done", F_OK) >= 0) {
 334                 log_debug("Got termination request");
 335                 goto done;
 336         }
 337
 338         for (;;) {
 339                 union {
 340                         struct fanotify_event_metadata metadata;
 341                         char buffer[4096];
 342                 } data;
 343                 ssize_t n;
 344                 struct fanotify_event_metadata *m;
 345                 usec_t t;
 346                 int h;
 347
 348                 if (hashmap_size(files) > arg_files_max) {
 349                         log_debug("Reached maximum number of read ahead files, ending collection.");
 350                         break;
 351                 }
 352
 353                 t = now(CLOCK_MONOTONIC);
 354                 if (t >= not_after) {
 355                         log_debug("Reached maximum collection time, ending collection.");
 356                         break;
 357                 }
 358
 359                 if ((h = poll(pollfd, _FD_MAX, (int) ((not_after - t) / USEC_PER_MSEC))) < 0) {
 360
 361                         if (errno == EINTR)
 362                                 continue;
 363
 364                         log_error("poll(): %m");
 365                         r = -errno;
 366                         goto finish;
 367                 }
 368
 369                 if (h == 0) {
 370                         log_debug("Reached maximum collection time, ending collection.");
 371                         break;
 372                 }
 373
 374                 if (pollfd[FD_SIGNAL].revents) {
 375                         log_debug("Got signal.");
 376                         break;
 377                 }
 378
 379                 if (pollfd[FD_INOTIFY].revents) {
 380                         uint8_t inotify_buffer[sizeof(struct inotify_event) + FILENAME_MAX];
 381                         struct inotify_event *e;
 382
 383                         if ((n = read(inotify_fd, &inotify_buffer, sizeof(inotify_buffer))) < 0) {
 384                                 if (errno == EINTR || errno == EAGAIN)
 385                                         continue;
 386
 387                                 log_error("Failed to read inotify event: %m");
 388                                 r = -errno;
 389                                 goto finish;
 390                         }
 391
 392                         e = (struct inotify_event*) inotify_buffer;
 393                         while (n > 0) {
 394                                 size_t step;
 395
 396                                 if ((e->mask & IN_CREATE) && streq(e->name, "cancel")) {
 397                                         log_debug("Collection canceled");
 398                                         r = -ECANCELED;
 399                                         goto finish;
 400                                 }
 401
 402                                 if ((e->mask & IN_CREATE) && streq(e->name, "done")) {
 403                                         log_debug("Got termination request");
 404                                         goto done;
 405                                 }
 406
 407                                 step = sizeof(struct inotify_event) + e->len;
 408                                 assert(step <= (size_t) n);
 409
 410                                 e = (struct inotify_event*) ((uint8_t*) e + step);
 411                                 n -= step;
 412                         }
 413                 }
 414
 415                 if ((n = read(fanotify_fd, &data, sizeof(data))) < 0) {
 416
 417                         if (errno == EINTR || errno == EAGAIN)
 418                                 continue;
 419
 420                         /* fanotify sometimes returns EACCES on read()
 421                          * where it shouldn't. For now let's just
 422                          * ignore it here (which is safe), but
 423                          * eventually this should be
 424                          * dropped when the kernel is fixed.
 425                          *
 426                          * https://bugzilla.redhat.com/show_bug.cgi?id=707577 */
 427                         if (errno == EACCES)
 428                                 continue;
 429
 430                         log_error("Failed to read event: %m");
 431                         r = -errno;
 432                         goto finish;
 433                 }
 434
 435                 for (m = &data.metadata; FAN_EVENT_OK(m, n); m = FAN_EVENT_NEXT(m, n)) {
 436                         char fn[PATH_MAX];
 437                         int k;
 438
 439                         if (m->fd < 0)
 440                                 goto next_iteration;
 441
 442                         if (m->pid == my_pid)
 443                                 goto next_iteration;
 444
 445                         __sync_synchronize();
 446                         if (m->pid == shared->replay)
 447                                 goto next_iteration;
 448
 449                         snprintf(fn, sizeof(fn), "/proc/self/fd/%i", m->fd);
 450                         char_array_0(fn);
 451
 452                         if ((k = readlink_malloc(fn, &p)) >= 0) {
 453                                 if (startswith(p, "/tmp") ||
 454                                     endswith(p, " (deleted)") ||
 455                                     hashmap_get(files, p))
 456                                         /* Not interesting, or
 457                                          * already read */
 458                                         free(p);
 459                                 else {
 460                                         unsigned long ul;
 461                                         usec_t entrytime;
 462                                         struct item *entry;
 463
 464                                         entry = new0(struct item, 1);
 465                                         if (!entry) {
 466                                                 r = log_oom();
 467                                                 goto finish;
 468                                         }
 469
 470                                         ul = fd_first_block(m->fd);
 471
 472                                         entrytime = now(CLOCK_MONOTONIC);
 473
 474                                         entry->block = ul;
 475                                         entry->path = strdup(p);
 476                                         if (!entry->path) {
 477                                                 free(entry);
 478                                                 r = log_oom();
 479                                                 goto finish;
 480                                         }
 481                                         entry->bin = (entrytime - starttime) / 2000000;
 482
 483                                         k = hashmap_put(files, p, entry);
 484                                         if (k < 0) {
 485                                                 log_warning("hashmap_put() failed: %s", strerror(-k));
 486                                                 free(p);
 487                                         }
 488                                 }
 489
 490                         } else
 491                                 log_warning("readlink(%s) failed: %s", fn, strerror(-k));
 492
 493                 next_iteration:
 494                         if (m->fd >= 0)
 495                                 close_nointr_nofail(m->fd);
 496                 }
 497         }
 498
 499 done:
 500         if (fanotify_fd >= 0) {
 501                 close_nointr_nofail(fanotify_fd);
 502                 fanotify_fd = -1;
 503         }
 504
 505         log_debug("Writing Pack File...");
 506
 507         on_ssd = fs_on_ssd(root) > 0;
 508         log_debug("On SSD: %s", yes_no(on_ssd));
 509
 510         on_btrfs = statfs(root, &sfs) >= 0 && F_TYPE_CMP(sfs.f_type, BTRFS_SUPER_MAGIC);
 511         log_debug("On btrfs: %s", yes_no(on_btrfs));
 512
 513         if (asprintf(&pack_fn_new, "%s/.readahead.new", root) < 0) {
 514                 r = log_oom();
 515                 goto finish;
 516         }
 517
 518         pack = fopen(pack_fn_new, "we");
 519         if (!pack) {
 520                 log_error("Failed to open pack file: %m");
 521                 r = -errno;
 522                 goto finish;
 523         }
 524
 525         fputs(CANONICAL_HOST READAHEAD_PACK_FILE_VERSION, pack);
 526         putc(on_ssd ? 'S' : 'R', pack);
 527
 528         if (on_ssd || on_btrfs) {
 529
 530                 /* On SSD or on btrfs, just write things out in the
 531                  * order the files were accessed. */
 532
 533                 HASHMAP_FOREACH_KEY(q, p, files, i)
 534                         pack_file(pack, p, on_btrfs);
 535         } else {
 536                 struct item *ordered, *j;
 537                 unsigned k, n;
 538
 539                 /* On rotating media, order things by the block
 540                  * numbers */
 541
 542                 log_debug("Ordering...");
 543
 544                 n = hashmap_size(files);
 545                 if (!(ordered = new(struct item, n))) {
 546                         r = log_oom();
 547                         goto finish;
 548                 }
 549
 550                 j = ordered;
 551                 HASHMAP_FOREACH_KEY(q, p, files, i) {
 552                         memcpy(j, q, sizeof(struct item));
 553                         j++;
 554                 }
 555
 556                 assert(ordered + n == j);
 557
 558                 qsort(ordered, n, sizeof(struct item), qsort_compare);
 559
 560                 for (k = 0; k < n; k++)
 561                         pack_file(pack, ordered[k].path, on_btrfs);
 562
 563                 free(ordered);
 564         }
 565
 566         log_debug("Finalizing...");
 567
 568         fflush(pack);
 569
 570         if (ferror(pack)) {
 571                 log_error("Failed to write pack file.");
 572                 r = -EIO;
 573                 goto finish;
 574         }
 575
 576         if (rename(pack_fn_new, pack_fn) < 0) {
 577                 log_error("Failed to rename readahead file: %m");
 578                 r = -errno;
 579                 goto finish;
 580         }
 581
 582         fclose(pack);
 583         pack = NULL;
 584
 585         log_debug("Done.");
 586
 587 finish:
 588         if (fanotify_fd >= 0)
 589                 close_nointr_nofail(fanotify_fd);
 590
 591         if (signal_fd >= 0)
 592                 close_nointr_nofail(signal_fd);
 593
 594         if (inotify_fd >= 0)
 595                 close_nointr_nofail(inotify_fd);
 596
 597         if (pack) {
 598                 fclose(pack);
 599                 unlink(pack_fn_new);
 600         }
 601         free(pack_fn_new);
 602         free(pack_fn);
 603
 604         while ((p = hashmap_steal_first_key(files)))
 605                 free(p);
 606
 607         hashmap_free(files);
 608
 609         if (previous_block_readahead_set) {
 610                 uint64_t bytes;
 611
 612                 /* Restore the original kernel readahead setting if we
 613                  * changed it, and nobody has overwritten it since
 614                  * yet. */
 615                 if (block_get_readahead(root, &bytes) >= 0 && bytes == 8*1024)
 616                         block_set_readahead(root, previous_block_readahead);
 617         }
 618
 619         return r;
 620 }
 621
 622 int main_collect(const char *root) {
 623
 624         if (!root)
 625                 root = "/";
 626
 627         /* Skip this step on read-only media. Note that we check the
 628          * underlying block device here, not he read-only flag of the
 629          * file system on top, since that one is most likely mounted
 630          * read-only anyway at boot, even if the underlying block
 631          * device is theoretically writable. */
 632         if (fs_on_read_only(root) > 0) {
 633                 log_info("Disabling readahead collector due to read-only media.");
 634                 return EXIT_SUCCESS;
 635         }
 636
 637         if (!enough_ram()) {
 638                 log_info("Disabling readahead collector due to low memory.");
 639                 return EXIT_SUCCESS;
 640         }
 641
 642         shared = shared_get();
 643         if (!shared)
 644                 return EXIT_FAILURE;
 645
 646         shared->collect = getpid();
 647         __sync_synchronize();
 648
 649         if (collect(root) < 0)
 650                 return EXIT_FAILURE;
 651
 652         return EXIT_SUCCESS;
 653 }