src/readahead/readahead-collect.c

   1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
   2
   3 /***
   4   This file is part of systemd.
   5
   6   Copyright 2010 Lennart Poettering
   7
   8   systemd is free software; you can redistribute it and/or modify it
   9   under the terms of the GNU Lesser General Public License as published by
  10   the Free Software Foundation; either version 2.1 of the License, or
  11   (at your option) any later version.
  12
  13   systemd is distributed in the hope that it will be useful, but
  14   WITHOUT ANY WARRANTY; without even the implied warranty of
  15   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  16   Lesser General Public License for more details.
  17
  18   You should have received a copy of the GNU Lesser General Public License
  19   along with systemd; If not, see <http://www.gnu.org/licenses/>.
  20 ***/
  21
  22 #include <errno.h>
  23 #include <inttypes.h>
  24 #include <fcntl.h>
  25 #include <linux/limits.h>
  26 #include <stdbool.h>
  27 #include <stdio.h>
  28 #include <stdlib.h>
  29 #include <string.h>
  30 #include <sys/select.h>
  31 #include <sys/time.h>
  32 #include <sys/types.h>
  33 #include <sys/stat.h>
  34 #include <unistd.h>
  35 #include <linux/fanotify.h>
  36 #include <sys/signalfd.h>
  37 #include <sys/poll.h>
  38 #include <sys/mman.h>
  39 #include <linux/fs.h>
  40 #include <linux/fiemap.h>
  41 #include <sys/ioctl.h>
  42 #include <sys/vfs.h>
  43 #include <getopt.h>
  44 #include <sys/inotify.h>
  45
  46 #ifdef HAVE_FANOTIFY_INIT
  47 #include <sys/fanotify.h>
  48 #endif
  49
  50 #include <systemd/sd-daemon.h>
  51
  52 #include "missing.h"
  53 #include "util.h"
  54 #include "set.h"
  55 #include "ioprio.h"
  56 #include "readahead-common.h"
  57 #include "virt.h"
  58
  59 /* fixme:
  60  *
  61  * - detect ssd on btrfs/lvm...
  62  * - read ahead directories
  63  * - gzip?
  64  * - remount rw?
  65  * - handle files where nothing is in mincore
  66  * - does ioprio_set work with fadvise()?
  67  */
  68
  69 static ReadaheadShared *shared = NULL;
  70
  71 /* Avoid collisions with the NULL pointer */
  72 #define SECTOR_TO_PTR(s) ULONG_TO_PTR((s)+1)
  73 #define PTR_TO_SECTOR(p) (PTR_TO_ULONG(p)-1)
  74
  75 static int btrfs_defrag(int fd) {
  76         struct btrfs_ioctl_vol_args data;
  77
  78         zero(data);
  79         data.fd = fd;
  80
  81         return ioctl(fd, BTRFS_IOC_DEFRAG, &data);
  82 }
  83
  84 static int pack_file(FILE *pack, const char *fn, bool on_btrfs) {
  85         struct stat st;
  86         void *start = MAP_FAILED;
  87         uint8_t *vec;
  88         uint32_t b, c;
  89         uint64_t inode;
  90         size_t l, pages;
  91         bool mapped;
  92         int r = 0, fd = -1, k;
  93
  94         assert(pack);
  95         assert(fn);
  96
  97         fd = open(fn, O_RDONLY|O_CLOEXEC|O_NOATIME|O_NOCTTY|O_NOFOLLOW);
  98         if (fd < 0) {
  99
 100                 if (errno == ENOENT)
 101                         return 0;
 102
 103                 if (errno == EPERM || errno == EACCES)
 104                         return 0;
 105
 106                 log_warning("open(%s) failed: %m", fn);
 107                 r = -errno;
 108                 goto finish;
 109         }
 110
 111         k = file_verify(fd, fn, arg_file_size_max, &st);
 112         if (k <= 0) {
 113                 r = k;
 114                 goto finish;
 115         }
 116
 117         if (on_btrfs)
 118                 btrfs_defrag(fd);
 119
 120         l = PAGE_ALIGN(st.st_size);
 121         start = mmap(NULL, l, PROT_READ, MAP_SHARED, fd, 0);
 122         if (start == MAP_FAILED) {
 123                 log_warning("mmap(%s) failed: %m", fn);
 124                 r = -errno;
 125                 goto finish;
 126         }
 127
 128         pages = l / page_size();
 129         vec = alloca(pages);
 130         memset(vec, 0, pages);
 131         if (mincore(start, l, vec) < 0) {
 132                 log_warning("mincore(%s) failed: %m", fn);
 133                 r = -errno;
 134                 goto finish;
 135         }
 136
 137         fputs(fn, pack);
 138         fputc('\n', pack);
 139
 140         /* Store the inode, so that we notice when the file is deleted */
 141         inode = (uint64_t) st.st_ino;
 142         fwrite(&inode, sizeof(inode), 1, pack);
 143
 144         mapped = false;
 145         for (c = 0; c < pages; c++) {
 146                 bool new_mapped = !!(vec[c] & 1);
 147
 148                 if (!mapped && new_mapped)
 149                         b = c;
 150                 else if (mapped && !new_mapped) {
 151                         fwrite(&b, sizeof(b), 1, pack);
 152                         fwrite(&c, sizeof(c), 1, pack);
 153
 154                         log_debug("%s: page %u to %u", fn, b, c);
 155                 }
 156
 157                 mapped = new_mapped;
 158         }
 159
 160         /* We don't write any range data if we should read the entire file */
 161         if (mapped && b > 0) {
 162                 fwrite(&b, sizeof(b), 1, pack);
 163                 fwrite(&c, sizeof(c), 1, pack);
 164
 165                 log_debug("%s: page %u to %u", fn, b, c);
 166         }
 167
 168         /* End marker */
 169         b = 0;
 170         fwrite(&b, sizeof(b), 1, pack);
 171         fwrite(&b, sizeof(b), 1, pack);
 172
 173 finish:
 174         if (start != MAP_FAILED)
 175                 munmap(start, l);
 176
 177         if (fd >= 0)
 178                 close_nointr_nofail(fd);
 179
 180         return r;
 181 }
 182
 183 static unsigned long fd_first_block(int fd) {
 184         struct {
 185                 struct fiemap fiemap;
 186                 struct fiemap_extent extent;
 187         } data;
 188
 189         zero(data);
 190         data.fiemap.fm_length = ~0ULL;
 191         data.fiemap.fm_extent_count = 1;
 192
 193         if (ioctl(fd, FS_IOC_FIEMAP, &data) < 0)
 194                 return 0;
 195
 196         if (data.fiemap.fm_mapped_extents <= 0)
 197                 return 0;
 198
 199         if (data.fiemap.fm_extents[0].fe_flags & FIEMAP_EXTENT_UNKNOWN)
 200                 return 0;
 201
 202         return (unsigned long) data.fiemap.fm_extents[0].fe_physical;
 203 }
 204
 205 struct item {
 206         const char *path;
 207         unsigned long block;
 208 };
 209
 210 static int qsort_compare(const void *a, const void *b) {
 211         const struct item *i, *j;
 212
 213         i = a;
 214         j = b;
 215
 216         if (i->block < j->block)
 217                 return -1;
 218         if (i->block > j->block)
 219                 return 1;
 220
 221         return strcmp(i->path, j->path);
 222 }
 223
 224 static int collect(const char *root) {
 225         enum {
 226                 FD_FANOTIFY,  /* Get the actual fs events */
 227                 FD_SIGNAL,
 228                 FD_INOTIFY,   /* We get notifications to quit early via this fd */
 229                 _FD_MAX
 230         };
 231         struct pollfd pollfd[_FD_MAX];
 232         int fanotify_fd = -1, signal_fd = -1, inotify_fd = -1, r = 0;
 233         pid_t my_pid;
 234         Hashmap *files = NULL;
 235         Iterator i;
 236         char *p, *q;
 237         sigset_t mask;
 238         FILE *pack = NULL;
 239         char *pack_fn_new = NULL, *pack_fn = NULL;
 240         bool on_ssd, on_btrfs;
 241         struct statfs sfs;
 242         usec_t not_after;
 243         uint64_t previous_block_readahead;
 244         bool previous_block_readahead_set = false;
 245
 246         assert(root);
 247
 248         if (asprintf(&pack_fn, "%s/.readahead", root) < 0) {
 249                 r = log_oom();
 250                 goto finish;
 251         }
 252
 253         /* If there's no pack file yet we lower the kernel readahead
 254          * so that mincore() is accurate. If there is a pack file
 255          * already we assume it is accurate enough so that kernel
 256          * readahead is never triggered. */
 257         previous_block_readahead_set =
 258                 access(pack_fn, F_OK) < 0 &&
 259                 block_get_readahead(root, &previous_block_readahead) >= 0 &&
 260                 block_set_readahead(root, 8*1024) >= 0;
 261
 262         if (ioprio_set(IOPRIO_WHO_PROCESS, getpid(), IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0)) < 0)
 263                 log_warning("Failed to set IDLE IO priority class: %m");
 264
 265         assert_se(sigemptyset(&mask) == 0);
 266         sigset_add_many(&mask, SIGINT, SIGTERM, -1);
 267         assert_se(sigprocmask(SIG_SETMASK, &mask, NULL) == 0);
 268
 269         if ((signal_fd = signalfd(-1, &mask, SFD_NONBLOCK|SFD_CLOEXEC)) < 0) {
 270                 log_error("signalfd(): %m");
 271                 r = -errno;
 272                 goto finish;
 273         }
 274
 275         if (!(files = hashmap_new(string_hash_func, string_compare_func))) {
 276                 log_error("Failed to allocate set.");
 277                 r = -ENOMEM;
 278                 goto finish;
 279         }
 280
 281         if ((fanotify_fd = fanotify_init(FAN_CLOEXEC|FAN_NONBLOCK, O_RDONLY|O_LARGEFILE|O_CLOEXEC|O_NOATIME)) < 0)  {
 282                 log_error("Failed to create fanotify object: %m");
 283                 r = -errno;
 284                 goto finish;
 285         }
 286
 287         if (fanotify_mark(fanotify_fd, FAN_MARK_ADD|FAN_MARK_MOUNT, FAN_OPEN, AT_FDCWD, root) < 0) {
 288                 log_error("Failed to mark %s: %m", root);
 289                 r = -errno;
 290                 goto finish;
 291         }
 292
 293         if ((inotify_fd = open_inotify()) < 0) {
 294                 r = inotify_fd;
 295                 goto finish;
 296         }
 297
 298         not_after = now(CLOCK_MONOTONIC) + arg_timeout;
 299
 300         my_pid = getpid();
 301
 302         zero(pollfd);
 303         pollfd[FD_FANOTIFY].fd = fanotify_fd;
 304         pollfd[FD_FANOTIFY].events = POLLIN;
 305         pollfd[FD_SIGNAL].fd = signal_fd;
 306         pollfd[FD_SIGNAL].events = POLLIN;
 307         pollfd[FD_INOTIFY].fd = inotify_fd;
 308         pollfd[FD_INOTIFY].events = POLLIN;
 309
 310         sd_notify(0,
 311                   "READY=1\n"
 312                   "STATUS=Collecting readahead data");
 313
 314         log_debug("Collecting...");
 315
 316         if (access("/run/systemd/readahead/cancel", F_OK) >= 0) {
 317                 log_debug("Collection canceled");
 318                 r = -ECANCELED;
 319                 goto finish;
 320         }
 321
 322         if (access("/run/systemd/readahead/done", F_OK) >= 0) {
 323                 log_debug("Got termination request");
 324                 goto done;
 325         }
 326
 327         for (;;) {
 328                 union {
 329                         struct fanotify_event_metadata metadata;
 330                         char buffer[4096];
 331                 } data;
 332                 ssize_t n;
 333                 struct fanotify_event_metadata *m;
 334                 usec_t t;
 335                 int h;
 336
 337                 if (hashmap_size(files) > arg_files_max) {
 338                         log_debug("Reached maximum number of read ahead files, ending collection.");
 339                         break;
 340                 }
 341
 342                 t = now(CLOCK_MONOTONIC);
 343                 if (t >= not_after) {
 344                         log_debug("Reached maximum collection time, ending collection.");
 345                         break;
 346                 }
 347
 348                 if ((h = poll(pollfd, _FD_MAX, (int) ((not_after - t) / USEC_PER_MSEC))) < 0) {
 349
 350                         if (errno == EINTR)
 351                                 continue;
 352
 353                         log_error("poll(): %m");
 354                         r = -errno;
 355                         goto finish;
 356                 }
 357
 358                 if (h == 0) {
 359                         log_debug("Reached maximum collection time, ending collection.");
 360                         break;
 361                 }
 362
 363                 if (pollfd[FD_SIGNAL].revents) {
 364                         log_debug("Got signal.");
 365                         break;
 366                 }
 367
 368                 if (pollfd[FD_INOTIFY].revents) {
 369                         uint8_t inotify_buffer[sizeof(struct inotify_event) + FILENAME_MAX];
 370                         struct inotify_event *e;
 371
 372                         if ((n = read(inotify_fd, &inotify_buffer, sizeof(inotify_buffer))) < 0) {
 373                                 if (errno == EINTR || errno == EAGAIN)
 374                                         continue;
 375
 376                                 log_error("Failed to read inotify event: %m");
 377                                 r = -errno;
 378                                 goto finish;
 379                         }
 380
 381                         e = (struct inotify_event*) inotify_buffer;
 382                         while (n > 0) {
 383                                 size_t step;
 384
 385                                 if ((e->mask & IN_CREATE) && streq(e->name, "cancel")) {
 386                                         log_debug("Collection canceled");
 387                                         r = -ECANCELED;
 388                                         goto finish;
 389                                 }
 390
 391                                 if ((e->mask & IN_CREATE) && streq(e->name, "done")) {
 392                                         log_debug("Got termination request");
 393                                         goto done;
 394                                 }
 395
 396                                 step = sizeof(struct inotify_event) + e->len;
 397                                 assert(step <= (size_t) n);
 398
 399                                 e = (struct inotify_event*) ((uint8_t*) e + step);
 400                                 n -= step;
 401                         }
 402                 }
 403
 404                 if ((n = read(fanotify_fd, &data, sizeof(data))) < 0) {
 405
 406                         if (errno == EINTR || errno == EAGAIN)
 407                                 continue;
 408
 409                         /* fanotify sometimes returns EACCES on read()
 410                          * where it shouldn't. For now let's just
 411                          * ignore it here (which is safe), but
 412                          * eventually this should be
 413                          * dropped when the kernel is fixed.
 414                          *
 415                          * https://bugzilla.redhat.com/show_bug.cgi?id=707577 */
 416                         if (errno == EACCES)
 417                                 continue;
 418
 419                         log_error("Failed to read event: %m");
 420                         r = -errno;
 421                         goto finish;
 422                 }
 423
 424                 for (m = &data.metadata; FAN_EVENT_OK(m, n); m = FAN_EVENT_NEXT(m, n)) {
 425                         char fn[PATH_MAX];
 426                         int k;
 427
 428                         if (m->fd < 0)
 429                                 goto next_iteration;
 430
 431                         if (m->pid == my_pid)
 432                                 goto next_iteration;
 433
 434                         __sync_synchronize();
 435                         if (m->pid == shared->replay)
 436                                 goto next_iteration;
 437
 438                         snprintf(fn, sizeof(fn), "/proc/self/fd/%i", m->fd);
 439                         char_array_0(fn);
 440
 441                         if ((k = readlink_malloc(fn, &p)) >= 0) {
 442                                 if (startswith(p, "/tmp") ||
 443                                     endswith(p, " (deleted)") ||
 444                                     hashmap_get(files, p))
 445                                         /* Not interesting, or
 446                                          * already read */
 447                                         free(p);
 448                                 else {
 449                                         unsigned long ul;
 450
 451                                         ul = fd_first_block(m->fd);
 452
 453                                         if ((k = hashmap_put(files, p, SECTOR_TO_PTR(ul))) < 0) {
 454                                                 log_warning("set_put() failed: %s", strerror(-k));
 455                                                 free(p);
 456                                         }
 457                                 }
 458
 459                         } else
 460                                 log_warning("readlink(%s) failed: %s", fn, strerror(-k));
 461
 462                 next_iteration:
 463                         if (m->fd >= 0)
 464                                 close_nointr_nofail(m->fd);
 465                 }
 466         }
 467
 468 done:
 469         if (fanotify_fd >= 0) {
 470                 close_nointr_nofail(fanotify_fd);
 471                 fanotify_fd = -1;
 472         }
 473
 474         log_debug("Writing Pack File...");
 475
 476         on_ssd = fs_on_ssd(root) > 0;
 477         log_debug("On SSD: %s", yes_no(on_ssd));
 478
 479         on_btrfs = statfs(root, &sfs) >= 0 && (long) sfs.f_type == (long) BTRFS_SUPER_MAGIC;
 480         log_debug("On btrfs: %s", yes_no(on_btrfs));
 481
 482         if (asprintf(&pack_fn_new, "%s/.readahead.new", root) < 0) {
 483                 r = log_oom();
 484                 goto finish;
 485         }
 486
 487         pack = fopen(pack_fn_new, "we");
 488         if (!pack) {
 489                 log_error("Failed to open pack file: %m");
 490                 r = -errno;
 491                 goto finish;
 492         }
 493
 494         fputs(CANONICAL_HOST READAHEAD_PACK_FILE_VERSION, pack);
 495         putc(on_ssd ? 'S' : 'R', pack);
 496
 497         if (on_ssd || on_btrfs) {
 498
 499                 /* On SSD or on btrfs, just write things out in the
 500                  * order the files were accessed. */
 501
 502                 HASHMAP_FOREACH_KEY(q, p, files, i)
 503                         pack_file(pack, p, on_btrfs);
 504         } else {
 505                 struct item *ordered, *j;
 506                 unsigned k, n;
 507
 508                 /* On rotating media, order things by the block
 509                  * numbers */
 510
 511                 log_debug("Ordering...");
 512
 513                 n = hashmap_size(files);
 514                 if (!(ordered = new(struct item, n))) {
 515                         r = log_oom();
 516                         goto finish;
 517                 }
 518
 519                 j = ordered;
 520                 HASHMAP_FOREACH_KEY(q, p, files, i) {
 521                         j->path = p;
 522                         j->block = PTR_TO_SECTOR(q);
 523                         j++;
 524                 }
 525
 526                 assert(ordered + n == j);
 527
 528                 qsort(ordered, n, sizeof(struct item), qsort_compare);
 529
 530                 for (k = 0; k < n; k++)
 531                         pack_file(pack, ordered[k].path, on_btrfs);
 532
 533                 free(ordered);
 534         }
 535
 536         log_debug("Finalizing...");
 537
 538         fflush(pack);
 539
 540         if (ferror(pack)) {
 541                 log_error("Failed to write pack file.");
 542                 r = -EIO;
 543                 goto finish;
 544         }
 545
 546         if (rename(pack_fn_new, pack_fn) < 0) {
 547                 log_error("Failed to rename readahead file: %m");
 548                 r = -errno;
 549                 goto finish;
 550         }
 551
 552         fclose(pack);
 553         pack = NULL;
 554
 555         log_debug("Done.");
 556
 557 finish:
 558         if (fanotify_fd >= 0)
 559                 close_nointr_nofail(fanotify_fd);
 560
 561         if (signal_fd >= 0)
 562                 close_nointr_nofail(signal_fd);
 563
 564         if (inotify_fd >= 0)
 565                 close_nointr_nofail(inotify_fd);
 566
 567         if (pack) {
 568                 fclose(pack);
 569                 unlink(pack_fn_new);
 570         }
 571         free(pack_fn_new);
 572         free(pack_fn);
 573
 574         while ((p = hashmap_steal_first_key(files)))
 575                 free(p);
 576
 577         hashmap_free(files);
 578
 579         if (previous_block_readahead_set) {
 580                 uint64_t bytes;
 581
 582                 /* Restore the original kernel readahead setting if we
 583                  * changed it, and nobody has overwritten it since
 584                  * yet. */
 585                 if (block_get_readahead(root, &bytes) >= 0 && bytes == 8*1024)
 586                         block_set_readahead(root, previous_block_readahead);
 587         }
 588
 589         return r;
 590 }
 591
 592 int main_collect(const char *root) {
 593
 594         if (!root)
 595                 root = "/";
 596
 597         /* Skip this step on read-only media. Note that we check the
 598          * underlying block device here, not he read-only flag of the
 599          * file system on top, since that one is most likely mounted
 600          * read-only anyway at boot, even if the underlying block
 601          * device is theoretically writable. */
 602         if (fs_on_read_only(root) > 0) {
 603                 log_info("Disabling readahead collector due to read-only media.");
 604                 return EXIT_SUCCESS;
 605         }
 606
 607         if (!enough_ram()) {
 608                 log_info("Disabling readahead collector due to low memory.");
 609                 return EXIT_SUCCESS;
 610         }
 611
 612         shared = shared_get();
 613         if (!shared)
 614                 return EXIT_FAILURE;
 615
 616         shared->collect = getpid();
 617         __sync_synchronize();
 618
 619         if (collect(root) < 0)
 620                 return EXIT_FAILURE;
 621
 622         return EXIT_SUCCESS;
 623 }