src/readahead/readahead-collect.c

   1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
   2
   3 /***
   4   This file is part of systemd.
   5
   6   Copyright 2010 Lennart Poettering
   7
   8   systemd is free software; you can redistribute it and/or modify it
   9   under the terms of the GNU Lesser General Public License as published by
  10   the Free Software Foundation; either version 2.1 of the License, or
  11   (at your option) any later version.
  12
  13   systemd is distributed in the hope that it will be useful, but
  14   WITHOUT ANY WARRANTY; without even the implied warranty of
  15   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  16   Lesser General Public License for more details.
  17
  18   You should have received a copy of the GNU Lesser General Public License
  19   along with systemd; If not, see <http://www.gnu.org/licenses/>.
  20 ***/
  21
  22 #include <errno.h>
  23 #include <inttypes.h>
  24 #include <fcntl.h>
  25 #include <linux/limits.h>
  26 #include <stdbool.h>
  27 #include <stdio.h>
  28 #include <stdlib.h>
  29 #include <string.h>
  30 #include <sys/select.h>
  31 #include <sys/time.h>
  32 #include <sys/types.h>
  33 #include <sys/stat.h>
  34 #include <unistd.h>
  35 #include <linux/fanotify.h>
  36 #include <sys/signalfd.h>
  37 #include <sys/poll.h>
  38 #include <sys/mman.h>
  39 #include <linux/fs.h>
  40 #include <linux/fiemap.h>
  41 #include <sys/ioctl.h>
  42 #include <sys/vfs.h>
  43 #include <getopt.h>
  44 #include <sys/inotify.h>
  45 #include <math.h>
  46
  47 #ifdef HAVE_FANOTIFY_INIT
  48 #include <sys/fanotify.h>
  49 #endif
  50
  51 #include <systemd/sd-daemon.h>
  52
  53 #include "missing.h"
  54 #include "util.h"
  55 #include "set.h"
  56 #include "ioprio.h"
  57 #include "readahead-common.h"
  58 #include "virt.h"
  59
  60 /* fixme:
  61  *
  62  * - detect ssd on btrfs/lvm...
  63  * - read ahead directories
  64  * - gzip?
  65  * - remount rw?
  66  * - handle files where nothing is in mincore
  67  * - does ioprio_set work with fadvise()?
  68  */
  69
  70 static ReadaheadShared *shared = NULL;
  71 static usec_t starttime;
  72
  73 /* Avoid collisions with the NULL pointer */
  74 #define SECTOR_TO_PTR(s) ULONG_TO_PTR((s)+1)
  75 #define PTR_TO_SECTOR(p) (PTR_TO_ULONG(p)-1)
  76
  77 static int btrfs_defrag(int fd) {
  78         struct btrfs_ioctl_vol_args data = { .fd = fd };
  79
  80         return ioctl(fd, BTRFS_IOC_DEFRAG, &data);
  81 }
  82
  83 static int pack_file(FILE *pack, const char *fn, bool on_btrfs) {
  84         struct stat st;
  85         void *start = MAP_FAILED;
  86         uint8_t *vec;
  87         uint32_t b, c;
  88         uint64_t inode;
  89         size_t l, pages;
  90         bool mapped;
  91         int r = 0, fd = -1, k;
  92
  93         assert(pack);
  94         assert(fn);
  95
  96         fd = open(fn, O_RDONLY|O_CLOEXEC|O_NOATIME|O_NOCTTY|O_NOFOLLOW);
  97         if (fd < 0) {
  98
  99                 if (errno == ENOENT)
 100                         return 0;
 101
 102                 if (errno == EPERM || errno == EACCES)
 103                         return 0;
 104
 105                 log_warning("open(%s) failed: %m", fn);
 106                 r = -errno;
 107                 goto finish;
 108         }
 109
 110         k = file_verify(fd, fn, arg_file_size_max, &st);
 111         if (k <= 0) {
 112                 r = k;
 113                 goto finish;
 114         }
 115
 116         if (on_btrfs)
 117                 btrfs_defrag(fd);
 118
 119         l = PAGE_ALIGN(st.st_size);
 120         start = mmap(NULL, l, PROT_READ, MAP_SHARED, fd, 0);
 121         if (start == MAP_FAILED) {
 122                 log_warning("mmap(%s) failed: %m", fn);
 123                 r = -errno;
 124                 goto finish;
 125         }
 126
 127         pages = l / page_size();
 128         vec = alloca(pages);
 129         memset(vec, 0, pages);
 130         if (mincore(start, l, vec) < 0) {
 131                 log_warning("mincore(%s) failed: %m", fn);
 132                 r = -errno;
 133                 goto finish;
 134         }
 135
 136         fputs(fn, pack);
 137         fputc('\n', pack);
 138
 139         /* Store the inode, so that we notice when the file is deleted */
 140         inode = (uint64_t) st.st_ino;
 141         fwrite(&inode, sizeof(inode), 1, pack);
 142
 143         mapped = false;
 144         for (c = 0; c < pages; c++) {
 145                 bool new_mapped = !!(vec[c] & 1);
 146
 147                 if (!mapped && new_mapped)
 148                         b = c;
 149                 else if (mapped && !new_mapped) {
 150                         fwrite(&b, sizeof(b), 1, pack);
 151                         fwrite(&c, sizeof(c), 1, pack);
 152
 153                         log_debug("%s: page %u to %u", fn, b, c);
 154                 }
 155
 156                 mapped = new_mapped;
 157         }
 158
 159         /* We don't write any range data if we should read the entire file */
 160         if (mapped && b > 0) {
 161                 fwrite(&b, sizeof(b), 1, pack);
 162                 fwrite(&c, sizeof(c), 1, pack);
 163
 164                 log_debug("%s: page %u to %u", fn, b, c);
 165         }
 166
 167         /* End marker */
 168         b = 0;
 169         fwrite(&b, sizeof(b), 1, pack);
 170         fwrite(&b, sizeof(b), 1, pack);
 171
 172 finish:
 173         if (start != MAP_FAILED)
 174                 munmap(start, l);
 175
 176         if (fd >= 0)
 177                 close_nointr_nofail(fd);
 178
 179         return r;
 180 }
 181
 182 static unsigned long fd_first_block(int fd) {
 183         struct {
 184                 struct fiemap fiemap;
 185                 struct fiemap_extent extent;
 186         } data = {
 187                 .fiemap.fm_length = ~0ULL,
 188                 .fiemap.fm_extent_count = 1,
 189         };
 190
 191         if (ioctl(fd, FS_IOC_FIEMAP, &data) < 0)
 192                 return 0;
 193
 194         if (data.fiemap.fm_mapped_extents <= 0)
 195                 return 0;
 196
 197         if (data.fiemap.fm_extents[0].fe_flags & FIEMAP_EXTENT_UNKNOWN)
 198                 return 0;
 199
 200         return (unsigned long) data.fiemap.fm_extents[0].fe_physical;
 201 }
 202
 203 struct item {
 204         const char *path;
 205         unsigned long block;
 206         unsigned long bin;
 207 };
 208
 209 static int qsort_compare(const void *a, const void *b) {
 210         const struct item *i, *j;
 211
 212         i = a;
 213         j = b;
 214
 215         /* sort by bin first */
 216         if (i->bin < j->bin)
 217                 return -1;
 218         if (i->bin > j->bin)
 219                 return 1;
 220
 221         /* then sort by sector */
 222         if (i->block < j->block)
 223                 return -1;
 224         if (i->block > j->block)
 225                 return 1;
 226
 227         return strcmp(i->path, j->path);
 228 }
 229
 230 static int collect(const char *root) {
 231         enum {
 232                 FD_FANOTIFY,  /* Get the actual fs events */
 233                 FD_SIGNAL,
 234                 FD_INOTIFY,   /* We get notifications to quit early via this fd */
 235                 _FD_MAX
 236         };
 237         struct pollfd pollfd[_FD_MAX] = {};
 238         int fanotify_fd = -1, signal_fd = -1, inotify_fd = -1, r = 0;
 239         pid_t my_pid;
 240         Hashmap *files = NULL;
 241         Iterator i;
 242         char *p, *q;
 243         sigset_t mask;
 244         FILE *pack = NULL;
 245         char *pack_fn_new = NULL, *pack_fn = NULL;
 246         bool on_ssd, on_btrfs;
 247         struct statfs sfs;
 248         usec_t not_after;
 249         uint64_t previous_block_readahead;
 250         bool previous_block_readahead_set = false;
 251
 252         assert(root);
 253
 254         if (asprintf(&pack_fn, "%s/.readahead", root) < 0) {
 255                 r = log_oom();
 256                 goto finish;
 257         }
 258
 259         starttime = now(CLOCK_MONOTONIC);
 260
 261         /* If there's no pack file yet we lower the kernel readahead
 262          * so that mincore() is accurate. If there is a pack file
 263          * already we assume it is accurate enough so that kernel
 264          * readahead is never triggered. */
 265         previous_block_readahead_set =
 266                 access(pack_fn, F_OK) < 0 &&
 267                 block_get_readahead(root, &previous_block_readahead) >= 0 &&
 268                 block_set_readahead(root, 8*1024) >= 0;
 269
 270         if (ioprio_set(IOPRIO_WHO_PROCESS, getpid(), IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0)) < 0)
 271                 log_warning("Failed to set IDLE IO priority class: %m");
 272
 273         assert_se(sigemptyset(&mask) == 0);
 274         sigset_add_many(&mask, SIGINT, SIGTERM, -1);
 275         assert_se(sigprocmask(SIG_SETMASK, &mask, NULL) == 0);
 276
 277         if ((signal_fd = signalfd(-1, &mask, SFD_NONBLOCK|SFD_CLOEXEC)) < 0) {
 278                 log_error("signalfd(): %m");
 279                 r = -errno;
 280                 goto finish;
 281         }
 282
 283         files = hashmap_new(string_hash_func, string_compare_func);
 284         if (!files) {
 285                 log_error("Failed to allocate set.");
 286                 r = -ENOMEM;
 287                 goto finish;
 288         }
 289
 290         fanotify_fd = fanotify_init(FAN_CLOEXEC|FAN_NONBLOCK, O_RDONLY|O_LARGEFILE|O_CLOEXEC|O_NOATIME);
 291         if (fanotify_fd < 0)  {
 292                 log_error("Failed to create fanotify object: %m");
 293                 r = -errno;
 294                 goto finish;
 295         }
 296
 297         if (fanotify_mark(fanotify_fd, FAN_MARK_ADD|FAN_MARK_MOUNT, FAN_OPEN, AT_FDCWD, root) < 0) {
 298                 log_error("Failed to mark %s: %m", root);
 299                 r = -errno;
 300                 goto finish;
 301         }
 302
 303         inotify_fd = open_inotify();
 304         if (inotify_fd < 0) {
 305                 r = inotify_fd;
 306                 goto finish;
 307         }
 308
 309         not_after = now(CLOCK_MONOTONIC) + arg_timeout;
 310
 311         my_pid = getpid();
 312
 313         pollfd[FD_FANOTIFY].fd = fanotify_fd;
 314         pollfd[FD_FANOTIFY].events = POLLIN;
 315         pollfd[FD_SIGNAL].fd = signal_fd;
 316         pollfd[FD_SIGNAL].events = POLLIN;
 317         pollfd[FD_INOTIFY].fd = inotify_fd;
 318         pollfd[FD_INOTIFY].events = POLLIN;
 319
 320         sd_notify(0,
 321                   "READY=1\n"
 322                   "STATUS=Collecting readahead data");
 323
 324         log_debug("Collecting...");
 325
 326         if (access("/run/systemd/readahead/cancel", F_OK) >= 0) {
 327                 log_debug("Collection canceled");
 328                 r = -ECANCELED;
 329                 goto finish;
 330         }
 331
 332         if (access("/run/systemd/readahead/done", F_OK) >= 0) {
 333                 log_debug("Got termination request");
 334                 goto done;
 335         }
 336
 337         for (;;) {
 338                 union {
 339                         struct fanotify_event_metadata metadata;
 340                         char buffer[4096];
 341                 } data;
 342                 ssize_t n;
 343                 struct fanotify_event_metadata *m;
 344                 usec_t t;
 345                 int h;
 346
 347                 if (hashmap_size(files) > arg_files_max) {
 348                         log_debug("Reached maximum number of read ahead files, ending collection.");
 349                         break;
 350                 }
 351
 352                 t = now(CLOCK_MONOTONIC);
 353                 if (t >= not_after) {
 354                         log_debug("Reached maximum collection time, ending collection.");
 355                         break;
 356                 }
 357
 358                 if ((h = poll(pollfd, _FD_MAX, (int) ((not_after - t) / USEC_PER_MSEC))) < 0) {
 359
 360                         if (errno == EINTR)
 361                                 continue;
 362
 363                         log_error("poll(): %m");
 364                         r = -errno;
 365                         goto finish;
 366                 }
 367
 368                 if (h == 0) {
 369                         log_debug("Reached maximum collection time, ending collection.");
 370                         break;
 371                 }
 372
 373                 if (pollfd[FD_SIGNAL].revents) {
 374                         log_debug("Got signal.");
 375                         break;
 376                 }
 377
 378                 if (pollfd[FD_INOTIFY].revents) {
 379                         uint8_t inotify_buffer[sizeof(struct inotify_event) + FILENAME_MAX];
 380                         struct inotify_event *e;
 381
 382                         if ((n = read(inotify_fd, &inotify_buffer, sizeof(inotify_buffer))) < 0) {
 383                                 if (errno == EINTR || errno == EAGAIN)
 384                                         continue;
 385
 386                                 log_error("Failed to read inotify event: %m");
 387                                 r = -errno;
 388                                 goto finish;
 389                         }
 390
 391                         e = (struct inotify_event*) inotify_buffer;
 392                         while (n > 0) {
 393                                 size_t step;
 394
 395                                 if ((e->mask & IN_CREATE) && streq(e->name, "cancel")) {
 396                                         log_debug("Collection canceled");
 397                                         r = -ECANCELED;
 398                                         goto finish;
 399                                 }
 400
 401                                 if ((e->mask & IN_CREATE) && streq(e->name, "done")) {
 402                                         log_debug("Got termination request");
 403                                         goto done;
 404                                 }
 405
 406                                 step = sizeof(struct inotify_event) + e->len;
 407                                 assert(step <= (size_t) n);
 408
 409                                 e = (struct inotify_event*) ((uint8_t*) e + step);
 410                                 n -= step;
 411                         }
 412                 }
 413
 414                 if ((n = read(fanotify_fd, &data, sizeof(data))) < 0) {
 415
 416                         if (errno == EINTR || errno == EAGAIN)
 417                                 continue;
 418
 419                         /* fanotify sometimes returns EACCES on read()
 420                          * where it shouldn't. For now let's just
 421                          * ignore it here (which is safe), but
 422                          * eventually this should be
 423                          * dropped when the kernel is fixed.
 424                          *
 425                          * https://bugzilla.redhat.com/show_bug.cgi?id=707577 */
 426                         if (errno == EACCES)
 427                                 continue;
 428
 429                         log_error("Failed to read event: %m");
 430                         r = -errno;
 431                         goto finish;
 432                 }
 433
 434                 for (m = &data.metadata; FAN_EVENT_OK(m, n); m = FAN_EVENT_NEXT(m, n)) {
 435                         char fn[PATH_MAX];
 436                         int k;
 437
 438                         if (m->fd < 0)
 439                                 goto next_iteration;
 440
 441                         if (m->pid == my_pid)
 442                                 goto next_iteration;
 443
 444                         __sync_synchronize();
 445                         if (m->pid == shared->replay)
 446                                 goto next_iteration;
 447
 448                         snprintf(fn, sizeof(fn), "/proc/self/fd/%i", m->fd);
 449                         char_array_0(fn);
 450
 451                         if ((k = readlink_malloc(fn, &p)) >= 0) {
 452                                 if (startswith(p, "/tmp") ||
 453                                     endswith(p, " (deleted)") ||
 454                                     hashmap_get(files, p))
 455                                         /* Not interesting, or
 456                                          * already read */
 457                                         free(p);
 458                                 else {
 459                                         unsigned long ul;
 460                                         usec_t entrytime;
 461                                         struct item *entry;
 462
 463                                         entry = new0(struct item, 1);
 464                                         if (!entry) {
 465                                                 r = log_oom();
 466                                                 goto finish;
 467                                         }
 468
 469                                         ul = fd_first_block(m->fd);
 470
 471                                         entrytime = now(CLOCK_MONOTONIC);
 472
 473                                         entry->block = ul;
 474                                         entry->path = strdup(p);
 475                                         if (!entry->path) {
 476                                                 free(entry);
 477                                                 r = log_oom();
 478                                                 goto finish;
 479                                         }
 480                                         entry->bin = (entrytime - starttime) / 2000000;
 481
 482                                         if ((k = hashmap_put(files, p, entry)) < 0) {
 483                                                 log_warning("set_put() failed: %s", strerror(-k));
 484                                                 free(p);
 485                                         }
 486                                 }
 487
 488                         } else
 489                                 log_warning("readlink(%s) failed: %s", fn, strerror(-k));
 490
 491                 next_iteration:
 492                         if (m->fd >= 0)
 493                                 close_nointr_nofail(m->fd);
 494                 }
 495         }
 496
 497 done:
 498         if (fanotify_fd >= 0) {
 499                 close_nointr_nofail(fanotify_fd);
 500                 fanotify_fd = -1;
 501         }
 502
 503         log_debug("Writing Pack File...");
 504
 505         on_ssd = fs_on_ssd(root) > 0;
 506         log_debug("On SSD: %s", yes_no(on_ssd));
 507
 508         on_btrfs = statfs(root, &sfs) >= 0 && (unsigned) sfs.f_type == BTRFS_SUPER_MAGIC;
 509         log_debug("On btrfs: %s", yes_no(on_btrfs));
 510
 511         if (asprintf(&pack_fn_new, "%s/.readahead.new", root) < 0) {
 512                 r = log_oom();
 513                 goto finish;
 514         }
 515
 516         pack = fopen(pack_fn_new, "we");
 517         if (!pack) {
 518                 log_error("Failed to open pack file: %m");
 519                 r = -errno;
 520                 goto finish;
 521         }
 522
 523         fputs(CANONICAL_HOST READAHEAD_PACK_FILE_VERSION, pack);
 524         putc(on_ssd ? 'S' : 'R', pack);
 525
 526         if (on_ssd || on_btrfs) {
 527
 528                 /* On SSD or on btrfs, just write things out in the
 529                  * order the files were accessed. */
 530
 531                 HASHMAP_FOREACH_KEY(q, p, files, i)
 532                         pack_file(pack, p, on_btrfs);
 533         } else {
 534                 struct item *ordered, *j;
 535                 unsigned k, n;
 536
 537                 /* On rotating media, order things by the block
 538                  * numbers */
 539
 540                 log_debug("Ordering...");
 541
 542                 n = hashmap_size(files);
 543                 if (!(ordered = new(struct item, n))) {
 544                         r = log_oom();
 545                         goto finish;
 546                 }
 547
 548                 j = ordered;
 549                 HASHMAP_FOREACH_KEY(q, p, files, i) {
 550                         memcpy(j, q, sizeof(struct item));
 551                         j++;
 552                 }
 553
 554                 assert(ordered + n == j);
 555
 556                 qsort(ordered, n, sizeof(struct item), qsort_compare);
 557
 558                 for (k = 0; k < n; k++)
 559                         pack_file(pack, ordered[k].path, on_btrfs);
 560
 561                 free(ordered);
 562         }
 563
 564         log_debug("Finalizing...");
 565
 566         fflush(pack);
 567
 568         if (ferror(pack)) {
 569                 log_error("Failed to write pack file.");
 570                 r = -EIO;
 571                 goto finish;
 572         }
 573
 574         if (rename(pack_fn_new, pack_fn) < 0) {
 575                 log_error("Failed to rename readahead file: %m");
 576                 r = -errno;
 577                 goto finish;
 578         }
 579
 580         fclose(pack);
 581         pack = NULL;
 582
 583         log_debug("Done.");
 584
 585 finish:
 586         if (fanotify_fd >= 0)
 587                 close_nointr_nofail(fanotify_fd);
 588
 589         if (signal_fd >= 0)
 590                 close_nointr_nofail(signal_fd);
 591
 592         if (inotify_fd >= 0)
 593                 close_nointr_nofail(inotify_fd);
 594
 595         if (pack) {
 596                 fclose(pack);
 597                 unlink(pack_fn_new);
 598         }
 599         free(pack_fn_new);
 600         free(pack_fn);
 601
 602         while ((p = hashmap_steal_first_key(files)))
 603                 free(p);
 604
 605         hashmap_free(files);
 606
 607         if (previous_block_readahead_set) {
 608                 uint64_t bytes;
 609
 610                 /* Restore the original kernel readahead setting if we
 611                  * changed it, and nobody has overwritten it since
 612                  * yet. */
 613                 if (block_get_readahead(root, &bytes) >= 0 && bytes == 8*1024)
 614                         block_set_readahead(root, previous_block_readahead);
 615         }
 616
 617         return r;
 618 }
 619
 620 int main_collect(const char *root) {
 621
 622         if (!root)
 623                 root = "/";
 624
 625         /* Skip this step on read-only media. Note that we check the
 626          * underlying block device here, not he read-only flag of the
 627          * file system on top, since that one is most likely mounted
 628          * read-only anyway at boot, even if the underlying block
 629          * device is theoretically writable. */
 630         if (fs_on_read_only(root) > 0) {
 631                 log_info("Disabling readahead collector due to read-only media.");
 632                 return EXIT_SUCCESS;
 633         }
 634
 635         if (!enough_ram()) {
 636                 log_info("Disabling readahead collector due to low memory.");
 637                 return EXIT_SUCCESS;
 638         }
 639
 640         shared = shared_get();
 641         if (!shared)
 642                 return EXIT_FAILURE;
 643
 644         shared->collect = getpid();
 645         __sync_synchronize();
 646
 647         if (collect(root) < 0)
 648                 return EXIT_FAILURE;
 649
 650         return EXIT_SUCCESS;
 651 }