src/readahead/readahead-collect.c

   1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
   2
   3 /***
   4   This file is part of systemd.
   5
   6   Copyright 2010 Lennart Poettering
   7
   8   systemd is free software; you can redistribute it and/or modify it
   9   under the terms of the GNU Lesser General Public License as published by
  10   the Free Software Foundation; either version 2.1 of the License, or
  11   (at your option) any later version.
  12
  13   systemd is distributed in the hope that it will be useful, but
  14   WITHOUT ANY WARRANTY; without even the implied warranty of
  15   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  16   Lesser General Public License for more details.
  17
  18   You should have received a copy of the GNU Lesser General Public License
  19   along with systemd; If not, see <http://www.gnu.org/licenses/>.
  20 ***/
  21
  22 #include <errno.h>
  23 #include <inttypes.h>
  24 #include <fcntl.h>
  25 #include <linux/limits.h>
  26 #include <stdbool.h>
  27 #include <stdio.h>
  28 #include <stdlib.h>
  29 #include <string.h>
  30 #include <sys/select.h>
  31 #include <sys/time.h>
  32 #include <sys/types.h>
  33 #include <sys/stat.h>
  34 #include <unistd.h>
  35 #include <linux/fanotify.h>
  36 #include <sys/signalfd.h>
  37 #include <sys/poll.h>
  38 #include <sys/mman.h>
  39 #include <linux/fs.h>
  40 #include <linux/fiemap.h>
  41 #include <sys/ioctl.h>
  42 #include <sys/vfs.h>
  43 #include <getopt.h>
  44 #include <sys/inotify.h>
  45
  46 #include <systemd/sd-daemon.h>
  47
  48 #include "missing.h"
  49 #include "util.h"
  50 #include "set.h"
  51 #include "ioprio.h"
  52 #include "readahead-common.h"
  53 #include "virt.h"
  54
  55 /* fixme:
  56  *
  57  * - detect ssd on btrfs/lvm...
  58  * - read ahead directories
  59  * - gzip?
  60  * - remount rw?
  61  * - handle files where nothing is in mincore
  62  * - does ioprio_set work with fadvise()?
  63  */
  64
  65 static ReadaheadShared *shared = NULL;
  66
  67 /* Avoid collisions with the NULL pointer */
  68 #define SECTOR_TO_PTR(s) ULONG_TO_PTR((s)+1)
  69 #define PTR_TO_SECTOR(p) (PTR_TO_ULONG(p)-1)
  70
  71 static int btrfs_defrag(int fd) {
  72         struct btrfs_ioctl_vol_args data;
  73
  74         zero(data);
  75         data.fd = fd;
  76
  77         return ioctl(fd, BTRFS_IOC_DEFRAG, &data);
  78 }
  79
  80 static int pack_file(FILE *pack, const char *fn, bool on_btrfs) {
  81         struct stat st;
  82         void *start = MAP_FAILED;
  83         uint8_t *vec;
  84         uint32_t b, c;
  85         uint64_t inode;
  86         size_t l, pages;
  87         bool mapped;
  88         int r = 0, fd = -1, k;
  89
  90         assert(pack);
  91         assert(fn);
  92
  93         fd = open(fn, O_RDONLY|O_CLOEXEC|O_NOATIME|O_NOCTTY|O_NOFOLLOW);
  94         if (fd < 0) {
  95
  96                 if (errno == ENOENT)
  97                         return 0;
  98
  99                 if (errno == EPERM || errno == EACCES)
 100                         return 0;
 101
 102                 log_warning("open(%s) failed: %m", fn);
 103                 r = -errno;
 104                 goto finish;
 105         }
 106
 107         k = file_verify(fd, fn, arg_file_size_max, &st);
 108         if (k <= 0) {
 109                 r = k;
 110                 goto finish;
 111         }
 112
 113         if (on_btrfs)
 114                 btrfs_defrag(fd);
 115
 116         l = PAGE_ALIGN(st.st_size);
 117         start = mmap(NULL, l, PROT_READ, MAP_SHARED, fd, 0);
 118         if (start == MAP_FAILED) {
 119                 log_warning("mmap(%s) failed: %m", fn);
 120                 r = -errno;
 121                 goto finish;
 122         }
 123
 124         pages = l / page_size();
 125         vec = alloca(pages);
 126         memset(vec, 0, pages);
 127         if (mincore(start, l, vec) < 0) {
 128                 log_warning("mincore(%s) failed: %m", fn);
 129                 r = -errno;
 130                 goto finish;
 131         }
 132
 133         fputs(fn, pack);
 134         fputc('\n', pack);
 135
 136         /* Store the inode, so that we notice when the file is deleted */
 137         inode = (uint64_t) st.st_ino;
 138         fwrite(&inode, sizeof(inode), 1, pack);
 139
 140         mapped = false;
 141         for (c = 0; c < pages; c++) {
 142                 bool new_mapped = !!(vec[c] & 1);
 143
 144                 if (!mapped && new_mapped)
 145                         b = c;
 146                 else if (mapped && !new_mapped) {
 147                         fwrite(&b, sizeof(b), 1, pack);
 148                         fwrite(&c, sizeof(c), 1, pack);
 149
 150                         log_debug("%s: page %u to %u", fn, b, c);
 151                 }
 152
 153                 mapped = new_mapped;
 154         }
 155
 156         /* We don't write any range data if we should read the entire file */
 157         if (mapped && b > 0) {
 158                 fwrite(&b, sizeof(b), 1, pack);
 159                 fwrite(&c, sizeof(c), 1, pack);
 160
 161                 log_debug("%s: page %u to %u", fn, b, c);
 162         }
 163
 164         /* End marker */
 165         b = 0;
 166         fwrite(&b, sizeof(b), 1, pack);
 167         fwrite(&b, sizeof(b), 1, pack);
 168
 169 finish:
 170         if (start != MAP_FAILED)
 171                 munmap(start, l);
 172
 173         if (fd >= 0)
 174                 close_nointr_nofail(fd);
 175
 176         return r;
 177 }
 178
 179 static unsigned long fd_first_block(int fd) {
 180         struct {
 181                 struct fiemap fiemap;
 182                 struct fiemap_extent extent;
 183         } data;
 184
 185         zero(data);
 186         data.fiemap.fm_length = ~0ULL;
 187         data.fiemap.fm_extent_count = 1;
 188
 189         if (ioctl(fd, FS_IOC_FIEMAP, &data) < 0)
 190                 return 0;
 191
 192         if (data.fiemap.fm_mapped_extents <= 0)
 193                 return 0;
 194
 195         if (data.fiemap.fm_extents[0].fe_flags & FIEMAP_EXTENT_UNKNOWN)
 196                 return 0;
 197
 198         return (unsigned long) data.fiemap.fm_extents[0].fe_physical;
 199 }
 200
 201 struct item {
 202         const char *path;
 203         unsigned long block;
 204 };
 205
 206 static int qsort_compare(const void *a, const void *b) {
 207         const struct item *i, *j;
 208
 209         i = a;
 210         j = b;
 211
 212         if (i->block < j->block)
 213                 return -1;
 214         if (i->block > j->block)
 215                 return 1;
 216
 217         return strcmp(i->path, j->path);
 218 }
 219
 220 static int collect(const char *root) {
 221         enum {
 222                 FD_FANOTIFY,  /* Get the actual fs events */
 223                 FD_SIGNAL,
 224                 FD_INOTIFY,   /* We get notifications to quit early via this fd */
 225                 _FD_MAX
 226         };
 227         struct pollfd pollfd[_FD_MAX];
 228         int fanotify_fd = -1, signal_fd = -1, inotify_fd = -1, r = 0;
 229         pid_t my_pid;
 230         Hashmap *files = NULL;
 231         Iterator i;
 232         char *p, *q;
 233         sigset_t mask;
 234         FILE *pack = NULL;
 235         char *pack_fn_new = NULL, *pack_fn = NULL;
 236         bool on_ssd, on_btrfs;
 237         struct statfs sfs;
 238         usec_t not_after;
 239         uint64_t previous_block_readahead;
 240         bool previous_block_readahead_set = false;
 241
 242         assert(root);
 243
 244         if (asprintf(&pack_fn, "%s/.readahead", root) < 0) {
 245                 r = log_oom();
 246                 goto finish;
 247         }
 248
 249         /* If there's no pack file yet we lower the kernel readahead
 250          * so that mincore() is accurate. If there is a pack file
 251          * already we assume it is accurate enough so that kernel
 252          * readahead is never triggered. */
 253         previous_block_readahead_set =
 254                 access(pack_fn, F_OK) < 0 &&
 255                 block_get_readahead(root, &previous_block_readahead) >= 0 &&
 256                 block_set_readahead(root, 8*1024) >= 0;
 257
 258         if (ioprio_set(IOPRIO_WHO_PROCESS, getpid(), IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0)) < 0)
 259                 log_warning("Failed to set IDLE IO priority class: %m");
 260
 261         assert_se(sigemptyset(&mask) == 0);
 262         sigset_add_many(&mask, SIGINT, SIGTERM, -1);
 263         assert_se(sigprocmask(SIG_SETMASK, &mask, NULL) == 0);
 264
 265         if ((signal_fd = signalfd(-1, &mask, SFD_NONBLOCK|SFD_CLOEXEC)) < 0) {
 266                 log_error("signalfd(): %m");
 267                 r = -errno;
 268                 goto finish;
 269         }
 270
 271         if (!(files = hashmap_new(string_hash_func, string_compare_func))) {
 272                 log_error("Failed to allocate set.");
 273                 r = -ENOMEM;
 274                 goto finish;
 275         }
 276
 277         if ((fanotify_fd = fanotify_init(FAN_CLOEXEC|FAN_NONBLOCK, O_RDONLY|O_LARGEFILE|O_CLOEXEC|O_NOATIME)) < 0)  {
 278                 log_error("Failed to create fanotify object: %m");
 279                 r = -errno;
 280                 goto finish;
 281         }
 282
 283         if (fanotify_mark(fanotify_fd, FAN_MARK_ADD|FAN_MARK_MOUNT, FAN_OPEN, AT_FDCWD, root) < 0) {
 284                 log_error("Failed to mark %s: %m", root);
 285                 r = -errno;
 286                 goto finish;
 287         }
 288
 289         if ((inotify_fd = open_inotify()) < 0) {
 290                 r = inotify_fd;
 291                 goto finish;
 292         }
 293
 294         not_after = now(CLOCK_MONOTONIC) + arg_timeout;
 295
 296         my_pid = getpid();
 297
 298         zero(pollfd);
 299         pollfd[FD_FANOTIFY].fd = fanotify_fd;
 300         pollfd[FD_FANOTIFY].events = POLLIN;
 301         pollfd[FD_SIGNAL].fd = signal_fd;
 302         pollfd[FD_SIGNAL].events = POLLIN;
 303         pollfd[FD_INOTIFY].fd = inotify_fd;
 304         pollfd[FD_INOTIFY].events = POLLIN;
 305
 306         sd_notify(0,
 307                   "READY=1\n"
 308                   "STATUS=Collecting readahead data");
 309
 310         log_debug("Collecting...");
 311
 312         if (access("/run/systemd/readahead/cancel", F_OK) >= 0) {
 313                 log_debug("Collection canceled");
 314                 r = -ECANCELED;
 315                 goto finish;
 316         }
 317
 318         if (access("/run/systemd/readahead/done", F_OK) >= 0) {
 319                 log_debug("Got termination request");
 320                 goto done;
 321         }
 322
 323         for (;;) {
 324                 union {
 325                         struct fanotify_event_metadata metadata;
 326                         char buffer[4096];
 327                 } data;
 328                 ssize_t n;
 329                 struct fanotify_event_metadata *m;
 330                 usec_t t;
 331                 int h;
 332
 333                 if (hashmap_size(files) > arg_files_max) {
 334                         log_debug("Reached maximum number of read ahead files, ending collection.");
 335                         break;
 336                 }
 337
 338                 t = now(CLOCK_MONOTONIC);
 339                 if (t >= not_after) {
 340                         log_debug("Reached maximum collection time, ending collection.");
 341                         break;
 342                 }
 343
 344                 if ((h = poll(pollfd, _FD_MAX, (int) ((not_after - t) / USEC_PER_MSEC))) < 0) {
 345
 346                         if (errno == EINTR)
 347                                 continue;
 348
 349                         log_error("poll(): %m");
 350                         r = -errno;
 351                         goto finish;
 352                 }
 353
 354                 if (h == 0) {
 355                         log_debug("Reached maximum collection time, ending collection.");
 356                         break;
 357                 }
 358
 359                 if (pollfd[FD_SIGNAL].revents) {
 360                         log_debug("Got signal.");
 361                         break;
 362                 }
 363
 364                 if (pollfd[FD_INOTIFY].revents) {
 365                         uint8_t inotify_buffer[sizeof(struct inotify_event) + FILENAME_MAX];
 366                         struct inotify_event *e;
 367
 368                         if ((n = read(inotify_fd, &inotify_buffer, sizeof(inotify_buffer))) < 0) {
 369                                 if (errno == EINTR || errno == EAGAIN)
 370                                         continue;
 371
 372                                 log_error("Failed to read inotify event: %m");
 373                                 r = -errno;
 374                                 goto finish;
 375                         }
 376
 377                         e = (struct inotify_event*) inotify_buffer;
 378                         while (n > 0) {
 379                                 size_t step;
 380
 381                                 if ((e->mask & IN_CREATE) && streq(e->name, "cancel")) {
 382                                         log_debug("Collection canceled");
 383                                         r = -ECANCELED;
 384                                         goto finish;
 385                                 }
 386
 387                                 if ((e->mask & IN_CREATE) && streq(e->name, "done")) {
 388                                         log_debug("Got termination request");
 389                                         goto done;
 390                                 }
 391
 392                                 step = sizeof(struct inotify_event) + e->len;
 393                                 assert(step <= (size_t) n);
 394
 395                                 e = (struct inotify_event*) ((uint8_t*) e + step);
 396                                 n -= step;
 397                         }
 398                 }
 399
 400                 if ((n = read(fanotify_fd, &data, sizeof(data))) < 0) {
 401
 402                         if (errno == EINTR || errno == EAGAIN)
 403                                 continue;
 404
 405                         /* fanotify sometimes returns EACCES on read()
 406                          * where it shouldn't. For now let's just
 407                          * ignore it here (which is safe), but
 408                          * eventually this should be
 409                          * dropped when the kernel is fixed.
 410                          *
 411                          * https://bugzilla.redhat.com/show_bug.cgi?id=707577 */
 412                         if (errno == EACCES)
 413                                 continue;
 414
 415                         log_error("Failed to read event: %m");
 416                         r = -errno;
 417                         goto finish;
 418                 }
 419
 420                 for (m = &data.metadata; FAN_EVENT_OK(m, n); m = FAN_EVENT_NEXT(m, n)) {
 421                         char fn[PATH_MAX];
 422                         int k;
 423
 424                         if (m->fd < 0)
 425                                 goto next_iteration;
 426
 427                         if (m->pid == my_pid)
 428                                 goto next_iteration;
 429
 430                         __sync_synchronize();
 431                         if (m->pid == shared->replay)
 432                                 goto next_iteration;
 433
 434                         snprintf(fn, sizeof(fn), "/proc/self/fd/%i", m->fd);
 435                         char_array_0(fn);
 436
 437                         if ((k = readlink_malloc(fn, &p)) >= 0) {
 438                                 if (startswith(p, "/tmp") ||
 439                                     endswith(p, " (deleted)") ||
 440                                     hashmap_get(files, p))
 441                                         /* Not interesting, or
 442                                          * already read */
 443                                         free(p);
 444                                 else {
 445                                         unsigned long ul;
 446
 447                                         ul = fd_first_block(m->fd);
 448
 449                                         if ((k = hashmap_put(files, p, SECTOR_TO_PTR(ul))) < 0) {
 450                                                 log_warning("set_put() failed: %s", strerror(-k));
 451                                                 free(p);
 452                                         }
 453                                 }
 454
 455                         } else
 456                                 log_warning("readlink(%s) failed: %s", fn, strerror(-k));
 457
 458                 next_iteration:
 459                         if (m->fd)
 460                                 close_nointr_nofail(m->fd);
 461                 }
 462         }
 463
 464 done:
 465         if (fanotify_fd >= 0) {
 466                 close_nointr_nofail(fanotify_fd);
 467                 fanotify_fd = -1;
 468         }
 469
 470         log_debug("Writing Pack File...");
 471
 472         on_ssd = fs_on_ssd(root) > 0;
 473         log_debug("On SSD: %s", yes_no(on_ssd));
 474
 475         on_btrfs = statfs(root, &sfs) >= 0 && (long) sfs.f_type == (long) BTRFS_SUPER_MAGIC;
 476         log_debug("On btrfs: %s", yes_no(on_btrfs));
 477
 478         if (asprintf(&pack_fn_new, "%s/.readahead.new", root) < 0) {
 479                 r = log_oom();
 480                 goto finish;
 481         }
 482
 483         pack = fopen(pack_fn_new, "we");
 484         if (!pack) {
 485                 log_error("Failed to open pack file: %m");
 486                 r = -errno;
 487                 goto finish;
 488         }
 489
 490         fputs(CANONICAL_HOST READAHEAD_PACK_FILE_VERSION, pack);
 491         putc(on_ssd ? 'S' : 'R', pack);
 492
 493         if (on_ssd || on_btrfs) {
 494
 495                 /* On SSD or on btrfs, just write things out in the
 496                  * order the files were accessed. */
 497
 498                 HASHMAP_FOREACH_KEY(q, p, files, i)
 499                         pack_file(pack, p, on_btrfs);
 500         } else {
 501                 struct item *ordered, *j;
 502                 unsigned k, n;
 503
 504                 /* On rotating media, order things by the block
 505                  * numbers */
 506
 507                 log_debug("Ordering...");
 508
 509                 n = hashmap_size(files);
 510                 if (!(ordered = new(struct item, n))) {
 511                         r = log_oom();
 512                         goto finish;
 513                 }
 514
 515                 j = ordered;
 516                 HASHMAP_FOREACH_KEY(q, p, files, i) {
 517                         j->path = p;
 518                         j->block = PTR_TO_SECTOR(q);
 519                         j++;
 520                 }
 521
 522                 assert(ordered + n == j);
 523
 524                 qsort(ordered, n, sizeof(struct item), qsort_compare);
 525
 526                 for (k = 0; k < n; k++)
 527                         pack_file(pack, ordered[k].path, on_btrfs);
 528
 529                 free(ordered);
 530         }
 531
 532         log_debug("Finalizing...");
 533
 534         fflush(pack);
 535
 536         if (ferror(pack)) {
 537                 log_error("Failed to write pack file.");
 538                 r = -EIO;
 539                 goto finish;
 540         }
 541
 542         if (rename(pack_fn_new, pack_fn) < 0) {
 543                 log_error("Failed to rename readahead file: %m");
 544                 r = -errno;
 545                 goto finish;
 546         }
 547
 548         fclose(pack);
 549         pack = NULL;
 550
 551         log_debug("Done.");
 552
 553 finish:
 554         if (fanotify_fd >= 0)
 555                 close_nointr_nofail(fanotify_fd);
 556
 557         if (signal_fd >= 0)
 558                 close_nointr_nofail(signal_fd);
 559
 560         if (inotify_fd >= 0)
 561                 close_nointr_nofail(inotify_fd);
 562
 563         if (pack) {
 564                 fclose(pack);
 565                 unlink(pack_fn_new);
 566         }
 567         free(pack_fn_new);
 568         free(pack_fn);
 569
 570         while ((p = hashmap_steal_first_key(files)))
 571                 free(p);
 572
 573         hashmap_free(files);
 574
 575         if (previous_block_readahead_set) {
 576                 uint64_t bytes;
 577
 578                 /* Restore the original kernel readahead setting if we
 579                  * changed it, and nobody has overwritten it since
 580                  * yet. */
 581                 if (block_get_readahead(root, &bytes) >= 0 && bytes == 8*1024)
 582                         block_set_readahead(root, previous_block_readahead);
 583         }
 584
 585         return r;
 586 }
 587
 588 int main_collect(const char *root) {
 589
 590         if (!root)
 591                 root = "/";
 592
 593         /* Skip this step on read-only media. Note that we check the
 594          * underlying block device here, not he read-only flag of the
 595          * file system on top, since that one is most likely mounted
 596          * read-only anyway at boot, even if the underlying block
 597          * device is theoretically writable. */
 598         if (fs_on_read_only(root) > 0) {
 599                 log_info("Disabling readahead collector due to read-only media.");
 600                 return EXIT_SUCCESS;
 601         }
 602
 603         if (!enough_ram()) {
 604                 log_info("Disabling readahead collector due to low memory.");
 605                 return EXIT_SUCCESS;
 606         }
 607
 608         shared = shared_get();
 609         if (!shared)
 610                 return EXIT_FAILURE;
 611
 612         shared->collect = getpid();
 613         __sync_synchronize();
 614
 615         if (collect(root) < 0)
 616                 return EXIT_FAILURE;
 617
 618         return EXIT_SUCCESS;
 619 }