chiark - git - mdw - mLib/blob - test/bench.c

   1 /* -*-c-*-
   2  *
   3  * Benchmarking support
   4  *
   5  * (c) 2023 Straylight/Edgeware
   6  */
   7
   8 /*----- Licensing notice --------------------------------------------------*
   9  *
  10  * This file is part of the mLib utilities library.
  11  *
  12  * mLib is free software: you can redistribute it and/or modify it under
  13  * the terms of the GNU Library General Public License as published by
  14  * the Free Software Foundation; either version 2 of the License, or (at
  15  * your option) any later version.
  16  *
  17  * mLib is distributed in the hope that it will be useful, but WITHOUT
  18  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  19  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
  20  * License for more details.
  21  *
  22  * You should have received a copy of the GNU Library General Public
  23  * License along with mLib.  If not, write to the Free Software
  24  * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
  25  * USA.
  26  */
  27
  28 /*----- Header files ------------------------------------------------------*/
  29
  30 #include "config.h"
  31
  32 #include <ctype.h>
  33 #include <errno.h>
  34 #include <limits.h>
  35 #include <math.h>
  36 #include <stdarg.h>
  37 #include <stdio.h>
  38 #include <stdlib.h>
  39 #include <string.h>
  40 #include <time.h>
  41
  42 #include "alloc.h"
  43 #include "arena.h"
  44 #include "bench.h"
  45 #include "bits.h"
  46 #include "dstr.h"
  47 #include "linreg.h"
  48 #include "macros.h"
  49
  50 #if GCC_VERSION_P(4, 5) && (defined(__i386__) || defined(__x86_64__))
  51 #  include <cpuid.h>
  52 #  define CPUID_1D_TSC (1u << 4)
  53 #  define CPUID_1xD_TSCP (1u << 27)
  54 #  define USE_X86_RDTSC 1
  55 #endif
  56
  57 #if defined(HAVE_LINUX_PERF_EVENT_H) && defined(HAVE_UINT64)
  58 #  include <sys/syscall.h>
  59 #  include <sys/types.h>
  60 #  include <unistd.h>
  61 #  include <linux/perf_event.h>
  62 #  ifdef HAVE_VALGRIND_VALGRIND_H
  63 #    include <valgrind/valgrind.h>
  64 #  endif
  65 #  define USE_LINUX_PERFEVENT 1
  66 #  if GCC_VERSION_P(4, 5) && (defined(__i386__) || defined(__x86_64__))
  67 #    include <sys/mman.h>
  68 #    define USE_LINUX_PERFEVRDPMC 1
  69 #  endif
  70 #endif
  71
  72 /*----- Data structures ---------------------------------------------------*/
  73
  74 enum { CLK, CY, NTIMER };
  75
  76 struct timer {
  77   struct bench_timer _t;
  78   arena *a;
  79   const struct timer_ops *ops[NTIMER];  /* subtimers for clock and cycles */
  80   union {
  81 #ifdef USE_X86_RDTSC
  82     unsigned tscaux;                    /* `ia32_tsc_aux' for `ldtscp' */
  83 #endif
  84 #ifdef USE_LINUX_PERFEVENT
  85     int fd;                             /* vanilla `perf_event_open' */
  86 #endif
  87 #ifdef USE_LINUX_PERFEVRDPMC
  88     struct {                            /* `perf_event_open' with `rdpmc' */
  89       const volatile void *map; size_t sz; /* memory-mapped info */
  90       pid_t owner;                      /*   owning thread id */
  91     } pmc;
  92 #endif
  93   } u_cy;                               /* state for cycle measurement */
  94 };
  95
  96 struct timer_ops {
  97   const char *name;                     /* timer name */
  98   unsigned f;                           /* flags */
  99 /* ... @BTF_...OK@ flags */             /*   expected results */
 100 #define TF_SECRET 16u                   /*   don't try this automatically */
 101   int (*init)(struct timer */*t*/);     /* initialization function */
 102   int (*preflight)(struct timer */*t*/); /* preflight checks */
 103   int (*now)(struct timer */*t*/,       /* read current */
 104              struct bench_time */*t_out*/, unsigned /*f*/);
 105   void (*diff)(struct timer */*t*/,     /* difference */
 106                struct bench_timing */*t_inout*/,
 107                const struct bench_time */*t0*/,
 108                const struct bench_time */*t1*/);
 109   void (*teardown)(struct timer */*t*/); /* release held resources */
 110 };
 111
 112 /*----- Preliminaries -----------------------------------------------------*/
 113
 114 #define NS_PER_S 1000000000
 115
 116 /* --- @debug@ --- *
 117  *
 118  * Arguments:   @const char *fmt@ = format control string
 119  *              @...@ = format arguemnts
 120  *
 121  * Returns:     ---
 122  *
 123  * Use:         Maybe report a debugging message to standard error.
 124  */
 125
 126 static PRINTF_LIKE(1, 2) void debug(const char *fmt, ...)
 127 {
 128   const char *p;
 129   va_list ap;
 130
 131   p = getenv("MLIB_BENCH_DEBUG");
 132   if (p && *p != 'n' && *p != '0') {
 133     va_start(ap, fmt);
 134     fputs("mLib BENCH: ", stderr);
 135     vfprintf(stderr, fmt, ap);
 136     fputc('\n', stderr);
 137     va_end(ap);
 138   }
 139 }
 140
 141 #ifdef HAVE_UINT64
 142 #  define FLOATK64(k) ((double)(k).i)
 143 #else
 144 #  define FLOATK64(k) ((double)(k).lo + 4294967296.0*(double)(k).hi)
 145 #endif
 146
 147 /* --- @diff_ts@ --- *
 148  *
 149  * Arguments:   @struct timer *t@ = timer structure
 150  *              @struct bench_timing *delta_inout@ = where to put the result
 151  *              @const struct time *t0, *t1@ = two input times
 152  *
 153  * Returns:     ---
 154  *
 155  * Use:         Calculates a time difference for timers using the
 156  *              @struct timespec@-like time format.
 157  */
 158
 159 static void diff_ts(struct timer *t, struct bench_timing *delta_inout,
 160                     const struct bench_time *t0, const struct bench_time *t1)
 161 {
 162   unsigned f = t0->f&t1->f;
 163   kludge64 delta_s;
 164   uint32 delta_ns;
 165
 166   if (f&BTF_TIMEOK) {
 167
 168     /* Calculate the integer differences in seconds and nanoseconds
 169      * independently.  To avoid underflow, though, add a second's worth of
 170      * nanoseconds which we'll subtract off later.
 171      */
 172     SUB64(delta_s, t1->t.ts.s, t0->t.ts.s);
 173     delta_ns = t1->t.ts.ns + NS_PER_S - t0->t.ts.ns;
 174
 175     /* Hack if they're both equal. */
 176     if (ZERO64(delta_s) && !delta_ns) delta_ns = 1;
 177
 178     /* And apply the nanoseconds difference.  To prevent underflow, pre-
 179      * emptively borrow one from the integer difference.
 180      */
 181     delta_inout->t = FLOATK64(delta_s) - 1.0 + delta_ns/(double)NS_PER_S;
 182
 183     /* Done. */
 184     delta_inout->f |= BTF_TIMEOK;
 185   }
 186 }
 187
 188 /* --- @diff_cycles@ --- *
 189  *
 190  * Arguments:   @struct timer *t@ = timer structure
 191  *              @struct bench_timing *delta_inout@ = where to put the result
 192  *              @const struct time *t0, *t1@ = two input times
 193  *
 194  * Returns:     ---
 195  *
 196  * Use:         Calculates a time difference for cycle-counting timers.
 197  */
 198
 199 static void diff_cycles(struct timer *t, struct bench_timing *delta_inout,
 200                         const struct bench_time *t0,
 201                         const struct bench_time *t1)
 202 {
 203   unsigned f = t0->f&t1->f;
 204   kludge64 delta_cy;
 205
 206   if (f&BTF_CYOK) {
 207     SUB64(delta_cy, t1->cy, t0->cy); delta_inout->cy = FLOATK64(delta_cy);
 208     if (!delta_inout->cy) delta_inout->cy = 1;
 209     delta_inout->f |= BTF_CYOK;
 210   }
 211 }
 212
 213 #undef FLOATK64
 214
 215 /* --- @normalize@ --- *
 216  *
 217  * Arguments:   @double *x_inout@ = address of a value to normalize
 218  *              @const char **unit_out@ = address to store unit prefix
 219  *              @double scale@ = scale factor for unit steps
 220  *
 221  * Returns:     ---
 222  *
 223  * Use:         Adjust @*x_inout@ by a power of @scale@, and set @*unit_out@
 224  *              so that printing the two reflects the original value with an
 225  *              appropriate SI unit scaling.  The @scale@ should be 1024 for
 226  *              binary quantities, most notably memory sizes, or 1000 for
 227  *              other quantities.
 228  */
 229
 230 static void normalize(double *x_inout, const char **unit_out, double scale)
 231 {
 232   static const char
 233     *const nothing = "",
 234     *const big[] = { "k", "M", "G", "T", "P", "E", 0 },
 235     *const little[] = { "m", "µ", "n", "p", "f", "a", 0 };
 236   const char *const *u;
 237   double x = *x_inout, s;
 238
 239   if (x >= 0) s = +1.0;
 240   else { x = -x; s = -1.0; }
 241
 242   if (x && x < 1)
 243     for (u = little, x *= scale; x < 1 && u[1]; u++, x *= scale);
 244   else if (x >= scale)
 245     for (u = big, x /= scale; x >= scale && u[1]; u++, x /= scale);
 246   else
 247     u = &nothing;
 248
 249   *x_inout = s*x; *unit_out = *u;
 250 }
 251
 252 /*----- The null timer ----------------------------------------------------*/
 253
 254 /* This is a timer which does nothing, in case we don't have any better
 255  * ideas.
 256  */
 257
 258 static int null_init(struct timer *t) { return (0); }
 259 static int null_now(struct timer *t, struct bench_time *t_out, unsigned f)
 260   { return (0); }
 261 static int null_preflight(struct timer *t) { return (0); }
 262 static void null_diff(struct timer *t, struct bench_timing *delta_inout,
 263                       const struct bench_time *t0,
 264                       const struct bench_time *t1)
 265   { ; }
 266 static void null_teardown(struct timer *t) { ; }
 267
 268 static const struct timer_ops null_ops =
 269   { "null", 0,
 270     null_init, null_preflight, null_now, null_diff, null_teardown };
 271 #define NULL_ENT &null_ops,
 272
 273 /*----- The broken clock --------------------------------------------------*/
 274
 275 /* This is a cycle counter which does nothing, in case we don't have any
 276  * better ideas.
 277  */
 278
 279 static int broken_init(struct timer *t) { return (-1); }
 280
 281 static const struct timer_ops broken_ops =
 282   { "broken", TF_SECRET,
 283     broken_init, null_preflight, null_now, null_diff, null_teardown };
 284 #define BROKEN_ENT &broken_ops,
 285
 286 /*----- Linux performance counters ----------------------------------------*/
 287
 288 /* This is a cycle counter which uses the Linux performance event system,
 289  * which is probably the best choice if it's available.
 290  */
 291
 292 #if defined(HAVE_LINUX_PERF_EVENT_H) && defined(HAVE_UINT64)
 293
 294 /* --- @perfevent_open@ --- *
 295  *
 296  * Arguments:   ---
 297  *
 298  * Returns:     File descriptor, or %$-1$%.
 299  *
 300  * Use:         Open a performance measurement descriptor set up to count CPU
 301  *              cycles.
 302  */
 303
 304 static int perfevent_open(void)
 305 {
 306   struct perf_event_attr attr = { 0 };
 307   int fd;
 308
 309   attr.type = PERF_TYPE_HARDWARE;
 310   attr.size = sizeof(attr);
 311   attr.config = PERF_COUNT_HW_CPU_CYCLES;
 312   attr.disabled = 0;
 313   attr.exclude_kernel = 1;
 314   attr.exclude_hv = 1;
 315
 316   fd = syscall(SYS_perf_event_open, &attr, 0, -1, -1, 0);
 317   if (fd < 0) {
 318     debug("couldn't open perf event: %s", strerror(errno));
 319     return (-1);
 320   }
 321
 322   return (fd);
 323 }
 324
 325 static int perfevent_now(struct timer *t,
 326                          struct bench_time *t_out, unsigned f)
 327 {
 328   ssize_t n;
 329
 330   n = read(t->u_cy.fd, &t_out->cy.i, sizeof(t_out->cy.i));
 331     if (n != sizeof(t_out->cy.i)) {
 332       debug("failed to read perf-event counter: %s", strerror(errno));
 333       return (0);
 334     }
 335   t_out->f |= BTF_CYOK; return (0);
 336 }
 337
 338 static void perfevent_teardown(struct timer *t)
 339   { close(t->u_cy.fd); }
 340
 341 static int perfevent_init(struct timer *t)
 342 {
 343   int fd = -1, rc;
 344
 345   fd = perfevent_open(); if (fd < 0) { rc = -1; goto end; }
 346   t->u_cy.fd = fd; fd = -1; rc = 0;
 347 end:
 348   if (fd != -1) close(fd);
 349   return (rc);
 350 }
 351
 352 static const struct timer_ops perfevent_ops =
 353   { "linux-perf-read-hw-cycles", BTF_CYOK,
 354     perfevent_init, null_preflight, perfevent_now,
 355     diff_cycles, perfevent_teardown };
 356 #define PERFEVENT_VANILLA_CYENT &perfevent_ops,
 357
 358 #  if GCC_VERSION_P(4, 5) && (defined(__i386__) || defined(__x86_64__))
 359
 360 /* Special syscall-free version for x86 using `rdpmc' instruction. *
 361  *
 362  * This is a bit weird because it does both kinds of measurement in a single
 363  * operation.
 364  */
 365
 366 static int perfevrdpmc_now(struct timer *t,
 367                            struct bench_time *t_out, unsigned f)
 368 {
 369   const volatile struct perf_event_mmap_page *map = t->u_cy.pmc.map;
 370   unsigned long long tsc = tsc, toff = toff, tenb = tenb;
 371   unsigned long long cy = cy, cyoff = cyoff;
 372   unsigned long long m, hi, lo;
 373   unsigned tshift = tshift, tmult = tmult, q0, q1, ff;
 374
 375   /* Repeat until we can complete this job without the buffer changing in the
 376    * middle.
 377    */
 378   q0 = map->lock;
 379   __atomic_thread_fence(__ATOMIC_ACQ_REL);
 380   for (;;) {
 381     ff = 0;
 382
 383     /* Read the passage-of-time information. */
 384     if (map->cap_user_time) {
 385       tenb = map->time_enabled;
 386       tsc = __builtin_ia32_rdtsc();
 387       tshift = map->time_shift;
 388       tmult = map->time_mult;
 389       toff = map->time_offset;
 390       ff |= BTF_TIMEOK;
 391     }
 392
 393     /* Read the performance-counter information. */
 394     if (map->cap_user_rdpmc) {
 395       cy = __builtin_ia32_rdpmc(map->index - 1);
 396       cyoff = map->offset;
 397       ff |= BTF_CYOK;
 398     }
 399
 400     /* Check the sequence number again. */
 401     __atomic_thread_fence(__ATOMIC_ACQ_REL);
 402     q1 = map->lock;
 403     if (q0 == q1) break;
 404     q0 = q1;
 405   }
 406
 407   if (ff&BTF_TIMEOK) {
 408     /* We have a raw reference-cycle count %$n$% (@tsc@), and parameters
 409      * %$a$%, %$w$% and %$t_0$%, such that %$a n/2^w + t_0$% gives a time in
 410      * nanoseconds.
 411      */
 412
 413     m = (1ull << tshift) - 1;
 414     hi = tsc >> tshift; lo = tsc&m;
 415     t_out->t.rawns.i = hi*tmult + (lo*tmult >> tshift) + toff + tenb;
 416     t_out->f |= BTF_TIMEOK;
 417   }
 418
 419   if (ff&BTF_CYOK) {
 420     /* We have the cycle count. */
 421
 422     t_out->cy.i = cy + cyoff;
 423     t_out->f |= BTF_CYOK;
 424   }
 425   return (0);
 426 }
 427
 428 static void perfevrdpmc_diff(struct timer *t,
 429                              struct bench_timing *delta_inout,
 430                              const struct bench_time *t0,
 431                              const struct bench_time *t1)
 432 {
 433   unsigned long long delta_ns;
 434   unsigned f = t0->f&t1->f;
 435
 436   if (f&BTF_TIMEOK) {
 437     delta_ns = t1->t.rawns.i - t0->t.rawns.i; if (!delta_ns) delta_ns = 1;
 438     delta_inout->t = delta_ns/(double)NS_PER_S;
 439     delta_inout->f |= BTF_TIMEOK;
 440   }
 441
 442   if (f&BTF_CYOK) {
 443     delta_inout->cy = t1->cy.i - t0->cy.i;
 444     if (!delta_inout->cy) delta_inout->cy = 1;
 445     delta_inout->f |= BTF_CYOK;
 446   }
 447 }
 448
 449 static void perfevrdpmc_unmap
 450   (const volatile struct perf_event_mmap_page *map, size_t mapsz)
 451   { if (map) munmap(UNQUALIFY(struct perf_event_mmap_page, map), mapsz); }
 452
 453 static void perfevrdpmc_teardown(struct timer *t)
 454   { perfevrdpmc_unmap(t->u_cy.pmc.map, t->u_cy.pmc.sz); }
 455
 456 static int perfevrdpmc_setup(struct timer *t)
 457 {
 458   const volatile struct perf_event_mmap_page *map = 0;
 459   int pgsz, mapsz = 0, fd = -1, rc;
 460
 461   /* The rules say we must allocate %$1 + 2^n$% pages, so we need to know how
 462    * big a page is.
 463    */
 464   pgsz = sysconf(_SC_PAGESIZE);
 465     if (pgsz < 0) {
 466       debug("failed to discover page size!: %s", strerror(errno));
 467       rc = -1; goto end;
 468     }
 469
 470   /* Open the measurement descriptor and map it. */
 471   fd = perfevent_open(); if (!fd) return (-1);
 472   mapsz = 2*pgsz;
 473   map = mmap(0, mapsz, PROT_READ, MAP_SHARED, fd, 0);
 474     if (map == MAP_FAILED) {
 475       debug("failed to map perf event: %s", strerror(errno));
 476       return (-1);
 477     }
 478
 479   t->u_cy.pmc.map = map; t->u_cy.pmc.sz = mapsz; map = 0;
 480   t->u_cy.pmc.owner = syscall(SYS_gettid); rc = 0;
 481 end:
 482   if (fd != -1) close(fd);
 483   perfevrdpmc_unmap(map, mapsz);
 484   return (rc);
 485 }
 486
 487 static int perfevrdpmc_preflight(struct timer *t)
 488 {
 489   if (!t->u_cy.pmc.map) { debug("retry perf event map setup"); goto reopen; }
 490   if (t->u_cy.pmc.owner != syscall(SYS_gettid)) {
 491     debug("pid changed: reopen perf event map");
 492     perfevrdpmc_unmap(t->u_cy.pmc.map, t->u_cy.pmc.sz);
 493     t->u_cy.pmc.map = 0; goto reopen;
 494   }
 495   return (0);
 496
 497 reopen:
 498   if (perfevrdpmc_setup(t)) return (-1);
 499   return (0);
 500 }
 501
 502 static int perfevrdpmc_cyinit(struct timer *t)
 503 {
 504   unsigned a, b, c, d;
 505
 506 #  ifdef HAVE_VALGRIND_VALGRIND_H
 507   /* Valgrind doesn't like `rdpmc' instructions, so just bail. */
 508   if (RUNNING_ON_VALGRIND) return (-1);
 509 #  endif
 510
 511   /* We need `rdtsc' to do the passage-of-time measurement. */
 512   if (!__get_cpuid(1, &a, &b, &c, &d) || !(d&CPUID_1D_TSC))
 513     { debug("no `rdtsc' instrunction"); return (-1); }
 514
 515   /* Set things up. */
 516   if (perfevrdpmc_setup(t)) return (-1);
 517   return (0);
 518 }
 519
 520 static const struct timer_ops perfevrdpmc_cyops =
 521   { "linux-x86-perf-rdpmc-hw-cycles", BTF_TIMEOK | BTF_CYOK,
 522     perfevrdpmc_cyinit, perfevrdpmc_preflight, perfevrdpmc_now,
 523     perfevrdpmc_diff, perfevrdpmc_teardown };
 524
 525 static int perfevrdpmc_clkinit(struct timer *t)
 526 {
 527   if (t->ops[CY] != &perfevrdpmc_cyops) {
 528     debug("`linux-x86-perf-rdpmc-hw-cycles' not set as cycle subtimer");
 529     return(-1);
 530   }
 531   return (0);
 532 }
 533
 534 static const struct timer_ops perfevrdpmc_clkops =
 535   { "linux-x86-perf-rdpmc-hw-cycles", 0,
 536     perfevrdpmc_clkinit, null_preflight, null_now,
 537     null_diff, null_teardown };
 538
 539 #    define PERFEVENT_RDPMC_CLKENT &perfevrdpmc_clkops,
 540 #    define PERFEVENT_RDPMC_CYENT &perfevrdpmc_cyops,
 541
 542 #  else
 543 #    define PERFEVENT_RDPMC_CLKENT
 544 #    define PERFEVENT_RDPMC_CYENT
 545 #  endif
 546
 547 #  define PERFEVENT_CLKENT PERFEVENT_RDPMC_CLKENT
 548 #  define PERFEVENT_CYENT PERFEVENT_RDPMC_CYENT PERFEVENT_VANILLA_CYENT
 549 #else
 550 #  define PERFEVENT_CLKENT
 551 #  define PERFEVENT_CYENT
 552 #endif
 553
 554 /*----- Intel time-stamp counter ------------------------------------------*/
 555
 556 /* This is a cycle counter based on the Intel `rdtsc' instruction.  It's not
 557  * really suitable for performance measurement because it gets confused by
 558  * CPU frequency adjustments.
 559  */
 560
 561 #if GCC_VERSION_P(4, 5) && (defined(__i386__) || defined(__x86_64__))
 562
 563 static int x86rdtsc_now(struct timer *t,
 564                         struct bench_time *t_out, unsigned f)
 565   { t_out->cy.i = __builtin_ia32_rdtsc(); t_out->f |= BTF_CYOK; return (0); }
 566
 567 static int x86rdtsc_init(struct timer *t)
 568 {
 569   unsigned a, b, c, d;
 570
 571   if (!__get_cpuid(1, &a, &b, &c, &d) || !(d&CPUID_1D_TSC))
 572     { debug("no `rdtsc' instrunction"); return (-1); }
 573   t->u_cy.tscaux = ~0u;
 574   return (0);
 575 }
 576
 577 static int x86rdtscp_now(struct timer *t,
 578                          struct bench_time *t_out, unsigned f)
 579 {
 580   unsigned tscaux;
 581   unsigned long long n;
 582
 583   n = __builtin_ia32_rdtscp(&tscaux);
 584   if (!(f&BTF_T1))
 585     t->u_cy.tscaux = tscaux;
 586   else if (t->u_cy.tscaux != tscaux) {
 587     debug("tscaux mismatch: new 0x%08x /= old 0x%08x",
 588           tscaux, t->u_cy.tscaux);
 589     return (-1);
 590   }
 591   t_out->cy.i = n; t_out->f |= BTF_CYOK; return (0);
 592 }
 593
 594 static int x86rdtscp_init(struct timer *t)
 595 {
 596   unsigned a, b, c, d;
 597
 598   if (!__get_cpuid(0x80000001, &a, &b, &c, &d) || !(d&CPUID_1xD_TSCP))
 599     { debug("no `rdtscp' instrunction"); return (-1); }
 600   return (0);
 601 }
 602
 603 static const struct timer_ops x86rdtsc_ops =
 604   { "x86-rdtsc", BTF_CYOK,
 605     x86rdtsc_init, null_preflight, x86rdtsc_now,
 606     diff_cycles, null_teardown };
 607 static const struct timer_ops x86rdtscp_ops =
 608   { "x86-rdtscp", BTF_CYOK,
 609     x86rdtscp_init, null_preflight,
 610     x86rdtscp_now, diff_cycles, null_teardown };
 611
 612 #  define X86RDTSC_CYENT &x86rdtscp_ops, &x86rdtsc_ops,
 613 #else
 614 #  define X86RDTSC_CYENT
 615 #endif
 616
 617 /*----- POSIX `clock_gettime' ---------------------------------------------*/
 618
 619 /* This is a real-time clock based on the POSIX time interface, with up to
 620  * nanosecond precision.
 621  */
 622
 623 #if defined(HAVE_CLOCK_GETTIME) && defined(CLOCK_THREAD_CPUTIME_ID)
 624
 625 static int gettime_now(struct timer *t, struct bench_time *t_out, unsigned f)
 626 {
 627   struct timespec now;
 628
 629   if (clock_gettime(CLOCK_THREAD_CPUTIME_ID, &now))
 630     { debug("error reading POSIX clock: %s", strerror(errno)); return (0); }
 631   ASSIGN64(t_out->t.ts.s, now.tv_sec); t_out->t.ts.ns = now.tv_nsec;
 632   t_out->f |= BTF_TIMEOK; return (0);
 633 }
 634
 635 static const struct timer_ops gettime_ops =
 636   { "posix-thread-cputime", BTF_TIMEOK,
 637     null_init, null_preflight, gettime_now, diff_ts, null_teardown };
 638
 639 #  define GETTIME_CLKENT &gettime_ops,
 640 #else
 641 #  define GETTIME_CLKENT
 642 #endif
 643
 644 /*----- Standard C `clock' ------------------------------------------------*/
 645
 646 /* This is a real-time clock based on the C `clock' function which is
 647  * guaranteed to be available, though it's not likely to be very good.
 648  */
 649
 650 static int clock_now(struct timer *t, struct bench_time *t_out, unsigned f)
 651 {
 652   clock_t now;
 653
 654   now = clock();
 655     if (now == (clock_t)-1) {
 656       debug("error reading standard clock: %s", strerror(errno));
 657       return (0);
 658     }
 659   t_out->t.clk = now; t_out->f |= BTF_TIMEOK; return (0);
 660 }
 661
 662 static void clock_diff(struct timer *t, struct bench_timing *delta_inout,
 663                         const struct bench_time *t0,
 664                         const struct bench_time *t1)
 665 {
 666   clock_t delta_clk;
 667   unsigned f = t0->f&t1->f;
 668
 669   if (f&BTF_TIMEOK) {
 670     delta_clk = t1->t.clk - t0->t.clk; if (!delta_clk) delta_clk = 1;
 671     delta_inout->t = delta_clk/(double)CLOCKS_PER_SEC;
 672     delta_inout->f |= BTF_TIMEOK;
 673   }
 674 }
 675
 676 static const struct timer_ops clock_ops =
 677   { "stdc-clock", BTF_TIMEOK, null_init, null_preflight, clock_now,
 678     clock_diff, null_teardown };
 679
 680 #define CLOCK_CLKENT &clock_ops,
 681
 682 /*----- Timing setup ------------------------------------------------------*/
 683
 684 /* Tables of timing sources. */
 685 static const struct timer_ops
 686   *const clktab[] = { PERFEVENT_CLKENT
 687                       GETTIME_CLKENT
 688                       CLOCK_CLKENT
 689                       BROKEN_ENT
 690                       0 },
 691   *const cytab[] = { PERFEVENT_CYENT
 692                      X86RDTSC_CYENT
 693                      NULL_ENT
 694                      BROKEN_ENT
 695                      0 };
 696
 697 static const struct timertab {
 698   const char *what;
 699   const char *env;
 700   const struct timer_ops *const *opstab;
 701 } timertab[] = {
 702   { "clock",    "MLIB_BENCH_CLKTIMER",          clktab },
 703   { "cycle",    "MLIB_BENCH_CYCLETIMER",        cytab }
 704 };
 705
 706 /* --- @find_timer@ --- *
 707  *
 708  * Arguments:   @const char *name@ = timer name
 709  *              @size_t sz@ = length of name
 710  *              @unsigned tm@ = which subtimer we're looking for
 711  *
 712  * Returns:     The table entry matching the given name, or null if there
 713  *              isn't one.
 714  */
 715
 716 static const struct timer_ops *find_timer(const char *name, size_t sz,
 717                                           unsigned tm)
 718 {
 719   const struct timer_ops *const *tt;
 720
 721   for (tt = timertab[tm].opstab; *tt; tt++) {
 722     if (strlen((*tt)->name) == sz &&
 723         MEMCMP(name, ==, (*tt)->name, sz))
 724       return (*tt);
 725   }
 726   debug("%s timer `%.*s' not found",
 727         timertab[tm].what, (int)sz, name); return (0);
 728 }
 729
 730 /* --- @try_timer@ --- *
 731  *
 732  * Arguments:   @struct timer *t@ = timer structure
 733  *              @const struct timer_ops *ops@ = timer ops
 734  *              @unsigned tm@ = which subtimer we're setting
 735  *
 736  * Returns:     Zero on success, %$-1$% if timer failed.
 737  *
 738  * Use:         Tries to initialize the timer @t@, reporting a debug message
 739  *              if it worked.
 740  */
 741
 742 static int try_timer(struct timer *t,
 743                      const struct timer_ops *ops, unsigned tm)
 744 {
 745   struct bench_time t0, t1;
 746   struct bench_timing delta;
 747   int rc;
 748   unsigned f = 0;
 749 #define f_teardown 1u
 750
 751   if (ops->init(t)) { rc = -1; goto end; }
 752   f |= f_teardown;
 753
 754   if (ops->preflight(t)) { rc = -1; goto end; }
 755   t0.f = t1.f = 0;
 756   do {
 757     while (ops->now(t, &t0, BTF_T0));
 758   } while (ops->now(t, &t1, BTF_T1));
 759   delta.f = 0; ops->diff(t, &delta, &t0, &t1);
 760   if ((ops->f ^ delta.f)&BTF_ANY) { rc = -1; goto end; }
 761
 762   debug("selected %s timer `%s'", timertab[tm].what, ops->name);
 763   t->ops[tm] = ops; f &= ~f_teardown; rc = 0;
 764
 765 end:
 766   if (f&f_teardown) ops->teardown(t);
 767   return (rc);
 768
 769 #undef f_teardown
 770 }
 771
 772 /* --- @select_timer@ --- *
 773  *
 774  * Arguments:   @struct timer *t@ = timer structure
 775  *              @unsigned tm@ = which subtimer we're setting
 776  *              @const char *config@, @size_t sz@ = config string
 777  *
 778  * Returns:     Zero on success, %$-1$% if timer failed.
 779  *
 780  * Use:         Select a timer from the table.  If the environment variable
 781  *              is set, then parse a comma-separated list of timer names and
 782  *              use the first one listed that seems to work; otherwise, try
 783  *              the timers in the table in order.
 784  */
 785
 786 static int select_timer(struct timer *t, unsigned tm,
 787                         const char *config, size_t sz)
 788 {
 789   const char *p, *l;
 790   const struct timer_ops *ops, *const *tt;
 791
 792   if (!config) {
 793     for (tt = timertab[tm].opstab; *tt; tt++)
 794       if (!((*tt)->f&TF_SECRET) && !try_timer(t, *tt, tm)) return (0);
 795   } else {
 796     l = config + sz;
 797     for (;;) {
 798       p = memchr(config, ',', l - config); if (!p) p = l;
 799       ops = find_timer(config, p - config, tm);
 800       if (ops && !try_timer(t, ops, tm)) return (0);
 801       if (p >= l) break;
 802       config = p + 1;
 803     }
 804   }
 805   debug("no suitable %s timer found", timertab[tm].what); return (-1);
 806 }
 807
 808 /* Bench timer operations. */
 809 static void timer_describe(struct bench_timer *tm, dstr *d)
 810 {
 811   struct timer *t = (struct timer *)tm;
 812   unsigned i;
 813
 814   dstr_puts(d, "builtin: ");
 815   for (i = 0; i < NTIMER; i++) {
 816     if (i) dstr_puts(d, ", ");
 817     dstr_putf(d, "%s = %s", timertab[i].what, t->ops[i]->name);
 818   }
 819 }
 820
 821 static int timer_preflight(struct bench_timer *tm)
 822 {
 823   struct timer *t = (struct timer *)tm;
 824   unsigned i;
 825
 826   for (i = 0; i < NTIMER; i++) if (t->ops[i]->preflight(t)) return (-1);
 827   return (0);
 828 }
 829
 830 static int timer_now(struct bench_timer *tm,
 831                      struct bench_time *t_out, unsigned f)
 832 {
 833   struct timer *t = (struct timer *)tm;
 834   unsigned i;
 835
 836   t_out->f = 0;
 837   for (i = 0; i < NTIMER; i++) if (t->ops[i]->now(t, t_out, f)) return (-1);
 838   return (0);
 839 }
 840
 841 static void timer_diff(struct bench_timer *tm,
 842                        struct bench_timing *t_out,
 843                        const struct bench_time *t0,
 844                        const struct bench_time *t1)
 845 {
 846   struct timer *t = (struct timer *)tm;
 847   unsigned i;
 848
 849   t_out->f = 0;
 850   for (i = 0; i < NTIMER; i++) t->ops[i]->diff(t, t_out, t0, t1);
 851 }
 852
 853 static void timer_destroy(struct bench_timer *tm)
 854 {
 855   struct timer *t = (struct timer *)tm;
 856   unsigned i;
 857
 858   if (!t) return;
 859   for (i = 0; i < NTIMER; i++)
 860     if (t->ops[i]) t->ops[i]->teardown(t);
 861   x_free(t->a, t);
 862 }
 863
 864 static const struct bench_timerops timer_ops =
 865   { timer_describe, timer_preflight, timer_now, timer_diff, timer_destroy };
 866
 867 /* --- @bench_createtimer@ --- *
 868  *
 869  * Arguments:   @const char *config@ = timer configuration string
 870  *
 871  * Returns:     A freshly constructed standard timer object.
 872  *
 873  * Use:         Allocate a timer.  Dispose of it by calling
 874  *              @tm->ops->destroy(tm)@ when you're done.
 875  *
 876  *              Applications should not set configuration strings except as
 877  *              established by user action, e.g., from a command-line option,
 878  *              environment variable, or configuration file.
 879  */
 880
 881 struct bench_timer *bench_createtimer(const char *config)
 882 {
 883   struct timer *t = 0;
 884   struct bench_timer *ret = 0;
 885   struct { const char *p; size_t sz; } tmconf[NTIMER] = { 0 };
 886   const struct timer_ops *const *tt;
 887   const char *p, *l; size_t n, nn;
 888   unsigned i;
 889
 890   /* Parse the configuration string. */
 891   if (config) {
 892
 893     /* The first thing to do is find the end of the string. */
 894     l = config + strlen(config);
 895
 896     for (;;) {
 897       /* Process the whitespace-sparated words of the string one by one. */
 898
 899       /* Skip over any initial whitespace.  If we hit the end of the string
 900        * then we're done.
 901        */
 902       for (;;)
 903         if (config >= l) goto done_config;
 904         else if (!ISSPACE(*config)) break;
 905         else config++;
 906
 907       /* There's definitely a word here.  Find the end of it. */
 908       for (p = config; p < l && !ISSPACE(*p); p++);
 909       nn = p - config;
 910
 911       /* Try various simple keywords. */
 912 #define MATCHP(lit) (nn == sizeof(lit) - 1 && MEMCMP(config, ==, lit, nn))
 913
 914       if (MATCHP("list")) {
 915         /* The `list' keyword requests lists of the available timer
 916          * implementations.
 917          */
 918
 919         for (i = 0; i < NTIMER; i++) {
 920           printf("%s timers:", timertab[i].what);
 921           for (tt = timertab[i].opstab; *tt; tt++)
 922             if (!((*tt)->f&TF_SECRET)) printf(" %s", (*tt)->name);
 923           putchar('\n');
 924         }
 925         goto next_config;
 926       }
 927
 928 #undef MATCHP
 929
 930       /* Otherwise it's an assignment, setting a subtimer list. */
 931       p = memchr(config, '=', nn);
 932       if (!p)
 933         n = nn;
 934       else {
 935         n = p - config;
 936         for (i = 0; i < NTIMER; i++)
 937           if (STRNCMP(config, ==, timertab[i].what, n) &&
 938               !timertab[i].what[n]) {
 939             if (tmconf[i].p)
 940               debug("duplicate %s timer list", timertab[i].what);
 941             tmconf[i].p = config + n + 1; tmconf[i].sz = nn - n - 1;
 942             goto next_config;
 943           }
 944       }
 945       debug("unrecognized config keyword `%.*s'", (int)n, config);
 946
 947       /* Move on to the next word. */
 948     next_config:
 949       config += nn;
 950     }
 951   done_config:;
 952   }
 953
 954   /* Override these settings from the environment. */
 955   for (i = 0; i < NTIMER; i++) {
 956     p = getenv(timertab[i].env);
 957     if (p) { tmconf[i].p = p; tmconf[i].sz = strlen(p); }
 958   }
 959
 960   /* All seems well.  Allocate the timer object. */
 961   XNEW(t); t->a = arena_global;
 962   for (i = 0; i < NTIMER; i++) t->ops[i] = 0;
 963
 964   /* Try to set up the subtimers. */
 965   for (i = NTIMER; i--; )
 966     if (select_timer(t, i, tmconf[i].p, tmconf[i].sz)) goto end;
 967
 968   /* All is done. */
 969   t->_t.ops = &timer_ops; t->_t.ref = 1; ret = &t->_t; t = 0;
 970 end:
 971   if (t) timer_destroy(&t->_t);
 972   return (ret);
 973 }
 974
 975 /*----- Benchmarking ------------------------------------------------------*/
 976
 977 /* --- @bench_init@ --- *
 978  *
 979  * Arguments:   @struct bench_state *b@ = bench state to initialize
 980  *              @struct bench_timer *tm@ = timer to attach, or null
 981  *
 982  * Returns:     Zero on success, %$-1$% on failure.
 983  *
 984  * Use:         Initialize the benchmark state.  On success, the timer state
 985  *              still needs to be calibrated (use @bench_calibrate@) before
 986  *              it can be used, but this will be done automatically by
 987  *              @bench_measure@ if it's not done by hand earlier.  The timer
 988  *              is now owned by the benchmark state and will be destroyed by
 989  *              @bench_destroy@.
 990  *
 991  *              The only reason for failure is if @tm@ was null on entry,
 992  *              and automatic construction of a timer failed.  The state is
 993  *              safe to discard, but calling @bench_destroy@ is safe too.
 994  */
 995
 996 int bench_init(struct bench_state *b, struct bench_timer *tm)
 997 {
 998   int rc;
 999
1000   b->tm = 0;
1001
1002   if (!tm) {
1003     tm = bench_createtimer(0);
1004     if (!tm) { rc = -1; goto end; }
1005   }
1006
1007   b->tm = tm; b->target_s = 1.0; b->f = 0; rc = 0;
1008 end:
1009   return (rc);
1010 }
1011
1012 /* --- @bench_destroy@ --- *
1013  *
1014  * Arguments:   @struct bench_state *b@ = bench state
1015  *
1016  * Returns:     ---
1017  *
1018  * Use:         Destroy the benchmark state, releasing the resources that it
1019  *              holds.
1020  */
1021
1022 void bench_destroy(struct bench_state *b)
1023   { if (b->tm && !--b->tm->ref) { b->tm->ops->destroy(b->tm); b->tm = 0; } }
1024
1025 /* --- @spin@ --- *
1026  *
1027  * Arguments:   @unsigned long n@ = iteration count
1028  *              @void *ctx@ = context pointer (ignored)
1029  *
1030  * Returns:     ---
1031  *
1032  * Use:         Does nothing at all for @n@ iterations.  Used to calibrate
1033  *              the benchmarking state.
1034  */
1035
1036 static void spin(unsigned long n, void *ctx)
1037   { while (n--) RELAX; }
1038
1039 /* --- @bench_calibrate@ --- *
1040  *
1041  * Arguments:   @struct bench_state *b@ = bench state
1042  *              @unsigned f@ = calibration flags
1043  *
1044  * Returns:     Zero on success, %$-1$% if calibration failed.
1045  *
1046  * Use:         Calibrate the benchmark state, so that it can be used to
1047  *              measure performance reasonably accurately.
1048  *
1049  *              Calibration will take into account how the subject code is
1050  *              going to be located.  If you're going to use @BENCH_MEASURE@
1051  *              to measure a piece of literal code, then leave @f@ zero.  If
1052  *              the code to be measured is going to be executed via an
1053  *              indirect branch, e.g., through the @measure@ function, then
1054  *              set @BTF_INDIRECT@.
1055  */
1056
1057 #define T_CLB 0.0625                    /* calibration time limit */
1058
1059 int bench_calibrate(struct bench_state *b, unsigned f)
1060 {
1061   struct linreg lr_clk = LINREG_INIT, lr_cy = LINREG_INIT;
1062   struct bench_timer *tm = b->tm;
1063   struct bench_timing delta;
1064   double n, r;
1065   unsigned i, tf = BTF_ANY;
1066   BENCH_TIMELOOP_DECLS;
1067   int rc;
1068
1069   /* The model here is that a timing loop has a fixed overhead as we enter
1070    * and leave (e.g., to do with the indirect branch into the code), and
1071    * per-iteration overheads as we check the counter and loop back.  We aim
1072    * to split these apart using linear regression.
1073    */
1074
1075   /* If we've already calibrated then there's nothing to do. */
1076   if (b->f&BTF_CLB) return (b->f&BTF_ANY ? 0 : -1);
1077
1078   /* Run the timer preflight check. */
1079   if (tm->ops->preflight(tm)) { rc = -1; goto end; }
1080
1081   /* Exercise the inner loop a few times to educate the branch predictor.
1082    * This is only useful if we're executing via an indirect call.
1083    */
1084   if (f&BTF_INDIRECT) {
1085     for (i = 0; i < 50; i++)
1086       BENCH_TIMELOOP_TAG(setup, b->tm, &delta, 10000, ;)
1087         LAUNDER(&spin)(_bench_n, 0);
1088   }
1089
1090   /* Now we measure idle loops until they take sufficiently long -- or we run
1091    * out of counter.
1092    */
1093   debug("calibrating...");
1094   n = 1.0;
1095   for (;;) {
1096
1097     /* Measure @n@ iterations of the idle loop. */
1098     if (f&BTF_INDIRECT)
1099       BENCH_TIMELOOP_TAG(calibrate, b->tm, &delta, n, ;)
1100         LAUNDER(&spin)(_bench_n, 0);
1101     else
1102       BENCH_TIMELOOP_TAG(calibrate, b->tm, &delta, n, ;)
1103         while (_bench_n--) RELAX;
1104     tf &= delta.f; if (!(tf&BTF_TIMEOK)) { rc = -1; goto end; }
1105
1106     /* Register the timings with the regression machinery. */
1107     linreg_update(&lr_clk, n, delta.t);
1108     if (!(tf&BTF_CYOK))
1109       debug("  n = %10.0f; t = %12g s", n, delta.t);
1110     else {
1111       linreg_update(&lr_cy, n, delta.cy);
1112       debug("  n = %10.0f; t = %12g s, cy = %10.0f", n, delta.t, delta.cy);
1113     }
1114
1115     /* If we're done then stop. */
1116     if (delta.t >= T_CLB) break;
1117     if (n >= ULONG_MAX - n/3) break;
1118
1119     /* Update the counter and continue. */
1120     n += n/3.0 + 1.0;
1121   }
1122
1123   /* Now run the linear regression to extract the constant and per-iteration
1124    * overheads.
1125    */
1126   linreg_fit(&lr_clk, &b->clk.m, &b->clk.c, &r);
1127   debug("clock overhead = (%g n + %g) s (r = %g)", b->clk.m, b->clk.c, r);
1128   if (tf&BTF_CYOK) {
1129     linreg_fit(&lr_cy, &b->cy.m, &b->cy.c, &r);
1130     debug("cycle overhead = (%g n + %g) cy (r = %g)", b->cy.m, b->cy.c, r);
1131   }
1132
1133   /* We're done. */
1134   rc = 0;
1135 end:
1136   b->f |= tf | BTF_CLB;                 /* no point trying again */
1137   return (rc);
1138 }
1139
1140 /* --- @bench_preflight@ --- *
1141  *
1142  * Arguments:   @struct bench_state *b@ = benchmark state
1143  *
1144  * Returns:     Zero on success, %$-1$% on failure.
1145  *
1146  * Use:         Prepares for benchmarking on the current thread.  Current
1147  *              checks are that the timer is calibrated and that it can
1148  *              successfully measure time; the timer preflight is also run.
1149  *
1150  *              Users are unlikely to find this function useful: it's called
1151  *              automatically by the @BENCH_MEASURE@ macro and the
1152  *              @bench_measure@ function.
1153  */
1154
1155 int bench_preflight(struct bench_state *b)
1156 {
1157   struct bench_timer *tm = b->tm;
1158
1159   if (!(b->f&BTF_CLB)) return (-1);
1160   if (!(b->f&BTF_TIMEOK)) return (-1);
1161   if (tm->ops->preflight(tm)) return (-1);
1162   debug("measuring...");
1163   return (0);
1164 }
1165
1166 /* --- @bench_adapt@ --- *
1167  *
1168  * Arguments:   @double *n_inout@ = number of iterations, updated
1169  *              @double target_s@ = target time in seconds
1170  *              @const struct bench_timing *t@ = timing from the previous run
1171  *
1172  * Returns:     Nonzero if the measurement is sufficient; zero to run again.
1173  *
1174  * Use:         This function determines a suitable number of iterations of a
1175  *              benchmark function to perform next.  It is used in a loop
1176  *              such as the following.
1177  *
1178  *                      @double n = 1.0;@
1179  *                      @struct bench_timing t;@
1180  *
1181  *                      @do {@
1182  *                        (run @n@ iterations; set @t@ to the timing)
1183  *                      @} while (!bench_adapt(b, &n, &t));@
1184  *
1185  *              On entry, @*n_inout@ should be the number of iterations
1186  *              performed by the previous pass, and @*t@ the resulting time;
1187  *              the @BTF_TIMEOK@ flag must be set @t->f@.  If the timing is
1188  *              sufficient -- @t->t@ is sufficiently close to @target_s@
1189  *              -- then the function returns nonzero to indicate that
1190  *              measurement is complete.  Otherwise, it sets @*n_inout@ to a
1191  *              new, larger iteration count and returns zero to indicate that
1192  *              a further pass is necessary.
1193  */
1194
1195 int bench_adapt(double *n_inout, double target_s,
1196                 const struct bench_timing *t)
1197 {
1198   double n = *n_inout, nn;
1199
1200   /* Dump the results for debugging. */
1201   if (!(t->f&BTF_CYOK)) debug("  n = %10.0f; t = %12g", n, t->t);
1202   else debug("  n = %10.0f; t = %12g, cy = %10.0f", n, t->t, t->cy);
1203
1204   /* Suppose the timer loop %$n$% iterations in %$t$% seconds.  Our ideal
1205    * time is %$T$% seconds.  If %$t \ge T/\sqrt{2}$%, we're happy.
1206    * Otherwise, we need to scale up the iteration count.  The obvious next
1207    * choice is %$n' = n T/t$%.  Alas, rounding is a problem: if
1208    * %$T/t < 1 + 1/n$% then %$\floor{n T/t} = n$% and we will make no
1209    * progress.  We know that %$T/t > \sqrt{2}%, so this can only happen when
1210    * %$1 + 1/n > \sqrt{2}$%, i.e., when %$n < \sqrt{2} + 1$%.  On the other
1211    * hand, if %$T/t < 1 + 1/n$% then %$t (n + 1)/n > T$%, so just trying
1212    * again with %$n' = n + 1$% iterations will very likely work.
1213    */
1214   if (t->t >= 0.707*target_s) return (1);
1215   nn = n*target_s/t->t; modf(nn, &nn);
1216   *n_inout = nn > n ? nn : n + 1;
1217   return (0);
1218 }
1219
1220 /* --- @bench_adjust@ --- *
1221  *
1222  * Arguments:   @struct bench_state *b@ = benchmark state
1223  *              @struct bench_timing *t_inout@ = timing to adjust
1224  *              @double n@ = number of external iterations performed
1225  *              @double base@ = number of internal operations per external
1226  *                      iteration
1227  *
1228  * Returns:     ---
1229  *
1230  * Use:         Adjusts a raw timing, as captured by @BENCH_TIMELOOP@,
1231  *              according to the calibration data captured in @b@.
1232  *              On exit, the timing data is updated, and @t->n@ is set to the
1233  *              product @n*base@.
1234  */
1235
1236 void bench_adjust(struct bench_state *b,
1237                   struct bench_timing *t_inout, double n, double base)
1238 {
1239
1240   /* Adjust according to the calibration. */
1241   t_inout->t -= n*b->clk.m + b->clk.c;
1242   if (t_inout->f&BTF_CYOK) t_inout->cy -= n*b->cy.m + b->cy.c;
1243
1244   /* Report the results, if debugging. */
1245   if (!(t_inout->f&BTF_CYOK)) debug("  adjusted t' = %12g", t_inout->t);
1246   else debug("  adjusted t' = %12g, cy' = %10.0f", t_inout->t, t_inout->cy);
1247   if (!(t_inout->f&BTF_CYOK))
1248     debug("  %g s per iter; %g iters/s", t_inout->t/n, n/t_inout->t);
1249   else
1250     debug("  %g s (%g cy) per iter; %g iters/s",
1251           t_inout->t/n, t_inout->cy/n, n/t_inout->t);
1252
1253   /* All done. */
1254   t_inout->n = n*base;
1255 }
1256
1257 /* --- @bench_measure@ --- *
1258  *
1259  * Arguments:   @struct bench_state *b@ = benchmark state
1260  *              @struct bench_timing *t_out@ = where to leave the timing
1261  *              @double base@ = number of internal units per call
1262  *              @bench_fn *fn@, @void *ctx@ = benchmark function to run
1263  *
1264  * Returns:     Zero on success, %$-1$% if timing failed.
1265  *
1266  * Use:         Measure a function.  The function @fn@ is called adaptively
1267  *              with an iteration count @n@ set so as to run for
1268  *              approximately @b->target_s@ seconds.
1269  *
1270  *              The result is left in @*t_out@, with @t_out->n@ counting the
1271  *              final product of the iteration count and @base@ (which might,
1272  *              e.g., reflect the number of inner iterations the function
1273  *              performs, or the number of bytes it processes per iteration).
1274  *
1275  *              To get useful results, the benchmark state should have been
1276  *              calibrated for indirect calling -- i.e., with @BTF_INDIRECT@.
1277  */
1278
1279 int bench_measure(struct bench_state *b, struct bench_timing *t_out,
1280                   double base, bench_fn *fn, void *ctx)
1281 {
1282   BENCH_MEASURE_DECLS;
1283   int rc;
1284
1285   BENCH_MEASURE(b, rc, t_out, base) fn(_bench_n, ctx);
1286   return (rc);
1287 }
1288
1289 /*----- Reporting ---------------------------------------------------------*/
1290
1291 /* --- @bench_report@ --- *
1292  *
1293  * Arguments:   @const struct gprintf_ops *gops, void *gp@ = output formatter
1294  *              @unsigned unit@ = unit processed by the benchmark function
1295  *              @const struct bench_timing *t@ = benchmark result
1296  *
1297  * Returns:     ---
1298  *
1299  * Use:         Format, to the output identified by @gops@ and @go@, a
1300  *              human-readable report of the benchmarking result @t@.  No
1301  *              newline is appended.
1302  *
1303  *              The output format is subject to change in later versions.
1304  */
1305
1306 void bench_report(const struct gprintf_ops *gops, void *go,
1307                   unsigned unit, const struct bench_timing *t)
1308 {
1309   double scale, x, n = t->n;
1310   const char *u, *what, *whats;
1311
1312   assert(t->f&BTF_TIMEOK);
1313
1314   switch (unit) {
1315     case BTU_OP:
1316       gprintf(gops, go, "%.0f iterations ", n);
1317       what = "op"; whats = "ops"; scale = 1000;
1318       break;
1319     case BTU_BYTE:
1320       x = n; normalize(&x, &u, 1024); gprintf(gops, go, "%.3f %sB ", x, u);
1321       what = whats = "B"; scale = 1024;
1322       break;
1323     default:
1324       assert(0);
1325   }
1326
1327   x = t->t; normalize(&x, &u, 1000);
1328   gprintf(gops, go, "in %.3f %ss", x, u);
1329   if (t->f&BTF_CYOK) {
1330     x = t->cy; normalize(&x, &u, 1000);
1331     gprintf(gops, go, " (%.3f %scy)", x, u);
1332   }
1333   gprintf(gops, go, ": ");
1334
1335   x = n/t->t; normalize(&x, &u, scale);
1336     gprintf(gops, go, "%.3f %s%s/s", x, u, whats);
1337   x = t->t/n; normalize(&x, &u, 1000);
1338     gprintf(gops, go, ", %.3f %ss/%s", x, u, what);
1339   if (t->f&BTF_CYOK) {
1340     x = t->cy/n; normalize(&x, &u, 1000);
1341       gprintf(gops, go, " (%.3f %scy/%s)", x, u, what);
1342   }
1343 }
1344
1345 /*----- That's all, folks -------------------------------------------------*/