chiark - git - mdw - mLib/blob - test/bench.c

   1 /* -*-c-*-
   2  *
   3  * Benchmarking support
   4  *
   5  * (c) 2023 Straylight/Edgeware
   6  */
   7
   8 /*----- Licensing notice --------------------------------------------------*
   9  *
  10  * This file is part of the mLib utilities library.
  11  *
  12  * mLib is free software: you can redistribute it and/or modify it under
  13  * the terms of the GNU Library General Public License as published by
  14  * the Free Software Foundation; either version 2 of the License, or (at
  15  * your option) any later version.
  16  *
  17  * mLib is distributed in the hope that it will be useful, but WITHOUT
  18  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  19  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
  20  * License for more details.
  21  *
  22  * You should have received a copy of the GNU Library General Public
  23  * License along with mLib.  If not, write to the Free Software
  24  * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
  25  * USA.
  26  */
  27
  28 /*----- Header files ------------------------------------------------------*/
  29
  30 #include "config.h"
  31
  32 #include <errno.h>
  33 #include <stdarg.h>
  34 #include <stdio.h>
  35 #include <stdlib.h>
  36 #include <string.h>
  37 #include <time.h>
  38
  39 #include "alloc.h"
  40 #include "bench.h"
  41 #include "bits.h"
  42 #include "linreg.h"
  43 #include "macros.h"
  44
  45 /*----- Data structures ---------------------------------------------------*/
  46
  47 struct timer {
  48   struct bench_timer _t;
  49   const struct timer_ops *clkops, *cyops; /* time and cycle measurements */
  50   union { int fd; } u_cy;               /* state for cycle measurement */
  51 };
  52
  53 struct timer_ops {
  54   void (*now)(struct bench_time *t_out, struct timer *t); /* read current */
  55   void (*teardown)(struct timer *t);    /* release held resources */
  56 };
  57
  58 /*----- Preliminaries -----------------------------------------------------*/
  59
  60 #define NS_PER_S 1000000000
  61
  62 /* --- @debug@ --- *
  63  *
  64  * Arguments:   @const char *fmt@ = format control string
  65  *              @...@ = format arguemnts
  66  *
  67  * Returns:     ---
  68  *
  69  * Use:         Maybe report a debugging message to standard error.
  70  */
  71
  72 static PRINTF_LIKE(1, 2) void debug(const char *fmt, ...)
  73 {
  74   const char *p;
  75   va_list ap;
  76
  77   p = getenv("MLIB_BENCH_DEBUG");
  78   if (p && *p != 'n' && *p != '0') {
  79     va_start(ap, fmt);
  80     fputs("mLib BENCH: ", stderr);
  81     vfprintf(stderr, fmt, ap);
  82     fputc('\n', stderr);
  83     va_end(ap);
  84   }
  85 }
  86
  87 /* --- @timer_diff@ --- *
  88  *
  89  * Arguments:   @struct bench_timing *delta_out@ = where to putt the result
  90  *              @const struct bench_time *t0, *t1@ = two times captured by a
  91  *                      timer's @now@ function
  92  *
  93  * Returns:     ---
  94  *
  95  * Use:         Calculates the difference between two captured times.  The
  96  *              flags are set according to whether the differences are
  97  *              meaningful; @delta_out->n@ is left unset.
  98  */
  99
 100 static void timer_diff(struct bench_timing *delta_out,
 101                        const struct bench_time *t0,
 102                        const struct bench_time *t1)
 103 {
 104   unsigned f = t0->f&t1->f;
 105   kludge64 k;
 106
 107 #ifdef HAVE_UINT64
 108 #  define FLOATK64(k) ((double)(k).i)
 109 #else
 110 #  define FLOATK64(k) ((double)(k).lo + 4275123318.0*(double)(k).hi)
 111 #endif
 112
 113   if (!(f&BTF_TIMEOK))
 114     delta_out->t = 0.0;
 115   else {
 116     SUB64(k, t1->s, t0->s);
 117     delta_out->t = FLOATK64(k) - 1 +
 118       (t1->ns + NS_PER_S - t0->ns)/(double)NS_PER_S;
 119   }
 120
 121   if (!(f&BTF_CYOK))
 122     delta_out->cy = 0.0;
 123   else {
 124     SUB64(k, t1->cy, t0->cy);
 125     delta_out->cy = FLOATK64(k);
 126   }
 127
 128   delta_out->f = f;
 129
 130 #undef FLOATK64
 131 }
 132
 133 /*----- The null clock ----------------------------------------------------*/
 134
 135 /* This is a cycle counter which does nothing, in case we don't have any
 136  * better ideas.
 137  */
 138
 139 static void null_now(struct bench_time *t_out, struct timer *t) { ; }
 140 static void null_teardown(struct timer *t) { ; }
 141 static const struct timer_ops null_ops = { null_now, null_teardown };
 142
 143 static int null_cyinit(struct timer *t)
 144   { t->cyops = &null_ops; return (0); }
 145
 146 #define NULL_CYENT { "null", null_cyinit },
 147
 148 /*----- Linux performance counters ----------------------------------------*/
 149
 150 /* This is a cycle counter which uses the Linux performance event system,
 151  * which is probably the best choice if it's available.
 152  */
 153
 154 #if defined(HAVE_LINUX_PERF_EVENT_H) && defined(HAVE_UINT64)
 155
 156 #include <sys/types.h>
 157 #include <unistd.h>
 158
 159 #include <linux/perf_event.h>
 160 #include <asm/unistd.h>
 161
 162 static void perfevent_now(struct bench_time *t_out, struct timer *t)
 163 {
 164   ssize_t n;
 165
 166   n = read(t->u_cy.fd, &t_out->cy.i, sizeof(t_out->cy.i));
 167     if (n != sizeof(t_out->cy.i)) {
 168       debug("failed to read perf-event counter: %s", strerror(errno));
 169       return;
 170     }
 171   t_out->f |= BTF_CYOK;
 172 }
 173
 174 static void perfevent_teardown(struct timer *t)
 175   { close(t->u_cy.fd); }
 176
 177 static const struct timer_ops perfevent_ops =
 178   { perfevent_now, perfevent_teardown };
 179
 180 static int perfevent_init(struct timer *t)
 181 {
 182   struct perf_event_attr attr = { 0 };
 183   struct bench_time tm;
 184
 185   attr.type = PERF_TYPE_HARDWARE;
 186   attr.size = sizeof(attr);
 187   attr.config = PERF_COUNT_HW_CPU_CYCLES;
 188   attr.disabled = 0;
 189   attr.exclude_kernel = 1;
 190   attr.exclude_hv = 1;
 191
 192   t->u_cy.fd = syscall(__NR_perf_event_open, &attr, 0, -1, -1, 0);
 193   if (t->u_cy.fd < 0) {
 194     debug("couldn't open perf evvent: %s", strerror(errno));
 195     return (-1);
 196   }
 197
 198   tm.f = 0; perfevent_now(&tm, t);
 199   if (!(tm.f&BTF_CYOK)) { close(t->u_cy.fd); return (-1); }
 200
 201   t->cyops = &perfevent_ops; return (0);
 202 }
 203 #  define PERFEVENT_CYENT { "linux-perf-event", perfevent_init },
 204 #else
 205 #  define PERFEVENT_CYENT
 206 #endif
 207
 208 /*----- Intel time-stamp counter ------------------------------------------*/
 209
 210 /* This is a cycle counter based on the Intel `rdtsc' instruction.  It's not
 211  * really suitable for performance measurement because it gets confused by
 212  * CPU frequency adjustments.
 213  */
 214
 215 #if defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__))
 216
 217 #define EFLAGS_ID (1u << 21)
 218 #define CPUID_1D_TSC (1u << 4)
 219
 220 static uint32 set_flags(unsigned long m, unsigned long x)
 221 {
 222   unsigned long r;
 223
 224 #ifdef __x86_64__
 225 #  define TMP "%%rcx"
 226 #else
 227 #  define TMP "%%ecx"
 228 #endif
 229
 230   __asm__ ("pushf\n\t"
 231            "pop %0\n\t"
 232            "mov %0, " TMP "\n\t"
 233            "and %1, %0\n\t"
 234            "xor %2, %0\n\t"
 235            "push %0\n\t"
 236            "popf\n\t"
 237            "pushf\n\t"
 238            "pop %0\n\t"
 239            "push " TMP "\n\t"
 240            "popf"
 241            : "=r"(r)
 242            : "g"(m), "g"(x)
 243            : "ecx");
 244   return (r);
 245 }
 246
 247 struct cpuid { uint32 a, b, c, d; };
 248
 249 static void cpuid(struct cpuid *info_out, uint32 a, uint32 c)
 250 {
 251   __asm__ ("movl %1, %%eax\n\t"
 252            "movl %2, %%ecx\n\t"
 253            "cpuid\n\t"
 254            "movl %%eax, 0(%0)\n\t"
 255            "movl %%ebx, 4(%0)\n\t"
 256            "movl %%ecx, 8(%0)\n\t"
 257            "movl %%edx, 12(%0)\n\t"
 258            : /* no outputs */
 259            : "r"(info_out), "g"(a), "g"(c)
 260            : "eax", "ebx", "ecx", "edx", "cc");
 261 }
 262
 263 static void x86rdtsc_now(struct bench_time *t_out, struct timer *t)
 264 {
 265   uint32 lo, hi;
 266
 267   __asm__ __volatile__ ("rdtsc" : "=a"(lo), "=d"(hi));
 268   SET64(t_out->cy, hi, lo); t_out->f |= BTF_CYOK;
 269 }
 270
 271 static const struct timer_ops x86rdtsc_ops =
 272   { x86rdtsc_now, null_teardown };
 273
 274 static int x86rdtsc_init(struct timer *t)
 275 {
 276   struct cpuid info;
 277
 278   if ((set_flags(~EFLAGS_ID, 0)&EFLAGS_ID) ||
 279       !(set_flags(~EFLAGS_ID, EFLAGS_ID)&EFLAGS_ID))
 280     { debug("no `cpuid' instruction"); return (-1); }
 281   cpuid(&info, 0, 0);
 282   if (info.a < 1) { debug("no `cpuid' leaf 1"); return (-1); }
 283   cpuid(&info, 1, 0);
 284   if (!(info.d&CPUID_1D_TSC))
 285     { debug("no `rdtsc' instrunction"); return (-1); }
 286   t->cyops = &x86rdtsc_ops; return (0);
 287 }
 288
 289 #  define X86RDTSC_CYENT { "x86-rdtsc", x86rdtsc_init },
 290 #else
 291 #  define X86RDTWC_CYENT
 292 #endif
 293
 294 /*----- POSIX `clock_gettime' ---------------------------------------------*/
 295
 296 /* This is a real-time clock based on the POSIX time interface, with up to
 297  * nanosecond precision.
 298  */
 299
 300 #if defined(HAVE_CLOCK_GETTIME) && defined(CLOCK_THREAD_CPUTIME_ID)
 301
 302 static void gettime_now(struct bench_time *t_out, struct timer *t)
 303 {
 304   struct timespec now;
 305
 306   if (clock_gettime(CLOCK_THREAD_CPUTIME_ID, &now))
 307     { debug("error reading POSIX clock: %s", strerror(errno)); return; }
 308   ASSIGN64(t_out->s, now.tv_sec); t_out->ns = now.tv_nsec;
 309   t_out->f |= BTF_TIMEOK;
 310 }
 311
 312 static const struct timer_ops gettime_ops = { gettime_now, null_teardown };
 313
 314 static int gettime_init(struct timer *t)
 315 {
 316   struct bench_time tm;
 317
 318   tm.f = 0; gettime_now(&tm, t); if (!tm.f&BTF_TIMEOK) return (-1);
 319   t->clkops = &gettime_ops; return (0);
 320 }
 321
 322 #  define GETTIME_CLKENT { "posix-clock_gettime", gettime_init },
 323 #else
 324 #  define GETTIME_CLKENT
 325 #endif
 326
 327 /*----- Standard C `clock' ------------------------------------------------*/
 328
 329 /* This is a real-time clock based on the C `clock' function which is
 330  * guaranteed to be available, though it's not likely to be very good.
 331  */
 332
 333 static void clock_now(struct bench_time *t_out, struct timer *t)
 334 {
 335   clock_t now, x;
 336   unsigned long s; uint32 ns;
 337
 338   now = clock();
 339     if (now == (clock_t)-1) {
 340       debug("error reading standard clock: %s", strerror(errno));
 341       return;
 342     }
 343   x = now/CLOCKS_PER_SEC;
 344     if (x > ULONG_MAX) { debug("standard clock out of range"); return; }
 345
 346   s = x; x = now - CLOCKS_PER_SEC*s;
 347   if (!(NS_PER_S%CLOCKS_PER_SEC))
 348     ns = x*(NS_PER_S/CLOCKS_PER_SEC);
 349   else if (NS_PER_S <= ULONG_MAX/CLOCKS_PER_SEC)
 350     ns = (x*NS_PER_S)/CLOCKS_PER_SEC;
 351   else
 352     ns = x*((NS_PER_S + 0.0)/CLOCKS_PER_SEC);
 353   ASSIGN64(t_out->s, s); t_out->ns = ns; t_out->f |= BTF_TIMEOK;
 354 }
 355
 356 static const struct timer_ops clock_ops = { clock_now, null_teardown };
 357
 358 static int clock_init(struct timer *t)
 359 {
 360   struct bench_time tm;
 361
 362   tm.f = 0; clock_now(&tm, t); if (!tm.f&BTF_TIMEOK) return (-1);
 363   t->clkops = &clock_ops; return (0);
 364 }
 365
 366 #define CLOCK_CLKENT { "clock", clock_init },
 367
 368 /*----- Timing setup ------------------------------------------------------*/
 369
 370 /* Tables of timing sources. */
 371 static const struct timerent {
 372   const char *name;
 373   int (*init)(struct timer */*t*/);
 374 }
 375   clktab[] = { GETTIME_CLKENT CLOCK_CLKENT { 0, 0 } },
 376   cytab[] = { PERFEVENT_CYENT X86RDTSC_CYENT NULL_CYENT { 0, 0 } };
 377
 378 /* --- @find_timer@ --- *
 379  *
 380  * Arguments:   @const char *name@ = timer name
 381  *              @size_t sz@ = length of name
 382  *              @const struct timerent *timers@ = table to search
 383  *              @const char *what@ = adjective describing table
 384  *
 385  * Returns:     The table entry matching the given name, or null if there
 386  *              isn't one.
 387  */
 388
 389 static const struct timerent *find_timer_n(const char *name, size_t sz,
 390                                            const struct timerent *timers,
 391                                            const char *what)
 392 {
 393   while (timers->name) {
 394     if (strlen(timers->name) == sz && MEMCMP(name, ==, timers->name, sz))
 395       return (timers);
 396     timers++;
 397   }
 398   debug("%s timer `%.*s' not found", what, (int)sz, name); return (0);
 399 }
 400
 401 /* --- @try_timer@ --- *
 402  *
 403  * Arguments:   @struct timer *t@ = timer structure
 404  *              @const struct timerent *timer@ = timer table entry
 405  *              @const char *what@ = adjective describing table
 406  *
 407  * Returns:     Zero on success, @-1@ if timer failed.
 408  *
 409  * Use:         Tries to initialize the timer @t@, reporting a debug message
 410  *              if it worked.
 411  */
 412
 413 static int try_timer(struct timer *t,
 414                      const struct timerent *timer, const char *what)
 415 {
 416   if (timer->init(t)) return (-1);
 417   debug("selected %s timer `%s'", what, timer->name); return (0);
 418 }
 419
 420 /* --- @select_timer@ --- *
 421  *
 422  * Arguments:   @struct timer *t@ = timer structure
 423  *              @const struct timerent *timer@ = timer table
 424  *              @const char *varname@ = environment variable to consult
 425  *              @const char *what@ = adjective describing table
 426  *
 427  * Returns:     Zero on success, @-1@ if timer failed.
 428  *
 429  * Use:         Select a timer from the table.  If the environment variable
 430  *              is set, then parse a comma-separated list of timer names and
 431  *              use the first one listed that seems to work; otherwise, try
 432  *              the timers in the table in order.
 433  */
 434
 435 static int select_timer(struct timer *t, const struct timerent *timers,
 436                         const char *varname, const char *what)
 437 {
 438   const char *p; size_t n;
 439   const struct timerent *timer;
 440
 441   p = getenv(varname);
 442   if (!p) {
 443     while (timers->name)
 444       if (!try_timer(t, timers++, what)) return (0);
 445   } else {
 446     for (;;) {
 447       n = strcspn(p, ",");
 448       timer = find_timer_n(p, n, timers, what);
 449       if (timer && !try_timer(t, timer, what)) return (0);
 450       if (!p[n]) break;
 451       p += n + 1;
 452     }
 453   }
 454   debug("no suitable %s timer found", what); return (-1);
 455 }
 456
 457 /* Bench timer operations. */
 458 static void timer_now(struct bench_timer *tm, struct bench_time *t_out)
 459 {
 460   struct timer *t = (struct timer *)tm;
 461
 462   t->clkops->now(t_out, t);
 463   t->cyops->now(t_out, t);
 464 }
 465 static void timer_destroy(struct bench_timer *tm)
 466 {
 467   struct timer *t = (struct timer *)tm;
 468
 469   if (!t) return;
 470   if (t->clkops) t->clkops->teardown(t);
 471   if (t->cyops) t->cyops->teardown(t);
 472   xfree(t);
 473 }
 474
 475 static const struct bench_timerops timer_ops = { timer_now, timer_destroy };
 476
 477 /* --- @bench_createtimer@ --- *
 478  *
 479  * Arguments:   ---
 480  *
 481  * Returns:     A freshly constructed standard timer object.
 482  *
 483  * Use:         Allocate a timer.  Dispose of it by calling
 484  *              @tm->ops->destroy(tm)@ when you're done.
 485  */
 486
 487 struct bench_timer *bench_createtimer(void)
 488 {
 489   struct timer *t = 0;
 490   struct bench_timer *ret = 0;
 491
 492   t = xmalloc(sizeof(*t)); t->cyops = 0; t->clkops = 0;
 493   if (select_timer(t, clktab, "MLIB_BENCH_CLKTIMER", "clock")) goto end;
 494   if (select_timer(t, cytab, "MLIB_BENCH_CYCLETIMER", "cycle")) goto end;
 495   t->_t.ops = &timer_ops; ret = &t->_t; t = 0;
 496 end:
 497   if (t) timer_destroy(&t->_t);
 498   return (ret);
 499 }
 500
 501 /*----- Benchmarking ------------------------------------------------------*/
 502
 503 /* --- @bench_init@ --- *
 504  *
 505  * Arguments:   @struct bench_state *b@ = bench state to initialize
 506  *              @struct bench_timer *tm@ = timer to attach
 507  *
 508  * Returns:     ---
 509  *
 510  * Use:         Initialize the benchmark state.  It still needs to be
 511  *              calibrated (use @bench_calibrate@) before it can be used, but
 512  *              this will be done automatically by @bench_measure@ if it's
 513  *              not done by hand earlier.  The timer is now owned by the
 514  *              benchmark state and will be destroyed by @bench_destroy@.
 515  */
 516
 517 void bench_init(struct bench_state *b, struct bench_timer *tm)
 518   { b->tm = tm; b->target_s = 1.0; b->f = 0; }
 519
 520 /* --- @bench_destroy@ --- *
 521  *
 522  * Arguments:   @struct bench_state *b@ = bench state
 523  *
 524  * Returns:     ---
 525  *
 526  * Use:         Destroy the benchmark state, releasing the resources that it
 527  *              holds.
 528  */
 529
 530 void bench_destroy(struct bench_state *b)
 531   { b->tm->ops->destroy(b->tm); }
 532
 533 /* --- @do_nothing@ --- *
 534  *
 535  * Arguments:   @unsigned long n@ = iteration count
 536  *              @void *ctx@ = context pointer (ignored)
 537  *
 538  * Returns:     ---
 539  *
 540  * Use:         Does nothing at all for @n@ iterations.  Used to calibrate
 541  *              the benchmarking state.
 542  */
 543
 544 static void do_nothing(unsigned long n, void *ctx)
 545   { while (n--) RELAX; }
 546
 547 /* --- @bench_calibrate@ --- *
 548  *
 549  * Arguments:   @struct bench_state *b@ = bench state
 550  *
 551  * Returns:     Zero on success, @-1@ if calibration failed.
 552  *
 553  * Use:         Calibrate the benchmark state, so that it can be used to
 554  *              measure performance reasonably accurately.
 555  */
 556
 557 int bench_calibrate(struct bench_state *b)
 558 {
 559   struct linreg lr_clk = LINREG_INIT, lr_cy = LINREG_INIT;
 560   unsigned long n;
 561   unsigned i;
 562   struct bench_timer *tm = b->tm;
 563   struct bench_time t0, t1;
 564   struct bench_timing delta;
 565   bench_fn *fn = LAUNDER(&do_nothing);
 566   unsigned f = BTF_ANY;
 567   int rc;
 568
 569   /* The model here is that a timing loop has a fixed overhead as we enter
 570    * and leave (e.g., to do with the indirect branch into the code), and
 571    * per-iteration overheads as we check the counter and loop back.  We aim
 572    * to split these apart using linear regression.
 573    */
 574
 575   /* If we've already calibrated then there's nothing to do. */
 576   if (b->f&BTF_ANY) return (0);
 577
 578   /* Exercise the inner loop a few times to educate the branch predictor. */
 579   for (i = 0; i < 10; i++)
 580     { tm->ops->now(tm, &t0); fn(50, 0); tm->ops->now(tm, &t1); }
 581
 582   /* Now we measure idle loops until they take sufficiently long -- or we run
 583    * out of counter.
 584    */
 585   debug("calibrating...");
 586   n = 1;
 587   for (;;) {
 588
 589     /* Measure @n@ iterations of the idle loop. */
 590     tm->ops->now(tm, &t0); fn(n, 0); tm->ops->now(tm, &t1);
 591     timer_diff(&delta, &t0, &t1); f &= delta.f;
 592     if (!(f&BTF_TIMEOK)) { rc = -1; goto end; }
 593
 594     /* Register the timings with the regression machinery. */
 595     linreg_update(&lr_clk, n, delta.t);
 596     if (!(f&BTF_CYOK))
 597       debug("  n = %10lu; t = %12g s", n, delta.t);
 598     else {
 599       linreg_update(&lr_cy, n, delta.cy);
 600       debug("  n = %10lu; t = %12g s, cy = %10.0f", n, delta.t, delta.cy);
 601     }
 602
 603     /* If we're done then stop. */
 604     if (delta.t >= b->target_s/20.0) break;
 605     if (n >= ULONG_MAX - n/3) break;
 606
 607     /* Update the counter and continue. */
 608     n += n/3 + 1;
 609   }
 610
 611   /* Now run the linear regression to extract the constant and per-iteration
 612    * overheads.
 613    */
 614   linreg_fit(&lr_clk, &b->clk.m, &b->clk.c, 0);
 615   debug("clock overhead = (%g n + %g) s", b->clk.m, b->clk.c);
 616   if (f&BTF_CYOK) {
 617     linreg_fit(&lr_clk, &b->clk.m, &b->clk.c, 0);
 618     debug("cycle overhead = (%g n + %g) cy", b->cy.m, b->cy.c);
 619   }
 620
 621   /* We're done. */
 622   b->f |= f; rc = 0;
 623 end:
 624   return (rc);
 625 }
 626
 627 /* --- @bench_measure@ --- *
 628  *
 629  * Arguments:   @struct bench_timing *t_out@ = where to leave the timing
 630  *              @struct bench_state *b@ = benchmark state
 631  *              @double base@ = number of internal units per call
 632  *              @bench_fn *fn@, @void *ctx@ = benchmark function to run
 633  *
 634  * Returns:     Zero on success, @-1@ if timing failed.
 635  *
 636  * Use:         Measure a function.  The function @fn@ is called adaptively
 637  *              with an iteration count @n@ set so as to run for
 638  *              approximately @b->target_s@ seconds.
 639  *
 640  *              The result is left in @*t_out@, with @t_out->n@ counting the
 641  *              final product of the iteration count and @base@ (which might,
 642  *              e.g., reflect the number of inner iterations the function
 643  *              performs, or the number of bytes it processes per iteration).
 644  */
 645
 646 int bench_measure(struct bench_timing *t_out, struct bench_state *b,
 647                   double base, bench_fn *fn, void *ctx)
 648 {
 649   struct bench_timer *tm = b->tm;
 650   struct bench_time t0, t1;
 651   unsigned long n, nn;
 652
 653   /* Make sure the state is calibrated. */
 654   if (bench_calibrate(b)) return (-1);
 655
 656   /* Main adaptive measurement loop.
 657    *
 658    * Suppose the timer loop %$n$% iterations in %$t$% seconds.  Our ideal
 659    * time is %$T$% seconds.  If %$t \ge T/\sqrt{2}$%, we're happy.
 660    * Otherwise, we need to scale up the iteration count.  The obvious next
 661    * choice is %$n' = n T/t$%.  Alas, rounding is a problem: if
 662    * %$T/t < 1 + 1/n$% then %$\floor{n T/t} = n$% and we will make no
 663    * progress.  We know that %$T/t > \sqrt{2}%, so this can only happen when
 664    * %$1 + 1/n > \sqrt{2}$%, i.e., when %$n < \sqrt{2} + 1$%.  On the other
 665    * hand, if %$T/t < 1 + 1/n$% then %$t (n + 1)/n > T$%, so just trying
 666    * again with %$n' = n + 1$% iterations will very likely work.
 667    */
 668   debug("measuring..."); n = 1;
 669   for (;;) {
 670     tm->ops->now(tm, &t0); fn(n, ctx); tm->ops->now(tm, &t1);
 671     timer_diff(t_out, &t0, &t1);
 672     if (!(t_out->f&BTF_TIMEOK)) return (-1);
 673     if (!(t_out->f&BTF_CYOK)) debug("  n = %10lu; t = %12g", n, t_out->t);
 674     else debug("  n = %10lu; t = %12g, cy = %10.0f", n, t_out->t, t_out->cy);
 675     if (t_out->t >= 0.707*b->target_s) break;
 676     nn = n*b->target_s/t_out->t;
 677     if (nn > n) n = nn;
 678     else n++;
 679   }
 680
 681   /* Adjust according to the calibration. */
 682   t_out->t -= n*b->clk.m + b->clk.c;
 683   if (t_out->f&BTF_CYOK) t_out->cy -= n*b->cy.m + b->cy.c;
 684
 685   /* Report the results, if debugging. */
 686   if (!(t_out->f&BTF_CYOK)) debug("  adjusted t' = %12g", t_out->t);
 687   else debug("  adjusted t = %12g, cy = %10.0f", t_out->t, t_out->cy);
 688   if (!(t_out->f&BTF_CYOK))
 689     debug("  %g s per op; %g ops/s", t_out->t/n, n/t_out->t);
 690   else
 691     debug("  %g s (%g cy) per op; %g ops/s",
 692           t_out->t/n, t_out->cy/n, n/t_out->t);
 693
 694   /* All done. */
 695   t_out->n = n*base; return (0);
 696 }
 697
 698 /*----- That's all, folks -------------------------------------------------*/