chiark - git - mdw - mLib/blob - test/bench.c

   1 /* -*-c-*-
   2  *
   3  * Benchmarking support
   4  *
   5  * (c) 2023 Straylight/Edgeware
   6  */
   7
   8 /*----- Licensing notice --------------------------------------------------*
   9  *
  10  * This file is part of the mLib utilities library.
  11  *
  12  * mLib is free software: you can redistribute it and/or modify it under
  13  * the terms of the GNU Library General Public License as published by
  14  * the Free Software Foundation; either version 2 of the License, or (at
  15  * your option) any later version.
  16  *
  17  * mLib is distributed in the hope that it will be useful, but WITHOUT
  18  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  19  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
  20  * License for more details.
  21  *
  22  * You should have received a copy of the GNU Library General Public
  23  * License along with mLib.  If not, write to the Free Software
  24  * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
  25  * USA.
  26  */
  27
  28 /*----- Header files ------------------------------------------------------*/
  29
  30 #include "config.h"
  31
  32 #include <ctype.h>
  33 #include <errno.h>
  34 #include <stdarg.h>
  35 #include <stdio.h>
  36 #include <stdlib.h>
  37 #include <string.h>
  38 #include <time.h>
  39
  40 #include "alloc.h"
  41 #include "bench.h"
  42 #include "bits.h"
  43 #include "dstr.h"
  44 #include "linreg.h"
  45 #include "macros.h"
  46
  47 /*----- Data structures ---------------------------------------------------*/
  48
  49 enum { CLK, CY, NTIMER };
  50
  51 struct timer {
  52   struct bench_timer _t;
  53   const struct timer_ops *ops[NTIMER];  /* subtimers for clock and cycles */
  54   union { int fd; } u_cy;               /* state for cycle measurement */
  55 };
  56
  57 struct timer_ops {
  58   const char *name;                     /* timer name */
  59   unsigned f;                           /* flags */
  60 #define TF_SECRET 1u                    /*   don't try this automatically */
  61   int (*init)(struct timer */*t*/);     /* initialization function */
  62   void (*now)(struct bench_time *t_out, struct timer *t); /* read current */
  63   void (*teardown)(struct timer *t);    /* release held resources */
  64 };
  65
  66 /*----- Preliminaries -----------------------------------------------------*/
  67
  68 #define NS_PER_S 1000000000
  69
  70 /* --- @debug@ --- *
  71  *
  72  * Arguments:   @const char *fmt@ = format control string
  73  *              @...@ = format arguemnts
  74  *
  75  * Returns:     ---
  76  *
  77  * Use:         Maybe report a debugging message to standard error.
  78  */
  79
  80 static PRINTF_LIKE(1, 2) void debug(const char *fmt, ...)
  81 {
  82   const char *p;
  83   va_list ap;
  84
  85   p = getenv("MLIB_BENCH_DEBUG");
  86   if (p && *p != 'n' && *p != '0') {
  87     va_start(ap, fmt);
  88     fputs("mLib BENCH: ", stderr);
  89     vfprintf(stderr, fmt, ap);
  90     fputc('\n', stderr);
  91     va_end(ap);
  92   }
  93 }
  94
  95 /* --- @timer_diff@ --- *
  96  *
  97  * Arguments:   @struct bench_timing *delta_out@ = where to putt the result
  98  *              @const struct bench_time *t0, *t1@ = two times captured by a
  99  *                      timer's @now@ function
 100  *
 101  * Returns:     ---
 102  *
 103  * Use:         Calculates the difference between two captured times.  The
 104  *              flags are set according to whether the differences are
 105  *              meaningful; @delta_out->n@ is left unset.
 106  */
 107
 108 static void timer_diff(struct bench_timing *delta_out,
 109                        const struct bench_time *t0,
 110                        const struct bench_time *t1)
 111 {
 112   unsigned f = t0->f&t1->f;
 113   kludge64 k;
 114
 115 #ifdef HAVE_UINT64
 116 #  define FLOATK64(k) ((double)(k).i)
 117 #else
 118 #  define FLOATK64(k) ((double)(k).lo + 4275123318.0*(double)(k).hi)
 119 #endif
 120
 121   if (!(f&BTF_TIMEOK))
 122     delta_out->t = 0.0;
 123   else {
 124     SUB64(k, t1->s, t0->s);
 125     delta_out->t = FLOATK64(k) - 1 +
 126       (t1->ns + NS_PER_S - t0->ns)/(double)NS_PER_S;
 127   }
 128
 129   if (!(f&BTF_CYOK))
 130     delta_out->cy = 0.0;
 131   else {
 132     SUB64(k, t1->cy, t0->cy);
 133     delta_out->cy = FLOATK64(k);
 134   }
 135
 136   delta_out->f = f;
 137
 138 #undef FLOATK64
 139 }
 140
 141 /*----- The null timer ----------------------------------------------------*/
 142
 143 /* This is a timer which does nothing, in case we don't have any better
 144  * ideas.
 145  */
 146
 147 static int null_init(struct timer *t) { return (0); }
 148 static void null_now(struct bench_time *t_out, struct timer *t) { ; }
 149 static void null_teardown(struct timer *t) { ; }
 150
 151 static const struct timer_ops null_ops =
 152   { "null", 0, null_init, null_now, null_teardown };
 153 #define NULL_ENT &null_ops,
 154
 155 /*----- The broken clock --------------------------------------------------*/
 156
 157 /* This is a cycle counter which does nothing, in case we don't have any
 158  * better ideas.
 159  */
 160
 161 static int broken_init(struct timer *t) { return (-1); }
 162
 163 static const struct timer_ops broken_ops =
 164   { "broken", TF_SECRET, broken_init, null_now, null_teardown };
 165 #define BROKEN_ENT &broken_ops,
 166
 167 /*----- Linux performance counters ----------------------------------------*/
 168
 169 /* This is a cycle counter which uses the Linux performance event system,
 170  * which is probably the best choice if it's available.
 171  */
 172
 173 #if defined(HAVE_LINUX_PERF_EVENT_H) && defined(HAVE_UINT64)
 174
 175 #include <sys/types.h>
 176 #include <unistd.h>
 177
 178 #include <linux/perf_event.h>
 179 #include <asm/unistd.h>
 180
 181 static void perfevent_now(struct bench_time *t_out, struct timer *t)
 182 {
 183   ssize_t n;
 184
 185   n = read(t->u_cy.fd, &t_out->cy.i, sizeof(t_out->cy.i));
 186     if (n != sizeof(t_out->cy.i)) {
 187       debug("failed to read perf-event counter: %s", strerror(errno));
 188       return;
 189     }
 190   t_out->f |= BTF_CYOK;
 191 }
 192
 193 static void perfevent_teardown(struct timer *t)
 194   { close(t->u_cy.fd); }
 195
 196 static int perfevent_init(struct timer *t)
 197 {
 198   struct perf_event_attr attr = { 0 };
 199   struct bench_time tm;
 200
 201   attr.type = PERF_TYPE_HARDWARE;
 202   attr.size = sizeof(attr);
 203   attr.config = PERF_COUNT_HW_CPU_CYCLES;
 204   attr.disabled = 0;
 205   attr.exclude_kernel = 1;
 206   attr.exclude_hv = 1;
 207
 208   t->u_cy.fd = syscall(__NR_perf_event_open, &attr, 0, -1, -1, 0);
 209   if (t->u_cy.fd < 0) {
 210     debug("couldn't open perf evvent: %s", strerror(errno));
 211     return (-1);
 212   }
 213
 214   tm.f = 0; perfevent_now(&tm, t);
 215   if (!(tm.f&BTF_CYOK)) { close(t->u_cy.fd); return (-1); }
 216
 217   return (0);
 218 }
 219
 220 static const struct timer_ops perfevent_ops =
 221   { "linux-perf-hw-cycles", 0,
 222     perfevent_init, perfevent_now, perfevent_teardown };
 223
 224 #  define PERFEVENT_CYENT &perfevent_ops,
 225 #else
 226 #  define PERFEVENT_CYENT
 227 #endif
 228
 229 /*----- Intel time-stamp counter ------------------------------------------*/
 230
 231 /* This is a cycle counter based on the Intel `rdtsc' instruction.  It's not
 232  * really suitable for performance measurement because it gets confused by
 233  * CPU frequency adjustments.
 234  */
 235
 236 #if defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__))
 237
 238 #include <cpuid.h>
 239
 240 #define CPUID_1D_TSC (1u << 4)
 241
 242 static void x86rdtsc_now(struct bench_time *t_out, struct timer *t)
 243   { t_out->cy.i = __builtin_ia32_rdtsc(); t_out->f |= BTF_CYOK; }
 244
 245 static int x86rdtsc_init(struct timer *t)
 246 {
 247   unsigned a, b, c, d;
 248
 249   if (!__get_cpuid(1, &a, &b, &c, &d) || !(d&CPUID_1D_TSC))
 250     { debug("no `rdtsc' instrunction"); return (-1); }
 251   return (0);
 252 }
 253
 254 static const struct timer_ops x86rdtsc_ops =
 255   { "x86-rdtsc", 0, x86rdtsc_init, x86rdtsc_now, null_teardown };
 256
 257 #  define X86RDTSC_CYENT &x86rdtsc_ops,
 258 #else
 259 #  define X86RDTSC_CYENT
 260 #endif
 261
 262 /*----- POSIX `clock_gettime' ---------------------------------------------*/
 263
 264 /* This is a real-time clock based on the POSIX time interface, with up to
 265  * nanosecond precision.
 266  */
 267
 268 #if defined(HAVE_CLOCK_GETTIME) && defined(CLOCK_THREAD_CPUTIME_ID)
 269
 270 static void gettime_now(struct bench_time *t_out, struct timer *t)
 271 {
 272   struct timespec now;
 273
 274   if (clock_gettime(CLOCK_THREAD_CPUTIME_ID, &now))
 275     { debug("error reading POSIX clock: %s", strerror(errno)); return; }
 276   ASSIGN64(t_out->s, now.tv_sec); t_out->ns = now.tv_nsec;
 277   t_out->f |= BTF_TIMEOK;
 278 }
 279
 280 static int gettime_init(struct timer *t)
 281 {
 282   struct bench_time tm;
 283
 284   tm.f = 0; gettime_now(&tm, t); if (!tm.f&BTF_TIMEOK) return (-1);
 285   return (0);
 286 }
 287
 288 static const struct timer_ops gettime_ops =
 289   { "posix-thread-cputime", 0, gettime_init, gettime_now, null_teardown };
 290
 291 #  define GETTIME_CLKENT &gettime_ops,
 292 #else
 293 #  define GETTIME_CLKENT
 294 #endif
 295
 296 /*----- Standard C `clock' ------------------------------------------------*/
 297
 298 /* This is a real-time clock based on the C `clock' function which is
 299  * guaranteed to be available, though it's not likely to be very good.
 300  */
 301
 302 static void clock_now(struct bench_time *t_out, struct timer *t)
 303 {
 304   clock_t now, x;
 305   unsigned long s; uint32 ns;
 306
 307   now = clock();
 308     if (now == (clock_t)-1) {
 309       debug("error reading standard clock: %s", strerror(errno));
 310       return;
 311     }
 312   x = now/CLOCKS_PER_SEC;
 313     if (x > ULONG_MAX) { debug("standard clock out of range"); return; }
 314
 315   s = x; x = now - CLOCKS_PER_SEC*s;
 316   if (!(NS_PER_S%CLOCKS_PER_SEC))
 317     ns = x*(NS_PER_S/CLOCKS_PER_SEC);
 318   else if (NS_PER_S <= ULONG_MAX/CLOCKS_PER_SEC)
 319     ns = (x*NS_PER_S)/CLOCKS_PER_SEC;
 320   else
 321     ns = x*((NS_PER_S + 0.0)/CLOCKS_PER_SEC);
 322   ASSIGN64(t_out->s, s); t_out->ns = ns; t_out->f |= BTF_TIMEOK;
 323 }
 324
 325 static int clock_init(struct timer *t)
 326 {
 327   struct bench_time tm;
 328
 329   tm.f = 0; clock_now(&tm, t); if (!tm.f&BTF_TIMEOK) return (-1);
 330   return (0);
 331 }
 332
 333 static const struct timer_ops clock_ops =
 334   { "stdc-clock", 0, clock_init, clock_now, null_teardown };
 335
 336 #define CLOCK_CLKENT &clock_ops,
 337
 338 /*----- Timing setup ------------------------------------------------------*/
 339
 340 /* Tables of timing sources. */
 341 static const struct timer_ops
 342   *const clktab[] = { GETTIME_CLKENT CLOCK_CLKENT BROKEN_ENT 0 },
 343   *const cytab[] = { PERFEVENT_CYENT X86RDTSC_CYENT NULL_ENT BROKEN_ENT 0 };
 344
 345 static const struct timertab {
 346   const char *what;
 347   const char *env;
 348   const struct timer_ops *const *opstab;
 349 } timertab[] = {
 350   { "clock", "MLIB_BENCH_CLKTIMER", clktab },
 351   { "cycle", "MLIB_BENCH_CYCLETIMER", cytab }
 352 };
 353
 354 /* --- @find_timer@ --- *
 355  *
 356  * Arguments:   @const char *name@ = timer name
 357  *              @size_t sz@ = length of name
 358  *              @unsigned tm@ = which subtimer we're looking for
 359  *
 360  * Returns:     The table entry matching the given name, or null if there
 361  *              isn't one.
 362  */
 363
 364 static const struct timer_ops *find_timer(const char *name, size_t sz,
 365                                           unsigned tm)
 366 {
 367   const struct timer_ops *const *tt;
 368
 369   for (tt = timertab[tm].opstab; *tt; tt++) {
 370     if (strlen((*tt)->name) == sz &&
 371         MEMCMP(name, ==, (*tt)->name, sz))
 372       return (*tt);
 373   }
 374   debug("%s timer `%.*s' not found",
 375         timertab[tm].what, (int)sz, name); return (0);
 376 }
 377
 378 /* --- @try_timer@ --- *
 379  *
 380  * Arguments:   @struct timer *t@ = timer structure
 381  *              @const struct timer_ops *ops@ = timer ops
 382  *              @unsigned tm@ = which subtimer we're setting
 383  *
 384  * Returns:     Zero on success, %$-1$% if timer failed.
 385  *
 386  * Use:         Tries to initialize the timer @t@, reporting a debug message
 387  *              if it worked.
 388  */
 389
 390 static int try_timer(struct timer *t,
 391                      const struct timer_ops *ops, unsigned tm)
 392 {
 393   if (ops->init(t)) return (-1);
 394   debug("selected %s timer `%s'", timertab[tm].what, ops->name);
 395   t->ops[tm] = ops; return (0);
 396 }
 397
 398 /* --- @select_timer@ --- *
 399  *
 400  * Arguments:   @struct timer *t@ = timer structure
 401  *              @unsigned tm@ = which subtimer we're setting
 402  *              @const char *config@, @size_t sz@ = config string
 403  *
 404  * Returns:     Zero on success, %$-1$% if timer failed.
 405  *
 406  * Use:         Select a timer from the table.  If the environment variable
 407  *              is set, then parse a comma-separated list of timer names and
 408  *              use the first one listed that seems to work; otherwise, try
 409  *              the timers in the table in order.
 410  */
 411
 412 static int select_timer(struct timer *t, unsigned tm,
 413                         const char *config, size_t sz)
 414 {
 415   const char *p, *l;
 416   const struct timer_ops *ops, *const *tt;
 417
 418   if (!config) {
 419     for (tt = timertab[tm].opstab; *tt; tt++)
 420       if (!((*tt)->f&TF_SECRET) && !try_timer(t, *tt, tm)) return (0);
 421   } else {
 422     l = config + sz;
 423     for (;;) {
 424       p = memchr(config, ',', l - config); if (!p) p = l;
 425       ops = find_timer(config, p - config, tm);
 426       if (ops && !try_timer(t, ops, tm)) return (0);
 427       if (p >= l) break;
 428       config = p + 1;
 429     }
 430   }
 431   debug("no suitable %s timer found", timertab[tm].what); return (-1);
 432 }
 433
 434 /* Bench timer operations. */
 435 static void timer_describe(struct bench_timer *tm, dstr *d)
 436 {
 437   struct timer *t = (struct timer *)tm;
 438   unsigned i;
 439
 440   dstr_puts(d, "builtin: ");
 441   for (i = 0; i < NTIMER; i++) {
 442     if (i) dstr_puts(d, ", ");
 443     dstr_putf(d, "%s = %s", timertab[i].what, t->ops[i]->name);
 444   }
 445 }
 446
 447 static void timer_now(struct bench_timer *tm, struct bench_time *t_out)
 448 {
 449   struct timer *t = (struct timer *)tm;
 450   unsigned i;
 451
 452   for (i = 0; i < NTIMER; i++) t->ops[i]->now(t_out, t);
 453 }
 454
 455 static void timer_destroy(struct bench_timer *tm)
 456 {
 457   struct timer *t = (struct timer *)tm;
 458   unsigned i;
 459
 460   if (!t) return;
 461   for (i = 0; i < NTIMER; i++)
 462     if (t->ops[i]) t->ops[i]->teardown(t);
 463   xfree(t);
 464 }
 465
 466 static const struct bench_timerops timer_ops =
 467   { timer_describe, timer_now, timer_destroy };
 468
 469 /* --- @bench_createtimer@ --- *
 470  *
 471  * Arguments:   @const char *config@ = timer configuration string
 472  *
 473  * Returns:     A freshly constructed standard timer object.
 474  *
 475  * Use:         Allocate a timer.  Dispose of it by calling
 476  *              @tm->ops->destroy(tm)@ when you're done.
 477  *
 478  *              Applications should not set configuration strings except as
 479  *              established by user action, e.g., from a command-line option,
 480  *              environment variable, or configuration file.
 481  */
 482
 483 struct bench_timer *bench_createtimer(const char *config)
 484 {
 485   struct timer *t = 0;
 486   struct bench_timer *ret = 0;
 487   struct { const char *p; size_t sz; } tmconf[NTIMER] = { 0 };
 488   const struct timer_ops *const *tt;
 489   const char *p, *l; size_t n, nn;
 490   unsigned i;
 491
 492   /* Parse the configuration string. */
 493   if (config) {
 494
 495     /* The first thing to do is find the end of the string. */
 496     l = config + strlen(config);
 497
 498     for (;;) {
 499       /* Process the whitespace-sparated words of the string one by one. */
 500
 501       /* Skip over any initial whitespace.  If we hit the end of the string
 502        * then we're done.
 503        */
 504       for (;;)
 505         if (config >= l) goto done_config;
 506         else if (!ISSPACE(*config)) break;
 507         else config++;
 508
 509       /* There's definitely a word here.  Find the end of it. */
 510       for (p = config; p < l && !ISSPACE(*p); p++);
 511       nn = p - config;
 512
 513       /* Try various simple keywords. */
 514 #define MATCHP(lit) (nn == sizeof(lit) - 1 && MEMCMP(config, ==, lit, nn))
 515
 516       if (MATCHP("list")) {
 517         /* The `list' keyword requests lists of the available timer
 518          * implementations.
 519          */
 520
 521         for (i = 0; i < NTIMER; i++) {
 522           printf("%s timers:", timertab[i].what);
 523           for (tt = timertab[i].opstab; *tt; tt++)
 524             if (!((*tt)->f)&TF_SECRET) printf(" %s", (*tt)->name);
 525           putchar('\n');
 526         }
 527         goto next_config;
 528       }
 529
 530 #undef MATCHP
 531
 532       /* Otherwise it's an assignment, setting a subtimer list. */
 533       p = memchr(config, '=', nn);
 534       if (!p)
 535         n = nn;
 536       else {
 537         n = p - config;
 538         for (i = 0; i < NTIMER; i++)
 539           if (STRNCMP(config, ==, timertab[i].what, n) &&
 540               !timertab[i].what[n]) {
 541             if (tmconf[i].p)
 542               debug("duplicate %s timer list", timertab[i].what);
 543             tmconf[i].p = config + n + 1; tmconf[i].sz = nn - n - 1;
 544             goto next_config;
 545           }
 546       }
 547       debug("unrecognized config keyword `%.*s'", (int)n, config);
 548
 549       /* Move on to the next word. */
 550     next_config:
 551       config += nn;
 552     }
 553   done_config:;
 554   }
 555
 556   /* Override these settings from the environment. */
 557   for (i = 0; i < NTIMER; i++) {
 558     p = getenv(timertab[i].env);
 559     if (p) { tmconf[i].p = p; tmconf[i].sz = strlen(p); }
 560   }
 561
 562   /* All seems well.  Allocate the timer object. */
 563   t = xmalloc(sizeof(*t));
 564   for (i = 0; i < NTIMER; i++) t->ops[i] = 0;
 565
 566   /* Try to set up the subtimers. */
 567   for (i = 0; i < NTIMER; i++)
 568     if (select_timer(t, i, tmconf[i].p, tmconf[i].sz)) goto end;
 569
 570   /* All is done. */
 571   t->_t.ops = &timer_ops; ret = &t->_t; t = 0;
 572 end:
 573   if (t) timer_destroy(&t->_t);
 574   return (ret);
 575 }
 576
 577 /*----- Benchmarking ------------------------------------------------------*/
 578
 579 /* --- @bench_init@ --- *
 580  *
 581  * Arguments:   @struct bench_state *b@ = bench state to initialize
 582  *              @struct bench_timer *tm@ = timer to attach, or null
 583  *
 584  * Returns:     Zero on success, %$-1$% on failure.
 585  *
 586  * Use:         Initialize the benchmark state.  On success, the timer state
 587  *              still needs to be calibrated (use @bench_calibrate@) before
 588  *              it can be used, but this will be done automatically by
 589  *              @bench_measure@ if it's not done by hand earlier.  The timer
 590  *              is now owned by the benchmark state and will be destroyed by
 591  *              @bench_destroy@.
 592  *
 593  *              The only reason for failure is if @tm@ was null on entry,
 594  *              and automatic construction of a timer failed.  The state is
 595  *              safe to discard, but calling @bench_destroy@ is safe too.
 596  */
 597
 598 int bench_init(struct bench_state *b, struct bench_timer *tm)
 599 {
 600   int rc;
 601
 602   b->tm = 0;
 603
 604   if (!tm) {
 605     tm = bench_createtimer(0);
 606     if (!tm) { rc = -1; goto end; }
 607   }
 608
 609   b->tm = tm; b->target_s = 1.0; b->f = 0; rc = 0;
 610 end:
 611   return (rc);
 612 }
 613
 614 /* --- @bench_destroy@ --- *
 615  *
 616  * Arguments:   @struct bench_state *b@ = bench state
 617  *
 618  * Returns:     ---
 619  *
 620  * Use:         Destroy the benchmark state, releasing the resources that it
 621  *              holds.
 622  */
 623
 624 void bench_destroy(struct bench_state *b)
 625   { if (b->tm) { b->tm->ops->destroy(b->tm); b->tm = 0; } }
 626
 627 /* --- @do_nothing@ --- *
 628  *
 629  * Arguments:   @unsigned long n@ = iteration count
 630  *              @void *ctx@ = context pointer (ignored)
 631  *
 632  * Returns:     ---
 633  *
 634  * Use:         Does nothing at all for @n@ iterations.  Used to calibrate
 635  *              the benchmarking state.
 636  */
 637
 638 static void do_nothing(unsigned long n, void *ctx)
 639   { while (n--) RELAX; }
 640
 641 /* --- @bench_calibrate@ --- *
 642  *
 643  * Arguments:   @struct bench_state *b@ = bench state
 644  *
 645  * Returns:     Zero on success, %$-1$% if calibration failed.
 646  *
 647  * Use:         Calibrate the benchmark state, so that it can be used to
 648  *              measure performance reasonably accurately.
 649  */
 650
 651 #define T_CLB 0.0625                    /* calibration time limit */
 652
 653 int bench_calibrate(struct bench_state *b)
 654 {
 655   struct linreg lr_clk = LINREG_INIT, lr_cy = LINREG_INIT;
 656   unsigned long n;
 657   unsigned i;
 658   struct bench_timer *tm = b->tm;
 659   struct bench_time t0, t1;
 660   struct bench_timing delta;
 661   double r;
 662   bench_fn *fn = LAUNDER(&do_nothing);
 663   unsigned f = BTF_ANY;
 664   int rc;
 665
 666   /* The model here is that a timing loop has a fixed overhead as we enter
 667    * and leave (e.g., to do with the indirect branch into the code), and
 668    * per-iteration overheads as we check the counter and loop back.  We aim
 669    * to split these apart using linear regression.
 670    */
 671
 672   /* If we've already calibrated then there's nothing to do. */
 673   if (b->f&BTF_CLB) return (b->f&BTF_ANY ? 0 : -1);
 674
 675   /* Exercise the inner loop a few times to educate the branch predictor. */
 676   for (i = 0; i < 10; i++)
 677     { tm->ops->now(tm, &t0); fn(50, 0); tm->ops->now(tm, &t1); }
 678
 679   /* Now we measure idle loops until they take sufficiently long -- or we run
 680    * out of counter.
 681    */
 682   debug("calibrating...");
 683   n = 1;
 684   for (;;) {
 685
 686     /* Measure @n@ iterations of the idle loop. */
 687     tm->ops->now(tm, &t0); fn(n, 0); tm->ops->now(tm, &t1);
 688     timer_diff(&delta, &t0, &t1); f &= delta.f;
 689     if (!(f&BTF_TIMEOK)) { rc = -1; goto end; }
 690
 691     /* Register the timings with the regression machinery. */
 692     linreg_update(&lr_clk, n, delta.t);
 693     if (!(f&BTF_CYOK))
 694       debug("  n = %10lu; t = %12g s", n, delta.t);
 695     else {
 696       linreg_update(&lr_cy, n, delta.cy);
 697       debug("  n = %10lu; t = %12g s, cy = %10.0f", n, delta.t, delta.cy);
 698     }
 699
 700     /* If we're done then stop. */
 701     if (delta.t >= T_CLB) break;
 702     if (n >= ULONG_MAX - n/3) break;
 703
 704     /* Update the counter and continue. */
 705     n += n/3 + 1;
 706   }
 707
 708   /* Now run the linear regression to extract the constant and per-iteration
 709    * overheads.
 710    */
 711   linreg_fit(&lr_clk, &b->clk.m, &b->clk.c, &r);
 712   debug("clock overhead = (%g n + %g) s (r = %g)", b->clk.m, b->clk.c, r);
 713   if (f&BTF_CYOK) {
 714     linreg_fit(&lr_cy, &b->cy.m, &b->cy.c, &r);
 715     debug("cycle overhead = (%g n + %g) cy (r = %g)", b->cy.m, b->cy.c, r);
 716   }
 717
 718   /* We're done. */
 719   rc = 0;
 720 end:
 721   b->f |= f | BTF_CLB;                  /* no point trying again */
 722   return (rc);
 723 }
 724
 725 /* --- @bench_measure@ --- *
 726  *
 727  * Arguments:   @struct bench_timing *t_out@ = where to leave the timing
 728  *              @struct bench_state *b@ = benchmark state
 729  *              @double base@ = number of internal units per call
 730  *              @bench_fn *fn@, @void *ctx@ = benchmark function to run
 731  *
 732  * Returns:     Zero on success, %$-1$% if timing failed.
 733  *
 734  * Use:         Measure a function.  The function @fn@ is called adaptively
 735  *              with an iteration count @n@ set so as to run for
 736  *              approximately @b->target_s@ seconds.
 737  *
 738  *              The result is left in @*t_out@, with @t_out->n@ counting the
 739  *              final product of the iteration count and @base@ (which might,
 740  *              e.g., reflect the number of inner iterations the function
 741  *              performs, or the number of bytes it processes per iteration).
 742  */
 743
 744 int bench_measure(struct bench_timing *t_out, struct bench_state *b,
 745                   double base, bench_fn *fn, void *ctx)
 746 {
 747   struct bench_timer *tm = b->tm;
 748   struct bench_time t0, t1;
 749   unsigned long n, nn;
 750
 751   /* Make sure the state is calibrated. */
 752   if (bench_calibrate(b)) return (-1);
 753
 754   /* Main adaptive measurement loop.
 755    *
 756    * Suppose the timer loop %$n$% iterations in %$t$% seconds.  Our ideal
 757    * time is %$T$% seconds.  If %$t \ge T/\sqrt{2}$%, we're happy.
 758    * Otherwise, we need to scale up the iteration count.  The obvious next
 759    * choice is %$n' = n T/t$%.  Alas, rounding is a problem: if
 760    * %$T/t < 1 + 1/n$% then %$\floor{n T/t} = n$% and we will make no
 761    * progress.  We know that %$T/t > \sqrt{2}%, so this can only happen when
 762    * %$1 + 1/n > \sqrt{2}$%, i.e., when %$n < \sqrt{2} + 1$%.  On the other
 763    * hand, if %$T/t < 1 + 1/n$% then %$t (n + 1)/n > T$%, so just trying
 764    * again with %$n' = n + 1$% iterations will very likely work.
 765    */
 766   debug("measuring..."); n = 1;
 767   for (;;) {
 768     tm->ops->now(tm, &t0); fn(n, ctx); tm->ops->now(tm, &t1);
 769     timer_diff(t_out, &t0, &t1);
 770     if (!(t_out->f&BTF_TIMEOK)) return (-1);
 771     if (!(t_out->f&BTF_CYOK)) debug("  n = %10lu; t = %12g", n, t_out->t);
 772     else debug("  n = %10lu; t = %12g, cy = %10.0f", n, t_out->t, t_out->cy);
 773     if (t_out->t >= 0.707*b->target_s) break;
 774     nn = n*b->target_s/t_out->t;
 775     if (nn > n) n = nn;
 776     else n++;
 777   }
 778
 779   /* Adjust according to the calibration. */
 780   t_out->t -= n*b->clk.m + b->clk.c;
 781   if (t_out->f&BTF_CYOK) t_out->cy -= n*b->cy.m + b->cy.c;
 782
 783   /* Report the results, if debugging. */
 784   if (!(t_out->f&BTF_CYOK)) debug("  adjusted t' = %12g", t_out->t);
 785   else debug("  adjusted t = %12g, cy = %10.0f", t_out->t, t_out->cy);
 786   if (!(t_out->f&BTF_CYOK))
 787     debug("  %g s per op; %g ops/s", t_out->t/n, n/t_out->t);
 788   else
 789     debug("  %g s (%g cy) per op; %g ops/s",
 790           t_out->t/n, t_out->cy/n, n/t_out->t);
 791
 792   /* All done. */
 793   t_out->n = n*base; return (0);
 794 }
 795
 796 /*----- That's all, folks -------------------------------------------------*/