chiark - git - mdw - mLib/blob - test/bench.c

   1 /* -*-c-*-
   2  *
   3  * Benchmarking support
   4  *
   5  * (c) 2023 Straylight/Edgeware
   6  */
   7
   8 /*----- Licensing notice --------------------------------------------------*
   9  *
  10  * This file is part of the mLib utilities library.
  11  *
  12  * mLib is free software: you can redistribute it and/or modify it under
  13  * the terms of the GNU Library General Public License as published by
  14  * the Free Software Foundation; either version 2 of the License, or (at
  15  * your option) any later version.
  16  *
  17  * mLib is distributed in the hope that it will be useful, but WITHOUT
  18  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  19  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
  20  * License for more details.
  21  *
  22  * You should have received a copy of the GNU Library General Public
  23  * License along with mLib.  If not, write to the Free Software
  24  * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
  25  * USA.
  26  */
  27
  28 /*----- Header files ------------------------------------------------------*/
  29
  30 #include "config.h"
  31
  32 #include <ctype.h>
  33 #include <errno.h>
  34 #include <limits.h>
  35 #include <stdarg.h>
  36 #include <stdio.h>
  37 #include <stdlib.h>
  38 #include <string.h>
  39 #include <time.h>
  40
  41 #include "alloc.h"
  42 #include "bench.h"
  43 #include "bits.h"
  44 #include "dstr.h"
  45 #include "linreg.h"
  46 #include "macros.h"
  47
  48 #if GCC_VERSION_P(4, 5) && (defined(__i386__) || defined(__x86_64__))
  49 #  include <cpuid.h>
  50 #  define CPUID_1D_TSC (1u << 4)
  51 #  define CPUID_1xD_TSCP (1u << 27)
  52 #endif
  53
  54 #if defined(HAVE_LINUX_PERF_EVENT_H) && defined(HAVE_UINT64)
  55 #  include <sys/types.h>
  56 #  include <unistd.h>
  57 #  include <linux/perf_event.h>
  58 #  include <asm/unistd.h>
  59 #  if GCC_VERSION_P(4, 5) && (defined(__i386__) || defined(__x86_64__))
  60 #    include <sys/mman.h>
  61 #  endif
  62 #endif
  63
  64 /*----- Data structures ---------------------------------------------------*/
  65
  66 enum { CLK, CY, NTIMER };
  67
  68 struct timer {
  69   struct bench_timer _t;
  70   const struct timer_ops *ops[NTIMER];  /* subtimers for clock and cycles */
  71   union {
  72     unsigned tscaux;                    /* `ia32_tsc_aux' for `ldtscp' */
  73     int fd;                             /* vanilla `perf_event_open' */
  74     struct { const volatile void *map; size_t sz; } pmc; /* `perf_event_open'
  75                                          * with `rdpmc' */
  76   } u_cy;                               /* state for cycle measurement */
  77 };
  78
  79 struct timer_ops {
  80   const char *name;                     /* timer name */
  81   unsigned f;                           /* flags */
  82 #define TF_SECRET 1u                    /*   don't try this automatically */
  83   int (*init)(struct timer */*t*/);     /* initialization function */
  84   int (*now)(struct timer */*t*/,       /* read current */
  85              struct bench_time */*t_out*/, unsigned /*f*/);
  86   void (*diff)(struct timer */*t*/,     /* difference */
  87                struct bench_timing */*t_inout*/,
  88                const struct bench_time */*t0*/,
  89                const struct bench_time */*t1*/);
  90   void (*teardown)(struct timer */*t*/); /* release held resources */
  91 };
  92
  93 /*----- Preliminaries -----------------------------------------------------*/
  94
  95 #define NS_PER_S 1000000000
  96
  97 /* --- @debug@ --- *
  98  *
  99  * Arguments:   @const char *fmt@ = format control string
 100  *              @...@ = format arguemnts
 101  *
 102  * Returns:     ---
 103  *
 104  * Use:         Maybe report a debugging message to standard error.
 105  */
 106
 107 static PRINTF_LIKE(1, 2) void debug(const char *fmt, ...)
 108 {
 109   const char *p;
 110   va_list ap;
 111
 112   p = getenv("MLIB_BENCH_DEBUG");
 113   if (p && *p != 'n' && *p != '0') {
 114     va_start(ap, fmt);
 115     fputs("mLib BENCH: ", stderr);
 116     vfprintf(stderr, fmt, ap);
 117     fputc('\n', stderr);
 118     va_end(ap);
 119   }
 120 }
 121
 122 /*----- Difference utilities ----------------------------------------------*/
 123
 124 #ifdef HAVE_UINT64
 125 #  define FLOATK64(k) ((double)(k).i)
 126 #else
 127 #  define FLOATK64(k) ((double)(k).lo + 4294967296.0*(double)(k).hi)
 128 #endif
 129
 130 /* --- @diff_ts@ --- *
 131  *
 132  * Arguments:   @struct timer *t@ = timer structure
 133  *              @struct bench_timing *delta_inout@ = where to put the result
 134  *              @const struct time *t0, *t1@ = two input times
 135  *
 136  * Returns:     ---
 137  *
 138  * Use:         Calculates a time difference for timers using the
 139  *              @struct timespec@-like time format.
 140  */
 141
 142 static void diff_ts(struct timer *t, struct bench_timing *delta_inout,
 143                     const struct bench_time *t0, const struct bench_time *t1)
 144 {
 145   unsigned f = t0->f&t1->f;
 146   kludge64 k;
 147
 148   if (f&BTF_TIMEOK) {
 149
 150     /* Calculate the integer difference in seconds. */
 151     SUB64(k, t1->t.ts.s, t0->t.ts.s);
 152
 153     /* And apply the nanoseconds difference.  To prevent underflow,
 154      * pre-emptively borrow one from the integer difference.
 155      */
 156     delta_inout->t =
 157       FLOATK64(k) - 1.0 +
 158       (t1->t.ts.ns + NS_PER_S - t0->t.ts.ns)/(double)NS_PER_S;
 159
 160     /* Done. */
 161     delta_inout->f |= BTF_TIMEOK;
 162   }
 163 }
 164
 165 /* --- @diff_cycles@ --- *
 166  *
 167  * Arguments:   @struct timer *t@ = timer structure
 168  *              @struct bench_timing *delta_inout@ = where to put the result
 169  *              @const struct time *t0, *t1@ = two input times
 170  *
 171  * Returns:     ---
 172  *
 173  * Use:         Calculates a time difference for cycle-counting timers.
 174  */
 175
 176 static void diff_cycles(struct timer *t, struct bench_timing *delta_inout,
 177                         const struct bench_time *t0,
 178                         const struct bench_time *t1)
 179 {
 180   unsigned f = t0->f&t1->f;
 181   kludge64 k;
 182
 183   if (f&BTF_CYOK) {
 184     SUB64(k, t1->cy, t0->cy); delta_inout->cy = FLOATK64(k);
 185     delta_inout->f |= BTF_CYOK;
 186   }
 187 }
 188
 189 #undef FLOATK64
 190
 191 /*----- The null timer ----------------------------------------------------*/
 192
 193 /* This is a timer which does nothing, in case we don't have any better
 194  * ideas.
 195  */
 196
 197 static int null_init(struct timer *t) { return (0); }
 198 static int null_now(struct timer *t, struct bench_time *t_out, unsigned f)
 199   { return (0); }
 200 static void null_diff(struct timer *t, struct bench_timing *delta_inout,
 201                       const struct bench_time *t0,
 202                       const struct bench_time *t1)
 203   { ; }
 204 static void null_teardown(struct timer *t) { ; }
 205
 206 static const struct timer_ops null_ops =
 207   { "null", 0, null_init, null_now, null_diff, null_teardown };
 208 #define NULL_ENT &null_ops,
 209
 210 /*----- The broken clock --------------------------------------------------*/
 211
 212 /* This is a cycle counter which does nothing, in case we don't have any
 213  * better ideas.
 214  */
 215
 216 static int broken_init(struct timer *t) { return (-1); }
 217
 218 static const struct timer_ops broken_ops =
 219   { "broken", TF_SECRET, broken_init, null_now, null_diff, null_teardown };
 220 #define BROKEN_ENT &broken_ops,
 221
 222 /*----- Linux performance counters ----------------------------------------*/
 223
 224 /* This is a cycle counter which uses the Linux performance event system,
 225  * which is probably the best choice if it's available.
 226  */
 227
 228 #if defined(HAVE_LINUX_PERF_EVENT_H) && defined(HAVE_UINT64)
 229
 230 /* --- @perfevent_open@ --- *
 231  *
 232  * Arguments:   ---
 233  *
 234  * Returns:     File descriptor, or %$-1$%.
 235  *
 236  * Use:         Open a performance measurement descriptor set up to count CPU
 237  *              cycles.
 238  */
 239
 240 static int perfevent_open(void)
 241 {
 242   struct perf_event_attr attr = { 0 };
 243   int fd;
 244
 245   attr.type = PERF_TYPE_HARDWARE;
 246   attr.size = sizeof(attr);
 247   attr.config = PERF_COUNT_HW_CPU_CYCLES;
 248   attr.disabled = 0;
 249   attr.exclude_kernel = 1;
 250   attr.exclude_hv = 1;
 251
 252   fd = syscall(__NR_perf_event_open, &attr, 0, -1, -1, 0);
 253   if (fd < 0) {
 254     debug("couldn't open perf event: %s", strerror(errno));
 255     return (-1);
 256   }
 257
 258   return (fd);
 259 }
 260
 261 static int perfevent_now(struct timer *t,
 262                          struct bench_time *t_out, unsigned f)
 263 {
 264   ssize_t n;
 265
 266   n = read(t->u_cy.fd, &t_out->cy.i, sizeof(t_out->cy.i));
 267     if (n != sizeof(t_out->cy.i)) {
 268       debug("failed to read perf-event counter: %s", strerror(errno));
 269       return (0);
 270     }
 271   t_out->f |= BTF_CYOK; return (0);
 272 }
 273
 274 static void perfevent_teardown(struct timer *t)
 275   { close(t->u_cy.fd); }
 276
 277 static int perfevent_init(struct timer *t)
 278 {
 279   struct bench_time tm;
 280   int fd = -1, rc;
 281
 282   fd = perfevent_open(); if (!fd) { rc = -1; goto end; }
 283
 284   t->u_cy.fd = fd; tm.f = 0; perfevent_now(t, &tm, 0);
 285   if (!(tm.f&BTF_CYOK)) { rc = -1; goto end; }
 286   fd = -1; rc = 0;
 287 end:
 288   if (fd != -1) close(fd);
 289   return (rc);
 290 }
 291
 292 static const struct timer_ops perfevent_ops =
 293   { "linux-perf-read-hw-cycles", 0,
 294     perfevent_init, perfevent_now, diff_cycles, perfevent_teardown };
 295 #define PERFEVENT_VANILLA_CYENT &perfevent_ops,
 296
 297 #  if GCC_VERSION_P(4, 5) && (defined(__i386__) || defined(__x86_64__))
 298
 299 /* Special syscall-free version for x86 using `rdpmc' instruction. *
 300  *
 301  * This is a bit weird because it does both kinds of measurement in a single
 302  * operation.
 303  */
 304
 305 static int perfevrdpmc_now(struct timer *t,
 306                            struct bench_time *t_out, unsigned f)
 307 {
 308   const volatile struct perf_event_mmap_page *map = t->u_cy.pmc.map;
 309   unsigned long long tsc = tsc, toff = toff, tenb = tenb;
 310   unsigned long long cy = cy, cyoff = cyoff;
 311   unsigned long long m, hi, lo;
 312   unsigned tshift = tshift, tmult = tmult, q0, q1, ff;
 313
 314   /* Repeat until we can complete this job without the buffer changing in the
 315    * middle.
 316    */
 317   q0 = map->lock;
 318   __atomic_thread_fence(__ATOMIC_ACQ_REL);
 319   for (;;) {
 320     ff = 0;
 321
 322     /* Read the passage-of-time information. */
 323     if (map->cap_user_time) {
 324       tenb = map->time_enabled;
 325       tsc = __builtin_ia32_rdtsc();
 326       tshift = map->time_shift;
 327       tmult = map->time_mult;
 328       toff = map->time_offset;
 329       ff |= BTF_TIMEOK;
 330     }
 331
 332     /* Read the performance-counter information. */
 333     if (map->cap_user_rdpmc) {
 334       cy = __builtin_ia32_rdpmc(map->index - 1);
 335       cyoff = map->offset;
 336       ff |= BTF_CYOK;
 337     }
 338
 339     /* Check the sequence number again. */
 340     __atomic_thread_fence(__ATOMIC_ACQ_REL);
 341     q1 = map->lock;
 342     if (q0 == q1) break;
 343     q0 = q1;
 344   }
 345
 346   if (ff&BTF_TIMEOK) {
 347     /* We have a raw reference-cycle count %$n$% (@tsc@), and parameters
 348      * %$a$%, %$w$% and %$t_0$%, such that %$a n/2^w + t_0$% gives a time in
 349      * nanoseconds.
 350      */
 351
 352     m = (1ull << tshift) - 1;
 353     hi = tsc >> tshift; lo = tsc&m;
 354     t_out->t.rawns.i = hi*tmult + (lo*tmult >> tshift) + toff + tenb;
 355     t_out->f |= BTF_TIMEOK;
 356   }
 357
 358   if (ff&BTF_CYOK) {
 359     /* We have the cycle count. */
 360
 361     t_out->cy.i = cy + cyoff;
 362     t_out->f |= BTF_CYOK;
 363   }
 364   return (0);
 365 }
 366
 367 static void perfevrdpmc_diff(struct timer *t,
 368                              struct bench_timing *delta_inout,
 369                              const struct bench_time *t0,
 370                              const struct bench_time *t1)
 371 {
 372   unsigned f = t0->f&t1->f;
 373
 374   if (f&BTF_TIMEOK) {
 375     delta_inout->t = (t1->t.rawns.i - t0->t.rawns.i)/(double)NS_PER_S;
 376     delta_inout->f |= BTF_TIMEOK;
 377   }
 378
 379   if (f&BTF_CYOK) {
 380     delta_inout->cy = t1->cy.i - t0->cy.i;
 381     delta_inout->f |= BTF_CYOK;
 382   }
 383 }
 384
 385 static void perfevrdpmc_teardown(struct timer *t)
 386   { munmap((/*unconst unvolatile*/ void *)t->u_cy.pmc.map, t->u_cy.pmc.sz); }
 387
 388 static int perfevrdpmc_cyinit(struct timer *t)
 389 {
 390   const volatile struct perf_event_mmap_page *map = 0;
 391   unsigned a, b, c, d, q0, q1, f;
 392   int pgsz, mapsz, fd = -1, rc;
 393
 394   /* We need `rdtsc' to do the passage-of-time measurement. */
 395   if (!__get_cpuid(1, &a, &b, &c, &d) || !(d&CPUID_1D_TSC))
 396     { debug("no `rdtsc' instrunction"); return (-1); }
 397
 398   /* The rules say we must allocate %$1 + 2^n$% pages, so we need to know how
 399    * big a page is.
 400    */
 401   pgsz = sysconf(_SC_PAGESIZE);
 402     if (pgsz < 0) {
 403       debug("failed to discover page size!: %s", strerror(errno));
 404       rc = -1; goto end;
 405     }
 406
 407   /* Open the measurement descriptor and map it. */
 408   fd = perfevent_open(); if (!fd) return (-1);
 409   mapsz = 2*pgsz;
 410   map = mmap(0, mapsz, PROT_READ, MAP_SHARED, fd, 0);
 411     if (map == MAP_FAILED) {
 412       debug("failed to map perf event: %s", strerror(errno));
 413       return (-1);
 414     }
 415
 416   /* Check that it's revealed the necessary information. */
 417   q0 = map->lock;
 418   __atomic_thread_fence(__ATOMIC_ACQ_REL);
 419   for (;;) {
 420     f = 0;
 421     if (map->cap_user_time) f |= BTF_TIMEOK;
 422     if (map->cap_user_rdpmc) f |= BTF_CYOK;
 423     __atomic_thread_fence(__ATOMIC_ACQ_REL);
 424     q1 = map->lock;
 425     if (q0 == q1) break;
 426     q0 = q1;
 427   }
 428   if (!(f&BTF_TIMEOK))
 429     { debug("kernel refused user time measurement"); rc = -1; goto end; }
 430   if (!(f&BTF_TIMEOK))
 431     { debug("kernel refused user cycle measurement"); rc = -1; goto end; }
 432
 433   /* All done.  We can close the descriptor here: the mapping will keep the
 434    * performance-measurement machinery alive.
 435    */
 436   t->u_cy.pmc.map = map; t->u_cy.pmc.sz = mapsz; map = 0; rc = 0;
 437 end:
 438   if (fd != -1) close(fd);
 439   if (map) munmap((/*unconst unvolatile*/ void *)map, mapsz);
 440   return (rc);
 441 }
 442
 443 static const struct timer_ops perfevrdpmc_cyops =
 444   { "linux-x86-perf-rdpmc-hw-cycles", 0,
 445     perfevrdpmc_cyinit, perfevrdpmc_now,
 446     perfevrdpmc_diff, perfevrdpmc_teardown };
 447
 448 static int perfevrdpmc_clkinit(struct timer *t)
 449 {
 450   if (t->ops[CLK] != &perfevrdpmc_cyops) {
 451     debug("linux-x86-perf-rdpmc-hw-cycles not set as cycle subtimer");
 452     return(-1);
 453   }
 454   return (0);
 455 }
 456
 457 static const struct timer_ops perfevrdpmc_clkops =
 458   { "linux-x86-perf-rdpmc-hw-cycles", 0,
 459     perfevrdpmc_clkinit, null_now,
 460     null_diff, null_teardown };
 461
 462 #    define PERFEVENT_RDPMC_CLKENT &perfevrdpmc_clkops,
 463 #    define PERFEVENT_RDPMC_CYENT &perfevrdpmc_cyops,
 464
 465 #  else
 466 #    define PERFEVENT_RDPMC_CLKENT
 467 #    define PERFEVENT_RDPMC_CYENT
 468 #  endif
 469
 470 #  define PERFEVENT_CLKENT PERFEVENT_RDPMC_CLKENT
 471 #  define PERFEVENT_CYENT PERFEVENT_RDPMC_CYENT PERFEVENT_VANILLA_CYENT
 472 #else
 473 #  define PERFEVENT_CLKENT
 474 #  define PERFEVENT_CYENT
 475 #endif
 476
 477 /*----- Intel time-stamp counter ------------------------------------------*/
 478
 479 /* This is a cycle counter based on the Intel `rdtsc' instruction.  It's not
 480  * really suitable for performance measurement because it gets confused by
 481  * CPU frequency adjustments.
 482  */
 483
 484 #if GCC_VERSION_P(4, 5) && (defined(__i386__) || defined(__x86_64__))
 485
 486 static int x86rdtsc_now(struct timer *t,
 487                         struct bench_time *t_out, unsigned f)
 488   { t_out->cy.i = __builtin_ia32_rdtsc(); t_out->f |= BTF_CYOK; return (0); }
 489
 490 static int x86rdtsc_init(struct timer *t)
 491 {
 492   unsigned a, b, c, d;
 493
 494   if (!__get_cpuid(1, &a, &b, &c, &d) || !(d&CPUID_1D_TSC))
 495     { debug("no `rdtsc' instrunction"); return (-1); }
 496   t->u_cy.tscaux = ~0u;
 497   return (0);
 498 }
 499
 500 static int x86rdtscp_now(struct timer *t,
 501                          struct bench_time *t_out, unsigned f)
 502 {
 503   unsigned tscaux;
 504   unsigned long long n;
 505
 506   n = __builtin_ia32_rdtscp(&tscaux);
 507   if (!(f&BTF_T1))
 508     t->u_cy.tscaux = tscaux;
 509   else if (t->u_cy.tscaux != tscaux) {
 510     debug("tscaux mismatch: new 0x%08x /= old 0x%08x",
 511           tscaux, t->u_cy.tscaux);
 512     return (-1);
 513   }
 514   t_out->cy.i = n; t_out->f |= BTF_CYOK; return (0);
 515 }
 516
 517 static int x86rdtscp_init(struct timer *t)
 518 {
 519   unsigned a, b, c, d;
 520
 521   if (!__get_cpuid(0x80000001, &a, &b, &c, &d) || !(d&CPUID_1xD_TSCP))
 522     { debug("no `rdtscp' instrunction"); return (-1); }
 523   return (0);
 524 }
 525
 526 static const struct timer_ops x86rdtsc_ops =
 527   { "x86-rdtsc", 0,
 528     x86rdtsc_init, x86rdtsc_now, diff_cycles, null_teardown };
 529 static const struct timer_ops x86rdtscp_ops =
 530   { "x86-rdtscp", 0,
 531     x86rdtscp_init, x86rdtscp_now, diff_cycles, null_teardown };
 532
 533 #  define X86RDTSC_CYENT &x86rdtscp_ops, &x86rdtsc_ops,
 534 #else
 535 #  define X86RDTSC_CYENT
 536 #endif
 537
 538 /*----- POSIX `clock_gettime' ---------------------------------------------*/
 539
 540 /* This is a real-time clock based on the POSIX time interface, with up to
 541  * nanosecond precision.
 542  */
 543
 544 #if defined(HAVE_CLOCK_GETTIME) && defined(CLOCK_THREAD_CPUTIME_ID)
 545
 546 static int gettime_now(struct timer *t, struct bench_time *t_out, unsigned f)
 547 {
 548   struct timespec now;
 549
 550   if (clock_gettime(CLOCK_THREAD_CPUTIME_ID, &now))
 551     { debug("error reading POSIX clock: %s", strerror(errno)); return (0); }
 552   ASSIGN64(t_out->t.ts.s, now.tv_sec); t_out->t.ts.ns = now.tv_nsec;
 553   t_out->f |= BTF_TIMEOK; return (0);
 554 }
 555
 556 static int gettime_init(struct timer *t)
 557 {
 558   struct bench_time tm;
 559
 560   tm.f = 0; gettime_now(t, &tm, 0); if (!tm.f&BTF_TIMEOK) return (-1);
 561   return (0);
 562 }
 563
 564 static const struct timer_ops gettime_ops =
 565   { "posix-thread-cputime", 0,
 566     gettime_init, gettime_now, diff_ts, null_teardown };
 567
 568 #  define GETTIME_CLKENT &gettime_ops,
 569 #else
 570 #  define GETTIME_CLKENT
 571 #endif
 572
 573 /*----- Standard C `clock' ------------------------------------------------*/
 574
 575 /* This is a real-time clock based on the C `clock' function which is
 576  * guaranteed to be available, though it's not likely to be very good.
 577  */
 578
 579 static int clock_now(struct timer *t, struct bench_time *t_out, unsigned f)
 580 {
 581   clock_t now;
 582
 583   now = clock();
 584     if (now == (clock_t)-1) {
 585       debug("error reading standard clock: %s", strerror(errno));
 586       return (0);
 587     }
 588   t_out->t.clk = now; t_out->f |= BTF_TIMEOK; return (0);
 589 }
 590
 591 static void clock_diff(struct timer *t, struct bench_timing *delta_inout,
 592                         const struct bench_time *t0,
 593                         const struct bench_time *t1)
 594 {
 595   unsigned f = t0->f&t1->f;
 596
 597   if (f&BTF_TIMEOK) {
 598     delta_inout->t = (t1->t.clk - t0->t.clk)/(double)CLOCKS_PER_SEC;
 599     delta_inout->f |= BTF_TIMEOK;
 600   }
 601 }
 602
 603 static int clock_init(struct timer *t)
 604 {
 605   struct bench_time tm;
 606
 607   tm.f = 0; clock_now(t, &tm, 0); if (!tm.f&BTF_TIMEOK) return (-1);
 608   return (0);
 609 }
 610
 611 static const struct timer_ops clock_ops =
 612   { "stdc-clock", 0, clock_init, clock_now, clock_diff, null_teardown };
 613
 614 #define CLOCK_CLKENT &clock_ops,
 615
 616 /*----- Timing setup ------------------------------------------------------*/
 617
 618 /* Tables of timing sources. */
 619 static const struct timer_ops
 620   *const clktab[] = { PERFEVENT_CLKENT
 621                       GETTIME_CLKENT
 622                       CLOCK_CLKENT
 623                       BROKEN_ENT
 624                       0 },
 625   *const cytab[] = { PERFEVENT_CYENT
 626                      X86RDTSC_CYENT
 627                      NULL_ENT
 628                      BROKEN_ENT
 629                      0 };
 630
 631 static const struct timertab {
 632   const char *what;
 633   const char *env;
 634   const struct timer_ops *const *opstab;
 635 } timertab[] = {
 636   { "clock", "MLIB_BENCH_CLKTIMER", clktab },
 637   { "cycle", "MLIB_BENCH_CYCLETIMER", cytab }
 638 };
 639
 640 /* --- @find_timer@ --- *
 641  *
 642  * Arguments:   @const char *name@ = timer name
 643  *              @size_t sz@ = length of name
 644  *              @unsigned tm@ = which subtimer we're looking for
 645  *
 646  * Returns:     The table entry matching the given name, or null if there
 647  *              isn't one.
 648  */
 649
 650 static const struct timer_ops *find_timer(const char *name, size_t sz,
 651                                           unsigned tm)
 652 {
 653   const struct timer_ops *const *tt;
 654
 655   for (tt = timertab[tm].opstab; *tt; tt++) {
 656     if (strlen((*tt)->name) == sz &&
 657         MEMCMP(name, ==, (*tt)->name, sz))
 658       return (*tt);
 659   }
 660   debug("%s timer `%.*s' not found",
 661         timertab[tm].what, (int)sz, name); return (0);
 662 }
 663
 664 /* --- @try_timer@ --- *
 665  *
 666  * Arguments:   @struct timer *t@ = timer structure
 667  *              @const struct timer_ops *ops@ = timer ops
 668  *              @unsigned tm@ = which subtimer we're setting
 669  *
 670  * Returns:     Zero on success, %$-1$% if timer failed.
 671  *
 672  * Use:         Tries to initialize the timer @t@, reporting a debug message
 673  *              if it worked.
 674  */
 675
 676 static int try_timer(struct timer *t,
 677                      const struct timer_ops *ops, unsigned tm)
 678 {
 679   if (ops->init(t)) return (-1);
 680   debug("selected %s timer `%s'", timertab[tm].what, ops->name);
 681   t->ops[tm] = ops; return (0);
 682 }
 683
 684 /* --- @select_timer@ --- *
 685  *
 686  * Arguments:   @struct timer *t@ = timer structure
 687  *              @unsigned tm@ = which subtimer we're setting
 688  *              @const char *config@, @size_t sz@ = config string
 689  *
 690  * Returns:     Zero on success, %$-1$% if timer failed.
 691  *
 692  * Use:         Select a timer from the table.  If the environment variable
 693  *              is set, then parse a comma-separated list of timer names and
 694  *              use the first one listed that seems to work; otherwise, try
 695  *              the timers in the table in order.
 696  */
 697
 698 static int select_timer(struct timer *t, unsigned tm,
 699                         const char *config, size_t sz)
 700 {
 701   const char *p, *l;
 702   const struct timer_ops *ops, *const *tt;
 703
 704   if (!config) {
 705     for (tt = timertab[tm].opstab; *tt; tt++)
 706       if (!((*tt)->f&TF_SECRET) && !try_timer(t, *tt, tm)) return (0);
 707   } else {
 708     l = config + sz;
 709     for (;;) {
 710       p = memchr(config, ',', l - config); if (!p) p = l;
 711       ops = find_timer(config, p - config, tm);
 712       if (ops && !try_timer(t, ops, tm)) return (0);
 713       if (p >= l) break;
 714       config = p + 1;
 715     }
 716   }
 717   debug("no suitable %s timer found", timertab[tm].what); return (-1);
 718 }
 719
 720 /* Bench timer operations. */
 721 static void timer_describe(struct bench_timer *tm, dstr *d)
 722 {
 723   struct timer *t = (struct timer *)tm;
 724   unsigned i;
 725
 726   dstr_puts(d, "builtin: ");
 727   for (i = 0; i < NTIMER; i++) {
 728     if (i) dstr_puts(d, ", ");
 729     dstr_putf(d, "%s = %s", timertab[i].what, t->ops[i]->name);
 730   }
 731 }
 732
 733 static int timer_now(struct bench_timer *tm,
 734                      struct bench_time *t_out, unsigned f)
 735 {
 736   struct timer *t = (struct timer *)tm;
 737   unsigned i;
 738
 739   t_out->f = 0;
 740   for (i = 0; i < NTIMER; i++) if (t->ops[i]->now(t, t_out, f)) return (-1);
 741   return (0);
 742 }
 743
 744 static void timer_diff(struct bench_timer *tm,
 745                        struct bench_timing *t_out,
 746                        const struct bench_time *t0,
 747                        const struct bench_time *t1)
 748 {
 749   struct timer *t = (struct timer *)tm;
 750   unsigned i;
 751
 752   t_out->f = 0;
 753   for (i = 0; i < NTIMER; i++) t->ops[i]->diff(t, t_out, t0, t1);
 754 }
 755
 756 static void timer_destroy(struct bench_timer *tm)
 757 {
 758   struct timer *t = (struct timer *)tm;
 759   unsigned i;
 760
 761   if (!t) return;
 762   for (i = 0; i < NTIMER; i++)
 763     if (t->ops[i]) t->ops[i]->teardown(t);
 764   xfree(t);
 765 }
 766
 767 static const struct bench_timerops timer_ops =
 768   { timer_describe, timer_now, timer_diff, timer_destroy };
 769
 770 /* --- @bench_createtimer@ --- *
 771  *
 772  * Arguments:   @const char *config@ = timer configuration string
 773  *
 774  * Returns:     A freshly constructed standard timer object.
 775  *
 776  * Use:         Allocate a timer.  Dispose of it by calling
 777  *              @tm->ops->destroy(tm)@ when you're done.
 778  *
 779  *              Applications should not set configuration strings except as
 780  *              established by user action, e.g., from a command-line option,
 781  *              environment variable, or configuration file.
 782  */
 783
 784 struct bench_timer *bench_createtimer(const char *config)
 785 {
 786   struct timer *t = 0;
 787   struct bench_timer *ret = 0;
 788   struct { const char *p; size_t sz; } tmconf[NTIMER] = { 0 };
 789   const struct timer_ops *const *tt;
 790   const char *p, *l; size_t n, nn;
 791   unsigned i;
 792
 793   /* Parse the configuration string. */
 794   if (config) {
 795
 796     /* The first thing to do is find the end of the string. */
 797     l = config + strlen(config);
 798
 799     for (;;) {
 800       /* Process the whitespace-sparated words of the string one by one. */
 801
 802       /* Skip over any initial whitespace.  If we hit the end of the string
 803        * then we're done.
 804        */
 805       for (;;)
 806         if (config >= l) goto done_config;
 807         else if (!ISSPACE(*config)) break;
 808         else config++;
 809
 810       /* There's definitely a word here.  Find the end of it. */
 811       for (p = config; p < l && !ISSPACE(*p); p++);
 812       nn = p - config;
 813
 814       /* Try various simple keywords. */
 815 #define MATCHP(lit) (nn == sizeof(lit) - 1 && MEMCMP(config, ==, lit, nn))
 816
 817       if (MATCHP("list")) {
 818         /* The `list' keyword requests lists of the available timer
 819          * implementations.
 820          */
 821
 822         for (i = 0; i < NTIMER; i++) {
 823           printf("%s timers:", timertab[i].what);
 824           for (tt = timertab[i].opstab; *tt; tt++)
 825             if (!((*tt)->f)&TF_SECRET) printf(" %s", (*tt)->name);
 826           putchar('\n');
 827         }
 828         goto next_config;
 829       }
 830
 831 #undef MATCHP
 832
 833       /* Otherwise it's an assignment, setting a subtimer list. */
 834       p = memchr(config, '=', nn);
 835       if (!p)
 836         n = nn;
 837       else {
 838         n = p - config;
 839         for (i = 0; i < NTIMER; i++)
 840           if (STRNCMP(config, ==, timertab[i].what, n) &&
 841               !timertab[i].what[n]) {
 842             if (tmconf[i].p)
 843               debug("duplicate %s timer list", timertab[i].what);
 844             tmconf[i].p = config + n + 1; tmconf[i].sz = nn - n - 1;
 845             goto next_config;
 846           }
 847       }
 848       debug("unrecognized config keyword `%.*s'", (int)n, config);
 849
 850       /* Move on to the next word. */
 851     next_config:
 852       config += nn;
 853     }
 854   done_config:;
 855   }
 856
 857   /* Override these settings from the environment. */
 858   for (i = 0; i < NTIMER; i++) {
 859     p = getenv(timertab[i].env);
 860     if (p) { tmconf[i].p = p; tmconf[i].sz = strlen(p); }
 861   }
 862
 863   /* All seems well.  Allocate the timer object. */
 864   t = xmalloc(sizeof(*t));
 865   for (i = 0; i < NTIMER; i++) t->ops[i] = 0;
 866
 867   /* Try to set up the subtimers. */
 868   for (i = NTIMER; i--; )
 869     if (select_timer(t, i, tmconf[i].p, tmconf[i].sz)) goto end;
 870
 871   /* All is done. */
 872   t->_t.ops = &timer_ops; ret = &t->_t; t = 0;
 873 end:
 874   if (t) timer_destroy(&t->_t);
 875   return (ret);
 876 }
 877
 878 /*----- Benchmarking ------------------------------------------------------*/
 879
 880 /* --- @bench_init@ --- *
 881  *
 882  * Arguments:   @struct bench_state *b@ = bench state to initialize
 883  *              @struct bench_timer *tm@ = timer to attach, or null
 884  *
 885  * Returns:     Zero on success, %$-1$% on failure.
 886  *
 887  * Use:         Initialize the benchmark state.  On success, the timer state
 888  *              still needs to be calibrated (use @bench_calibrate@) before
 889  *              it can be used, but this will be done automatically by
 890  *              @bench_measure@ if it's not done by hand earlier.  The timer
 891  *              is now owned by the benchmark state and will be destroyed by
 892  *              @bench_destroy@.
 893  *
 894  *              The only reason for failure is if @tm@ was null on entry,
 895  *              and automatic construction of a timer failed.  The state is
 896  *              safe to discard, but calling @bench_destroy@ is safe too.
 897  */
 898
 899 int bench_init(struct bench_state *b, struct bench_timer *tm)
 900 {
 901   int rc;
 902
 903   b->tm = 0;
 904
 905   if (!tm) {
 906     tm = bench_createtimer(0);
 907     if (!tm) { rc = -1; goto end; }
 908   }
 909
 910   b->tm = tm; b->target_s = 1.0; b->f = 0; rc = 0;
 911 end:
 912   return (rc);
 913 }
 914
 915 /* --- @bench_destroy@ --- *
 916  *
 917  * Arguments:   @struct bench_state *b@ = bench state
 918  *
 919  * Returns:     ---
 920  *
 921  * Use:         Destroy the benchmark state, releasing the resources that it
 922  *              holds.
 923  */
 924
 925 void bench_destroy(struct bench_state *b)
 926   { if (b->tm) { b->tm->ops->destroy(b->tm); b->tm = 0; } }
 927
 928 /* --- @do_nothing@ --- *
 929  *
 930  * Arguments:   @unsigned long n@ = iteration count
 931  *              @void *ctx@ = context pointer (ignored)
 932  *
 933  * Returns:     ---
 934  *
 935  * Use:         Does nothing at all for @n@ iterations.  Used to calibrate
 936  *              the benchmarking state.
 937  */
 938
 939 static void do_nothing(unsigned long n, void *ctx)
 940   { while (n--) RELAX; }
 941
 942 /* --- @measure@ --- *
 943  *
 944  * Arguments:   @struct bench_state *b@ = bench state
 945  *              @struct bench_timing *delta_out@ = where to leave the timing
 946  *              @bench_fn *fn@ = function to measure
 947  *              @void *ctx@ = context for the function
 948  *              @double n@ = number of iterations
 949  *
 950  * Returns:     ---
 951  *
 952  * Use:         Run the function @n@ times, and report how long it took.
 953  *
 954  *              This function deals with retrying the measurements if the
 955  *              timer reports a temporary failure, and all of the
 956  *              difficulties if @n@ is too large to fit in a machine integer.
 957  */
 958
 959 static void measure(struct bench_state *b, struct bench_timing *delta_out,
 960                     bench_fn *fn, void *ctx, double n)
 961 {
 962   struct bench_timer *tm = b->tm;
 963   struct bench_time t0, t1;
 964   unsigned long n0, n1;
 965   double R = ULONG_MAX;
 966
 967   if (n <= R) {
 968     n0 = n;
 969     do {
 970       while (tm->ops->now(tm, &t0, BTF_T0));
 971       fn(n0, ctx);
 972     } while (tm->ops->now(tm, &t1, BTF_T1));
 973   } else {
 974     n1 = n/R; n0 = n - n1*R;
 975     do {
 976       while (tm->ops->now(tm, &t0, BTF_T0));
 977       while (n1--) fn(ULONG_MAX, ctx);
 978       fn(n0, ctx);
 979     } while (tm->ops->now(tm, &t1, BTF_T1));
 980   }
 981   tm->ops->diff(tm, delta_out, &t0, &t1);
 982 }
 983
 984 /* --- @bench_calibrate@ --- *
 985  *
 986  * Arguments:   @struct bench_state *b@ = bench state
 987  *
 988  * Returns:     Zero on success, %$-1$% if calibration failed.
 989  *
 990  * Use:         Calibrate the benchmark state, so that it can be used to
 991  *              measure performance reasonably accurately.
 992  */
 993
 994 #define T_CLB 0.0625                    /* calibration time limit */
 995
 996 int bench_calibrate(struct bench_state *b)
 997 {
 998   struct linreg lr_clk = LINREG_INIT, lr_cy = LINREG_INIT;
 999   struct bench_timing delta;
1000   double n, r;
1001   bench_fn *fn = LAUNDER(&do_nothing);
1002   unsigned i, f = BTF_ANY;
1003   int rc;
1004
1005   /* The model here is that a timing loop has a fixed overhead as we enter
1006    * and leave (e.g., to do with the indirect branch into the code), and
1007    * per-iteration overheads as we check the counter and loop back.  We aim
1008    * to split these apart using linear regression.
1009    */
1010
1011   /* If we've already calibrated then there's nothing to do. */
1012   if (b->f&BTF_CLB) return (b->f&BTF_ANY ? 0 : -1);
1013
1014   /* Exercise the inner loop a few times to educate the branch predictor. */
1015   for (i = 0; i < 50; i++) measure(b, &delta, fn, 0, 10000);
1016
1017   /* Now we measure idle loops until they take sufficiently long -- or we run
1018    * out of counter.
1019    */
1020   debug("calibrating...");
1021   n = 1.0;
1022   for (;;) {
1023
1024     /* Measure @n@ iterations of the idle loop. */
1025     measure(b, &delta, fn, 0, n); f &= delta.f;
1026     if (!(f&BTF_TIMEOK)) { rc = -1; goto end; }
1027
1028     /* Register the timings with the regression machinery. */
1029     linreg_update(&lr_clk, n, delta.t);
1030     if (!(f&BTF_CYOK))
1031       debug("  n = %10.0f; t = %12g s", n, delta.t);
1032     else {
1033       linreg_update(&lr_cy, n, delta.cy);
1034       debug("  n = %10.0f; t = %12g s, cy = %10.0f", n, delta.t, delta.cy);
1035     }
1036
1037     /* If we're done then stop. */
1038     if (delta.t >= T_CLB) break;
1039     if (n >= ULONG_MAX - n/3) break;
1040
1041     /* Update the counter and continue. */
1042     n += n/3.0 + 1.0;
1043   }
1044
1045   /* Now run the linear regression to extract the constant and per-iteration
1046    * overheads.
1047    */
1048   linreg_fit(&lr_clk, &b->clk.m, &b->clk.c, &r);
1049   debug("clock overhead = (%g n + %g) s (r = %g)", b->clk.m, b->clk.c, r);
1050   if (f&BTF_CYOK) {
1051     linreg_fit(&lr_cy, &b->cy.m, &b->cy.c, &r);
1052     debug("cycle overhead = (%g n + %g) cy (r = %g)", b->cy.m, b->cy.c, r);
1053   }
1054
1055   /* We're done. */
1056   rc = 0;
1057 end:
1058   b->f |= f | BTF_CLB;                  /* no point trying again */
1059   return (rc);
1060 }
1061
1062 /* --- @bench_measure@ --- *
1063  *
1064  * Arguments:   @struct bench_state *b@ = benchmark state
1065  *              @struct bench_timing *t_out@ = where to leave the timing
1066  *              @double base@ = number of internal units per call
1067  *              @bench_fn *fn@, @void *ctx@ = benchmark function to run
1068  *
1069  * Returns:     Zero on success, %$-1$% if timing failed.
1070  *
1071  * Use:         Measure a function.  The function @fn@ is called adaptively
1072  *              with an iteration count @n@ set so as to run for
1073  *              approximately @b->target_s@ seconds.
1074  *
1075  *              The result is left in @*t_out@, with @t_out->n@ counting the
1076  *              final product of the iteration count and @base@ (which might,
1077  *              e.g., reflect the number of inner iterations the function
1078  *              performs, or the number of bytes it processes per iteration).
1079  */
1080
1081 int bench_measure(struct bench_state *b, struct bench_timing *t_out,
1082                   double base, bench_fn *fn, void *ctx)
1083 {
1084   double n, nn;
1085
1086   /* Make sure the state is calibrated and usable. */
1087   if (!(b->f&BTF_CLB) && bench_calibrate(b)) return (-1);
1088   if (!(b->f&BTF_TIMEOK)) return (-1);
1089
1090   /* Main adaptive measurement loop.
1091    *
1092    * Suppose the timer loop %$n$% iterations in %$t$% seconds.  Our ideal
1093    * time is %$T$% seconds.  If %$t \ge T/\sqrt{2}$%, we're happy.
1094    * Otherwise, we need to scale up the iteration count.  The obvious next
1095    * choice is %$n' = n T/t$%.  Alas, rounding is a problem: if
1096    * %$T/t < 1 + 1/n$% then %$\floor{n T/t} = n$% and we will make no
1097    * progress.  We know that %$T/t > \sqrt{2}%, so this can only happen when
1098    * %$1 + 1/n > \sqrt{2}$%, i.e., when %$n < \sqrt{2} + 1$%.  On the other
1099    * hand, if %$T/t < 1 + 1/n$% then %$t (n + 1)/n > T$%, so just trying
1100    * again with %$n' = n + 1$% iterations will very likely work.
1101    */
1102   debug("measuring..."); n = 1.0;
1103   for (;;) {
1104     measure(b, t_out, fn, ctx, n); t_out->f &= b->f;
1105     if (!(t_out->f&BTF_TIMEOK)) return (-1);
1106     if (!(t_out->f&BTF_CYOK))
1107       debug("  n = %10.0f; t = %12g", n, t_out->t);
1108     else
1109       debug("  n = %10.0f; t = %12g, cy = %10.0f", n, t_out->t, t_out->cy);
1110
1111     if (t_out->t >= 0.707*b->target_s) break;
1112     nn = n*b->target_s/t_out->t;
1113     if (n > ULONG_MAX || nn > (unsigned long)n + 1) n = nn;
1114     else n++;
1115   }
1116
1117   /* Adjust according to the calibration. */
1118   t_out->t -= n*b->clk.m + b->clk.c;
1119   if (t_out->f&BTF_CYOK) t_out->cy -= n*b->cy.m + b->cy.c;
1120
1121   /* Report the results, if debugging. */
1122   if (!(t_out->f&BTF_CYOK)) debug("  adjusted t' = %12g", t_out->t);
1123   else debug("  adjusted t = %12g, cy = %10.0f", t_out->t, t_out->cy);
1124   if (!(t_out->f&BTF_CYOK))
1125     debug("  %g s per op; %g ops/s", t_out->t/n, n/t_out->t);
1126   else
1127     debug("  %g s (%g cy) per op; %g ops/s",
1128           t_out->t/n, t_out->cy/n, n/t_out->t);
1129
1130   /* All done. */
1131   t_out->n = n*base; return (0);
1132 }
1133
1134 /*----- That's all, folks -------------------------------------------------*/