5 * (c) 2023 Straylight/Edgeware
8 /*----- Licensing notice --------------------------------------------------*
10 * This file is part of the mLib utilities library.
12 * mLib is free software: you can redistribute it and/or modify it under
13 * the terms of the GNU Library General Public License as published by
14 * the Free Software Foundation; either version 2 of the License, or (at
15 * your option) any later version.
17 * mLib is distributed in the hope that it will be useful, but WITHOUT
18 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
19 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public
20 * License for more details.
22 * You should have received a copy of the GNU Library General Public
23 * License along with mLib. If not, write to the Free Software
24 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
28 /*----- Header files ------------------------------------------------------*/
48 #if GCC_VERSION_P(4, 5) && (defined(__i386__) || defined(__x86_64__))
50 # define CPUID_1D_TSC (1u << 4)
51 # define CPUID_1xD_TSCP (1u << 27)
52 # define USE_X86_RDTSC 1
55 #if defined(HAVE_LINUX_PERF_EVENT_H) && defined(HAVE_UINT64)
56 # include <sys/syscall.h>
57 # include <sys/types.h>
59 # include <linux/perf_event.h>
60 # ifdef HAVE_VALGRIND_VALGRIND_H
61 # include <valgrind/valgrind.h>
63 # define USE_LINUX_PERFEVENT 1
64 # if GCC_VERSION_P(4, 5) && (defined(__i386__) || defined(__x86_64__))
65 # include <sys/mman.h>
66 # define USE_LINUX_PERFEVRDPMC 1
70 /*----- Data structures ---------------------------------------------------*/
72 enum { CLK, CY, NTIMER };
75 struct bench_timer _t;
77 const struct timer_ops *ops[NTIMER]; /* subtimers for clock and cycles */
80 unsigned tscaux; /* `ia32_tsc_aux' for `ldtscp' */
82 #ifdef USE_LINUX_PERFEVENT
83 int fd; /* vanilla `perf_event_open' */
85 #ifdef USE_LINUX_PERFEVRDPMC
86 struct { /* `perf_event_open' with `rdpmc' */
87 const volatile void *map; size_t sz; /* memory-mapped info */
88 pid_t owner; /* owning thread id */
91 } u_cy; /* state for cycle measurement */
95 const char *name; /* timer name */
96 unsigned f; /* flags */
97 /* ... @BTF_...OK@ flags */ /* expected results */
98 #define TF_SECRET 16u /* don't try this automatically */
99 int (*init)(struct timer */*t*/); /* initialization function */
100 int (*preflight)(struct timer */*t*/); /* preflight checks */
101 int (*now)(struct timer */*t*/, /* read current */
102 struct bench_time */*t_out*/, unsigned /*f*/);
103 void (*diff)(struct timer */*t*/, /* difference */
104 struct bench_timing */*t_inout*/,
105 const struct bench_time */*t0*/,
106 const struct bench_time */*t1*/);
107 void (*teardown)(struct timer */*t*/); /* release held resources */
110 /*----- Preliminaries -----------------------------------------------------*/
112 #define NS_PER_S 1000000000
116 * Arguments: @const char *fmt@ = format control string
117 * @...@ = format arguemnts
121 * Use: Maybe report a debugging message to standard error.
124 static PRINTF_LIKE(1, 2) void debug(const char *fmt, ...)
129 p = getenv("MLIB_BENCH_DEBUG");
130 if (p && *p != 'n' && *p != '0') {
132 fputs("mLib BENCH: ", stderr);
133 vfprintf(stderr, fmt, ap);
140 # define FLOATK64(k) ((double)(k).i)
142 # define FLOATK64(k) ((double)(k).lo + 4294967296.0*(double)(k).hi)
145 /* --- @diff_ts@ --- *
147 * Arguments: @struct timer *t@ = timer structure
148 * @struct bench_timing *delta_inout@ = where to put the result
149 * @const struct time *t0, *t1@ = two input times
153 * Use: Calculates a time difference for timers using the
154 * @struct timespec@-like time format.
157 static void diff_ts(struct timer *t, struct bench_timing *delta_inout,
158 const struct bench_time *t0, const struct bench_time *t1)
160 unsigned f = t0->f&t1->f;
166 /* Calculate the integer differences in seconds and nanoseconds
167 * independently. To avoid underflow, though, add a second's worth of
168 * nanoseconds which we'll subtract off later.
170 SUB64(delta_s, t1->t.ts.s, t0->t.ts.s);
171 delta_ns = t1->t.ts.ns + NS_PER_S - t0->t.ts.ns;
173 /* Hack if they're both equal. */
174 if (ZERO64(delta_s) && !delta_ns) delta_ns = 1;
176 /* And apply the nanoseconds difference. To prevent underflow, pre-
177 * emptively borrow one from the integer difference.
179 delta_inout->t = FLOATK64(delta_s) - 1.0 + delta_ns/(double)NS_PER_S;
182 delta_inout->f |= BTF_TIMEOK;
186 /* --- @diff_cycles@ --- *
188 * Arguments: @struct timer *t@ = timer structure
189 * @struct bench_timing *delta_inout@ = where to put the result
190 * @const struct time *t0, *t1@ = two input times
194 * Use: Calculates a time difference for cycle-counting timers.
197 static void diff_cycles(struct timer *t, struct bench_timing *delta_inout,
198 const struct bench_time *t0,
199 const struct bench_time *t1)
201 unsigned f = t0->f&t1->f;
205 SUB64(delta_cy, t1->cy, t0->cy); delta_inout->cy = FLOATK64(delta_cy);
206 if (!delta_inout->cy) delta_inout->cy = 1;
207 delta_inout->f |= BTF_CYOK;
213 /* --- @normalize@ --- *
215 * Arguments: @double *x_inout@ = address of a value to normalize
216 * @const char **unit_out@ = address to store unit prefix
217 * @double scale@ = scale factor for unit steps
221 * Use: Adjust @*x_inout@ by a power of @scale@, and set @*unit_out@
222 * so that printing the two reflects the original value with an
223 * appropriate SI unit scaling. The @scale@ should be 1024 for
224 * binary quantities, most notably memory sizes, or 1000 for
228 static void normalize(double *x_inout, const char **unit_out, double scale)
232 *const big[] = { "k", "M", "G", "T", "P", "E", 0 },
233 *const little[] = { "m", "ยต", "n", "p", "f", "a", 0 };
234 const char *const *u;
235 double x = *x_inout, s;
237 if (x >= 0) s = +1.0;
238 else { x = -x; s = -1.0; }
241 for (u = little, x *= scale; x < 1 && u[1]; u++, x *= scale);
243 for (u = big, x /= scale; x >= scale && u[1]; u++, x /= scale);
247 *x_inout = s*x; *unit_out = *u;
250 /*----- The null timer ----------------------------------------------------*/
252 /* This is a timer which does nothing, in case we don't have any better
256 static int null_init(struct timer *t) { return (0); }
257 static int null_now(struct timer *t, struct bench_time *t_out, unsigned f)
259 static int null_preflight(struct timer *t) { return (0); }
260 static void null_diff(struct timer *t, struct bench_timing *delta_inout,
261 const struct bench_time *t0,
262 const struct bench_time *t1)
264 static void null_teardown(struct timer *t) { ; }
266 static const struct timer_ops null_ops =
268 null_init, null_preflight, null_now, null_diff, null_teardown };
269 #define NULL_ENT &null_ops,
271 /*----- The broken clock --------------------------------------------------*/
273 /* This is a cycle counter which does nothing, in case we don't have any
277 static int broken_init(struct timer *t) { return (-1); }
279 static const struct timer_ops broken_ops =
280 { "broken", TF_SECRET,
281 broken_init, null_preflight, null_now, null_diff, null_teardown };
282 #define BROKEN_ENT &broken_ops,
284 /*----- Linux performance counters ----------------------------------------*/
286 /* This is a cycle counter which uses the Linux performance event system,
287 * which is probably the best choice if it's available.
290 #if defined(HAVE_LINUX_PERF_EVENT_H) && defined(HAVE_UINT64)
292 /* --- @perfevent_open@ --- *
296 * Returns: File descriptor, or %$-1$%.
298 * Use: Open a performance measurement descriptor set up to count CPU
302 static int perfevent_open(void)
304 struct perf_event_attr attr = { 0 };
307 attr.type = PERF_TYPE_HARDWARE;
308 attr.size = sizeof(attr);
309 attr.config = PERF_COUNT_HW_CPU_CYCLES;
311 attr.exclude_kernel = 1;
314 fd = syscall(SYS_perf_event_open, &attr, 0, -1, -1, 0);
316 debug("couldn't open perf event: %s", strerror(errno));
323 static int perfevent_now(struct timer *t,
324 struct bench_time *t_out, unsigned f)
328 n = read(t->u_cy.fd, &t_out->cy.i, sizeof(t_out->cy.i));
329 if (n != sizeof(t_out->cy.i)) {
330 debug("failed to read perf-event counter: %s", strerror(errno));
333 t_out->f |= BTF_CYOK; return (0);
336 static void perfevent_teardown(struct timer *t)
337 { close(t->u_cy.fd); }
339 static int perfevent_init(struct timer *t)
343 fd = perfevent_open(); if (fd < 0) { rc = -1; goto end; }
344 t->u_cy.fd = fd; fd = -1; rc = 0;
346 if (fd != -1) close(fd);
350 static const struct timer_ops perfevent_ops =
351 { "linux-perf-read-hw-cycles", BTF_CYOK,
352 perfevent_init, null_preflight, perfevent_now,
353 diff_cycles, perfevent_teardown };
354 #define PERFEVENT_VANILLA_CYENT &perfevent_ops,
356 # if GCC_VERSION_P(4, 5) && (defined(__i386__) || defined(__x86_64__))
358 /* Special syscall-free version for x86 using `rdpmc' instruction. *
360 * This is a bit weird because it does both kinds of measurement in a single
364 static int perfevrdpmc_now(struct timer *t,
365 struct bench_time *t_out, unsigned f)
367 const volatile struct perf_event_mmap_page *map = t->u_cy.pmc.map;
368 unsigned long long tsc = tsc, toff = toff, tenb = tenb;
369 unsigned long long cy = cy, cyoff = cyoff;
370 unsigned long long m, hi, lo;
371 unsigned tshift = tshift, tmult = tmult, q0, q1, ff;
373 /* Repeat until we can complete this job without the buffer changing in the
377 __atomic_thread_fence(__ATOMIC_ACQ_REL);
381 /* Read the passage-of-time information. */
382 if (map->cap_user_time) {
383 tenb = map->time_enabled;
384 tsc = __builtin_ia32_rdtsc();
385 tshift = map->time_shift;
386 tmult = map->time_mult;
387 toff = map->time_offset;
391 /* Read the performance-counter information. */
392 if (map->cap_user_rdpmc) {
393 cy = __builtin_ia32_rdpmc(map->index - 1);
398 /* Check the sequence number again. */
399 __atomic_thread_fence(__ATOMIC_ACQ_REL);
406 /* We have a raw reference-cycle count %$n$% (@tsc@), and parameters
407 * %$a$%, %$w$% and %$t_0$%, such that %$a n/2^w + t_0$% gives a time in
411 m = (1ull << tshift) - 1;
412 hi = tsc >> tshift; lo = tsc&m;
413 t_out->t.rawns.i = hi*tmult + (lo*tmult >> tshift) + toff + tenb;
414 t_out->f |= BTF_TIMEOK;
418 /* We have the cycle count. */
420 t_out->cy.i = cy + cyoff;
421 t_out->f |= BTF_CYOK;
426 static void perfevrdpmc_diff(struct timer *t,
427 struct bench_timing *delta_inout,
428 const struct bench_time *t0,
429 const struct bench_time *t1)
431 unsigned long long delta_ns;
432 unsigned f = t0->f&t1->f;
435 delta_ns = t1->t.rawns.i - t0->t.rawns.i; if (!delta_ns) delta_ns = 1;
436 delta_inout->t = delta_ns/(double)NS_PER_S;
437 delta_inout->f |= BTF_TIMEOK;
441 delta_inout->cy = t1->cy.i - t0->cy.i;
442 if (!delta_inout->cy) delta_inout->cy = 1;
443 delta_inout->f |= BTF_CYOK;
447 static void perfevrdpmc_unmap
448 (const volatile struct perf_event_mmap_page *map, size_t mapsz)
449 { if (map) munmap(UNQUALIFY(struct perf_event_mmap_page, map), mapsz); }
451 static void perfevrdpmc_teardown(struct timer *t)
452 { perfevrdpmc_unmap(t->u_cy.pmc.map, t->u_cy.pmc.sz); }
454 static int perfevrdpmc_setup(struct timer *t)
456 const volatile struct perf_event_mmap_page *map = 0;
457 int pgsz, mapsz = 0, fd = -1, rc;
459 /* The rules say we must allocate %$1 + 2^n$% pages, so we need to know how
462 pgsz = sysconf(_SC_PAGESIZE);
464 debug("failed to discover page size!: %s", strerror(errno));
468 /* Open the measurement descriptor and map it. */
469 fd = perfevent_open(); if (!fd) return (-1);
471 map = mmap(0, mapsz, PROT_READ, MAP_SHARED, fd, 0);
472 if (map == MAP_FAILED) {
473 debug("failed to map perf event: %s", strerror(errno));
477 t->u_cy.pmc.map = map; t->u_cy.pmc.sz = mapsz; map = 0;
478 t->u_cy.pmc.owner = syscall(SYS_gettid); rc = 0;
480 if (fd != -1) close(fd);
481 perfevrdpmc_unmap(map, mapsz);
485 static int perfevrdpmc_preflight(struct timer *t)
487 if (!t->u_cy.pmc.map) { debug("retry perf event map setup"); goto reopen; }
488 if (t->u_cy.pmc.owner != syscall(SYS_gettid)) {
489 debug("pid changed: reopen perf event map");
490 perfevrdpmc_unmap(t->u_cy.pmc.map, t->u_cy.pmc.sz);
491 t->u_cy.pmc.map = 0; goto reopen;
496 if (perfevrdpmc_setup(t)) return (-1);
500 static int perfevrdpmc_cyinit(struct timer *t)
504 # ifdef HAVE_VALGRIND_VALGRIND_H
505 /* Valgrind doesn't like `rdpmc' instructions, so just bail. */
506 if (RUNNING_ON_VALGRIND) return (-1);
509 /* We need `rdtsc' to do the passage-of-time measurement. */
510 if (!__get_cpuid(1, &a, &b, &c, &d) || !(d&CPUID_1D_TSC))
511 { debug("no `rdtsc' instrunction"); return (-1); }
514 if (perfevrdpmc_setup(t)) return (-1);
518 static const struct timer_ops perfevrdpmc_cyops =
519 { "linux-x86-perf-rdpmc-hw-cycles", BTF_TIMEOK | BTF_CYOK,
520 perfevrdpmc_cyinit, perfevrdpmc_preflight, perfevrdpmc_now,
521 perfevrdpmc_diff, perfevrdpmc_teardown };
523 static int perfevrdpmc_clkinit(struct timer *t)
525 if (t->ops[CY] != &perfevrdpmc_cyops) {
526 debug("`linux-x86-perf-rdpmc-hw-cycles' not set as cycle subtimer");
532 static const struct timer_ops perfevrdpmc_clkops =
533 { "linux-x86-perf-rdpmc-hw-cycles", 0,
534 perfevrdpmc_clkinit, null_preflight, null_now,
535 null_diff, null_teardown };
537 # define PERFEVENT_RDPMC_CLKENT &perfevrdpmc_clkops,
538 # define PERFEVENT_RDPMC_CYENT &perfevrdpmc_cyops,
541 # define PERFEVENT_RDPMC_CLKENT
542 # define PERFEVENT_RDPMC_CYENT
545 # define PERFEVENT_CLKENT PERFEVENT_RDPMC_CLKENT
546 # define PERFEVENT_CYENT PERFEVENT_RDPMC_CYENT PERFEVENT_VANILLA_CYENT
548 # define PERFEVENT_CLKENT
549 # define PERFEVENT_CYENT
552 /*----- Intel time-stamp counter ------------------------------------------*/
554 /* This is a cycle counter based on the Intel `rdtsc' instruction. It's not
555 * really suitable for performance measurement because it gets confused by
556 * CPU frequency adjustments.
559 #if GCC_VERSION_P(4, 5) && (defined(__i386__) || defined(__x86_64__))
561 static int x86rdtsc_now(struct timer *t,
562 struct bench_time *t_out, unsigned f)
563 { t_out->cy.i = __builtin_ia32_rdtsc(); t_out->f |= BTF_CYOK; return (0); }
565 static int x86rdtsc_init(struct timer *t)
569 if (!__get_cpuid(1, &a, &b, &c, &d) || !(d&CPUID_1D_TSC))
570 { debug("no `rdtsc' instrunction"); return (-1); }
571 t->u_cy.tscaux = ~0u;
575 static int x86rdtscp_now(struct timer *t,
576 struct bench_time *t_out, unsigned f)
579 unsigned long long n;
581 n = __builtin_ia32_rdtscp(&tscaux);
583 t->u_cy.tscaux = tscaux;
584 else if (t->u_cy.tscaux != tscaux) {
585 debug("tscaux mismatch: new 0x%08x /= old 0x%08x",
586 tscaux, t->u_cy.tscaux);
589 t_out->cy.i = n; t_out->f |= BTF_CYOK; return (0);
592 static int x86rdtscp_init(struct timer *t)
596 if (!__get_cpuid(0x80000001, &a, &b, &c, &d) || !(d&CPUID_1xD_TSCP))
597 { debug("no `rdtscp' instrunction"); return (-1); }
601 static const struct timer_ops x86rdtsc_ops =
602 { "x86-rdtsc", BTF_CYOK,
603 x86rdtsc_init, null_preflight, x86rdtsc_now,
604 diff_cycles, null_teardown };
605 static const struct timer_ops x86rdtscp_ops =
606 { "x86-rdtscp", BTF_CYOK,
607 x86rdtscp_init, null_preflight,
608 x86rdtscp_now, diff_cycles, null_teardown };
610 # define X86RDTSC_CYENT &x86rdtscp_ops, &x86rdtsc_ops,
612 # define X86RDTSC_CYENT
615 /*----- POSIX `clock_gettime' ---------------------------------------------*/
617 /* This is a real-time clock based on the POSIX time interface, with up to
618 * nanosecond precision.
621 #if defined(HAVE_CLOCK_GETTIME) && defined(CLOCK_THREAD_CPUTIME_ID)
623 static int gettime_now(struct timer *t, struct bench_time *t_out, unsigned f)
627 if (clock_gettime(CLOCK_THREAD_CPUTIME_ID, &now))
628 { debug("error reading POSIX clock: %s", strerror(errno)); return (0); }
629 ASSIGN64(t_out->t.ts.s, now.tv_sec); t_out->t.ts.ns = now.tv_nsec;
630 t_out->f |= BTF_TIMEOK; return (0);
633 static const struct timer_ops gettime_ops =
634 { "posix-thread-cputime", BTF_TIMEOK,
635 null_init, null_preflight, gettime_now, diff_ts, null_teardown };
637 # define GETTIME_CLKENT &gettime_ops,
639 # define GETTIME_CLKENT
642 /*----- Standard C `clock' ------------------------------------------------*/
644 /* This is a real-time clock based on the C `clock' function which is
645 * guaranteed to be available, though it's not likely to be very good.
648 static int clock_now(struct timer *t, struct bench_time *t_out, unsigned f)
653 if (now == (clock_t)-1) {
654 debug("error reading standard clock: %s", strerror(errno));
657 t_out->t.clk = now; t_out->f |= BTF_TIMEOK; return (0);
660 static void clock_diff(struct timer *t, struct bench_timing *delta_inout,
661 const struct bench_time *t0,
662 const struct bench_time *t1)
665 unsigned f = t0->f&t1->f;
668 delta_clk = t1->t.clk - t0->t.clk; if (!delta_clk) delta_clk = 1;
669 delta_inout->t = delta_clk/(double)CLOCKS_PER_SEC;
670 delta_inout->f |= BTF_TIMEOK;
674 static const struct timer_ops clock_ops =
675 { "stdc-clock", BTF_TIMEOK, null_init, null_preflight, clock_now,
676 clock_diff, null_teardown };
678 #define CLOCK_CLKENT &clock_ops,
680 /*----- Timing setup ------------------------------------------------------*/
682 /* Tables of timing sources. */
683 static const struct timer_ops
684 *const clktab[] = { PERFEVENT_CLKENT
689 *const cytab[] = { PERFEVENT_CYENT
695 static const struct timertab {
698 const struct timer_ops *const *opstab;
700 { "clock", "MLIB_BENCH_CLKTIMER", clktab },
701 { "cycle", "MLIB_BENCH_CYCLETIMER", cytab }
704 /* --- @find_timer@ --- *
706 * Arguments: @const char *name@ = timer name
707 * @size_t sz@ = length of name
708 * @unsigned tm@ = which subtimer we're looking for
710 * Returns: The table entry matching the given name, or null if there
714 static const struct timer_ops *find_timer(const char *name, size_t sz,
717 const struct timer_ops *const *tt;
719 for (tt = timertab[tm].opstab; *tt; tt++) {
720 if (strlen((*tt)->name) == sz &&
721 MEMCMP(name, ==, (*tt)->name, sz))
724 debug("%s timer `%.*s' not found",
725 timertab[tm].what, (int)sz, name); return (0);
728 /* --- @try_timer@ --- *
730 * Arguments: @struct timer *t@ = timer structure
731 * @const struct timer_ops *ops@ = timer ops
732 * @unsigned tm@ = which subtimer we're setting
734 * Returns: Zero on success, %$-1$% if timer failed.
736 * Use: Tries to initialize the timer @t@, reporting a debug message
740 static int try_timer(struct timer *t,
741 const struct timer_ops *ops, unsigned tm)
743 struct bench_time t0, t1;
744 struct bench_timing delta;
747 #define f_teardown 1u
749 if (ops->init(t)) { rc = -1; goto end; }
752 if (ops->preflight(t)) { rc = -1; goto end; }
755 while (ops->now(t, &t0, BTF_T0));
756 } while (ops->now(t, &t1, BTF_T1));
757 delta.f = 0; ops->diff(t, &delta, &t0, &t1);
758 if ((ops->f ^ delta.f)&BTF_ANY) { rc = -1; goto end; }
760 debug("selected %s timer `%s'", timertab[tm].what, ops->name);
761 t->ops[tm] = ops; f &= ~f_teardown; rc = 0;
764 if (f&f_teardown) ops->teardown(t);
770 /* --- @select_timer@ --- *
772 * Arguments: @struct timer *t@ = timer structure
773 * @unsigned tm@ = which subtimer we're setting
774 * @const char *config@, @size_t sz@ = config string
776 * Returns: Zero on success, %$-1$% if timer failed.
778 * Use: Select a timer from the table. If the environment variable
779 * is set, then parse a comma-separated list of timer names and
780 * use the first one listed that seems to work; otherwise, try
781 * the timers in the table in order.
784 static int select_timer(struct timer *t, unsigned tm,
785 const char *config, size_t sz)
788 const struct timer_ops *ops, *const *tt;
791 for (tt = timertab[tm].opstab; *tt; tt++)
792 if (!((*tt)->f&TF_SECRET) && !try_timer(t, *tt, tm)) return (0);
796 p = memchr(config, ',', l - config); if (!p) p = l;
797 ops = find_timer(config, p - config, tm);
798 if (ops && !try_timer(t, ops, tm)) return (0);
803 debug("no suitable %s timer found", timertab[tm].what); return (-1);
806 /* Bench timer operations. */
807 static void timer_describe(struct bench_timer *tm, dstr *d)
809 struct timer *t = (struct timer *)tm;
812 dstr_puts(d, "builtin: ");
813 for (i = 0; i < NTIMER; i++) {
814 if (i) dstr_puts(d, ", ");
815 dstr_putf(d, "%s = %s", timertab[i].what, t->ops[i]->name);
819 static int timer_preflight(struct bench_timer *tm)
821 struct timer *t = (struct timer *)tm;
824 for (i = 0; i < NTIMER; i++) if (t->ops[i]->preflight(t)) return (-1);
828 static int timer_now(struct bench_timer *tm,
829 struct bench_time *t_out, unsigned f)
831 struct timer *t = (struct timer *)tm;
835 for (i = 0; i < NTIMER; i++) if (t->ops[i]->now(t, t_out, f)) return (-1);
839 static void timer_diff(struct bench_timer *tm,
840 struct bench_timing *t_out,
841 const struct bench_time *t0,
842 const struct bench_time *t1)
844 struct timer *t = (struct timer *)tm;
848 for (i = 0; i < NTIMER; i++) t->ops[i]->diff(t, t_out, t0, t1);
851 static void timer_destroy(struct bench_timer *tm)
853 struct timer *t = (struct timer *)tm;
857 for (i = 0; i < NTIMER; i++)
858 if (t->ops[i]) t->ops[i]->teardown(t);
862 static const struct bench_timerops timer_ops =
863 { timer_describe, timer_preflight, timer_now, timer_diff, timer_destroy };
865 /* --- @bench_createtimer@ --- *
867 * Arguments: @const char *config@ = timer configuration string
869 * Returns: A freshly constructed standard timer object.
871 * Use: Allocate a timer. Dispose of it by calling
872 * @tm->ops->destroy(tm)@ when you're done.
874 * Applications should not set configuration strings except as
875 * established by user action, e.g., from a command-line option,
876 * environment variable, or configuration file.
879 struct bench_timer *bench_createtimer(const char *config)
882 struct bench_timer *ret = 0;
883 struct { const char *p; size_t sz; } tmconf[NTIMER] = { 0 };
884 const struct timer_ops *const *tt;
885 const char *p, *l; size_t n, nn;
888 /* Parse the configuration string. */
891 /* The first thing to do is find the end of the string. */
892 l = config + strlen(config);
895 /* Process the whitespace-sparated words of the string one by one. */
897 /* Skip over any initial whitespace. If we hit the end of the string
901 if (config >= l) goto done_config;
902 else if (!ISSPACE(*config)) break;
905 /* There's definitely a word here. Find the end of it. */
906 for (p = config; p < l && !ISSPACE(*p); p++);
909 /* Try various simple keywords. */
910 #define MATCHP(lit) (nn == sizeof(lit) - 1 && MEMCMP(config, ==, lit, nn))
912 if (MATCHP("list")) {
913 /* The `list' keyword requests lists of the available timer
917 for (i = 0; i < NTIMER; i++) {
918 printf("%s timers:", timertab[i].what);
919 for (tt = timertab[i].opstab; *tt; tt++)
920 if (!((*tt)->f&TF_SECRET)) printf(" %s", (*tt)->name);
928 /* Otherwise it's an assignment, setting a subtimer list. */
929 p = memchr(config, '=', nn);
934 for (i = 0; i < NTIMER; i++)
935 if (STRNCMP(config, ==, timertab[i].what, n) &&
936 !timertab[i].what[n]) {
938 debug("duplicate %s timer list", timertab[i].what);
939 tmconf[i].p = config + n + 1; tmconf[i].sz = nn - n - 1;
943 debug("unrecognized config keyword `%.*s'", (int)n, config);
945 /* Move on to the next word. */
952 /* Override these settings from the environment. */
953 for (i = 0; i < NTIMER; i++) {
954 p = getenv(timertab[i].env);
955 if (p) { tmconf[i].p = p; tmconf[i].sz = strlen(p); }
958 /* All seems well. Allocate the timer object. */
959 XNEW(t); t->a = arena_global;
960 for (i = 0; i < NTIMER; i++) t->ops[i] = 0;
962 /* Try to set up the subtimers. */
963 for (i = NTIMER; i--; )
964 if (select_timer(t, i, tmconf[i].p, tmconf[i].sz)) goto end;
967 t->_t.ops = &timer_ops; t->_t.ref = 1; ret = &t->_t; t = 0;
969 if (t) timer_destroy(&t->_t);
973 /*----- Benchmarking ------------------------------------------------------*/
975 /* --- @bench_init@ --- *
977 * Arguments: @struct bench_state *b@ = bench state to initialize
978 * @struct bench_timer *tm@ = timer to attach, or null
980 * Returns: Zero on success, %$-1$% on failure.
982 * Use: Initialize the benchmark state. On success, the timer state
983 * still needs to be calibrated (use @bench_calibrate@) before
984 * it can be used, but this will be done automatically by
985 * @bench_measure@ if it's not done by hand earlier. The timer
986 * is now owned by the benchmark state and will be destroyed by
989 * The only reason for failure is if @tm@ was null on entry,
990 * and automatic construction of a timer failed. The state is
991 * safe to discard, but calling @bench_destroy@ is safe too.
994 int bench_init(struct bench_state *b, struct bench_timer *tm)
1001 tm = bench_createtimer(0);
1002 if (!tm) { rc = -1; goto end; }
1005 b->tm = tm; b->target_s = 1.0; b->f = 0; rc = 0;
1010 /* --- @bench_destroy@ --- *
1012 * Arguments: @struct bench_state *b@ = bench state
1016 * Use: Destroy the benchmark state, releasing the resources that it
1020 void bench_destroy(struct bench_state *b)
1021 { if (b->tm && !--b->tm->ref) { b->tm->ops->destroy(b->tm); b->tm = 0; } }
1025 * Arguments: @unsigned long n@ = iteration count
1026 * @void *ctx@ = context pointer (ignored)
1030 * Use: Does nothing at all for @n@ iterations. Used to calibrate
1031 * the benchmarking state.
1034 static void spin(unsigned long n, void *ctx)
1035 { while (n--) RELAX; }
1037 /* --- @bench_calibrate@ --- *
1039 * Arguments: @struct bench_state *b@ = bench state
1040 * @unsigned f@ = calibration flags
1042 * Returns: Zero on success, %$-1$% if calibration failed.
1044 * Use: Calibrate the benchmark state, so that it can be used to
1045 * measure performance reasonably accurately.
1047 * Calibration will take into account how the subject code is
1048 * going to be located. If you're going to use @BENCH_MEASURE@
1049 * to measure a piece of literal code, then leave @f@ zero. If
1050 * the code to be measured is going to be executed via an
1051 * indirect branch, e.g., through the @measure@ function, then
1052 * set @BTF_INDIRECT@.
1055 #define T_CLB 0.0625 /* calibration time limit */
1057 int bench_calibrate(struct bench_state *b, unsigned f)
1059 struct linreg lr_clk = LINREG_INIT, lr_cy = LINREG_INIT;
1060 struct bench_timer *tm = b->tm;
1061 struct bench_timing delta;
1063 unsigned i, tf = BTF_ANY;
1064 BENCH_TIMELOOP_DECLS;
1067 /* The model here is that a timing loop has a fixed overhead as we enter
1068 * and leave (e.g., to do with the indirect branch into the code), and
1069 * per-iteration overheads as we check the counter and loop back. We aim
1070 * to split these apart using linear regression.
1073 /* If we've already calibrated then there's nothing to do. */
1074 if (b->f&BTF_CLB) return (b->f&BTF_ANY ? 0 : -1);
1076 /* Run the timer preflight check. */
1077 if (tm->ops->preflight(tm)) { rc = -1; goto end; }
1079 /* Exercise the inner loop a few times to educate the branch predictor.
1080 * This is only useful if we're executing via an indirect call.
1082 if (f&BTF_INDIRECT) {
1083 for (i = 0; i < 50; i++)
1084 BENCH_TIMELOOP_TAG(setup, b->tm, &delta, 10000, ;)
1085 LAUNDER(&spin)(_bench_n, 0);
1088 /* Now we measure idle loops until they take sufficiently long -- or we run
1091 debug("calibrating...");
1095 /* Measure @n@ iterations of the idle loop. */
1097 BENCH_TIMELOOP_TAG(calibrate, b->tm, &delta, n, ;)
1098 LAUNDER(&spin)(_bench_n, 0);
1100 BENCH_TIMELOOP_TAG(calibrate, b->tm, &delta, n, ;)
1101 while (_bench_n--) RELAX;
1102 tf &= delta.f; if (!(tf&BTF_TIMEOK)) { rc = -1; goto end; }
1104 /* Register the timings with the regression machinery. */
1105 linreg_update(&lr_clk, n, delta.t);
1107 debug(" n = %10.0f; t = %12g s", n, delta.t);
1109 linreg_update(&lr_cy, n, delta.cy);
1110 debug(" n = %10.0f; t = %12g s, cy = %10.0f", n, delta.t, delta.cy);
1113 /* If we're done then stop. */
1114 if (delta.t >= T_CLB) break;
1115 if (n >= ULONG_MAX - n/3) break;
1117 /* Update the counter and continue. */
1121 /* Now run the linear regression to extract the constant and per-iteration
1124 linreg_fit(&lr_clk, &b->clk.m, &b->clk.c, &r);
1125 debug("clock overhead = (%g n + %g) s (r = %g)", b->clk.m, b->clk.c, r);
1127 linreg_fit(&lr_cy, &b->cy.m, &b->cy.c, &r);
1128 debug("cycle overhead = (%g n + %g) cy (r = %g)", b->cy.m, b->cy.c, r);
1134 b->f |= tf | BTF_CLB; /* no point trying again */
1138 /* --- @bench_preflight@ --- *
1140 * Arguments: @struct bench_state *b@ = benchmark state
1142 * Returns: Zero on success, %$-1$% on failure.
1144 * Use: Prepares for benchmarking on the current thread. Current
1145 * checks are that the timer is calibrated and that it can
1146 * successfully measure time; the timer preflight is also run.
1148 * Users are unlikely to find this function useful: it's called
1149 * automatically by the @BENCH_MEASURE@ macro and the
1150 * @bench_measure@ function.
1153 int bench_preflight(struct bench_state *b)
1155 struct bench_timer *tm = b->tm;
1157 if (!(b->f&BTF_CLB)) return (-1);
1158 if (!(b->f&BTF_TIMEOK)) return (-1);
1159 if (tm->ops->preflight(tm)) return (-1);
1160 debug("measuring...");
1164 /* --- @bench_adapt@ --- *
1166 * Arguments: @double *n_inout@ = number of iterations, updated
1167 * @double target_s@ = target time in seconds
1168 * @const struct bench_timing *t@ = timing from the previous run
1170 * Returns: Nonzero if the measurement is sufficient; zero to run again.
1172 * Use: This function determines a suitable number of iterations of a
1173 * benchmark function to perform next. It is used in a loop
1174 * such as the following.
1177 * @struct bench_timing t;@
1180 * (run @n@ iterations; set @t@ to the timing)
1181 * @} while (!bench_adapt(b, &n, &t));@
1183 * On entry, @*n_inout@ should be the number of iterations
1184 * performed by the previous pass, and @*t@ the resulting time;
1185 * the @BTF_TIMEOK@ flag must be set @t->f@. If the timing is
1186 * sufficient -- @t->t@ is sufficiently close to @target_s@
1187 * -- then the function returns nonzero to indicate that
1188 * measurement is complete. Otherwise, it sets @*n_inout@ to a
1189 * new, larger iteration count and returns zero to indicate that
1190 * a further pass is necessary.
1193 int bench_adapt(double *n_inout, double target_s,
1194 const struct bench_timing *t)
1196 double n = *n_inout, nn;
1198 /* Dump the results for debugging. */
1199 if (!(t->f&BTF_CYOK)) debug(" n = %10.0f; t = %12g", n, t->t);
1200 else debug(" n = %10.0f; t = %12g, cy = %10.0f", n, t->t, t->cy);
1202 /* Suppose the timer loop %$n$% iterations in %$t$% seconds. Our ideal
1203 * time is %$T$% seconds. If %$t \ge T/\sqrt{2}$%, we're happy.
1204 * Otherwise, we need to scale up the iteration count. The obvious next
1205 * choice is %$n' = n T/t$%. Alas, rounding is a problem: if
1206 * %$T/t < 1 + 1/n$% then %$\floor{n T/t} = n$% and we will make no
1207 * progress. We know that %$T/t > \sqrt{2}%, so this can only happen when
1208 * %$1 + 1/n > \sqrt{2}$%, i.e., when %$n < \sqrt{2} + 1$%. On the other
1209 * hand, if %$T/t < 1 + 1/n$% then %$t (n + 1)/n > T$%, so just trying
1210 * again with %$n' = n + 1$% iterations will very likely work.
1212 if (t->t >= 0.707*target_s) return (1);
1213 nn = n*target_s/t->t; modf(nn, &nn);
1214 *n_inout = nn > n ? nn : n + 1;
1218 /* --- @bench_adjust@ --- *
1220 * Arguments: @struct bench_state *b@ = benchmark state
1221 * @struct bench_timing *t_inout@ = timing to adjust
1222 * @double n@ = number of external iterations performed
1223 * @double base@ = number of internal operations per external
1228 * Use: Adjusts a raw timing, as captured by @BENCH_TIMELOOP@,
1229 * according to the calibration data captured in @b@.
1230 * On exit, the timing data is updated, and @t->n@ is set to the
1234 void bench_adjust(struct bench_state *b,
1235 struct bench_timing *t_inout, double n, double base)
1238 /* Adjust according to the calibration. */
1239 t_inout->t -= n*b->clk.m + b->clk.c;
1240 if (t_inout->f&BTF_CYOK) t_inout->cy -= n*b->cy.m + b->cy.c;
1242 /* Report the results, if debugging. */
1243 if (!(t_inout->f&BTF_CYOK)) debug(" adjusted t' = %12g", t_inout->t);
1244 else debug(" adjusted t' = %12g, cy' = %10.0f", t_inout->t, t_inout->cy);
1245 if (!(t_inout->f&BTF_CYOK))
1246 debug(" %g s per iter; %g iters/s", t_inout->t/n, n/t_inout->t);
1248 debug(" %g s (%g cy) per iter; %g iters/s",
1249 t_inout->t/n, t_inout->cy/n, n/t_inout->t);
1252 t_inout->n = n*base;
1255 /* --- @bench_measure@ --- *
1257 * Arguments: @struct bench_state *b@ = benchmark state
1258 * @struct bench_timing *t_out@ = where to leave the timing
1259 * @double base@ = number of internal units per call
1260 * @bench_fn *fn@, @void *ctx@ = benchmark function to run
1262 * Returns: Zero on success, %$-1$% if timing failed.
1264 * Use: Measure a function. The function @fn@ is called adaptively
1265 * with an iteration count @n@ set so as to run for
1266 * approximately @b->target_s@ seconds.
1268 * The result is left in @*t_out@, with @t_out->n@ counting the
1269 * final product of the iteration count and @base@ (which might,
1270 * e.g., reflect the number of inner iterations the function
1271 * performs, or the number of bytes it processes per iteration).
1273 * To get useful results, the benchmark state should have been
1274 * calibrated for indirect calling -- i.e., with @BTF_INDIRECT@.
1277 int bench_measure(struct bench_state *b, struct bench_timing *t_out,
1278 double base, bench_fn *fn, void *ctx)
1280 BENCH_MEASURE_DECLS;
1283 BENCH_MEASURE(b, rc, t_out, base) fn(_bench_n, ctx);
1287 /*----- Reporting ---------------------------------------------------------*/
1289 /* --- @bench_report@ --- *
1291 * Arguments: @const struct gprintf_ops *gops, void *gp@ = output formatter
1292 * @unsigned unit@ = unit processed by the benchmark function
1293 * @const struct bench_timing *t@ = benchmark result
1297 * Use: Format, to the output identified by @gops@ and @go@, a
1298 * human-readable report of the benchmarking result @t@. No
1299 * newline is appended.
1301 * The output format is subject to change in later versions.
1304 void bench_report(const struct gprintf_ops *gops, void *go,
1305 unsigned unit, const struct bench_timing *t)
1307 double scale, x, n = t->n;
1308 const char *u, *what, *whats;
1310 assert(t->f&BTF_TIMEOK);
1314 gprintf(gops, go, "%.0f iterations ", n);
1315 what = "op"; whats = "ops"; scale = 1000;
1318 x = n; normalize(&x, &u, 1024); gprintf(gops, go, "%.3f %sB ", x, u);
1319 what = whats = "B"; scale = 1024;
1325 x = t->t; normalize(&x, &u, 1000);
1326 gprintf(gops, go, "in %.3f %ss", x, u);
1327 if (t->f&BTF_CYOK) {
1328 x = t->cy; normalize(&x, &u, 1000);
1329 gprintf(gops, go, " (%.3f %scy)", x, u);
1331 gprintf(gops, go, ": ");
1333 x = n/t->t; normalize(&x, &u, scale);
1334 gprintf(gops, go, "%.3f %s%s/s", x, u, whats);
1335 x = t->t/n; normalize(&x, &u, 1000);
1336 gprintf(gops, go, ", %.3f %ss/%s", x, u, what);
1337 if (t->f&BTF_CYOK) {
1338 x = t->cy/n; normalize(&x, &u, 1000);
1339 gprintf(gops, go, " (%.3f %scy/%s)", x, u, what);
1343 /*----- That's all, folks -------------------------------------------------*/