#include <ctype.h>
#include <errno.h>
+#include <limits.h>
#include <stdarg.h>
#include <stdio.h>
#include <stdlib.h>
#include "linreg.h"
#include "macros.h"
+#if GCC_VERSION_P(4, 5) && (defined(__i386__) || defined(__x86_64__))
+# include <cpuid.h>
+# define CPUID_1D_TSC (1u << 4)
+# define CPUID_1xD_TSCP (1u << 27)
+#endif
+
+#if defined(HAVE_LINUX_PERF_EVENT_H) && defined(HAVE_UINT64)
+# include <sys/types.h>
+# include <unistd.h>
+# include <linux/perf_event.h>
+# include <asm/unistd.h>
+# if GCC_VERSION_P(4, 5) && (defined(__i386__) || defined(__x86_64__))
+# include <sys/mman.h>
+# endif
+#endif
+
/*----- Data structures ---------------------------------------------------*/
enum { CLK, CY, NTIMER };
struct timer {
struct bench_timer _t;
const struct timer_ops *ops[NTIMER]; /* subtimers for clock and cycles */
- union { int fd; } u_cy; /* state for cycle measurement */
+ union {
+ unsigned tscaux; /* `ia32_tsc_aux' for `ldtscp' */
+ int fd; /* vanilla `perf_event_open' */
+ struct { const volatile void *map; size_t sz; } pmc; /* `perf_event_open'
+ * with `rdpmc' */
+ } u_cy; /* state for cycle measurement */
};
struct timer_ops {
unsigned f; /* flags */
#define TF_SECRET 1u /* don't try this automatically */
int (*init)(struct timer */*t*/); /* initialization function */
- void (*now)(struct bench_time *t_out, struct timer *t); /* read current */
- void (*teardown)(struct timer *t); /* release held resources */
+ int (*now)(struct timer */*t*/, /* read current */
+ struct bench_time */*t_out*/, unsigned /*f*/);
+ void (*diff)(struct timer */*t*/, /* difference */
+ struct bench_timing */*t_inout*/,
+ const struct bench_time */*t0*/,
+ const struct bench_time */*t1*/);
+ void (*teardown)(struct timer */*t*/); /* release held resources */
};
/*----- Preliminaries -----------------------------------------------------*/
}
}
-/* --- @timer_diff@ --- *
+/*----- Difference utilities ----------------------------------------------*/
+
+#ifdef HAVE_UINT64
+# define FLOATK64(k) ((double)(k).i)
+#else
+# define FLOATK64(k) ((double)(k).lo + 4294967296.0*(double)(k).hi)
+#endif
+
+/* --- @diff_ts@ --- *
*
- * Arguments: @struct bench_timing *delta_out@ = where to putt the result
- * @const struct bench_time *t0, *t1@ = two times captured by a
- * timer's @now@ function
+ * Arguments: @struct timer *t@ = timer structure
+ * @struct bench_timing *delta_inout@ = where to put the result
+ * @const struct time *t0, *t1@ = two input times
*
* Returns: ---
*
- * Use: Calculates the difference between two captured times. The
- * flags are set according to whether the differences are
- * meaningful; @delta_out->n@ is left unset.
+ * Use: Calculates a time difference for timers using the
+ * @struct timespec@-like time format.
*/
-static void timer_diff(struct bench_timing *delta_out,
- const struct bench_time *t0,
- const struct bench_time *t1)
+static void diff_ts(struct timer *t, struct bench_timing *delta_inout,
+ const struct bench_time *t0, const struct bench_time *t1)
{
unsigned f = t0->f&t1->f;
kludge64 k;
-#ifdef HAVE_UINT64
-# define FLOATK64(k) ((double)(k).i)
-#else
-# define FLOATK64(k) ((double)(k).lo + 4275123318.0*(double)(k).hi)
-#endif
+ if (f&BTF_TIMEOK) {
- if (!(f&BTF_TIMEOK))
- delta_out->t = 0.0;
- else {
- SUB64(k, t1->s, t0->s);
- delta_out->t = FLOATK64(k) - 1 +
- (t1->ns + NS_PER_S - t0->ns)/(double)NS_PER_S;
- }
+ /* Calculate the integer difference in seconds. */
+ SUB64(k, t1->t.ts.s, t0->t.ts.s);
- if (!(f&BTF_CYOK))
- delta_out->cy = 0.0;
- else {
- SUB64(k, t1->cy, t0->cy);
- delta_out->cy = FLOATK64(k);
+ /* And apply the nanoseconds difference. To prevent underflow,
+ * pre-emptively borrow one from the integer difference.
+ */
+ delta_inout->t =
+ FLOATK64(k) - 1.0 +
+ (t1->t.ts.ns + NS_PER_S - t0->t.ts.ns)/(double)NS_PER_S;
+
+ /* Done. */
+ delta_inout->f |= BTF_TIMEOK;
}
+}
- delta_out->f = f;
+/* --- @diff_cycles@ --- *
+ *
+ * Arguments: @struct timer *t@ = timer structure
+ * @struct bench_timing *delta_inout@ = where to put the result
+ * @const struct time *t0, *t1@ = two input times
+ *
+ * Returns: ---
+ *
+ * Use: Calculates a time difference for cycle-counting timers.
+ */
-#undef FLOATK64
+static void diff_cycles(struct timer *t, struct bench_timing *delta_inout,
+ const struct bench_time *t0,
+ const struct bench_time *t1)
+{
+ unsigned f = t0->f&t1->f;
+ kludge64 k;
+
+ if (f&BTF_CYOK) {
+ SUB64(k, t1->cy, t0->cy); delta_inout->cy = FLOATK64(k);
+ delta_inout->f |= BTF_CYOK;
+ }
}
+#undef FLOATK64
+
/*----- The null timer ----------------------------------------------------*/
/* This is a timer which does nothing, in case we don't have any better
*/
static int null_init(struct timer *t) { return (0); }
-static void null_now(struct bench_time *t_out, struct timer *t) { ; }
+static int null_now(struct timer *t, struct bench_time *t_out, unsigned f)
+ { return (0); }
+static void null_diff(struct timer *t, struct bench_timing *delta_inout,
+ const struct bench_time *t0,
+ const struct bench_time *t1)
+ { ; }
static void null_teardown(struct timer *t) { ; }
static const struct timer_ops null_ops =
- { "null", 0, null_init, null_now, null_teardown };
+ { "null", 0, null_init, null_now, null_diff, null_teardown };
#define NULL_ENT &null_ops,
/*----- The broken clock --------------------------------------------------*/
static int broken_init(struct timer *t) { return (-1); }
static const struct timer_ops broken_ops =
- { "broken", TF_SECRET, broken_init, null_now, null_teardown };
+ { "broken", TF_SECRET, broken_init, null_now, null_diff, null_teardown };
#define BROKEN_ENT &broken_ops,
/*----- Linux performance counters ----------------------------------------*/
#if defined(HAVE_LINUX_PERF_EVENT_H) && defined(HAVE_UINT64)
-#include <sys/types.h>
-#include <unistd.h>
+/* --- @perfevent_open@ --- *
+ *
+ * Arguments: ---
+ *
+ * Returns: File descriptor, or %$-1$%.
+ *
+ * Use: Open a performance measurement descriptor set up to count CPU
+ * cycles.
+ */
-#include <linux/perf_event.h>
-#include <asm/unistd.h>
+static int perfevent_open(void)
+{
+ struct perf_event_attr attr = { 0 };
+ int fd;
-static void perfevent_now(struct bench_time *t_out, struct timer *t)
+ attr.type = PERF_TYPE_HARDWARE;
+ attr.size = sizeof(attr);
+ attr.config = PERF_COUNT_HW_CPU_CYCLES;
+ attr.disabled = 0;
+ attr.exclude_kernel = 1;
+ attr.exclude_hv = 1;
+
+ fd = syscall(__NR_perf_event_open, &attr, 0, -1, -1, 0);
+ if (fd < 0) {
+ debug("couldn't open perf event: %s", strerror(errno));
+ return (-1);
+ }
+
+ return (fd);
+}
+
+static int perfevent_now(struct timer *t,
+ struct bench_time *t_out, unsigned f)
{
ssize_t n;
n = read(t->u_cy.fd, &t_out->cy.i, sizeof(t_out->cy.i));
if (n != sizeof(t_out->cy.i)) {
debug("failed to read perf-event counter: %s", strerror(errno));
- return;
+ return (0);
}
- t_out->f |= BTF_CYOK;
+ t_out->f |= BTF_CYOK; return (0);
}
static void perfevent_teardown(struct timer *t)
static int perfevent_init(struct timer *t)
{
- struct perf_event_attr attr = { 0 };
struct bench_time tm;
+ int fd = -1, rc;
- attr.type = PERF_TYPE_HARDWARE;
- attr.size = sizeof(attr);
- attr.config = PERF_COUNT_HW_CPU_CYCLES;
- attr.disabled = 0;
- attr.exclude_kernel = 1;
- attr.exclude_hv = 1;
+ fd = perfevent_open(); if (!fd) { rc = -1; goto end; }
- t->u_cy.fd = syscall(__NR_perf_event_open, &attr, 0, -1, -1, 0);
- if (t->u_cy.fd < 0) {
- debug("couldn't open perf evvent: %s", strerror(errno));
- return (-1);
+ t->u_cy.fd = fd; tm.f = 0; perfevent_now(t, &tm, 0);
+ if (!(tm.f&BTF_CYOK)) { rc = -1; goto end; }
+ fd = -1; rc = 0;
+end:
+ if (fd != -1) close(fd);
+ return (rc);
+}
+
+static const struct timer_ops perfevent_ops =
+ { "linux-perf-read-hw-cycles", 0,
+ perfevent_init, perfevent_now, diff_cycles, perfevent_teardown };
+#define PERFEVENT_VANILLA_CYENT &perfevent_ops,
+
+# if GCC_VERSION_P(4, 5) && (defined(__i386__) || defined(__x86_64__))
+
+/* Special syscall-free version for x86 using `rdpmc' instruction. *
+ *
+ * This is a bit weird because it does both kinds of measurement in a single
+ * operation.
+ */
+
+static int perfevrdpmc_now(struct timer *t,
+ struct bench_time *t_out, unsigned f)
+{
+ const volatile struct perf_event_mmap_page *map = t->u_cy.pmc.map;
+ unsigned long long tsc = tsc, toff = toff, tenb = tenb;
+ unsigned long long cy = cy, cyoff = cyoff;
+ unsigned long long m, hi, lo;
+ unsigned tshift = tshift, tmult = tmult, q0, q1, ff;
+
+ /* Repeat until we can complete this job without the buffer changing in the
+ * middle.
+ */
+ q0 = map->lock;
+ __atomic_thread_fence(__ATOMIC_ACQ_REL);
+ for (;;) {
+ ff = 0;
+
+ /* Read the passage-of-time information. */
+ if (map->cap_user_time) {
+ tenb = map->time_enabled;
+ tsc = __builtin_ia32_rdtsc();
+ tshift = map->time_shift;
+ tmult = map->time_mult;
+ toff = map->time_offset;
+ ff |= BTF_TIMEOK;
+ }
+
+ /* Read the performance-counter information. */
+ if (map->cap_user_rdpmc) {
+ cy = __builtin_ia32_rdpmc(map->index - 1);
+ cyoff = map->offset;
+ ff |= BTF_CYOK;
+ }
+
+ /* Check the sequence number again. */
+ __atomic_thread_fence(__ATOMIC_ACQ_REL);
+ q1 = map->lock;
+ if (q0 == q1) break;
+ q0 = q1;
+ }
+
+ if (ff&BTF_TIMEOK) {
+ /* We have a raw reference-cycle count %$n$% (@tsc@), and parameters
+ * %$a$%, %$w$% and %$t_0$%, such that %$a n/2^w + t_0$% gives a time in
+ * nanoseconds.
+ */
+
+ m = (1ull << tshift) - 1;
+ hi = tsc >> tshift; lo = tsc&m;
+ t_out->t.rawns.i = hi*tmult + (lo*tmult >> tshift) + toff + tenb;
+ t_out->f |= BTF_TIMEOK;
}
- tm.f = 0; perfevent_now(&tm, t);
- if (!(tm.f&BTF_CYOK)) { close(t->u_cy.fd); return (-1); }
+ if (ff&BTF_CYOK) {
+ /* We have the cycle count. */
+ t_out->cy.i = cy + cyoff;
+ t_out->f |= BTF_CYOK;
+ }
return (0);
}
-static const struct timer_ops perfevent_ops =
- { "linux-perf-hw-cycles", 0,
- perfevent_init, perfevent_now, perfevent_teardown };
+static void perfevrdpmc_diff(struct timer *t,
+ struct bench_timing *delta_inout,
+ const struct bench_time *t0,
+ const struct bench_time *t1)
+{
+ unsigned f = t0->f&t1->f;
-# define PERFEVENT_CYENT &perfevent_ops,
+ if (f&BTF_TIMEOK) {
+ delta_inout->t = (t1->t.rawns.i - t0->t.rawns.i)/(double)NS_PER_S;
+ delta_inout->f |= BTF_TIMEOK;
+ }
+
+ if (f&BTF_CYOK) {
+ delta_inout->cy = t1->cy.i - t0->cy.i;
+ delta_inout->f |= BTF_CYOK;
+ }
+}
+
+static void perfevrdpmc_teardown(struct timer *t)
+ { munmap((/*unconst unvolatile*/ void *)t->u_cy.pmc.map, t->u_cy.pmc.sz); }
+
+static int perfevrdpmc_cyinit(struct timer *t)
+{
+ const volatile struct perf_event_mmap_page *map = 0;
+ unsigned a, b, c, d, q0, q1, f;
+ int pgsz, mapsz, fd = -1, rc;
+
+ /* We need `rdtsc' to do the passage-of-time measurement. */
+ if (!__get_cpuid(1, &a, &b, &c, &d) || !(d&CPUID_1D_TSC))
+ { debug("no `rdtsc' instrunction"); return (-1); }
+
+ /* The rules say we must allocate %$1 + 2^n$% pages, so we need to know how
+ * big a page is.
+ */
+ pgsz = sysconf(_SC_PAGESIZE);
+ if (pgsz < 0) {
+ debug("failed to discover page size!: %s", strerror(errno));
+ rc = -1; goto end;
+ }
+
+ /* Open the measurement descriptor and map it. */
+ fd = perfevent_open(); if (!fd) return (-1);
+ mapsz = 2*pgsz;
+ map = mmap(0, mapsz, PROT_READ, MAP_SHARED, fd, 0);
+ if (map == MAP_FAILED) {
+ debug("failed to map perf event: %s", strerror(errno));
+ return (-1);
+ }
+
+ /* Check that it's revealed the necessary information. */
+ q0 = map->lock;
+ __atomic_thread_fence(__ATOMIC_ACQ_REL);
+ for (;;) {
+ f = 0;
+ if (map->cap_user_time) f |= BTF_TIMEOK;
+ if (map->cap_user_rdpmc) f |= BTF_CYOK;
+ __atomic_thread_fence(__ATOMIC_ACQ_REL);
+ q1 = map->lock;
+ if (q0 == q1) break;
+ q0 = q1;
+ }
+ if (!(f&BTF_TIMEOK))
+ { debug("kernel refused user time measurement"); rc = -1; goto end; }
+ if (!(f&BTF_TIMEOK))
+ { debug("kernel refused user cycle measurement"); rc = -1; goto end; }
+
+ /* All done. We can close the descriptor here: the mapping will keep the
+ * performance-measurement machinery alive.
+ */
+ t->u_cy.pmc.map = map; t->u_cy.pmc.sz = mapsz; map = 0; rc = 0;
+end:
+ if (fd != -1) close(fd);
+ if (map) munmap((/*unconst unvolatile*/ void *)map, mapsz);
+ return (rc);
+}
+
+static const struct timer_ops perfevrdpmc_cyops =
+ { "linux-x86-perf-rdpmc-hw-cycles", 0,
+ perfevrdpmc_cyinit, perfevrdpmc_now,
+ perfevrdpmc_diff, perfevrdpmc_teardown };
+
+static int perfevrdpmc_clkinit(struct timer *t)
+{
+ if (t->ops[CLK] != &perfevrdpmc_cyops) {
+ debug("linux-x86-perf-rdpmc-hw-cycles not set as cycle subtimer");
+ return(-1);
+ }
+ return (0);
+}
+
+static const struct timer_ops perfevrdpmc_clkops =
+ { "linux-x86-perf-rdpmc-hw-cycles", 0,
+ perfevrdpmc_clkinit, null_now,
+ null_diff, null_teardown };
+
+# define PERFEVENT_RDPMC_CLKENT &perfevrdpmc_clkops,
+# define PERFEVENT_RDPMC_CYENT &perfevrdpmc_cyops,
+
+# else
+# define PERFEVENT_RDPMC_CLKENT
+# define PERFEVENT_RDPMC_CYENT
+# endif
+
+# define PERFEVENT_CLKENT PERFEVENT_RDPMC_CLKENT
+# define PERFEVENT_CYENT PERFEVENT_RDPMC_CYENT PERFEVENT_VANILLA_CYENT
#else
+# define PERFEVENT_CLKENT
# define PERFEVENT_CYENT
#endif
* CPU frequency adjustments.
*/
-#if defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__))
+#if GCC_VERSION_P(4, 5) && (defined(__i386__) || defined(__x86_64__))
-#include <cpuid.h>
-
-#define CPUID_1D_TSC (1u << 4)
-
-static void x86rdtsc_now(struct bench_time *t_out, struct timer *t)
- { t_out->cy.i = __builtin_ia32_rdtsc(); t_out->f |= BTF_CYOK; }
+static int x86rdtsc_now(struct timer *t,
+ struct bench_time *t_out, unsigned f)
+ { t_out->cy.i = __builtin_ia32_rdtsc(); t_out->f |= BTF_CYOK; return (0); }
static int x86rdtsc_init(struct timer *t)
{
if (!__get_cpuid(1, &a, &b, &c, &d) || !(d&CPUID_1D_TSC))
{ debug("no `rdtsc' instrunction"); return (-1); }
+ t->u_cy.tscaux = ~0u;
+ return (0);
+}
+
+static int x86rdtscp_now(struct timer *t,
+ struct bench_time *t_out, unsigned f)
+{
+ unsigned tscaux;
+ unsigned long long n;
+
+ n = __builtin_ia32_rdtscp(&tscaux);
+ if (!(f&BTF_T1))
+ t->u_cy.tscaux = tscaux;
+ else if (t->u_cy.tscaux != tscaux) {
+ debug("tscaux mismatch: new 0x%08x /= old 0x%08x",
+ tscaux, t->u_cy.tscaux);
+ return (-1);
+ }
+ t_out->cy.i = n; t_out->f |= BTF_CYOK; return (0);
+}
+
+static int x86rdtscp_init(struct timer *t)
+{
+ unsigned a, b, c, d;
+
+ if (!__get_cpuid(0x80000001, &a, &b, &c, &d) || !(d&CPUID_1xD_TSCP))
+ { debug("no `rdtscp' instrunction"); return (-1); }
return (0);
}
static const struct timer_ops x86rdtsc_ops =
- { "x86-rdtsc", 0, x86rdtsc_init, x86rdtsc_now, null_teardown };
+ { "x86-rdtsc", 0,
+ x86rdtsc_init, x86rdtsc_now, diff_cycles, null_teardown };
+static const struct timer_ops x86rdtscp_ops =
+ { "x86-rdtscp", 0,
+ x86rdtscp_init, x86rdtscp_now, diff_cycles, null_teardown };
-# define X86RDTSC_CYENT &x86rdtsc_ops,
+# define X86RDTSC_CYENT &x86rdtscp_ops, &x86rdtsc_ops,
#else
# define X86RDTSC_CYENT
#endif
#if defined(HAVE_CLOCK_GETTIME) && defined(CLOCK_THREAD_CPUTIME_ID)
-static void gettime_now(struct bench_time *t_out, struct timer *t)
+static int gettime_now(struct timer *t, struct bench_time *t_out, unsigned f)
{
struct timespec now;
if (clock_gettime(CLOCK_THREAD_CPUTIME_ID, &now))
- { debug("error reading POSIX clock: %s", strerror(errno)); return; }
- ASSIGN64(t_out->s, now.tv_sec); t_out->ns = now.tv_nsec;
- t_out->f |= BTF_TIMEOK;
+ { debug("error reading POSIX clock: %s", strerror(errno)); return (0); }
+ ASSIGN64(t_out->t.ts.s, now.tv_sec); t_out->t.ts.ns = now.tv_nsec;
+ t_out->f |= BTF_TIMEOK; return (0);
}
static int gettime_init(struct timer *t)
{
struct bench_time tm;
- tm.f = 0; gettime_now(&tm, t); if (!tm.f&BTF_TIMEOK) return (-1);
+ tm.f = 0; gettime_now(t, &tm, 0); if (!tm.f&BTF_TIMEOK) return (-1);
return (0);
}
static const struct timer_ops gettime_ops =
- { "posix-thread-cputime", 0, gettime_init, gettime_now, null_teardown };
+ { "posix-thread-cputime", 0,
+ gettime_init, gettime_now, diff_ts, null_teardown };
# define GETTIME_CLKENT &gettime_ops,
#else
* guaranteed to be available, though it's not likely to be very good.
*/
-static void clock_now(struct bench_time *t_out, struct timer *t)
+static int clock_now(struct timer *t, struct bench_time *t_out, unsigned f)
{
- clock_t now, x;
- unsigned long s; uint32 ns;
+ clock_t now;
now = clock();
if (now == (clock_t)-1) {
debug("error reading standard clock: %s", strerror(errno));
- return;
+ return (0);
}
- x = now/CLOCKS_PER_SEC;
- if (x > ULONG_MAX) { debug("standard clock out of range"); return; }
-
- s = x; x = now - CLOCKS_PER_SEC*s;
- if (!(NS_PER_S%CLOCKS_PER_SEC))
- ns = x*(NS_PER_S/CLOCKS_PER_SEC);
- else if (NS_PER_S <= ULONG_MAX/CLOCKS_PER_SEC)
- ns = (x*NS_PER_S)/CLOCKS_PER_SEC;
- else
- ns = x*((NS_PER_S + 0.0)/CLOCKS_PER_SEC);
- ASSIGN64(t_out->s, s); t_out->ns = ns; t_out->f |= BTF_TIMEOK;
+ t_out->t.clk = now; t_out->f |= BTF_TIMEOK; return (0);
+}
+
+static void clock_diff(struct timer *t, struct bench_timing *delta_inout,
+ const struct bench_time *t0,
+ const struct bench_time *t1)
+{
+ unsigned f = t0->f&t1->f;
+
+ if (f&BTF_TIMEOK) {
+ delta_inout->t = (t1->t.clk - t0->t.clk)/(double)CLOCKS_PER_SEC;
+ delta_inout->f |= BTF_TIMEOK;
+ }
}
static int clock_init(struct timer *t)
{
struct bench_time tm;
- tm.f = 0; clock_now(&tm, t); if (!tm.f&BTF_TIMEOK) return (-1);
+ tm.f = 0; clock_now(t, &tm, 0); if (!tm.f&BTF_TIMEOK) return (-1);
return (0);
}
static const struct timer_ops clock_ops =
- { "stdc-clock", 0, clock_init, clock_now, null_teardown };
+ { "stdc-clock", 0, clock_init, clock_now, clock_diff, null_teardown };
#define CLOCK_CLKENT &clock_ops,
/* Tables of timing sources. */
static const struct timer_ops
- *const clktab[] = { GETTIME_CLKENT CLOCK_CLKENT BROKEN_ENT 0 },
- *const cytab[] = { PERFEVENT_CYENT X86RDTSC_CYENT NULL_ENT BROKEN_ENT 0 };
+ *const clktab[] = { PERFEVENT_CLKENT
+ GETTIME_CLKENT
+ CLOCK_CLKENT
+ BROKEN_ENT
+ 0 },
+ *const cytab[] = { PERFEVENT_CYENT
+ X86RDTSC_CYENT
+ NULL_ENT
+ BROKEN_ENT
+ 0 };
static const struct timertab {
const char *what;
}
}
-static void timer_now(struct bench_timer *tm, struct bench_time *t_out)
+static int timer_now(struct bench_timer *tm,
+ struct bench_time *t_out, unsigned f)
{
struct timer *t = (struct timer *)tm;
unsigned i;
- for (i = 0; i < NTIMER; i++) t->ops[i]->now(t_out, t);
+ t_out->f = 0;
+ for (i = 0; i < NTIMER; i++) if (t->ops[i]->now(t, t_out, f)) return (-1);
+ return (0);
+}
+
+static void timer_diff(struct bench_timer *tm,
+ struct bench_timing *t_out,
+ const struct bench_time *t0,
+ const struct bench_time *t1)
+{
+ struct timer *t = (struct timer *)tm;
+ unsigned i;
+
+ t_out->f = 0;
+ for (i = 0; i < NTIMER; i++) t->ops[i]->diff(t, t_out, t0, t1);
}
static void timer_destroy(struct bench_timer *tm)
}
static const struct bench_timerops timer_ops =
- { timer_describe, timer_now, timer_destroy };
+ { timer_describe, timer_now, timer_diff, timer_destroy };
/* --- @bench_createtimer@ --- *
*
for (i = 0; i < NTIMER; i++) t->ops[i] = 0;
/* Try to set up the subtimers. */
- for (i = 0; i < NTIMER; i++)
+ for (i = NTIMER; i--; )
if (select_timer(t, i, tmconf[i].p, tmconf[i].sz)) goto end;
/* All is done. */
static void do_nothing(unsigned long n, void *ctx)
{ while (n--) RELAX; }
+/* --- @measure@ --- *
+ *
+ * Arguments: @struct bench_state *b@ = bench state
+ * @struct bench_timing *delta_out@ = where to leave the timing
+ * @bench_fn *fn@ = function to measure
+ * @void *ctx@ = context for the function
+ * @double n@ = number of iterations
+ *
+ * Returns: ---
+ *
+ * Use: Run the function @n@ times, and report how long it took.
+ *
+ * This function deals with retrying the measurements if the
+ * timer reports a temporary failure, and all of the
+ * difficulties if @n@ is too large to fit in a machine integer.
+ */
+
+static void measure(struct bench_state *b, struct bench_timing *delta_out,
+ bench_fn *fn, void *ctx, double n)
+{
+ struct bench_timer *tm = b->tm;
+ struct bench_time t0, t1;
+ unsigned long n0, n1;
+ double R = ULONG_MAX;
+
+ if (n <= R) {
+ n0 = n;
+ do {
+ while (tm->ops->now(tm, &t0, BTF_T0));
+ fn(n0, ctx);
+ } while (tm->ops->now(tm, &t1, BTF_T1));
+ } else {
+ n1 = n/R; n0 = n - n1*R;
+ do {
+ while (tm->ops->now(tm, &t0, BTF_T0));
+ while (n1--) fn(ULONG_MAX, ctx);
+ fn(n0, ctx);
+ } while (tm->ops->now(tm, &t1, BTF_T1));
+ }
+ tm->ops->diff(tm, delta_out, &t0, &t1);
+}
+
/* --- @bench_calibrate@ --- *
*
* Arguments: @struct bench_state *b@ = bench state
int bench_calibrate(struct bench_state *b)
{
struct linreg lr_clk = LINREG_INIT, lr_cy = LINREG_INIT;
- unsigned long n;
- unsigned i;
- struct bench_timer *tm = b->tm;
- struct bench_time t0, t1;
struct bench_timing delta;
- double r;
+ double n, r;
bench_fn *fn = LAUNDER(&do_nothing);
- unsigned f = BTF_ANY;
+ unsigned i, f = BTF_ANY;
int rc;
/* The model here is that a timing loop has a fixed overhead as we enter
if (b->f&BTF_CLB) return (b->f&BTF_ANY ? 0 : -1);
/* Exercise the inner loop a few times to educate the branch predictor. */
- for (i = 0; i < 10; i++)
- { tm->ops->now(tm, &t0); fn(50, 0); tm->ops->now(tm, &t1); }
+ for (i = 0; i < 50; i++) measure(b, &delta, fn, 0, 10000);
/* Now we measure idle loops until they take sufficiently long -- or we run
* out of counter.
*/
debug("calibrating...");
- n = 1;
+ n = 1.0;
for (;;) {
/* Measure @n@ iterations of the idle loop. */
- tm->ops->now(tm, &t0); fn(n, 0); tm->ops->now(tm, &t1);
- timer_diff(&delta, &t0, &t1); f &= delta.f;
+ measure(b, &delta, fn, 0, n); f &= delta.f;
if (!(f&BTF_TIMEOK)) { rc = -1; goto end; }
/* Register the timings with the regression machinery. */
linreg_update(&lr_clk, n, delta.t);
if (!(f&BTF_CYOK))
- debug(" n = %10lu; t = %12g s", n, delta.t);
+ debug(" n = %10.0f; t = %12g s", n, delta.t);
else {
linreg_update(&lr_cy, n, delta.cy);
- debug(" n = %10lu; t = %12g s, cy = %10.0f", n, delta.t, delta.cy);
+ debug(" n = %10.0f; t = %12g s, cy = %10.0f", n, delta.t, delta.cy);
}
/* If we're done then stop. */
if (n >= ULONG_MAX - n/3) break;
/* Update the counter and continue. */
- n += n/3 + 1;
+ n += n/3.0 + 1.0;
}
/* Now run the linear regression to extract the constant and per-iteration
int bench_measure(struct bench_state *b, struct bench_timing *t_out,
double base, bench_fn *fn, void *ctx)
{
- struct bench_timer *tm = b->tm;
- struct bench_time t0, t1;
- unsigned long n, nn;
+ double n, nn;
/* Make sure the state is calibrated and usable. */
if (!(b->f&BTF_CLB) && bench_calibrate(b)) return (-1);
* hand, if %$T/t < 1 + 1/n$% then %$t (n + 1)/n > T$%, so just trying
* again with %$n' = n + 1$% iterations will very likely work.
*/
- debug("measuring..."); n = 1;
+ debug("measuring..."); n = 1.0;
for (;;) {
- tm->ops->now(tm, &t0); fn(n, ctx); tm->ops->now(tm, &t1);
- timer_diff(t_out, &t0, &t1);
+ measure(b, t_out, fn, ctx, n); t_out->f &= b->f;
if (!(t_out->f&BTF_TIMEOK)) return (-1);
- if (!(t_out->f&BTF_CYOK)) debug(" n = %10lu; t = %12g", n, t_out->t);
- else debug(" n = %10lu; t = %12g, cy = %10.0f", n, t_out->t, t_out->cy);
+ if (!(t_out->f&BTF_CYOK))
+ debug(" n = %10.0f; t = %12g", n, t_out->t);
+ else
+ debug(" n = %10.0f; t = %12g, cy = %10.0f", n, t_out->t, t_out->cy);
+
if (t_out->t >= 0.707*b->target_s) break;
nn = n*b->target_s/t_out->t;
- if (nn > n) n = nn;
+ if (n > ULONG_MAX || nn > (unsigned long)n + 1) n = nn;
else n++;
}