Commit | Line | Data |
---|---|---|
b64eb60f MW |
1 | /* -*-c-*- |
2 | * | |
3 | * Benchmarking support | |
4 | * | |
5 | * (c) 2023 Straylight/Edgeware | |
6 | */ | |
7 | ||
8 | /*----- Licensing notice --------------------------------------------------* | |
9 | * | |
10 | * This file is part of the mLib utilities library. | |
11 | * | |
12 | * mLib is free software: you can redistribute it and/or modify it under | |
13 | * the terms of the GNU Library General Public License as published by | |
14 | * the Free Software Foundation; either version 2 of the License, or (at | |
15 | * your option) any later version. | |
16 | * | |
17 | * mLib is distributed in the hope that it will be useful, but WITHOUT | |
18 | * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or | |
19 | * FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public | |
20 | * License for more details. | |
21 | * | |
22 | * You should have received a copy of the GNU Library General Public | |
23 | * License along with mLib. If not, write to the Free Software | |
24 | * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, | |
25 | * USA. | |
26 | */ | |
27 | ||
28 | /*----- Header files ------------------------------------------------------*/ | |
29 | ||
30 | #include "config.h" | |
31 | ||
13ee7406 | 32 | #include <ctype.h> |
b64eb60f | 33 | #include <errno.h> |
6e683a79 | 34 | #include <limits.h> |
b64eb60f MW |
35 | #include <stdarg.h> |
36 | #include <stdio.h> | |
37 | #include <stdlib.h> | |
38 | #include <string.h> | |
39 | #include <time.h> | |
40 | ||
41 | #include "alloc.h" | |
42 | #include "bench.h" | |
43 | #include "bits.h" | |
13ee7406 | 44 | #include "dstr.h" |
b64eb60f MW |
45 | #include "linreg.h" |
46 | #include "macros.h" | |
47 | ||
6e683a79 MW |
48 | #if GCC_VERSION_P(4, 5) && (defined(__i386__) || defined(__x86_64__)) |
49 | # include <cpuid.h> | |
50 | # define CPUID_1D_TSC (1u << 4) | |
51 | # define CPUID_1xD_TSCP (1u << 27) | |
52 | #endif | |
53 | ||
54 | #if defined(HAVE_LINUX_PERF_EVENT_H) && defined(HAVE_UINT64) | |
55 | # include <sys/types.h> | |
56 | # include <unistd.h> | |
57 | # include <linux/perf_event.h> | |
58 | # include <asm/unistd.h> | |
59 | # if GCC_VERSION_P(4, 5) && (defined(__i386__) || defined(__x86_64__)) | |
60 | # include <sys/mman.h> | |
61 | # endif | |
62 | #endif | |
63 | ||
b64eb60f MW |
64 | /*----- Data structures ---------------------------------------------------*/ |
65 | ||
13ee7406 MW |
66 | enum { CLK, CY, NTIMER }; |
67 | ||
b64eb60f MW |
68 | struct timer { |
69 | struct bench_timer _t; | |
13ee7406 | 70 | const struct timer_ops *ops[NTIMER]; /* subtimers for clock and cycles */ |
6e683a79 MW |
71 | union { |
72 | unsigned tscaux; /* `ia32_tsc_aux' for `ldtscp' */ | |
73 | int fd; /* vanilla `perf_event_open' */ | |
74 | struct { const volatile void *map; size_t sz; } pmc; /* `perf_event_open' | |
75 | * with `rdpmc' */ | |
76 | } u_cy; /* state for cycle measurement */ | |
b64eb60f MW |
77 | }; |
78 | ||
79 | struct timer_ops { | |
13ee7406 MW |
80 | const char *name; /* timer name */ |
81 | unsigned f; /* flags */ | |
82 | #define TF_SECRET 1u /* don't try this automatically */ | |
83 | int (*init)(struct timer */*t*/); /* initialization function */ | |
6e683a79 MW |
84 | int (*now)(struct timer */*t*/, /* read current */ |
85 | struct bench_time */*t_out*/, unsigned /*f*/); | |
86 | void (*diff)(struct timer */*t*/, /* difference */ | |
87 | struct bench_timing */*t_inout*/, | |
88 | const struct bench_time */*t0*/, | |
89 | const struct bench_time */*t1*/); | |
90 | void (*teardown)(struct timer */*t*/); /* release held resources */ | |
b64eb60f MW |
91 | }; |
92 | ||
93 | /*----- Preliminaries -----------------------------------------------------*/ | |
94 | ||
95 | #define NS_PER_S 1000000000 | |
96 | ||
67b5031e MW |
97 | /* --- @debug@ --- * |
98 | * | |
99 | * Arguments: @const char *fmt@ = format control string | |
100 | * @...@ = format arguemnts | |
101 | * | |
102 | * Returns: --- | |
103 | * | |
104 | * Use: Maybe report a debugging message to standard error. | |
105 | */ | |
106 | ||
31d0247c | 107 | static PRINTF_LIKE(1, 2) void debug(const char *fmt, ...) |
b64eb60f MW |
108 | { |
109 | const char *p; | |
110 | va_list ap; | |
111 | ||
112 | p = getenv("MLIB_BENCH_DEBUG"); | |
113 | if (p && *p != 'n' && *p != '0') { | |
114 | va_start(ap, fmt); | |
115 | fputs("mLib BENCH: ", stderr); | |
116 | vfprintf(stderr, fmt, ap); | |
117 | fputc('\n', stderr); | |
118 | va_end(ap); | |
119 | } | |
120 | } | |
121 | ||
6e683a79 MW |
122 | /*----- Difference utilities ----------------------------------------------*/ |
123 | ||
124 | #ifdef HAVE_UINT64 | |
125 | # define FLOATK64(k) ((double)(k).i) | |
126 | #else | |
127 | # define FLOATK64(k) ((double)(k).lo + 4294967296.0*(double)(k).hi) | |
128 | #endif | |
129 | ||
130 | /* --- @diff_ts@ --- * | |
67b5031e | 131 | * |
6e683a79 MW |
132 | * Arguments: @struct timer *t@ = timer structure |
133 | * @struct bench_timing *delta_inout@ = where to put the result | |
134 | * @const struct time *t0, *t1@ = two input times | |
67b5031e MW |
135 | * |
136 | * Returns: --- | |
137 | * | |
6e683a79 MW |
138 | * Use: Calculates a time difference for timers using the |
139 | * @struct timespec@-like time format. | |
67b5031e MW |
140 | */ |
141 | ||
6e683a79 MW |
142 | static void diff_ts(struct timer *t, struct bench_timing *delta_inout, |
143 | const struct bench_time *t0, const struct bench_time *t1) | |
67b5031e MW |
144 | { |
145 | unsigned f = t0->f&t1->f; | |
146 | kludge64 k; | |
147 | ||
6e683a79 | 148 | if (f&BTF_TIMEOK) { |
67b5031e | 149 | |
6e683a79 MW |
150 | /* Calculate the integer difference in seconds. */ |
151 | SUB64(k, t1->t.ts.s, t0->t.ts.s); | |
67b5031e | 152 | |
6e683a79 MW |
153 | /* And apply the nanoseconds difference. To prevent underflow, |
154 | * pre-emptively borrow one from the integer difference. | |
155 | */ | |
156 | delta_inout->t = | |
157 | FLOATK64(k) - 1.0 + | |
158 | (t1->t.ts.ns + NS_PER_S - t0->t.ts.ns)/(double)NS_PER_S; | |
159 | ||
160 | /* Done. */ | |
161 | delta_inout->f |= BTF_TIMEOK; | |
67b5031e | 162 | } |
6e683a79 | 163 | } |
67b5031e | 164 | |
6e683a79 MW |
165 | /* --- @diff_cycles@ --- * |
166 | * | |
167 | * Arguments: @struct timer *t@ = timer structure | |
168 | * @struct bench_timing *delta_inout@ = where to put the result | |
169 | * @const struct time *t0, *t1@ = two input times | |
170 | * | |
171 | * Returns: --- | |
172 | * | |
173 | * Use: Calculates a time difference for cycle-counting timers. | |
174 | */ | |
67b5031e | 175 | |
6e683a79 MW |
176 | static void diff_cycles(struct timer *t, struct bench_timing *delta_inout, |
177 | const struct bench_time *t0, | |
178 | const struct bench_time *t1) | |
179 | { | |
180 | unsigned f = t0->f&t1->f; | |
181 | kludge64 k; | |
182 | ||
183 | if (f&BTF_CYOK) { | |
184 | SUB64(k, t1->cy, t0->cy); delta_inout->cy = FLOATK64(k); | |
185 | delta_inout->f |= BTF_CYOK; | |
186 | } | |
67b5031e MW |
187 | } |
188 | ||
6e683a79 MW |
189 | #undef FLOATK64 |
190 | ||
13ee7406 | 191 | /*----- The null timer ----------------------------------------------------*/ |
b64eb60f | 192 | |
13ee7406 MW |
193 | /* This is a timer which does nothing, in case we don't have any better |
194 | * ideas. | |
67b5031e MW |
195 | */ |
196 | ||
13ee7406 | 197 | static int null_init(struct timer *t) { return (0); } |
6e683a79 MW |
198 | static int null_now(struct timer *t, struct bench_time *t_out, unsigned f) |
199 | { return (0); } | |
200 | static void null_diff(struct timer *t, struct bench_timing *delta_inout, | |
201 | const struct bench_time *t0, | |
202 | const struct bench_time *t1) | |
203 | { ; } | |
b64eb60f | 204 | static void null_teardown(struct timer *t) { ; } |
b64eb60f | 205 | |
13ee7406 | 206 | static const struct timer_ops null_ops = |
6e683a79 | 207 | { "null", 0, null_init, null_now, null_diff, null_teardown }; |
13ee7406 MW |
208 | #define NULL_ENT &null_ops, |
209 | ||
210 | /*----- The broken clock --------------------------------------------------*/ | |
211 | ||
212 | /* This is a cycle counter which does nothing, in case we don't have any | |
213 | * better ideas. | |
214 | */ | |
b64eb60f | 215 | |
13ee7406 MW |
216 | static int broken_init(struct timer *t) { return (-1); } |
217 | ||
218 | static const struct timer_ops broken_ops = | |
6e683a79 | 219 | { "broken", TF_SECRET, broken_init, null_now, null_diff, null_teardown }; |
13ee7406 | 220 | #define BROKEN_ENT &broken_ops, |
b64eb60f MW |
221 | |
222 | /*----- Linux performance counters ----------------------------------------*/ | |
223 | ||
67b5031e MW |
224 | /* This is a cycle counter which uses the Linux performance event system, |
225 | * which is probably the best choice if it's available. | |
226 | */ | |
227 | ||
b64eb60f MW |
228 | #if defined(HAVE_LINUX_PERF_EVENT_H) && defined(HAVE_UINT64) |
229 | ||
6e683a79 MW |
230 | /* --- @perfevent_open@ --- * |
231 | * | |
232 | * Arguments: --- | |
233 | * | |
234 | * Returns: File descriptor, or %$-1$%. | |
235 | * | |
236 | * Use: Open a performance measurement descriptor set up to count CPU | |
237 | * cycles. | |
238 | */ | |
b64eb60f | 239 | |
6e683a79 MW |
240 | static int perfevent_open(void) |
241 | { | |
242 | struct perf_event_attr attr = { 0 }; | |
243 | int fd; | |
b64eb60f | 244 | |
6e683a79 MW |
245 | attr.type = PERF_TYPE_HARDWARE; |
246 | attr.size = sizeof(attr); | |
247 | attr.config = PERF_COUNT_HW_CPU_CYCLES; | |
248 | attr.disabled = 0; | |
249 | attr.exclude_kernel = 1; | |
250 | attr.exclude_hv = 1; | |
251 | ||
252 | fd = syscall(__NR_perf_event_open, &attr, 0, -1, -1, 0); | |
253 | if (fd < 0) { | |
254 | debug("couldn't open perf event: %s", strerror(errno)); | |
255 | return (-1); | |
256 | } | |
257 | ||
258 | return (fd); | |
259 | } | |
260 | ||
261 | static int perfevent_now(struct timer *t, | |
262 | struct bench_time *t_out, unsigned f) | |
b64eb60f MW |
263 | { |
264 | ssize_t n; | |
265 | ||
266 | n = read(t->u_cy.fd, &t_out->cy.i, sizeof(t_out->cy.i)); | |
267 | if (n != sizeof(t_out->cy.i)) { | |
268 | debug("failed to read perf-event counter: %s", strerror(errno)); | |
6e683a79 | 269 | return (0); |
b64eb60f | 270 | } |
6e683a79 | 271 | t_out->f |= BTF_CYOK; return (0); |
b64eb60f MW |
272 | } |
273 | ||
274 | static void perfevent_teardown(struct timer *t) | |
275 | { close(t->u_cy.fd); } | |
276 | ||
b64eb60f MW |
277 | static int perfevent_init(struct timer *t) |
278 | { | |
b64eb60f | 279 | struct bench_time tm; |
6e683a79 | 280 | int fd = -1, rc; |
b64eb60f | 281 | |
6e683a79 | 282 | fd = perfevent_open(); if (!fd) { rc = -1; goto end; } |
b64eb60f | 283 | |
6e683a79 MW |
284 | t->u_cy.fd = fd; tm.f = 0; perfevent_now(t, &tm, 0); |
285 | if (!(tm.f&BTF_CYOK)) { rc = -1; goto end; } | |
286 | fd = -1; rc = 0; | |
287 | end: | |
288 | if (fd != -1) close(fd); | |
289 | return (rc); | |
290 | } | |
291 | ||
292 | static const struct timer_ops perfevent_ops = | |
293 | { "linux-perf-read-hw-cycles", 0, | |
294 | perfevent_init, perfevent_now, diff_cycles, perfevent_teardown }; | |
295 | #define PERFEVENT_VANILLA_CYENT &perfevent_ops, | |
296 | ||
297 | # if GCC_VERSION_P(4, 5) && (defined(__i386__) || defined(__x86_64__)) | |
298 | ||
299 | /* Special syscall-free version for x86 using `rdpmc' instruction. * | |
300 | * | |
301 | * This is a bit weird because it does both kinds of measurement in a single | |
302 | * operation. | |
303 | */ | |
304 | ||
305 | static int perfevrdpmc_now(struct timer *t, | |
306 | struct bench_time *t_out, unsigned f) | |
307 | { | |
308 | const volatile struct perf_event_mmap_page *map = t->u_cy.pmc.map; | |
309 | unsigned long long tsc = tsc, toff = toff, tenb = tenb; | |
310 | unsigned long long cy = cy, cyoff = cyoff; | |
311 | unsigned long long m, hi, lo; | |
312 | unsigned tshift = tshift, tmult = tmult, q0, q1, ff; | |
313 | ||
314 | /* Repeat until we can complete this job without the buffer changing in the | |
315 | * middle. | |
316 | */ | |
317 | q0 = map->lock; | |
318 | __atomic_thread_fence(__ATOMIC_ACQ_REL); | |
319 | for (;;) { | |
320 | ff = 0; | |
321 | ||
322 | /* Read the passage-of-time information. */ | |
323 | if (map->cap_user_time) { | |
324 | tenb = map->time_enabled; | |
325 | tsc = __builtin_ia32_rdtsc(); | |
326 | tshift = map->time_shift; | |
327 | tmult = map->time_mult; | |
328 | toff = map->time_offset; | |
329 | ff |= BTF_TIMEOK; | |
330 | } | |
331 | ||
332 | /* Read the performance-counter information. */ | |
333 | if (map->cap_user_rdpmc) { | |
334 | cy = __builtin_ia32_rdpmc(map->index - 1); | |
335 | cyoff = map->offset; | |
336 | ff |= BTF_CYOK; | |
337 | } | |
338 | ||
339 | /* Check the sequence number again. */ | |
340 | __atomic_thread_fence(__ATOMIC_ACQ_REL); | |
341 | q1 = map->lock; | |
342 | if (q0 == q1) break; | |
343 | q0 = q1; | |
344 | } | |
345 | ||
346 | if (ff&BTF_TIMEOK) { | |
347 | /* We have a raw reference-cycle count %$n$% (@tsc@), and parameters | |
348 | * %$a$%, %$w$% and %$t_0$%, such that %$a n/2^w + t_0$% gives a time in | |
349 | * nanoseconds. | |
350 | */ | |
351 | ||
352 | m = (1ull << tshift) - 1; | |
353 | hi = tsc >> tshift; lo = tsc&m; | |
354 | t_out->t.rawns.i = hi*tmult + (lo*tmult >> tshift) + toff + tenb; | |
355 | t_out->f |= BTF_TIMEOK; | |
b64eb60f MW |
356 | } |
357 | ||
6e683a79 MW |
358 | if (ff&BTF_CYOK) { |
359 | /* We have the cycle count. */ | |
b64eb60f | 360 | |
6e683a79 MW |
361 | t_out->cy.i = cy + cyoff; |
362 | t_out->f |= BTF_CYOK; | |
363 | } | |
13ee7406 | 364 | return (0); |
b64eb60f | 365 | } |
13ee7406 | 366 | |
6e683a79 MW |
367 | static void perfevrdpmc_diff(struct timer *t, |
368 | struct bench_timing *delta_inout, | |
369 | const struct bench_time *t0, | |
370 | const struct bench_time *t1) | |
371 | { | |
372 | unsigned f = t0->f&t1->f; | |
13ee7406 | 373 | |
6e683a79 MW |
374 | if (f&BTF_TIMEOK) { |
375 | delta_inout->t = (t1->t.rawns.i - t0->t.rawns.i)/(double)NS_PER_S; | |
376 | delta_inout->f |= BTF_TIMEOK; | |
377 | } | |
378 | ||
379 | if (f&BTF_CYOK) { | |
380 | delta_inout->cy = t1->cy.i - t0->cy.i; | |
381 | delta_inout->f |= BTF_CYOK; | |
382 | } | |
383 | } | |
384 | ||
385 | static void perfevrdpmc_teardown(struct timer *t) | |
386 | { munmap((/*unconst unvolatile*/ void *)t->u_cy.pmc.map, t->u_cy.pmc.sz); } | |
387 | ||
388 | static int perfevrdpmc_cyinit(struct timer *t) | |
389 | { | |
390 | const volatile struct perf_event_mmap_page *map = 0; | |
391 | unsigned a, b, c, d, q0, q1, f; | |
392 | int pgsz, mapsz, fd = -1, rc; | |
393 | ||
394 | /* We need `rdtsc' to do the passage-of-time measurement. */ | |
395 | if (!__get_cpuid(1, &a, &b, &c, &d) || !(d&CPUID_1D_TSC)) | |
396 | { debug("no `rdtsc' instrunction"); return (-1); } | |
397 | ||
398 | /* The rules say we must allocate %$1 + 2^n$% pages, so we need to know how | |
399 | * big a page is. | |
400 | */ | |
401 | pgsz = sysconf(_SC_PAGESIZE); | |
402 | if (pgsz < 0) { | |
403 | debug("failed to discover page size!: %s", strerror(errno)); | |
404 | rc = -1; goto end; | |
405 | } | |
406 | ||
407 | /* Open the measurement descriptor and map it. */ | |
408 | fd = perfevent_open(); if (!fd) return (-1); | |
409 | mapsz = 2*pgsz; | |
410 | map = mmap(0, mapsz, PROT_READ, MAP_SHARED, fd, 0); | |
411 | if (map == MAP_FAILED) { | |
412 | debug("failed to map perf event: %s", strerror(errno)); | |
413 | return (-1); | |
414 | } | |
415 | ||
416 | /* Check that it's revealed the necessary information. */ | |
417 | q0 = map->lock; | |
418 | __atomic_thread_fence(__ATOMIC_ACQ_REL); | |
419 | for (;;) { | |
420 | f = 0; | |
421 | if (map->cap_user_time) f |= BTF_TIMEOK; | |
422 | if (map->cap_user_rdpmc) f |= BTF_CYOK; | |
423 | __atomic_thread_fence(__ATOMIC_ACQ_REL); | |
424 | q1 = map->lock; | |
425 | if (q0 == q1) break; | |
426 | q0 = q1; | |
427 | } | |
428 | if (!(f&BTF_TIMEOK)) | |
429 | { debug("kernel refused user time measurement"); rc = -1; goto end; } | |
430 | if (!(f&BTF_TIMEOK)) | |
431 | { debug("kernel refused user cycle measurement"); rc = -1; goto end; } | |
432 | ||
433 | /* All done. We can close the descriptor here: the mapping will keep the | |
434 | * performance-measurement machinery alive. | |
435 | */ | |
436 | t->u_cy.pmc.map = map; t->u_cy.pmc.sz = mapsz; map = 0; rc = 0; | |
437 | end: | |
438 | if (fd != -1) close(fd); | |
439 | if (map) munmap((/*unconst unvolatile*/ void *)map, mapsz); | |
440 | return (rc); | |
441 | } | |
442 | ||
443 | static const struct timer_ops perfevrdpmc_cyops = | |
444 | { "linux-x86-perf-rdpmc-hw-cycles", 0, | |
445 | perfevrdpmc_cyinit, perfevrdpmc_now, | |
446 | perfevrdpmc_diff, perfevrdpmc_teardown }; | |
447 | ||
448 | static int perfevrdpmc_clkinit(struct timer *t) | |
449 | { | |
450 | if (t->ops[CLK] != &perfevrdpmc_cyops) { | |
451 | debug("linux-x86-perf-rdpmc-hw-cycles not set as cycle subtimer"); | |
452 | return(-1); | |
453 | } | |
454 | return (0); | |
455 | } | |
456 | ||
457 | static const struct timer_ops perfevrdpmc_clkops = | |
458 | { "linux-x86-perf-rdpmc-hw-cycles", 0, | |
459 | perfevrdpmc_clkinit, null_now, | |
460 | null_diff, null_teardown }; | |
461 | ||
462 | # define PERFEVENT_RDPMC_CLKENT &perfevrdpmc_clkops, | |
463 | # define PERFEVENT_RDPMC_CYENT &perfevrdpmc_cyops, | |
464 | ||
465 | # else | |
466 | # define PERFEVENT_RDPMC_CLKENT | |
467 | # define PERFEVENT_RDPMC_CYENT | |
468 | # endif | |
469 | ||
470 | # define PERFEVENT_CLKENT PERFEVENT_RDPMC_CLKENT | |
471 | # define PERFEVENT_CYENT PERFEVENT_RDPMC_CYENT PERFEVENT_VANILLA_CYENT | |
b64eb60f | 472 | #else |
6e683a79 | 473 | # define PERFEVENT_CLKENT |
b64eb60f MW |
474 | # define PERFEVENT_CYENT |
475 | #endif | |
476 | ||
477 | /*----- Intel time-stamp counter ------------------------------------------*/ | |
478 | ||
67b5031e MW |
479 | /* This is a cycle counter based on the Intel `rdtsc' instruction. It's not |
480 | * really suitable for performance measurement because it gets confused by | |
481 | * CPU frequency adjustments. | |
482 | */ | |
483 | ||
6e683a79 | 484 | #if GCC_VERSION_P(4, 5) && (defined(__i386__) || defined(__x86_64__)) |
b64eb60f | 485 | |
6e683a79 MW |
486 | static int x86rdtsc_now(struct timer *t, |
487 | struct bench_time *t_out, unsigned f) | |
488 | { t_out->cy.i = __builtin_ia32_rdtsc(); t_out->f |= BTF_CYOK; return (0); } | |
b64eb60f MW |
489 | |
490 | static int x86rdtsc_init(struct timer *t) | |
491 | { | |
13ee7406 MW |
492 | unsigned a, b, c, d; |
493 | ||
494 | if (!__get_cpuid(1, &a, &b, &c, &d) || !(d&CPUID_1D_TSC)) | |
b64eb60f | 495 | { debug("no `rdtsc' instrunction"); return (-1); } |
6e683a79 MW |
496 | t->u_cy.tscaux = ~0u; |
497 | return (0); | |
498 | } | |
499 | ||
500 | static int x86rdtscp_now(struct timer *t, | |
501 | struct bench_time *t_out, unsigned f) | |
502 | { | |
503 | unsigned tscaux; | |
504 | unsigned long long n; | |
505 | ||
506 | n = __builtin_ia32_rdtscp(&tscaux); | |
507 | if (!(f&BTF_T1)) | |
508 | t->u_cy.tscaux = tscaux; | |
509 | else if (t->u_cy.tscaux != tscaux) { | |
510 | debug("tscaux mismatch: new 0x%08x /= old 0x%08x", | |
511 | tscaux, t->u_cy.tscaux); | |
512 | return (-1); | |
513 | } | |
514 | t_out->cy.i = n; t_out->f |= BTF_CYOK; return (0); | |
515 | } | |
516 | ||
517 | static int x86rdtscp_init(struct timer *t) | |
518 | { | |
519 | unsigned a, b, c, d; | |
520 | ||
521 | if (!__get_cpuid(0x80000001, &a, &b, &c, &d) || !(d&CPUID_1xD_TSCP)) | |
522 | { debug("no `rdtscp' instrunction"); return (-1); } | |
13ee7406 | 523 | return (0); |
b64eb60f MW |
524 | } |
525 | ||
13ee7406 | 526 | static const struct timer_ops x86rdtsc_ops = |
6e683a79 MW |
527 | { "x86-rdtsc", 0, |
528 | x86rdtsc_init, x86rdtsc_now, diff_cycles, null_teardown }; | |
529 | static const struct timer_ops x86rdtscp_ops = | |
530 | { "x86-rdtscp", 0, | |
531 | x86rdtscp_init, x86rdtscp_now, diff_cycles, null_teardown }; | |
13ee7406 | 532 | |
6e683a79 | 533 | # define X86RDTSC_CYENT &x86rdtscp_ops, &x86rdtsc_ops, |
b64eb60f | 534 | #else |
13ee7406 | 535 | # define X86RDTSC_CYENT |
b64eb60f MW |
536 | #endif |
537 | ||
538 | /*----- POSIX `clock_gettime' ---------------------------------------------*/ | |
539 | ||
67b5031e MW |
540 | /* This is a real-time clock based on the POSIX time interface, with up to |
541 | * nanosecond precision. | |
542 | */ | |
543 | ||
b64eb60f MW |
544 | #if defined(HAVE_CLOCK_GETTIME) && defined(CLOCK_THREAD_CPUTIME_ID) |
545 | ||
6e683a79 | 546 | static int gettime_now(struct timer *t, struct bench_time *t_out, unsigned f) |
b64eb60f MW |
547 | { |
548 | struct timespec now; | |
549 | ||
550 | if (clock_gettime(CLOCK_THREAD_CPUTIME_ID, &now)) | |
6e683a79 MW |
551 | { debug("error reading POSIX clock: %s", strerror(errno)); return (0); } |
552 | ASSIGN64(t_out->t.ts.s, now.tv_sec); t_out->t.ts.ns = now.tv_nsec; | |
553 | t_out->f |= BTF_TIMEOK; return (0); | |
b64eb60f MW |
554 | } |
555 | ||
b64eb60f MW |
556 | static int gettime_init(struct timer *t) |
557 | { | |
558 | struct bench_time tm; | |
559 | ||
6e683a79 | 560 | tm.f = 0; gettime_now(t, &tm, 0); if (!tm.f&BTF_TIMEOK) return (-1); |
13ee7406 | 561 | return (0); |
b64eb60f MW |
562 | } |
563 | ||
13ee7406 | 564 | static const struct timer_ops gettime_ops = |
6e683a79 MW |
565 | { "posix-thread-cputime", 0, |
566 | gettime_init, gettime_now, diff_ts, null_teardown }; | |
13ee7406 MW |
567 | |
568 | # define GETTIME_CLKENT &gettime_ops, | |
b64eb60f MW |
569 | #else |
570 | # define GETTIME_CLKENT | |
571 | #endif | |
572 | ||
573 | /*----- Standard C `clock' ------------------------------------------------*/ | |
574 | ||
67b5031e MW |
575 | /* This is a real-time clock based on the C `clock' function which is |
576 | * guaranteed to be available, though it's not likely to be very good. | |
577 | */ | |
578 | ||
6e683a79 | 579 | static int clock_now(struct timer *t, struct bench_time *t_out, unsigned f) |
b64eb60f | 580 | { |
6e683a79 | 581 | clock_t now; |
b64eb60f MW |
582 | |
583 | now = clock(); | |
584 | if (now == (clock_t)-1) { | |
585 | debug("error reading standard clock: %s", strerror(errno)); | |
6e683a79 | 586 | return (0); |
b64eb60f | 587 | } |
6e683a79 MW |
588 | t_out->t.clk = now; t_out->f |= BTF_TIMEOK; return (0); |
589 | } | |
590 | ||
591 | static void clock_diff(struct timer *t, struct bench_timing *delta_inout, | |
592 | const struct bench_time *t0, | |
593 | const struct bench_time *t1) | |
594 | { | |
595 | unsigned f = t0->f&t1->f; | |
596 | ||
597 | if (f&BTF_TIMEOK) { | |
598 | delta_inout->t = (t1->t.clk - t0->t.clk)/(double)CLOCKS_PER_SEC; | |
599 | delta_inout->f |= BTF_TIMEOK; | |
600 | } | |
b64eb60f MW |
601 | } |
602 | ||
b64eb60f MW |
603 | static int clock_init(struct timer *t) |
604 | { | |
605 | struct bench_time tm; | |
606 | ||
6e683a79 | 607 | tm.f = 0; clock_now(t, &tm, 0); if (!tm.f&BTF_TIMEOK) return (-1); |
13ee7406 | 608 | return (0); |
b64eb60f MW |
609 | } |
610 | ||
13ee7406 | 611 | static const struct timer_ops clock_ops = |
6e683a79 | 612 | { "stdc-clock", 0, clock_init, clock_now, clock_diff, null_teardown }; |
13ee7406 MW |
613 | |
614 | #define CLOCK_CLKENT &clock_ops, | |
b64eb60f MW |
615 | |
616 | /*----- Timing setup ------------------------------------------------------*/ | |
617 | ||
67b5031e | 618 | /* Tables of timing sources. */ |
13ee7406 | 619 | static const struct timer_ops |
6e683a79 MW |
620 | *const clktab[] = { PERFEVENT_CLKENT |
621 | GETTIME_CLKENT | |
622 | CLOCK_CLKENT | |
623 | BROKEN_ENT | |
624 | 0 }, | |
625 | *const cytab[] = { PERFEVENT_CYENT | |
626 | X86RDTSC_CYENT | |
627 | NULL_ENT | |
628 | BROKEN_ENT | |
629 | 0 }; | |
13ee7406 MW |
630 | |
631 | static const struct timertab { | |
632 | const char *what; | |
633 | const char *env; | |
634 | const struct timer_ops *const *opstab; | |
635 | } timertab[] = { | |
636 | { "clock", "MLIB_BENCH_CLKTIMER", clktab }, | |
637 | { "cycle", "MLIB_BENCH_CYCLETIMER", cytab } | |
638 | }; | |
b64eb60f | 639 | |
67b5031e MW |
640 | /* --- @find_timer@ --- * |
641 | * | |
642 | * Arguments: @const char *name@ = timer name | |
643 | * @size_t sz@ = length of name | |
13ee7406 | 644 | * @unsigned tm@ = which subtimer we're looking for |
67b5031e MW |
645 | * |
646 | * Returns: The table entry matching the given name, or null if there | |
647 | * isn't one. | |
648 | */ | |
649 | ||
13ee7406 MW |
650 | static const struct timer_ops *find_timer(const char *name, size_t sz, |
651 | unsigned tm) | |
b64eb60f | 652 | { |
13ee7406 MW |
653 | const struct timer_ops *const *tt; |
654 | ||
655 | for (tt = timertab[tm].opstab; *tt; tt++) { | |
656 | if (strlen((*tt)->name) == sz && | |
657 | MEMCMP(name, ==, (*tt)->name, sz)) | |
658 | return (*tt); | |
b64eb60f | 659 | } |
13ee7406 MW |
660 | debug("%s timer `%.*s' not found", |
661 | timertab[tm].what, (int)sz, name); return (0); | |
b64eb60f MW |
662 | } |
663 | ||
67b5031e MW |
664 | /* --- @try_timer@ --- * |
665 | * | |
666 | * Arguments: @struct timer *t@ = timer structure | |
13ee7406 MW |
667 | * @const struct timer_ops *ops@ = timer ops |
668 | * @unsigned tm@ = which subtimer we're setting | |
67b5031e | 669 | * |
13ee7406 | 670 | * Returns: Zero on success, %$-1$% if timer failed. |
67b5031e MW |
671 | * |
672 | * Use: Tries to initialize the timer @t@, reporting a debug message | |
673 | * if it worked. | |
674 | */ | |
675 | ||
b64eb60f | 676 | static int try_timer(struct timer *t, |
13ee7406 | 677 | const struct timer_ops *ops, unsigned tm) |
b64eb60f | 678 | { |
13ee7406 MW |
679 | if (ops->init(t)) return (-1); |
680 | debug("selected %s timer `%s'", timertab[tm].what, ops->name); | |
681 | t->ops[tm] = ops; return (0); | |
b64eb60f MW |
682 | } |
683 | ||
67b5031e MW |
684 | /* --- @select_timer@ --- * |
685 | * | |
686 | * Arguments: @struct timer *t@ = timer structure | |
13ee7406 MW |
687 | * @unsigned tm@ = which subtimer we're setting |
688 | * @const char *config@, @size_t sz@ = config string | |
67b5031e | 689 | * |
13ee7406 | 690 | * Returns: Zero on success, %$-1$% if timer failed. |
67b5031e MW |
691 | * |
692 | * Use: Select a timer from the table. If the environment variable | |
693 | * is set, then parse a comma-separated list of timer names and | |
694 | * use the first one listed that seems to work; otherwise, try | |
695 | * the timers in the table in order. | |
696 | */ | |
697 | ||
13ee7406 MW |
698 | static int select_timer(struct timer *t, unsigned tm, |
699 | const char *config, size_t sz) | |
b64eb60f | 700 | { |
13ee7406 MW |
701 | const char *p, *l; |
702 | const struct timer_ops *ops, *const *tt; | |
b64eb60f | 703 | |
13ee7406 MW |
704 | if (!config) { |
705 | for (tt = timertab[tm].opstab; *tt; tt++) | |
706 | if (!((*tt)->f&TF_SECRET) && !try_timer(t, *tt, tm)) return (0); | |
b64eb60f | 707 | } else { |
13ee7406 | 708 | l = config + sz; |
b64eb60f | 709 | for (;;) { |
13ee7406 MW |
710 | p = memchr(config, ',', l - config); if (!p) p = l; |
711 | ops = find_timer(config, p - config, tm); | |
712 | if (ops && !try_timer(t, ops, tm)) return (0); | |
713 | if (p >= l) break; | |
714 | config = p + 1; | |
b64eb60f MW |
715 | } |
716 | } | |
13ee7406 | 717 | debug("no suitable %s timer found", timertab[tm].what); return (-1); |
b64eb60f MW |
718 | } |
719 | ||
67b5031e | 720 | /* Bench timer operations. */ |
13ee7406 MW |
721 | static void timer_describe(struct bench_timer *tm, dstr *d) |
722 | { | |
723 | struct timer *t = (struct timer *)tm; | |
724 | unsigned i; | |
725 | ||
726 | dstr_puts(d, "builtin: "); | |
727 | for (i = 0; i < NTIMER; i++) { | |
728 | if (i) dstr_puts(d, ", "); | |
729 | dstr_putf(d, "%s = %s", timertab[i].what, t->ops[i]->name); | |
730 | } | |
731 | } | |
732 | ||
6e683a79 MW |
733 | static int timer_now(struct bench_timer *tm, |
734 | struct bench_time *t_out, unsigned f) | |
b64eb60f MW |
735 | { |
736 | struct timer *t = (struct timer *)tm; | |
13ee7406 | 737 | unsigned i; |
b64eb60f | 738 | |
6e683a79 MW |
739 | t_out->f = 0; |
740 | for (i = 0; i < NTIMER; i++) if (t->ops[i]->now(t, t_out, f)) return (-1); | |
741 | return (0); | |
742 | } | |
743 | ||
744 | static void timer_diff(struct bench_timer *tm, | |
745 | struct bench_timing *t_out, | |
746 | const struct bench_time *t0, | |
747 | const struct bench_time *t1) | |
748 | { | |
749 | struct timer *t = (struct timer *)tm; | |
750 | unsigned i; | |
751 | ||
752 | t_out->f = 0; | |
753 | for (i = 0; i < NTIMER; i++) t->ops[i]->diff(t, t_out, t0, t1); | |
b64eb60f | 754 | } |
13ee7406 | 755 | |
b64eb60f MW |
756 | static void timer_destroy(struct bench_timer *tm) |
757 | { | |
758 | struct timer *t = (struct timer *)tm; | |
13ee7406 | 759 | unsigned i; |
b64eb60f MW |
760 | |
761 | if (!t) return; | |
13ee7406 MW |
762 | for (i = 0; i < NTIMER; i++) |
763 | if (t->ops[i]) t->ops[i]->teardown(t); | |
b64eb60f MW |
764 | xfree(t); |
765 | } | |
766 | ||
13ee7406 | 767 | static const struct bench_timerops timer_ops = |
6e683a79 | 768 | { timer_describe, timer_now, timer_diff, timer_destroy }; |
b64eb60f | 769 | |
67b5031e MW |
770 | /* --- @bench_createtimer@ --- * |
771 | * | |
13ee7406 | 772 | * Arguments: @const char *config@ = timer configuration string |
67b5031e MW |
773 | * |
774 | * Returns: A freshly constructed standard timer object. | |
775 | * | |
776 | * Use: Allocate a timer. Dispose of it by calling | |
777 | * @tm->ops->destroy(tm)@ when you're done. | |
13ee7406 MW |
778 | * |
779 | * Applications should not set configuration strings except as | |
780 | * established by user action, e.g., from a command-line option, | |
781 | * environment variable, or configuration file. | |
67b5031e MW |
782 | */ |
783 | ||
13ee7406 | 784 | struct bench_timer *bench_createtimer(const char *config) |
b64eb60f MW |
785 | { |
786 | struct timer *t = 0; | |
787 | struct bench_timer *ret = 0; | |
13ee7406 MW |
788 | struct { const char *p; size_t sz; } tmconf[NTIMER] = { 0 }; |
789 | const struct timer_ops *const *tt; | |
790 | const char *p, *l; size_t n, nn; | |
791 | unsigned i; | |
b64eb60f | 792 | |
13ee7406 MW |
793 | /* Parse the configuration string. */ |
794 | if (config) { | |
795 | ||
796 | /* The first thing to do is find the end of the string. */ | |
797 | l = config + strlen(config); | |
798 | ||
799 | for (;;) { | |
800 | /* Process the whitespace-sparated words of the string one by one. */ | |
801 | ||
802 | /* Skip over any initial whitespace. If we hit the end of the string | |
803 | * then we're done. | |
804 | */ | |
805 | for (;;) | |
806 | if (config >= l) goto done_config; | |
807 | else if (!ISSPACE(*config)) break; | |
808 | else config++; | |
809 | ||
810 | /* There's definitely a word here. Find the end of it. */ | |
811 | for (p = config; p < l && !ISSPACE(*p); p++); | |
812 | nn = p - config; | |
813 | ||
814 | /* Try various simple keywords. */ | |
815 | #define MATCHP(lit) (nn == sizeof(lit) - 1 && MEMCMP(config, ==, lit, nn)) | |
816 | ||
817 | if (MATCHP("list")) { | |
818 | /* The `list' keyword requests lists of the available timer | |
819 | * implementations. | |
820 | */ | |
821 | ||
822 | for (i = 0; i < NTIMER; i++) { | |
823 | printf("%s timers:", timertab[i].what); | |
824 | for (tt = timertab[i].opstab; *tt; tt++) | |
825 | if (!((*tt)->f)&TF_SECRET) printf(" %s", (*tt)->name); | |
826 | putchar('\n'); | |
827 | } | |
828 | goto next_config; | |
829 | } | |
830 | ||
831 | #undef MATCHP | |
832 | ||
833 | /* Otherwise it's an assignment, setting a subtimer list. */ | |
834 | p = memchr(config, '=', nn); | |
835 | if (!p) | |
836 | n = nn; | |
837 | else { | |
838 | n = p - config; | |
839 | for (i = 0; i < NTIMER; i++) | |
840 | if (STRNCMP(config, ==, timertab[i].what, n) && | |
841 | !timertab[i].what[n]) { | |
842 | if (tmconf[i].p) | |
843 | debug("duplicate %s timer list", timertab[i].what); | |
844 | tmconf[i].p = config + n + 1; tmconf[i].sz = nn - n - 1; | |
845 | goto next_config; | |
846 | } | |
847 | } | |
848 | debug("unrecognized config keyword `%.*s'", (int)n, config); | |
849 | ||
850 | /* Move on to the next word. */ | |
851 | next_config: | |
852 | config += nn; | |
853 | } | |
854 | done_config:; | |
855 | } | |
856 | ||
857 | /* Override these settings from the environment. */ | |
858 | for (i = 0; i < NTIMER; i++) { | |
859 | p = getenv(timertab[i].env); | |
860 | if (p) { tmconf[i].p = p; tmconf[i].sz = strlen(p); } | |
861 | } | |
862 | ||
863 | /* All seems well. Allocate the timer object. */ | |
864 | t = xmalloc(sizeof(*t)); | |
865 | for (i = 0; i < NTIMER; i++) t->ops[i] = 0; | |
866 | ||
867 | /* Try to set up the subtimers. */ | |
6e683a79 | 868 | for (i = NTIMER; i--; ) |
13ee7406 MW |
869 | if (select_timer(t, i, tmconf[i].p, tmconf[i].sz)) goto end; |
870 | ||
871 | /* All is done. */ | |
b64eb60f MW |
872 | t->_t.ops = &timer_ops; ret = &t->_t; t = 0; |
873 | end: | |
874 | if (t) timer_destroy(&t->_t); | |
875 | return (ret); | |
876 | } | |
877 | ||
67b5031e | 878 | /*----- Benchmarking ------------------------------------------------------*/ |
b64eb60f | 879 | |
67b5031e MW |
880 | /* --- @bench_init@ --- * |
881 | * | |
882 | * Arguments: @struct bench_state *b@ = bench state to initialize | |
13ee7406 | 883 | * @struct bench_timer *tm@ = timer to attach, or null |
67b5031e | 884 | * |
13ee7406 | 885 | * Returns: Zero on success, %$-1$% on failure. |
67b5031e | 886 | * |
13ee7406 MW |
887 | * Use: Initialize the benchmark state. On success, the timer state |
888 | * still needs to be calibrated (use @bench_calibrate@) before | |
889 | * it can be used, but this will be done automatically by | |
890 | * @bench_measure@ if it's not done by hand earlier. The timer | |
891 | * is now owned by the benchmark state and will be destroyed by | |
892 | * @bench_destroy@. | |
893 | * | |
894 | * The only reason for failure is if @tm@ was null on entry, | |
895 | * and automatic construction of a timer failed. The state is | |
896 | * safe to discard, but calling @bench_destroy@ is safe too. | |
67b5031e | 897 | */ |
b64eb60f | 898 | |
13ee7406 MW |
899 | int bench_init(struct bench_state *b, struct bench_timer *tm) |
900 | { | |
901 | int rc; | |
902 | ||
903 | b->tm = 0; | |
904 | ||
905 | if (!tm) { | |
906 | tm = bench_createtimer(0); | |
907 | if (!tm) { rc = -1; goto end; } | |
908 | } | |
909 | ||
910 | b->tm = tm; b->target_s = 1.0; b->f = 0; rc = 0; | |
911 | end: | |
912 | return (rc); | |
913 | } | |
b64eb60f | 914 | |
67b5031e MW |
915 | /* --- @bench_destroy@ --- * |
916 | * | |
917 | * Arguments: @struct bench_state *b@ = bench state | |
918 | * | |
919 | * Returns: --- | |
920 | * | |
921 | * Use: Destroy the benchmark state, releasing the resources that it | |
922 | * holds. | |
923 | */ | |
924 | ||
b64eb60f | 925 | void bench_destroy(struct bench_state *b) |
13ee7406 | 926 | { if (b->tm) { b->tm->ops->destroy(b->tm); b->tm = 0; } } |
b64eb60f | 927 | |
67b5031e MW |
928 | /* --- @do_nothing@ --- * |
929 | * | |
930 | * Arguments: @unsigned long n@ = iteration count | |
931 | * @void *ctx@ = context pointer (ignored) | |
932 | * | |
933 | * Returns: --- | |
934 | * | |
935 | * Use: Does nothing at all for @n@ iterations. Used to calibrate | |
936 | * the benchmarking state. | |
937 | */ | |
938 | ||
939 | static void do_nothing(unsigned long n, void *ctx) | |
b64eb60f MW |
940 | { while (n--) RELAX; } |
941 | ||
6e683a79 MW |
942 | /* --- @measure@ --- * |
943 | * | |
944 | * Arguments: @struct bench_state *b@ = bench state | |
945 | * @struct bench_timing *delta_out@ = where to leave the timing | |
946 | * @bench_fn *fn@ = function to measure | |
947 | * @void *ctx@ = context for the function | |
948 | * @double n@ = number of iterations | |
949 | * | |
950 | * Returns: --- | |
951 | * | |
952 | * Use: Run the function @n@ times, and report how long it took. | |
953 | * | |
954 | * This function deals with retrying the measurements if the | |
955 | * timer reports a temporary failure, and all of the | |
956 | * difficulties if @n@ is too large to fit in a machine integer. | |
957 | */ | |
958 | ||
959 | static void measure(struct bench_state *b, struct bench_timing *delta_out, | |
960 | bench_fn *fn, void *ctx, double n) | |
961 | { | |
962 | struct bench_timer *tm = b->tm; | |
963 | struct bench_time t0, t1; | |
964 | unsigned long n0, n1; | |
965 | double R = ULONG_MAX; | |
966 | ||
967 | if (n <= R) { | |
968 | n0 = n; | |
969 | do { | |
970 | while (tm->ops->now(tm, &t0, BTF_T0)); | |
971 | fn(n0, ctx); | |
972 | } while (tm->ops->now(tm, &t1, BTF_T1)); | |
973 | } else { | |
974 | n1 = n/R; n0 = n - n1*R; | |
975 | do { | |
976 | while (tm->ops->now(tm, &t0, BTF_T0)); | |
977 | while (n1--) fn(ULONG_MAX, ctx); | |
978 | fn(n0, ctx); | |
979 | } while (tm->ops->now(tm, &t1, BTF_T1)); | |
980 | } | |
981 | tm->ops->diff(tm, delta_out, &t0, &t1); | |
982 | } | |
983 | ||
67b5031e MW |
984 | /* --- @bench_calibrate@ --- * |
985 | * | |
986 | * Arguments: @struct bench_state *b@ = bench state | |
987 | * | |
13ee7406 | 988 | * Returns: Zero on success, %$-1$% if calibration failed. |
67b5031e MW |
989 | * |
990 | * Use: Calibrate the benchmark state, so that it can be used to | |
991 | * measure performance reasonably accurately. | |
992 | */ | |
993 | ||
13ee7406 MW |
994 | #define T_CLB 0.0625 /* calibration time limit */ |
995 | ||
b64eb60f MW |
996 | int bench_calibrate(struct bench_state *b) |
997 | { | |
998 | struct linreg lr_clk = LINREG_INIT, lr_cy = LINREG_INIT; | |
b64eb60f | 999 | struct bench_timing delta; |
6e683a79 | 1000 | double n, r; |
b64eb60f | 1001 | bench_fn *fn = LAUNDER(&do_nothing); |
6e683a79 | 1002 | unsigned i, f = BTF_ANY; |
b64eb60f MW |
1003 | int rc; |
1004 | ||
67b5031e MW |
1005 | /* The model here is that a timing loop has a fixed overhead as we enter |
1006 | * and leave (e.g., to do with the indirect branch into the code), and | |
1007 | * per-iteration overheads as we check the counter and loop back. We aim | |
1008 | * to split these apart using linear regression. | |
1009 | */ | |
1010 | ||
1011 | /* If we've already calibrated then there's nothing to do. */ | |
13ee7406 | 1012 | if (b->f&BTF_CLB) return (b->f&BTF_ANY ? 0 : -1); |
b64eb60f | 1013 | |
67b5031e | 1014 | /* Exercise the inner loop a few times to educate the branch predictor. */ |
6e683a79 | 1015 | for (i = 0; i < 50; i++) measure(b, &delta, fn, 0, 10000); |
b64eb60f | 1016 | |
67b5031e MW |
1017 | /* Now we measure idle loops until they take sufficiently long -- or we run |
1018 | * out of counter. | |
1019 | */ | |
b64eb60f | 1020 | debug("calibrating..."); |
6e683a79 | 1021 | n = 1.0; |
b64eb60f | 1022 | for (;;) { |
67b5031e MW |
1023 | |
1024 | /* Measure @n@ iterations of the idle loop. */ | |
6e683a79 | 1025 | measure(b, &delta, fn, 0, n); f &= delta.f; |
b64eb60f | 1026 | if (!(f&BTF_TIMEOK)) { rc = -1; goto end; } |
67b5031e MW |
1027 | |
1028 | /* Register the timings with the regression machinery. */ | |
b64eb60f MW |
1029 | linreg_update(&lr_clk, n, delta.t); |
1030 | if (!(f&BTF_CYOK)) | |
6e683a79 | 1031 | debug(" n = %10.0f; t = %12g s", n, delta.t); |
b64eb60f MW |
1032 | else { |
1033 | linreg_update(&lr_cy, n, delta.cy); | |
6e683a79 | 1034 | debug(" n = %10.0f; t = %12g s, cy = %10.0f", n, delta.t, delta.cy); |
b64eb60f | 1035 | } |
67b5031e MW |
1036 | |
1037 | /* If we're done then stop. */ | |
13ee7406 | 1038 | if (delta.t >= T_CLB) break; |
b64eb60f | 1039 | if (n >= ULONG_MAX - n/3) break; |
67b5031e MW |
1040 | |
1041 | /* Update the counter and continue. */ | |
6e683a79 | 1042 | n += n/3.0 + 1.0; |
b64eb60f MW |
1043 | } |
1044 | ||
67b5031e MW |
1045 | /* Now run the linear regression to extract the constant and per-iteration |
1046 | * overheads. | |
1047 | */ | |
13ee7406 MW |
1048 | linreg_fit(&lr_clk, &b->clk.m, &b->clk.c, &r); |
1049 | debug("clock overhead = (%g n + %g) s (r = %g)", b->clk.m, b->clk.c, r); | |
b64eb60f | 1050 | if (f&BTF_CYOK) { |
13ee7406 MW |
1051 | linreg_fit(&lr_cy, &b->cy.m, &b->cy.c, &r); |
1052 | debug("cycle overhead = (%g n + %g) cy (r = %g)", b->cy.m, b->cy.c, r); | |
b64eb60f | 1053 | } |
67b5031e MW |
1054 | |
1055 | /* We're done. */ | |
13ee7406 | 1056 | rc = 0; |
b64eb60f | 1057 | end: |
13ee7406 | 1058 | b->f |= f | BTF_CLB; /* no point trying again */ |
b64eb60f MW |
1059 | return (rc); |
1060 | } | |
1061 | ||
67b5031e MW |
1062 | /* --- @bench_measure@ --- * |
1063 | * | |
8d372122 MW |
1064 | * Arguments: @struct bench_state *b@ = benchmark state |
1065 | * @struct bench_timing *t_out@ = where to leave the timing | |
67b5031e MW |
1066 | * @double base@ = number of internal units per call |
1067 | * @bench_fn *fn@, @void *ctx@ = benchmark function to run | |
1068 | * | |
13ee7406 | 1069 | * Returns: Zero on success, %$-1$% if timing failed. |
67b5031e MW |
1070 | * |
1071 | * Use: Measure a function. The function @fn@ is called adaptively | |
1072 | * with an iteration count @n@ set so as to run for | |
1073 | * approximately @b->target_s@ seconds. | |
1074 | * | |
1075 | * The result is left in @*t_out@, with @t_out->n@ counting the | |
1076 | * final product of the iteration count and @base@ (which might, | |
1077 | * e.g., reflect the number of inner iterations the function | |
1078 | * performs, or the number of bytes it processes per iteration). | |
1079 | */ | |
1080 | ||
8d372122 | 1081 | int bench_measure(struct bench_state *b, struct bench_timing *t_out, |
67b5031e | 1082 | double base, bench_fn *fn, void *ctx) |
b64eb60f | 1083 | { |
6e683a79 | 1084 | double n, nn; |
b64eb60f | 1085 | |
8d372122 MW |
1086 | /* Make sure the state is calibrated and usable. */ |
1087 | if (!(b->f&BTF_CLB) && bench_calibrate(b)) return (-1); | |
1088 | if (!(b->f&BTF_TIMEOK)) return (-1); | |
67b5031e | 1089 | |
c81c35df MW |
1090 | /* Main adaptive measurement loop. |
1091 | * | |
1092 | * Suppose the timer loop %$n$% iterations in %$t$% seconds. Our ideal | |
1093 | * time is %$T$% seconds. If %$t \ge T/\sqrt{2}$%, we're happy. | |
1094 | * Otherwise, we need to scale up the iteration count. The obvious next | |
1095 | * choice is %$n' = n T/t$%. Alas, rounding is a problem: if | |
1096 | * %$T/t < 1 + 1/n$% then %$\floor{n T/t} = n$% and we will make no | |
1097 | * progress. We know that %$T/t > \sqrt{2}%, so this can only happen when | |
1098 | * %$1 + 1/n > \sqrt{2}$%, i.e., when %$n < \sqrt{2} + 1$%. On the other | |
1099 | * hand, if %$T/t < 1 + 1/n$% then %$t (n + 1)/n > T$%, so just trying | |
1100 | * again with %$n' = n + 1$% iterations will very likely work. | |
1101 | */ | |
6e683a79 | 1102 | debug("measuring..."); n = 1.0; |
b64eb60f | 1103 | for (;;) { |
6e683a79 | 1104 | measure(b, t_out, fn, ctx, n); t_out->f &= b->f; |
b64eb60f | 1105 | if (!(t_out->f&BTF_TIMEOK)) return (-1); |
6e683a79 MW |
1106 | if (!(t_out->f&BTF_CYOK)) |
1107 | debug(" n = %10.0f; t = %12g", n, t_out->t); | |
1108 | else | |
1109 | debug(" n = %10.0f; t = %12g, cy = %10.0f", n, t_out->t, t_out->cy); | |
1110 | ||
c81c35df MW |
1111 | if (t_out->t >= 0.707*b->target_s) break; |
1112 | nn = n*b->target_s/t_out->t; | |
6e683a79 | 1113 | if (n > ULONG_MAX || nn > (unsigned long)n + 1) n = nn; |
c81c35df | 1114 | else n++; |
b64eb60f | 1115 | } |
67b5031e MW |
1116 | |
1117 | /* Adjust according to the calibration. */ | |
c7da785d MW |
1118 | t_out->t -= n*b->clk.m + b->clk.c; |
1119 | if (t_out->f&BTF_CYOK) t_out->cy -= n*b->cy.m + b->cy.c; | |
67b5031e MW |
1120 | |
1121 | /* Report the results, if debugging. */ | |
c7da785d MW |
1122 | if (!(t_out->f&BTF_CYOK)) debug(" adjusted t' = %12g", t_out->t); |
1123 | else debug(" adjusted t = %12g, cy = %10.0f", t_out->t, t_out->cy); | |
b64eb60f MW |
1124 | if (!(t_out->f&BTF_CYOK)) |
1125 | debug(" %g s per op; %g ops/s", t_out->t/n, n/t_out->t); | |
1126 | else | |
1127 | debug(" %g s (%g cy) per op; %g ops/s", | |
1128 | t_out->t/n, t_out->cy/n, n/t_out->t); | |
67b5031e MW |
1129 | |
1130 | /* All done. */ | |
e63124bc | 1131 | t_out->n = n*base; return (0); |
b64eb60f MW |
1132 | } |
1133 | ||
1134 | /*----- That's all, folks -------------------------------------------------*/ |