3 .\" Manual for benchmarking core
5 .\" (c) 2024 Straylight/Edgeware
8 .\"----- Licensing notice ---------------------------------------------------
10 .\" This file is part of the mLib utilities library.
12 .\" mLib is free software: you can redistribute it and/or modify it under
13 .\" the terms of the GNU Library General Public License as published by
14 .\" the Free Software Foundation; either version 2 of the License, or (at
15 .\" your option) any later version.
17 .\" mLib is distributed in the hope that it will be useful, but WITHOUT
18 .\" ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
19 .\" FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public
20 .\" License for more details.
22 .\" You should have received a copy of the GNU Library General Public
23 .\" License along with mLib. If not, write to the Free Software
24 .\" Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
27 .\"--------------------------------------------------------------------------
28 .so ../defs.man \" @@@PRE@@@
30 .\"--------------------------------------------------------------------------
31 .TH bench 3mLib "9 March 2024" "Straylight/Edgeware" "mLib utilities library"
32 .\" @bench_createtimer
33 .\" @BENCH_TIMELOOP_DECLS
34 .\" @BENCH_TIMELOOP_TAG
42 .\" @BENCH_MEASURE_DECLS
43 .\" @BENCH_MEASURE_TAG
49 .\"--------------------------------------------------------------------------
51 bench \- low-level benchmarking tools
53 .\"--------------------------------------------------------------------------
57 .B "#include <mLib/bench.h>"
59 .B "#define BTF_TIMEOK ..."
60 .B "#define BTF_CYOK ..."
61 .B "#define BTF_ANY (BTF_TIMEOK | BTF_CYOK)"
64 .B "struct bench_time {"
67 .B " struct { kludge64 s; uint32 ns; } ts;"
74 .B "struct bench_timing {"
81 .B "#define BTF_T0 0u"
82 .B "#define BTF_T1 ..."
83 .B "struct bench_timerops {"
84 .BI " void (*describe)(struct bench_timer *" bt ", dstr *" d );
85 .BI " int (*preflight)(struct bench_timer *" bt );
86 .ta 2n +\w'\fBint (*now)('u
87 .BI " int (*now)(struct bench_timer *" bt ,
88 .BI " struct bench_time *" t_out ", unsigned " f );
89 .ta 2n +\w'\fBvoid (*diff)('u
90 .BI " void (*diff)(struct bench_timer *" bt ,
91 .BI " struct bench_timing *" delta_out ,
92 .BI " const struct bench_time *" t0 ,
93 .BI " const struct bench_time *" t1 );
94 .BI " void (*destroy)(struct bench_timer *" bt );
96 .B "struct bench_timer {"
97 .B " const struct bench_timerops *ops;"
101 .B "struct bench_timer *bench_createtimer(void);"
102 .B "BENCH_TIMELOOP_DECLS;"
103 .ta 2n \w'\fBBENCH_TIMELOOP_TAG('u
104 .BI "BENCH_TIMELOOP_TAG(" tag ", struct bench_timer *" tm ,
105 .BI " struct bench_timing *" delta_out ", double " n ,
109 .B "#define BTF_CLB ..."
110 .B "#define BTF_INDIRECT ..."
113 .B "struct bench_state {"
115 .B " double target_s;"
119 .BI "typedef void bench_fn(unsigned long " n ", void *" ctx );
121 .BI "int bench_init(struct bench_state *" b ", struct bench_timer *" tm );
122 .BI "void bench_destroy(struct bench_state *" b );
123 .BI "int bench_calibrate(struct bench_state *" b ", unsigned " f );
124 .BI "int bench_preflight(struct bench_state *" b );
125 .ta \w'\fBint bench_adapt('u
126 .BI "int bench_adapt(struct bench_state *" b ", double *" n_inout ,
127 .BI " const struct bench_timing *" t );
128 .ta \w'\fBint bench_adjust('u
129 .BI "int bench_adjust(struct bench_state *" b ", struct bench_timing *" t_inout ,
130 .BI " double " n ", double " base );
131 .B "BENCH_MEASURE_DECLS;"
132 .ta 2n \w'\fBBENCH_MEASURE_TAG('u
133 .BI "BENCH_MEASURE_TAG(" tag ", struct bench_state *" b ,
134 .BI " int &" rc ", struct bench_timing *" t_out ", double " bsae )
136 .ta 2n \w'\fBBENCH_MEASURE('u
137 .BI "BENCH_MEASURE(struct bench_state *" b ,
138 .BI " int &" rc ", struct bench_timing *" t_out ", double " bsae )
140 .ta \w'\fBint bench_measure('u
141 .BI "int bench_measure(struct bench_state *" b ", struct bench_timing *" t_out ,
142 .BI " double " base ", bench_fn *" fn ", void *" ctx );
149 .BI " BTU_LIMIT = " n
151 .ta \w'\fBvoid bench_report('u
152 .BI "void bench_report(const struct gprintf_ops *" gops ", void *" go ,
153 .BI " unsigned " unit ", const struct bench_timing *" t );
157 .\"--------------------------------------------------------------------------
162 provides declarations and defintions
163 for performing low-level benchmarks.
165 The `main event' are the
170 These will be described in detail later,
172 they execute a caller-provided piece of code
173 instructing it to run adaptively chosen numbers of iterations,
174 in order to get a reasonably reliable measurement of its running time,
175 and then report the results by filling in a structure.
177 With understanding these as our objective,
178 we must examine all of the pieces involved in making them work.
180 .SS Timers in general
183 is a gadget which is capable of reporting the current time,
184 in seconds (ideally precise to tiny fractions of a second),
185 and/or in CPU cycles.
186 A timer is represented by a pointer to an object of type
187 .BR "struct bench_timer" .
188 This structure has two members:
191 .BR "struct bench_timerops" ,
192 which is a table of function pointers,
195 which is a simple reference count;
196 typically, a timer has more data following this,
197 but this fact is not exposed to applications.
199 The function pointers in
200 .B "struct bench_timerops"
205 must always point to the timer object itself.
207 .IB tm ->ops->describe( tm ", " d)
208 Write a description of the timer to the dynamic string
211 .IB tm ->ops->preflight( tm )
212 Ensure that the timer is in working order,
213 and perform any necessary per-thread or per-process setup.
216 function is likely to work properly
217 when called from the same thread
219 otherwise return \-1.
221 .IB tm ->ops->now( tm ", " t_out ", " f )
222 Store the current time in
228 to indicate that this is the second call in a pair;
229 leave it clear for the first call.
232 flag is defined to be zero for symmetry.)
233 Return zero on success
236 return \-1 if timing failed but
237 trying again immediately has a reasonable chance of success.
239 .IB tm ->ops->diff( tm ", " delta_out ", " t0 ", " t1 )
242 the difference between the two times
247 .IB tm ->ops->destroy( tm )
249 releasing all of the resources that it holds.
251 In a freshly-created timer, the
254 Applications are expected to handle the reference count themselves;
257 function does not check or decrement the count.
258 Code for destroying the timer when it's no longer needed
259 might look like this.
261 if (!--tm->ref) tm->ops->destroy(tm);
265 structure reports the difference between two times,
266 as determined by a timer's
274 is set if the passage-of-time measurement in
278 is set if the cycle count in
291 if the timer returned any valid timing information.
294 The number of units processed the benchmark computation
295 on its satisfactory run,
296 multiplied by a given
305 The time taken for the satisfactory run of the benchmark function,
313 The number of CPU cycles used
314 in the satisfactory run of the benchmark function,
322 .B "struct bench_time"
323 represents a single instant in time,
324 as captured by a timer's
327 The use of this structure is a private matter for the timer:
328 the only hard requirement is that the
330 function should be able to compute the difference between two times.
331 However, the intent is that
332 a passage-of-time measurement is stored in the
335 a cycle count is stored in the
344 if the passage-of-time or cycle count respectively are valid.
347 .B BENCH_TIMELOOP_TAG
348 macro uses a timer to measure a number of iterations of a computation.
349 It requires the declarations made by
350 .B BENCH_TIMELOOP_DECLS
352 ideally within an enclosing block
353 (rather than at top-level,
354 where they'll have static storage duration,
355 and take longer to access).
356 The macro's expansion is syntactically a statement head;
359 for details about the underlying machinery.
360 In more detail, the macro is invoked as
364 .BI "BENCH_TIMELOOP_TAG(" tag ", " tm ", " delta_out ", " n ", " onbreak )
370 argument is used to distinguish
371 the labels used internally by the macro:
374 for details about tags.
375 The macro calls on the timer
377 to determine the initial time and cycle counts,
380 iterations of some computation,
381 and calls on the timer a second time
382 to determine the final time and cycle counts,
383 and to store the difference in
387 may be any C statement:
392 .BR "unsigned long" ,
394 The statement should perform
396 iterations of the computation to be measured
397 \(en and do as little else as possible.
403 the macro will execute
405 multiple times if necessary.
406 The statement is allowed to clobber
409 .B BENCH_TIMELOOP_TAG
427 currently does not have a useful behaviour.
444 .SS The built-in timer
447 constructs and returns a timer.
448 It takes a single argument,
451 from which it reads configuration information.
454 fails, it returns a null pointer.
458 pointer may safely be null,
459 in which case a default configuration will be used.
462 set this pointer to a value supplied by a user,
463 e.g., through a command-line argument,
464 environment variable, or
467 The built-in timer makes use of one or two
469 a `clock' subtimer to measure the passage of time,
470 and possibly a `cycle' subtimer to count CPU cycles.
472 The configuration string consists of a sequence of words
473 separated by whitespace.
474 There may be additional whitespace at the start and end of the string.
475 The words recognized are as follows.
478 Prints a list of the available clock and cycle subtimers
482 Use the first of the listed clock subtimers
483 to initialize successfully
484 as the clock subtimer.
485 If none of the subtimers can be initialized,
486 then construction of the timer as a whole fails.
489 Use the first of the listed subtimers
490 to initialize successfully
491 as the cycle subtimer.
492 If none of the subtimers can be initialized,
493 then construction of the timer as a whole fails.
495 The clock subtimers are as follows.
496 Not all of them will be available on every platform.
498 .B linux-x86-perf-rdpmc-hw-cycles
499 This is a dummy companion to the similarly named cycle subtimer;
500 see its description below.
502 .B posix-thread-cputime
503 Measures the passage of time using
504 .BR clock_gettime (2),
506 .B CLOCK_\%THREAD_\%CPUTIME_\%ID
510 Measures the passage of time using
514 is part of the original ANSI\ C standard,
515 this subtimer should always be available.
516 However, it may produce unhelpful results
517 if other threads are running.
519 The cycle subtimers are as follows.
520 Not all of them will be available on every platform.
522 .B linux-perf-read-hw-cycles
523 Counts CPU cycles using the Linux-specific
524 .BR perf_event_open (2)
526 .BR PERF_\%COUNT_\%HW_\%CPU_\%CYCLES
528 Only available on Linux.
529 It will fail to initialize
530 if access to performance counters is restricted,
532 .B /proc/sys/kernel/perf_event_paranoid
535 .B linux-perf-rdpmc-hw-cycles
536 Counts CPU cycles using the Linux-specific
537 .BR perf_event_open (2)
540 .B linux-x86-perf-read-hw-cycles
542 except that it additionally uses the i386/AMD64
547 together with information provided by the kernel
548 through a memory-mapped page
549 to do its measurements without any system call overheads.
550 It does passage-of-time and cycle counting in a single operation,
551 so no separate clock subtimer is required:
552 the similarly-named clock subtimer does nothing
553 except check that the
554 .B linux-x86-perf-rdpmc-hw-cycles
555 cycle subtimer has been selected.
556 This is almost certainly the best choice if it's available;
557 It is, however, not compatible with (at least some versions of)
559 it will detect that it is running under
561 and fail to initialize.
564 Counts CPU cycles using the x86
567 This instruction is not really suitable for performance measurement:
568 it gives misleading results on CPUs with variable clock frequency.
571 Counts CPU cycles using the x86
574 This has the downsides of
577 but also fails to detect when the thread has been suspended
578 or transferred to a different CPU core
579 and gives misleading answers in this case.
580 Not really recommended.
583 A dummy cycle counter,
584 which will initialize successfully
585 and then fail to report cycle counts.
586 This is a reasonable fallback in many situations.
588 The built-in preference order for clock subtimers,
589 from most to least preferred, is
590 .BR linux-x86-perf-rdpmc-hw-cycles ,
592 .BR posix-thread-cputime ,
595 The built-in preference order for cycle subtimers,
596 from most to least preferred, is
597 .BR linux-x86-perf-rdpmc-hw-cycles
599 .BR linux-x86-perf-read-hw-cycles ,
607 .SS The benchmark state
610 tracks the information needed to measure performance of functions.
611 It is represented by a
612 .B struct bench_state
615 The benchmark state is initialized by calling
617 passing the address of the state structure to be initialized,
618 and a pointer to a timer.
621 is called with a non-null timer pointer,
622 then it will not fail;
623 the benchmark state will be initialized,
624 and the function returns zero;
625 the timer's reference count is
628 If the timer pointer is null,
631 attempts to construct a timer for itself
633 .BR bench_createtimer .
635 then the benchmark state will be initialized,
636 and the function returns zero.
638 the timer reference becomes owned by the benchmark state:
641 on the benchmark state will decrement the timer's reference count,
642 and destroy it unless it has additional outstanding references.
645 is called with a null timer pointer,
646 and its attempt to create a timer for itself fails,
650 the benchmark state is not initialized
651 and can safely be discarded.
656 releases any resources it holds,
657 most notably its timer, if any.
660 on an unsuccessfully initialized benchmark state
661 is safe but has no effect.
664 .B struct bench_state
665 is defined in the header file,
666 only two members are available for use by applications.
669 A word containing flags.
672 The target time for which to try run a benchmark, in seconds.
673 After initialization, this is set to 1.0,
674 though applications can override it.
676 Before the benchmark state can be used in measurements,
679 This is performed by calling
681 on the benchmark state.
682 Calibration takes a noticeable amount of time
683 (currently about 0.25\*,s),
684 so it makes sense to defer it until it's known to be necessary.
686 Calibration is carried out separately, but in parallel,
687 for the timer's passage-of-time measurement and cycle counter.
688 Either or both of these calibrations can succeed or fail;
689 if passage-of-time calibration fails,
690 then cycle count calibration is impossible.
692 The benchmarking state must be calibrated differently
693 for different kinds of timing loop;
694 this is controlled by the flags passed as the
697 .BR bench_calibrate .
698 The main difference lies in whether the code to be measured
701 i.e., via a function pointer.
704 if the code is to be called indirectly;
705 leave this flag clear if the code is called directly.
708 function always makes indirect calls;
711 macro does not itself make indirect calls.
712 Usually, a program needs only one or the other;
713 if both are necessary for some reason,
714 the best approach is just to set up two benchmarking states
715 sharing the same timer,
716 and calibrate them separately.
720 sets flags in the benchmark state's
723 if passage-of-time calibration succeeded,
726 if cycle-count calibration succeeded,
731 is set unconditionally,
732 as a persistent indication that calibration has been attempted.
736 function returns zero if it successfully calibrated
737 at least the passage-of-time measurement;
738 otherwise, it returns \-1.
741 is called for a second or subsequent time on the same benchmark state,
742 it returns immediately,
743 either returning 0 or \-1
744 according to whether passage-of-time had previously been calibrated.
748 macro measures the performance of a computation.
749 It requires the declarations made by
750 .B BENCH_MEASURE_DECLS
752 ideally within an enclosing block
753 (rather than at top-level,
754 where they'll have static storage duration,
755 and take longer to access).
756 The macro's expansion is syntactically a statement head;
759 for details about the underlying machinery.
760 In more detail, the macro is invoked as
764 .BI "BENCH_MEASURE(" b ", " rc ", " t_out ", " base )
770 can be any C statement;
773 iterations of the computation to be measured.
776 is declarared as part of
777 .B BENCH_MEASURE_DECLS
779 .BR "unsigned long" .
780 Before commencing measurement proper,
782 .BR bench_preflight ,
784 to check that everything is set up properly
785 for measurements on the current thread;
786 if this fails, then the macro sets
789 Otherwise, the macro executes
792 with the objective of finding an iteration count
796 iterations of the computation take more than
797 .IB b ->target_s "" \fR/\(sr2
799 If measurement fails,
807 is filled in with the measurement;
810 .IR n "\ \(mu\ " base .
814 macro works just like
816 except that it takes an additional
818 argument used to distinguish the internal labels
819 used by the macro's implementation;
820 this makes it possible to use
822 as a component in more complex macros.
825 for details about control-structure macros
826 and the meaning and format of tags.
831 except that it calls a
832 .I benchmark function
833 to perform the computation.
834 A benchmark function has the signature
836 .BI "void " fn "(unsigned long " n ", void *" ctx );
838 When called, it should perform the operation to be measured
843 argument is a pointer passed into
845 for the benchmark function's own purposes.
848 function returns zero on success,
852 invokes the benchmark indirectly,
853 so the benchmark state should have been calibrated with
856 .SS Measurement utilities
857 The following functions are primarily exported for the benefit of the
860 but are documented here in case they are useful.
864 function prepares a benchmarking state for use.
865 It checks that the timer is calibrated
866 and suitable for measuring passage-of-time;
867 it also calls the timer's
869 function to prepare it for measurements on the current thread.
870 If these checks succeed, then
873 otherwise it returns \-1
874 and the caller should not proceed with measurements.
878 function is used to determine iteration counts.
879 It is used in a loop such as the following.
883 .B "BENCH_TIMELOOP_DECLS;"
884 .B "struct bench_timer *tm;"
885 .B "struct bench_timing t;"
886 .B "double n = 1.0, target_s = 1.0;"
889 .B " BENCH_TIMELOOP_TAG(time, tm, &t, n, { break; })"
890 .BI " " "(do " _bench_n " iterations of some computation)" ;
891 .B "} while (!bench_adapt(&n, target_s, &t));"
896 should be the number of iterations performed by the previous step,
904 If the timing is sufficient \(en if
905 .IR t\fB->t "\ \*(>=\ " target_s /\(sr2
908 returns a nonzero value to indicate that measurement is complete.
911 to a new, larger iteration count
912 and returns zero to indicate that a further pass is necessary.
916 function adjusts a raw timing,
918 .BR BENCH_TIMELOOP_TAG ,
919 according to the calibration data captured in
921 On exit, the timing data is updated,
924 is set to the product
925 .IR n "\ \(mu\ " base .
927 .SS Reporting results
930 function formats a measurement result
931 into a human-readable string.
932 The function writes its output using the
933 generalized output formatting operations
939 for details on generalized output formatting.
942 argument describes the unit of activity being measured:
945 counts operations of some unspecified nature, while
948 counts a number of bytes processed.
950 These are presented differently
952 quantities bytes are expressed using binary scaling rather than decimal.
953 The timing to report is given by the
957 gives the number of units processed.
959 .\"--------------------------------------------------------------------------
962 The following macros offer a fairly simple example of
963 how the benchmarking functions and macros can be used.
965 .ta 2n +2n +2n 2n+\w'\fBBENCH_MEASURE_TAG('u \n(.lu-\n(.iu-4n
966 #define BENCHMARK_DECLS \e
967 struct bench_timing _bmark_t; \e
971 #define BENCHMARK_TAG(tag, b, unit, base) \e
972 MC_BEFORE(tag##__benchmark_before, { fflush(stdout); }) \e
973 MC_AFTER(tag##__benchmark_after, { \e
975 printf(": FAILED\en"); \e
977 fputs(": ", stdout); \e
978 bench_report(&file_printops, stdout, (unit), &_bmark_tm);\ \e
982 BENCH_MEASURE_TAG(tag##__bmarkmark_measure, \e
983 (b), _bmark_rc, &_bmark_t, (base))
984 #define BENCHMARK(b, unit, base) \e
985 BENCHMARK_TAG(bench, b, unit, base)
988 .\"--------------------------------------------------------------------------
996 .\"--------------------------------------------------------------------------
999 Mark Wooding, <mdw@distorted.org.uk>
1001 .\"----- That's all, folks --------------------------------------------------