.\" -*-nroff-*- .\" .\" Manual for benchmarking core .\" .\" (c) 2024 Straylight/Edgeware .\" . .\"----- Licensing notice --------------------------------------------------- .\" .\" This file is part of the mLib utilities library. .\" .\" mLib is free software: you can redistribute it and/or modify it under .\" the terms of the GNU Library General Public License as published by .\" the Free Software Foundation; either version 2 of the License, or (at .\" your option) any later version. .\" .\" mLib is distributed in the hope that it will be useful, but WITHOUT .\" ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or .\" FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public .\" License for more details. .\" .\" You should have received a copy of the GNU Library General Public .\" License along with mLib. If not, write to the Free Software .\" Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, .\" USA. . .\"-------------------------------------------------------------------------- .so ../defs.man \" @@@PRE@@@ . .\"-------------------------------------------------------------------------- .TH bench 3mLib "9 March 2024" "Straylight/Edgeware" "mLib utilities library" .\" @bench_createtimer .\" @BENCH_TIMELOOP_DECLS .\" @BENCH_TIMELOOP_TAG . .\" @bench_init .\" @bench_destroy .\" @bench_calibrate .\" @bench_preflight .\" @bench_adapt .\" @bench_adjust .\" @BENCH_MEASURE_DECLS .\" @BENCH_MEASURE_TAG .\" @BENCH_MEASURE .\" @bench_measure . .\" @bench_report . .\"-------------------------------------------------------------------------- .SH NAME bench \- low-level benchmarking tools . .\"-------------------------------------------------------------------------- .SH SYNOPSIS . .nf .B "#include " .PP .B "#define BTF_TIMEOK ..." .B "#define BTF_CYOK ..." .B "#define BTF_ANY (BTF_TIMEOK | BTF_CYOK)" .PP .ta 2n +2n +2n .B "struct bench_time {" .B " unsigned f;" .B " union {" .B " struct { kludge64 s; uint32 ns; } ts;" .B " clock_t clk;" .B " kludge64 rawns;" .B " } t;" .B " kludge64 cy;" .B "};" .PP .B "struct bench_timing {" .B " unsigned f;" .B " double n;" .B " double t;" .B " double cy;" .B "};" .PP .B "#define BTF_T0 0u" .B "#define BTF_T1 ..." .B "struct bench_timerops {" .BI " void (*describe)(struct bench_timer *" bt ", dstr *" d ); .BI " int (*preflight)(struct bench_timer *" bt ); .ta 2n +\w'\fBint (*now)('u .BI " int (*now)(struct bench_timer *" bt , .BI " struct bench_time *" t_out ", unsigned " f ); .ta 2n +\w'\fBvoid (*diff)('u .BI " void (*diff)(struct bench_timer *" bt , .BI " struct bench_timing *" delta_out , .BI " const struct bench_time *" t0 , .BI " const struct bench_time *" t1 ); .BI " void (*destroy)(struct bench_timer *" bt ); .B "};" .B "struct bench_timer {" .B " const struct bench_timerops *ops;" .B " unsigned ref;" .B "};" .PP .B "struct bench_timer *bench_createtimer(void);" .B "BENCH_TIMELOOP_DECLS;" .ta 2n \w'\fBBENCH_TIMELOOP_TAG('u .BI "BENCH_TIMELOOP_TAG(" tag ", struct bench_timer *" tm , .BI " struct bench_timing *" delta_out ", double " n , .BI " " onbreak ) .BI " " stmt .PP .B "#define BTF_CLB ..." .B "#define BTF_INDIRECT ..." .PP .ta 2n .B "struct bench_state {" .B " unsigned f;" .B " double target_s;" .B " ..." .B "}"; .PP .BI "typedef void bench_fn(unsigned long " n ", void *" ctx ); .PP .BI "int bench_init(struct bench_state *" b ", struct bench_timer *" tm ); .BI "void bench_destroy(struct bench_state *" b ); .BI "int bench_calibrate(struct bench_state *" b ", unsigned " f ); .BI "int bench_preflight(struct bench_state *" b ); .ta \w'\fBint bench_adapt('u .BI "int bench_adapt(struct bench_state *" b ", double *" n_inout , .BI " const struct bench_timing *" t ); .ta \w'\fBint bench_adjust('u .BI "int bench_adjust(struct bench_state *" b ", struct bench_timing *" t_inout , .BI " double " n ", double " base ); .B "BENCH_MEASURE_DECLS;" .ta 2n \w'\fBBENCH_MEASURE_TAG('u .BI "BENCH_MEASURE_TAG(" tag ", struct bench_state *" b , .BI " int &" rc ", struct bench_timing *" t_out ", double " bsae ) .BI " " stmt .ta 2n \w'\fBBENCH_MEASURE('u .BI "BENCH_MEASURE(struct bench_state *" b , .BI " int &" rc ", struct bench_timing *" t_out ", double " bsae ) .BI " " stmt .ta \w'\fBint bench_measure('u .BI "int bench_measure(struct bench_state *" b ", struct bench_timing *" t_out , .BI " double " base ", bench_fn *" fn ", void *" ctx ); .PP .ta 2n .B "enum {" .B " BTU_OP = 0," .B " BTU_BYTE = 1," .B " ..." .BI " BTU_LIMIT = " n .B "};" .ta \w'\fBvoid bench_report('u .BI "void bench_report(const struct gprintf_ops *" gops ", void *" go , .BI " unsigned " unit ", const struct bench_timing *" t ); .PP .fi . .\"-------------------------------------------------------------------------- .SH DESCRIPTION . The header file .B "" provides declarations and defintions for performing low-level benchmarks. .PP The `main event' are the .B BENCH_MEASURE macro and .B bench_measure function. These will be described in detail later, but, in brief, they execute a caller-provided piece of code instructing it to run adaptively chosen numbers of iterations, in order to get a reasonably reliable measurement of its running time, and then report the results by filling in a structure. .PP With understanding these as our objective, we must examine all of the pieces involved in making them work. . .SS Timers in general A .I timer is a gadget which is capable of reporting the current time, in seconds (ideally precise to tiny fractions of a second), and/or in CPU cycles. A timer is represented by a pointer to an object of type .BR "struct bench_timer" . This structure has two members: .BR ops , pointing to a .BR "struct bench_timerops" , which is a table of function pointers, and .BR ref , which is a simple reference count; typically, a timer has more data following this, but this fact is not exposed to applications. .PP The function pointers in .B "struct bench_timerops" are as follows. The first argument, named .I tm must always point to the timer object itself. .TP .IB tm ->ops->describe( tm ", " d) Write a description of the timer to the dynamic string .IR d . .TP .IB tm ->ops->preflight( tm ) Ensure that the timer is in working order, and perform any necessary per-thread or per-process setup. Return zero if the .B now function is likely to work properly when called from the same thread in the near future; otherwise return \-1. .TP .IB tm ->ops->now( tm ", " t_out ", " f ) Store the current time in .BI * t_out \fR. The .B BTF_T1 flag in .I f to indicate that this is the second call in a pair; leave it clear for the first call. (A fake .B BTF_T0 flag is defined to be zero for symmetry.) Return zero on success .I or permanent failure; return \-1 if timing failed but trying again immediately has a reasonable chance of success. .TP .IB tm ->ops->diff( tm ", " delta_out ", " t0 ", " t1 ) Store in .BI * delta_out the difference between the two times .I t0 and .IR t1 . .TP .IB tm ->ops->destroy( tm ) Destroy the timer, releasing all of the resources that it holds. .PP In a freshly-created timer, the .B ref member is 1. Applications are expected to handle the reference count themselves; the .B destroy function does not check or decrement the count. Code for destroying the timer when it's no longer needed might look like this. .VS if (!--tm->ref) tm->ops->destroy(tm); .VE A .B bench_timing structure reports the difference between two times, as determined by a timer's .B diff function. It has four members. .TP .B f A flags word. .B BTF_TIMEOK is set if the passage-of-time measurement in .B t is valid; .B BTF_CYOK is set if the cycle count in .B cy is valid. The mask .B BTF_ANY covers the .B BTF_TIMEOK and .B BTF_CYOK bits: hence, .B f&BTF_ANY is nonzero (true) if the timer returned any valid timing information. .TP .B n The number of units processed the benchmark computation on its satisfactory run, multiplied by a given .IR base \(en see .BR BENCH_MEASURE , .BR bench_measure , and .BR bench_adjust . .TP .B t The time taken for the satisfactory run of the benchmark function, in seconds. Only valid if .B BTF_TIMEOK is set in .BR f . .TP .B cy The number of CPU cycles used in the satisfactory run of the benchmark function, in seconds. Only valid if .B BTF_CYOK is set in .BR f . .PP A .B "struct bench_time" represents a single instant in time, as captured by a timer's .B now function. The use of this structure is a private matter for the timer: the only hard requirement is that the .B diff function should be able to compute the difference between two times. However, the intent is that a passage-of-time measurement is stored in the .B t union, a cycle count is stored in the .B cy member, and the .B f member stores flags .B BTF_TIMEOK and or .B BTF_CYOK if the passage-of-time or cycle count respectively are valid. .PP The .B BENCH_TIMELOOP_TAG macro uses a timer to measure a number of iterations of a computation. It requires the declarations made by .B BENCH_TIMELOOP_DECLS to be in scope, ideally within an enclosing block (rather than at top-level, where they'll have static storage duration, and take longer to access). The macro's expansion is syntactically a statement head; see .BR control (3) for details about the underlying machinery. In more detail, the macro is invoked as .IP .nf .ta 2n .BI "BENCH_TIMELOOP_TAG(" tag ", " tm ", " delta_out ", " n ", " onbreak ) .BI " " stmt .fi .PP The .I tag argument is used to distinguish the labels used internally by the macro: see .BR control (3) for details about tags. The macro calls on the timer .I tm to determine the initial time and cycle counts, performs .I n iterations of some computation, and calls on the timer a second time to determine the final time and cycle counts, and to store the difference in .BI * delta_out \fR. The .I stmt may be any C statement: when it is executed, the variable .BR _bench_n , of type .BR "unsigned long" , is in scope. The statement should perform .B _bench_n iterations of the computation to be measured \(en and do as little else as possible. The argument .I n to the macro may be larger than .BR ULONG_MAX : the macro will execute .I stmt multiple times if necessary. The statement is allowed to clobber .BR _bench_n . Note that .B BENCH_TIMELOOP_TAG does .I not call the timer's .B preflight function. If the .I stmt executes a free .B break statement then the statement .I onbreak is executed; a free .B continue statement within .I stmt currently does not have a useful behaviour. Free .B break and .B continue statements within .I onbreak behave normally. (See .BR control (3) for a definition of `free' .B break and .B continue statements.) . .SS The built-in timer The function .B bench_createtimer constructs and returns a timer. It takes a single argument, a string .IR config , from which it reads configuration information. If .B bench_createtimer fails, it returns a null pointer. .PP The .I config pointer may safely be null, in which case a default configuration will be used. Applications .I should only set this pointer to a value supplied by a user, e.g., through a command-line argument, environment variable, or configuration file. .PP The built-in timer makes use of one or two .IR subtimers : a `clock' subtimer to measure the passage of time, and possibly a `cycle' subtimer to count CPU cycles. .PP The configuration string consists of a sequence of words separated by whitespace. There may be additional whitespace at the start and end of the string. The words recognized are as follows. .TP .B list Prints a list of the available clock and cycle subtimers to standard output. .TP .BI clock= t , ... Use the first of the listed clock subtimers to initialize successfully as the clock subtimer. If none of the subtimers can be initialized, then construction of the timer as a whole fails. .TP .BI cycle= t , ... Use the first of the listed subtimers to initialize successfully as the cycle subtimer. If none of the subtimers can be initialized, then construction of the timer as a whole fails. .PP The clock subtimers are as follows. Not all of them will be available on every platform. .TP .B linux-x86-perf-rdpmc-hw-cycles This is a dummy companion to the similarly named cycle subtimer; see its description below. .TP .B posix-thread-cputime Measures the passage of time using .BR clock_gettime (2), specifying the .B CLOCK_\%THREAD_\%CPUTIME_\%ID clock. .TP .B stdc-clock Measures the passage of time using .BR clock (3). Since .BR clock (3) is part of the original ANSI\ C standard, this subtimer should always be available. However, it may produce unhelpful results if other threads are running. .PP The cycle subtimers are as follows. Not all of them will be available on every platform. .TP .B linux-perf-read-hw-cycles Counts CPU cycles using the Linux-specific .BR perf_event_open (2) function to read the .BR PERF_\%COUNT_\%HW_\%CPU_\%CYCLES counter. Only available on Linux. It will fail to initialize if access to performance counters is restricted, e.g., because the .B /proc/sys/kernel/perf_event_paranoid level is too high. .TP .B linux-perf-rdpmc-hw-cycles Counts CPU cycles using the Linux-specific .BR perf_event_open (2) function, as for .B linux-x86-perf-read-hw-cycles above, except that it additionally uses the i386/AMD64 .B rdtsc and .B rdpmc instructions, together with information provided by the kernel through a memory-mapped page to do its measurements without any system call overheads. It does passage-of-time and cycle counting in a single operation, so no separate clock subtimer is required: the similarly-named clock subtimer does nothing except check that the .B linux-x86-perf-rdpmc-hw-cycles cycle subtimer has been selected. This is almost certainly the best choice if it's available; It is, however, not compatible with (at least some versions of) .BR valgrind (1); it will detect that it is running under .B valgrind and fail to initialize. .TP .B x86-rdtscp Counts CPU cycles using the x86 .B rdtscp instruction. This instruction is not really suitable for performance measurement: it gives misleading results on CPUs with variable clock frequency. .TP .B x86-rdtsc Counts CPU cycles using the x86 .B rdtsc instruction. This has the downsides of .B rdtscp above, but also fails to detect when the thread has been suspended or transferred to a different CPU core and gives misleading answers in this case. Not really recommended. .TP .B null A dummy cycle counter, which will initialize successfully and then fail to report cycle counts. This is a reasonable fallback in many situations. .PP The built-in preference order for clock subtimers, from most to least preferred, is .BR linux-x86-perf-rdpmc-hw-cycles , followed by .BR posix-thread-cputime , and finally .BR stdc-clock . The built-in preference order for cycle subtimers, from most to least preferred, is .BR linux-x86-perf-rdpmc-hw-cycles then .BR linux-x86-perf-read-hw-cycles , followed by .BR x86-rdtscp , and .BR x86-rdtsc , and finally .BR null . . .SS The benchmark state A .I benchmark state tracks the information needed to measure performance of functions. It is represented by a .B struct bench_state structure. .PP The benchmark state is initialized by calling .BR bench_init , passing the address of the state structure to be initialized, and a pointer to a timer. If .B bench_init is called with a non-null timer pointer, then it will not fail; the benchmark state will be initialized, and the function returns zero; the timer's reference count is .I not incremented. If the timer pointer is null, then .B bench_init attempts to construct a timer for itself by calling .BR bench_createtimer . If this succeeds, then the benchmark state will be initialized, and the function returns zero. In both cases, the timer reference becomes owned by the benchmark state: calling .B bench_destroy on the benchmark state will decrement the timer's reference count, and destroy it unless it has additional outstanding references. If .B bench_init is called with a null timer pointer, and its attempt to create a timer for itself fails, then .B bench_init returns \-1: the benchmark state is not initialized and can safely be discarded. .PP Calling .B bench_destroy on a benchmark state releases any resources it holds, most notably its timer, if any. Calling .B bench_destroy on an unsuccessfully initialized benchmark state is safe but has no effect. .PP Although .B struct bench_state is defined in the header file, only two members are available for use by applications. .TP .B f A word containing flags. .TP .B target_s The target time for which to try run a benchmark, in seconds. After initialization, this is set to 1.0, though applications can override it. .PP Before the benchmark state can be used in measurements, it must be .IR calibrated . This is performed by calling .B bench_calibrate on the benchmark state. Calibration takes a noticeable amount of time (currently about 0.25\*,s), so it makes sense to defer it until it's known to be necessary. .PP Calibration is carried out separately, but in parallel, for the timer's passage-of-time measurement and cycle counter. Either or both of these calibrations can succeed or fail; if passage-of-time calibration fails, then cycle count calibration is impossible. .PP The benchmarking state must be calibrated differently for different kinds of timing loop; this is controlled by the flags passed as the .I f argument to .BR bench_calibrate . The main difference lies in whether the code to be measured is called .IR indirectly , i.e., via a function pointer. Set .B BTF_INDIRECT if the code is to be called indirectly; leave this flag clear if the code is called directly. The .B bench_measure function always makes indirect calls; the .B BENCH_MEASURE macro does not itself make indirect calls. Usually, a program needs only one or the other; if both are necessary for some reason, the best approach is just to set up two benchmarking states sharing the same timer, and calibrate them separately. .PP When it completes, .B bench_calibrate sets flags in the benchmark state's .B f member: if passage-of-time calibration succeeded, .B BTF_TIMEOK is set; if cycle-count calibration succeeded, .B BTF_CYOK is set; and the flag .B BTF_CLB is set unconditionally, as a persistent indication that calibration has been attempted. .PP The .B bench_calibrate function returns zero if it successfully calibrated at least the passage-of-time measurement; otherwise, it returns \-1. If .B bench_calibrate is called for a second or subsequent time on the same benchmark state, it returns immediately, either returning 0 or \-1 according to whether passage-of-time had previously been calibrated. .PP The .B BENCH_MEASURE macro measures the performance of a computation. It requires the declarations made by .B BENCH_MEASURE_DECLS to be in scope, ideally within an enclosing block (rather than at top-level, where they'll have static storage duration, and take longer to access). The macro's expansion is syntactically a statement head; see .BR control (3) for details about the underlying machinery. In more detail, the macro is invoked as .IP .nf .ta 2n .BI "BENCH_MEASURE(" b ", " rc ", " t_out ", " base ) .BI " " stmt .fi .PP The .I stmt can be any C statement; it should perform .B _bench_n iterations of the computation to be measured. (The variable .B _bench_n is declarared as part of .B BENCH_MEASURE_DECLS and has type .BR "unsigned long" . Before commencing measurement proper, the macro calls .BR bench_preflight , described below, to check that everything is set up properly for measurements on the current thread; if this fails, then the macro sets .I rc to \-1. Otherwise, the macro executes .I stmt one or more times, with the objective of finding an iteration count .I n such that .I n iterations of the computation take more than .IB b ->target_s "" \fR/\*(sr2 seconds. If measurement fails, then .I rc is set to \-1; otherwise, .I rc is set to zero, and .BI * t_out is filled in with the measurement; .IB t_out ->n is set to .IR n "\ \*(mu\ " base . .PP The .B BENCH_MEASURE_TAG macro works just like .B BENCH_MEASURE except that it takes an additional .I tag argument used to distinguish the internal labels used by the macro's implementation; this makes it possible to use .B BENCH_MEASURE_TAG as a component in more complex macros. See .BR control (3) for details about control-structure macros and the meaning and format of tags. .PP The function .B bench_measure is similar, except that it calls a .I benchmark function to perform the computation. A benchmark function has the signature .IP .BI "void " fn "(unsigned long " n ", void *" ctx ); .PP When called, it should perform the operation to be measured .I n times. The .I ctx argument is a pointer passed into .B bench_measure for the benchmark function's own purposes. The .B bench_measure function returns zero on success, or \-1 on failure. Note that .B bench_measure invokes the benchmark indirectly, so the benchmark state should have been calibrated with .BR BTF_INDIRECT . . .SS Measurement utilities The following functions are primarily exported for the benefit of the .B BENCH_MEASURE macro, but are documented here in case they are useful. .PP The .B bench_preflight function prepares a benchmarking state for use. It checks that the timer is calibrated and suitable for measuring passage-of-time; it also calls the timer's .B preflight function to prepare it for measurements on the current thread. If these checks succeed, then .B bench_preflight returns zero; otherwise it returns \-1 and the caller should not proceed with measurements. .PP The .B bench_adapt function is used to determine iteration counts. It is used in a loop such as the following. .IP .nf .ta 2n +2n .B "BENCH_TIMELOOP_DECLS;" .B "struct bench_timer *tm;" .B "struct bench_timing t;" .B "double n = 1.0, target_s = 1.0;" .IP .B "do {" .B " BENCH_TIMELOOP_TAG(time, tm, &t, n, { break; })" .BI " " "(do " _bench_n " iterations of some computation)" ; .B "} while (!bench_adapt(&n, target_s, &t));" .fi .PP On entry, .BI *n_inout should be the number of iterations performed by the previous step, and .BI * t the resulting time; the .B BTF_TIMEOK flag must be set in .IB t ->f \fR. If the timing is sufficient \(en if .IR t\fB->t "\ \*(>=\ " target_s /\*(sr2 \(en then .B bench_adapt returns a nonzero value to indicate that measurement is complete. Otherwise, it sets .BI * n_inout to a new, larger iteration count and returns zero to indicate that a further pass is necessary. .PP The .B bench_adjust function adjusts a raw timing, as captured by .BR BENCH_TIMELOOP_TAG , according to the calibration data captured in .IR b . On exit, the timing data is updated, and .IB t ->n is set to the product .IR n "\ \*(mu\ " base . . .SS Reporting results The .B bench_report function formats a measurement result into a human-readable string. The function writes its output using the generalized output formatting operations .I gops and output pointer .IR go ; see .BR gprintf (3) for details on generalized output formatting. The .I unit argument describes the unit of activity being measured: .TP .B BTU_OP counts operations of some unspecified nature, while .TP .B BTU_BYTE counts a number of bytes processed. .PP These are presented differently \(em in particular, quantities bytes are expressed using binary scaling rather than decimal. The timing to report is given by the .I t argument; .IB t ->n gives the number of units processed. . .\"-------------------------------------------------------------------------- .SH EXAMPLE . The following macros offer a fairly simple example of how the benchmarking functions and macros can be used. .VS .ta 2n +2n +2n 2n+\w'\fBBENCH_MEASURE_TAG('u \n(.lu-\n(.iu-4n #define BENCHMARK_DECLS \e struct bench_timing _bmark_t; \e int _bmark_rc; \e BENCH_MEASURE_DECLS .VP #define BENCHMARK_TAG(tag, b, unit, base) \e MC_BEFORE(tag##__benchmark_before, { fflush(stdout); }) \e MC_AFTER(tag##__benchmark_after, { \e if (_bmark_rc) \e puts(": FAILED"); \e else { \e fputs(": ", stdout); \e bench_report(&file_printops, stdout, (unit), &_bmark_tm);\ \e putchar('\en'); \e } \e }) \e BENCH_MEASURE_TAG(tag##__bmarkmark_measure, \e (b), _bmark_rc, &_bmark_t, (base)) #define BENCHMARK(b, unit, base) \e BENCHMARK_TAG(bench, b, unit, base) .VE .\"-------------------------------------------------------------------------- .SH "SEE ALSO" . .BR control (3), .BR macros (3), .BR tvec-bench (3), .BR mLib (3). . .\"-------------------------------------------------------------------------- .SH AUTHOR . Mark Wooding, . .\"----- That's all, folks --------------------------------------------------