.\" -*-nroff-*-
.\"
.\" Manual for benchmarking core
.\"
.\" (c) 2024 Straylight/Edgeware
.\"
.
.\"----- Licensing notice ---------------------------------------------------
.\"
.\" This file is part of the mLib utilities library.
.\"
.\" mLib is free software: you can redistribute it and/or modify it under
.\" the terms of the GNU Library General Public License as published by
.\" the Free Software Foundation; either version 2 of the License, or (at
.\" your option) any later version.
.\"
.\" mLib is distributed in the hope that it will be useful, but WITHOUT
.\" ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
.\" FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
.\" License for more details.
.\"
.\" You should have received a copy of the GNU Library General Public
.\" License along with mLib.  If not, write to the Free Software
.\" Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
.\" USA.
.
.\"--------------------------------------------------------------------------
.so ../defs.man \" @@@PRE@@@
.
.\"--------------------------------------------------------------------------
.TH bench 3mLib "9 March 2024" "Straylight/Edgeware" "mLib utilities library"
.\" @bench_createtimer
.\" @BENCH_TIMELOOP_DECLS
.\" @BENCH_TIMELOOP_TAG
.
.\" @bench_init
.\" @bench_destroy
.\" @bench_calibrate
.\" @bench_preflight
.\" @bench_adapt
.\" @bench_adjust
.\" @BENCH_MEASURE_DECLS
.\" @BENCH_MEASURE_TAG
.\" @BENCH_MEASURE
.\" @bench_measure
.
.\" @bench_report
.
.\"--------------------------------------------------------------------------
.SH NAME
bench \- low-level benchmarking tools
.
.\"--------------------------------------------------------------------------
.SH SYNOPSIS
.
.nf
.B "#include <mLib/bench.h>"
.PP
.B "#define BTF_TIMEOK ..."
.B "#define BTF_CYOK ..."
.B "#define BTF_ANY (BTF_TIMEOK | BTF_CYOK)"
.PP
.ta 2n +2n +2n
.B "struct bench_time {"
.B "	unsigned f;"
.B "	union {"
.B "		struct { kludge64 s; uint32 ns; } ts;"
.B "		clock_t clk;"
.B "		kludge64 rawns;"
.B "	} t;"
.B "	kludge64 cy;"
.B "};"
.PP
.B "struct bench_timing {"
.B "	unsigned f;"
.B "	double n;"
.B "	double t;"
.B "	double cy;"
.B "};"
.PP
.B "#define BTF_T0 0u"
.B "#define BTF_T1 ..."
.B "struct bench_timerops {"
.BI "	void (*describe)(struct bench_timer *" bt ", dstr *" d );
.BI "	int (*preflight)(struct bench_timer *" bt );
.ta 2n +\w'\fBint (*now)('u
.BI "	int (*now)(struct bench_timer *" bt ,
.BI "		struct bench_time *" t_out ", unsigned " f );
.ta 2n +\w'\fBvoid (*diff)('u
.BI "	void (*diff)(struct bench_timer *" bt ,
.BI "		struct bench_timing *" delta_out ,
.BI "		const struct bench_time *" t0 ,
.BI "		const struct bench_time *" t1 );
.BI "	void (*destroy)(struct bench_timer *" bt );
.B "};"
.B "struct bench_timer {"
.B "	const struct bench_timerops *ops;"
.B "	unsigned ref;"
.B "};"
.PP
.B "struct bench_timer *bench_createtimer(void);"
.B "BENCH_TIMELOOP_DECLS;"
.ta 2n \w'\fBBENCH_TIMELOOP_TAG('u
.BI "BENCH_TIMELOOP_TAG(" tag ", struct bench_timer *" tm ,
.BI "		struct bench_timing *" delta_out ", double " n ,
.BI "		" onbreak )
.BI "	" stmt
.PP
.B "#define BTF_CLB ..."
.B "#define BTF_INDIRECT ..."
.PP
.ta 2n
.B "struct bench_state {"
.B "	unsigned f;"
.B "	double target_s;"
.B "	..."
.B "}";
.PP
.BI "typedef void bench_fn(unsigned long " n ", void *" ctx );
.PP
.BI "int bench_init(struct bench_state *" b ", struct bench_timer *" tm );
.BI "void bench_destroy(struct bench_state *" b );
.BI "int bench_calibrate(struct bench_state *" b ", unsigned " f );
.BI "int bench_preflight(struct bench_state *" b );
.ta \w'\fBint bench_adapt('u
.BI "int bench_adapt(struct bench_state *" b ", double *" n_inout ,
.BI "	const struct bench_timing *" t );
.ta \w'\fBint bench_adjust('u
.BI "int bench_adjust(struct bench_state *" b ", struct bench_timing *" t_inout ,
.BI "	double " n ", double " base );
.B "BENCH_MEASURE_DECLS;"
.ta 2n \w'\fBBENCH_MEASURE_TAG('u
.BI "BENCH_MEASURE_TAG(" tag ", struct bench_state *" b ,
.BI "		int &" rc ", struct bench_timing *" t_out ", double " bsae )
.BI "	" stmt
.ta 2n \w'\fBBENCH_MEASURE('u
.BI "BENCH_MEASURE(struct bench_state *" b ,
.BI "		int &" rc ", struct bench_timing *" t_out ", double " bsae )
.BI "	" stmt
.ta \w'\fBint bench_measure('u
.BI "int bench_measure(struct bench_state *" b ", struct bench_timing *" t_out ,
.BI "	double " base ", bench_fn *" fn ", void *" ctx );
.PP
.ta 2n
.B "enum {"
.B "	BTU_OP = 0,"
.B "	BTU_BYTE = 1,"
.B "	..."
.BI "	BTU_LIMIT = " n
.B "};"
.ta \w'\fBvoid bench_report('u
.BI "void bench_report(const struct gprintf_ops *" gops ", void *" go ,
.BI "	unsigned " unit ", const struct bench_timing *" t );
.PP
.fi
.
.\"--------------------------------------------------------------------------
.SH DESCRIPTION
.
The header file
.B "<mLib/bench.h>"
provides declarations and defintions
for performing low-level benchmarks.
.PP
The `main event' are the
.B BENCH_MEASURE
macro and
.B bench_measure
function.
These will be described in detail later,
but, in brief,
they execute a caller-provided piece of code
instructing it to run adaptively chosen numbers of iterations,
in order to get a reasonably reliable measurement of its running time,
and then report the results by filling in a structure.
.PP
With understanding these as our objective,
we must examine all of the pieces involved in making them work.
.
.SS Timers in general
A
.I timer
is a gadget which is capable of reporting the current time,
in seconds (ideally precise to tiny fractions of a second),
and/or in CPU cycles.
A timer is represented by a pointer to an object of type
.BR "struct bench_timer" .
This structure has two members:
.BR ops ,
pointing to a
.BR "struct bench_timerops" ,
which is a table of function pointers,
and
.BR ref ,
which is a simple reference count;
typically, a timer has more data following this,
but this fact is not exposed to applications.
.PP
The function pointers in
.B "struct bench_timerops"
are as follows.
The first argument,
named
.I tm
must always point to the timer object itself.
.TP
.IB tm ->ops->describe( tm ", " d)
Write a description of the timer to the dynamic string
.IR d .
.TP
.IB tm ->ops->preflight( tm )
Ensure that the timer is in working order,
and perform any necessary per-thread or per-process setup.
Return zero if the
.B now
function is likely to work properly
when called from the same thread
in the near future;
otherwise return \-1.
.TP
.IB tm ->ops->now( tm ", " t_out ", " f )
Store the current time in
.BI * t_out \fR.
The
.B BTF_T1
flag in
.I f
to indicate that this is the second call in a pair;
leave it clear for the first call.
(A fake
.B BTF_T0
flag is defined to be zero for symmetry.)
Return zero on success
.I or
permanent failure;
return \-1 if timing failed but
trying again immediately has a reasonable chance of success.
.TP
.IB tm ->ops->diff( tm ", " delta_out ", " t0 ", " t1 )
Store in
.BI * delta_out
the difference between the two times
.I t0
and
.IR t1 .
.TP
.IB tm ->ops->destroy( tm )
Destroy the timer,
releasing all of the resources that it holds.
.PP
In a freshly-created timer, the
.B ref
member is 1.
Applications are expected to handle the reference count themselves;
the
.B destroy
function does not check or decrement the count.
Code for destroying the timer when it's no longer needed
might look like this.
.VS
if (!--tm->ref) tm->ops->destroy(tm);
.VE
A
.B bench_timing
structure reports the difference between two times,
as determined by a timer's
.B diff
function.
It has four members.
.TP
.B f
A flags word.
.B BTF_TIMEOK
is set if the passage-of-time measurement in
.B t
is valid;
.B BTF_CYOK
is set if the cycle count in
.B cy
is valid.
The mask
.B BTF_ANY
covers the
.B BTF_TIMEOK
and
.B BTF_CYOK
bits:
hence,
.B f&BTF_ANY
is nonzero (true)
if the timer returned any valid timing information.
.TP
.B n
The number of units processed the benchmark computation
on its satisfactory run,
multiplied by a given
.IR base
\(en see
.BR BENCH_MEASURE ,
.BR bench_measure ,
and
.BR bench_adjust .
.TP
.B t
The time taken for the satisfactory run of the benchmark function,
in seconds.
Only valid if
.B BTF_TIMEOK
is set in
.BR f .
.TP
.B cy
The number of CPU cycles used
in the satisfactory run of the benchmark function,
in seconds.
Only valid if
.B BTF_CYOK
is set in
.BR f .
.PP
A
.B "struct bench_time"
represents a single instant in time,
as captured by a timer's
.B now
function.
The use of this structure is a private matter for the timer:
the only hard requirement is that the
.B diff
function should be able to compute the difference between two times.
However, the intent is that
a passage-of-time measurement is stored in the
.B t
union,
a cycle count is stored in the
.B cy
member, and
the
.B f
member stores flags
.B BTF_TIMEOK
and or
.B BTF_CYOK
if the passage-of-time or cycle count respectively are valid.
.PP
The
.B BENCH_TIMELOOP_TAG
macro uses a timer to measure a number of iterations of a computation.
It requires the declarations made by
.B BENCH_TIMELOOP_DECLS
to be in scope,
ideally within an enclosing block
(rather than at top-level,
where they'll have static storage duration,
and take longer to access).
The macro's expansion is syntactically a statement head;
see
.BR control (3)
for details about the underlying machinery.
In more detail, the macro is invoked as
.IP
.nf
.ta 2n
.BI "BENCH_TIMELOOP_TAG(" tag ", " tm ", " delta_out ", " n ", " onbreak )
.BI "	" stmt
.fi
.PP
The
.I tag
argument is used to distinguish
the labels used internally by the macro:
see
.BR control (3)
for details about tags.
The macro calls on the timer
.I tm
to determine the initial time and cycle counts,
performs
.I n
iterations of some computation,
and calls on the timer a second time
to determine the final time and cycle counts,
and to store the difference in
.BI * delta_out \fR.
The
.I stmt
may be any C statement:
when it is executed,
the variable
.BR _bench_n ,
of type
.BR "unsigned long" ,
is in scope.
The statement should perform
.B _bench_n
iterations of the computation to be measured
\(en and do as little else as possible.
The argument
.I n
to the macro
may be larger than
.BR ULONG_MAX :
the macro will execute
.I stmt
multiple times if necessary.
The statement is allowed to clobber
.BR _bench_n .
Note that
.B BENCH_TIMELOOP_TAG
does
.I not
call the timer's
.B preflight
function.
If the
.I stmt
executes a free
.B break
statement
then the statement
.I onbreak
is executed;
a free
.B continue
statement within
.I stmt
currently does not have a useful behaviour.
Free
.B break
and
.B continue
statements within
.I onbreak
behave normally.
(See
.BR control (3)
for a definition of
`free'
.B break
and
.B continue
statements.)
.
.SS The built-in timer
The function
.B bench_createtimer
constructs and returns a timer.
It takes a single argument,
a string
.IR config ,
from which it reads configuration information.
If
.B bench_createtimer
fails, it returns a null pointer.
.PP
The
.I config
pointer may safely be null,
in which case a default configuration will be used.
Applications
.I should only
set this pointer to a value supplied by a user,
e.g., through a command-line argument,
environment variable, or
configuration file.
.PP
The built-in timer makes use of one or two
.IR subtimers :
a `clock' subtimer to measure the passage of time,
and possibly a `cycle' subtimer to count CPU cycles.
.PP
The configuration string consists of a sequence of words
separated by whitespace.
There may be additional whitespace at the start and end of the string.
The words recognized are as follows.
.TP
.B list
Prints a list of the available clock and cycle subtimers
to standard output.
.TP
.BI clock= t , ...
Use the first of the listed clock subtimers
to initialize successfully
as the clock subtimer.
If none of the subtimers can be initialized,
then construction of the timer as a whole fails.
.TP
.BI cycle= t , ...
Use the first of the listed subtimers
to initialize successfully
as the cycle subtimer.
If none of the subtimers can be initialized,
then construction of the timer as a whole fails.
.PP
The clock subtimers are as follows.
Not all of them will be available on every platform.
.TP
.B linux-x86-perf-rdpmc-hw-cycles
This is a dummy companion to the similarly named cycle subtimer;
see its description below.
.TP
.B posix-thread-cputime
Measures the passage of time using
.BR clock_gettime (2),
specifying the
.B CLOCK_\%THREAD_\%CPUTIME_\%ID
clock.
.TP
.B stdc-clock
Measures the passage of time using
.BR clock (3).
Since
.BR clock (3)
is part of the original ANSI\ C standard,
this subtimer should always be available.
However, it may produce unhelpful results
if other threads are running.
.PP
The cycle subtimers are as follows.
Not all of them will be available on every platform.
.TP
.B linux-perf-read-hw-cycles
Counts CPU cycles using the Linux-specific
.BR perf_event_open (2)
function to read the
.BR PERF_\%COUNT_\%HW_\%CPU_\%CYCLES
counter.
Only available on Linux.
It will fail to initialize
if access to performance counters is restricted,
e.g., because the
.B /proc/sys/kernel/perf_event_paranoid
level is too high.
.TP
.B linux-perf-rdpmc-hw-cycles
Counts CPU cycles using the Linux-specific
.BR perf_event_open (2)
function,
as for
.B linux-x86-perf-read-hw-cycles
above,
except that it additionally uses the i386/AMD64
.B rdtsc
and
.B rdpmc
instructions,
together with information provided by the kernel
through a memory-mapped page
to do its measurements without any system call overheads.
It does passage-of-time and cycle counting in a single operation,
so no separate clock subtimer is required:
the similarly-named clock subtimer does nothing
except check that the
.B linux-x86-perf-rdpmc-hw-cycles
cycle subtimer has been selected.
This is almost certainly the best choice if it's available;
It is, however, not compatible with (at least some versions of)
.BR valgrind (1);
it will detect that it is running under
.B valgrind
and fail to initialize.
.TP
.B x86-rdtscp
Counts CPU cycles using the x86
.B rdtscp
instruction.
This instruction is not really suitable for performance measurement:
it gives misleading results on CPUs with variable clock frequency.
.TP
.B x86-rdtsc
Counts CPU cycles using the x86
.B rdtsc
instruction.
This has the downsides of
.B rdtscp
above,
but also fails to detect when the thread has been suspended
or transferred to a different CPU core
and gives misleading answers in this case.
Not really recommended.
.TP
.B null
A dummy cycle counter,
which will initialize successfully
and then fail to report cycle counts.
This is a reasonable fallback in many situations.
.PP
The built-in preference order for clock subtimers,
from most to least preferred, is
.BR linux-x86-perf-rdpmc-hw-cycles ,
followed by
.BR posix-thread-cputime ,
and finally
.BR stdc-clock .
The built-in preference order for cycle subtimers,
from most to least preferred, is
.BR linux-x86-perf-rdpmc-hw-cycles
then
.BR linux-x86-perf-read-hw-cycles ,
followed by
.BR x86-rdtscp ,
and
.BR x86-rdtsc ,
and finally
.BR null .
.
.SS The benchmark state
A
.I benchmark state
tracks the information needed to measure performance of functions.
It is represented by a
.B struct bench_state
structure.
.PP
The benchmark state is initialized by calling
.BR bench_init ,
passing the address of the state structure to be initialized,
and a pointer to a timer.
If
.B bench_init
is called with a non-null timer pointer,
then it will not fail;
the benchmark state will be initialized,
and the function returns zero;
the timer's reference count is
.I not
incremented.
If the timer pointer is null,
then
.B bench_init
attempts to construct a timer for itself
by calling
.BR bench_createtimer .
If this succeeds,
then the benchmark state will be initialized,
and the function returns zero.
In both cases,
the timer reference becomes owned by the benchmark state:
calling
.B bench_destroy
on the benchmark state will decrement the timer's reference count,
and destroy it unless it has additional outstanding references.
If
.B bench_init
is called with a null timer pointer,
and its attempt to create a timer for itself fails,
then
.B bench_init
returns \-1:
the benchmark state is not initialized
and can safely be discarded.
.PP
Calling
.B bench_destroy
on a benchmark state
releases any resources it holds,
most notably its timer, if any.
Calling
.B bench_destroy
on an unsuccessfully initialized benchmark state
is safe but has no effect.
.PP
Although
.B struct bench_state
is defined in the header file,
only two members are available for use by applications.
.TP
.B f
A word containing flags.
.TP
.B target_s
The target time for which to try run a benchmark, in seconds.
After initialization, this is set to 1.0,
though applications can override it.
.PP
Before the benchmark state can be used in measurements,
it must be
.IR calibrated .
This is performed by calling
.B bench_calibrate
on the benchmark state.
Calibration takes a noticeable amount of time
(currently about 0.25\*,s),
so it makes sense to defer it until it's known to be necessary.
.PP
Calibration is carried out separately, but in parallel,
for the timer's passage-of-time measurement and cycle counter.
Either or both of these calibrations can succeed or fail;
if passage-of-time calibration fails,
then cycle count calibration is impossible.
.PP
The benchmarking state must be calibrated differently
for different kinds of timing loop;
this is controlled by the flags passed as the
.I f
argument to
.BR bench_calibrate .
The main difference lies in whether the code to be measured
is called
.IR indirectly ,
i.e., via a function pointer.
Set
.B BTF_INDIRECT
if the code is to be called indirectly;
leave this flag clear if the code is called directly.
The
.B bench_measure
function always makes indirect calls;
the
.B BENCH_MEASURE
macro does not itself make indirect calls.
Usually, a program needs only one or the other;
if both are necessary for some reason,
the best approach is just to set up two benchmarking states
sharing the same timer,
and calibrate them separately.
.PP
When it completes,
.B bench_calibrate
sets flags in the benchmark state's
.B f
member:
if passage-of-time calibration succeeded,
.B BTF_TIMEOK
is set;
if cycle-count calibration succeeded,
.B BTF_CYOK
is set;
and the flag
.B BTF_CLB
is set unconditionally,
as a persistent indication that calibration has been attempted.
.PP
The
.B bench_calibrate
function returns zero if it successfully calibrated
at least the passage-of-time measurement;
otherwise, it returns \-1.
If
.B bench_calibrate
is called for a second or subsequent time on the same benchmark state,
it returns immediately,
either returning 0 or \-1
according to whether passage-of-time had previously been calibrated.
.PP
The
.B BENCH_MEASURE
macro measures the performance of a computation.
It requires the declarations made by
.B BENCH_MEASURE_DECLS
to be in scope,
ideally within an enclosing block
(rather than at top-level,
where they'll have static storage duration,
and take longer to access).
The macro's expansion is syntactically a statement head;
see
.BR control (3)
for details about the underlying machinery.
In more detail, the macro is invoked as
.IP
.nf
.ta 2n
.BI "BENCH_MEASURE(" b ", " rc ", " t_out ", " base )
.BI "	" stmt
.fi
.PP
The
.I stmt
can be any C statement;
it should perform
.B _bench_n
iterations of the computation to be measured.
(The variable
.B _bench_n
is declarared as part of
.B BENCH_MEASURE_DECLS
and has type
.BR "unsigned long" .
Before commencing measurement proper,
the macro calls
.BR bench_preflight ,
described below,
to check that everything is set up properly
for measurements on the current thread;
if this fails, then the macro sets
.I rc
to \-1.
Otherwise, the macro executes
.I stmt
one or more times,
with the objective of finding an iteration count
.I n
such that
.I n
iterations of the computation take more than
.IB b ->target_s "" \fR/\*(sr2
seconds.
If measurement fails,
then
.I rc
is set to \-1;
otherwise,
.I rc
is set to zero, and
.BI * t_out
is filled in with the measurement;
.IB t_out ->n
is set to
.IR n "\ \*(mu\ " base .
.PP
The
.B BENCH_MEASURE_TAG
macro works just like
.B BENCH_MEASURE
except that it takes an additional
.I tag
argument used to distinguish the internal labels
used by the macro's implementation;
this makes it possible to use
.B BENCH_MEASURE_TAG
as a component in more complex macros.
See
.BR control (3)
for details about control-structure macros
and the meaning and format of tags.
.PP
The function
.B bench_measure
is similar,
except that it calls a
.I benchmark function
to perform the computation.
A benchmark function has the signature
.IP
.BI "void " fn "(unsigned long " n ", void *" ctx );
.PP
When called, it should perform the operation to be measured
.I n
times.
The
.I ctx
argument is a pointer passed into
.B bench_measure
for the benchmark function's own purposes.
The
.B bench_measure
function returns zero on success,
or \-1 on failure.
Note that
.B bench_measure
invokes the benchmark indirectly,
so the benchmark state should have been calibrated with
.BR BTF_INDIRECT .
.
.SS Measurement utilities
The following functions are primarily exported for the benefit of the
.B BENCH_MEASURE
macro,
but are documented here in case they are useful.
.PP
The
.B bench_preflight
function prepares a benchmarking state for use.
It checks that the timer is calibrated
and suitable for measuring passage-of-time;
it also calls the timer's
.B preflight
function to prepare it for measurements on the current thread.
If these checks succeed, then
.B bench_preflight
returns zero;
otherwise it returns \-1
and the caller should not proceed with measurements.
.PP
The
.B bench_adapt
function is used to determine iteration counts.
It is used in a loop such as the following.
.IP
.nf
.ta 2n +2n
.B "BENCH_TIMELOOP_DECLS;"
.B "struct bench_timer *tm;"
.B "struct bench_timing t;"
.B "double n = 1.0, target_s = 1.0;"
.IP
.B "do {"
.B "	BENCH_TIMELOOP_TAG(time, tm, &t, n, { break; })"
.BI "		" "(do " _bench_n " iterations of some computation)" ;
.B "} while (!bench_adapt(&n, target_s, &t));"
.fi
.PP
On entry,
.BI *n_inout
should be the number of iterations performed by the previous step,
and
.BI * t
the resulting time;
the
.B BTF_TIMEOK
flag must be set in
.IB t ->f \fR.
If the timing is sufficient \(en if
.IR t\fB->t "\ \*(>=\ " target_s /\*(sr2
\(en then
.B bench_adapt
returns a nonzero value to indicate that measurement is complete.
Otherwise, it sets
.BI * n_inout
to a new, larger iteration count
and returns zero to indicate that a further pass is necessary.
.PP
The
.B bench_adjust
function adjusts a raw timing,
as captured by
.BR BENCH_TIMELOOP_TAG ,
according to the calibration data captured in
.IR b .
On exit, the timing data is updated,
and
.IB t ->n
is set to the product
.IR n "\ \*(mu\ " base .
.
.SS Reporting results
The
.B bench_report
function formats a measurement result
into a human-readable string.
The function writes its output using the
generalized output formatting operations
.I gops
and output pointer
.IR go ;
see
.BR gprintf (3)
for details on generalized output formatting.
The
.I unit
argument describes the unit of activity being measured:
.TP
.B BTU_OP
counts operations of some unspecified nature, while
.TP
.B BTU_BYTE
counts a number of bytes processed.
.PP
These are presented differently
\(em in particular,
quantities bytes are expressed using binary scaling rather than decimal.
The timing to report is given by the
.I t
argument;
.IB t ->n
gives the number of units processed.
.
.\"--------------------------------------------------------------------------
.SH EXAMPLE
.
The following macros offer a fairly simple example of
how the benchmarking functions and macros can be used.
.VS
.ta 2n +2n +2n 2n+\w'\fBBENCH_MEASURE_TAG('u \n(.lu-\n(.iu-4n
#define BENCHMARK_DECLS					\e
	struct bench_timing _bmark_t;				\e
	int _bmark_rc;				\e
	BENCH_MEASURE_DECLS
.VP
#define BENCHMARK_TAG(tag, b, unit, base)					\e
	MC_BEFORE(tag##__benchmark_before, { fflush(stdout); })				\e
	MC_AFTER(tag##__benchmark_after, {				\e
		if (_bmark_rc)			\e
			puts(": FAILED");		\e
		else {			\e
			fputs(": ", stdout);		\e
			bench_report(&file_printops, stdout, (unit), &_bmark_tm);\ 	\e
			putchar('\en');		\e
		}			\e
	})				\e
	BENCH_MEASURE_TAG(tag##__bmarkmark_measure,				\e
				(b), _bmark_rc, &_bmark_t, (base))
#define BENCHMARK(b, unit, base)					\e
	BENCHMARK_TAG(bench, b, unit, base)
.VE

.\"--------------------------------------------------------------------------
.SH "SEE ALSO"
.
.BR control (3),
.BR macros (3),
.BR tvec-bench (3),
.BR mLib (3).
.
.\"--------------------------------------------------------------------------
.SH AUTHOR
.
Mark Wooding, <mdw@distorted.org.uk>
.
.\"----- That's all, folks --------------------------------------------------