chiark - git - mdw - mLib/blob - test/bench.3.in

   1 .\" -*-nroff-*-
   2 .\"
   3 .\" Manual for benchmarking core
   4 .\"
   5 .\" (c) 2024 Straylight/Edgeware
   6 .\"
   7 .
   8 .\"----- Licensing notice ---------------------------------------------------
   9 .\"
  10 .\" This file is part of the mLib utilities library.
  11 .\"
  12 .\" mLib is free software: you can redistribute it and/or modify it under
  13 .\" the terms of the GNU Library General Public License as published by
  14 .\" the Free Software Foundation; either version 2 of the License, or (at
  15 .\" your option) any later version.
  16 .\"
  17 .\" mLib is distributed in the hope that it will be useful, but WITHOUT
  18 .\" ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  19 .\" FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
  20 .\" License for more details.
  21 .\"
  22 .\" You should have received a copy of the GNU Library General Public
  23 .\" License along with mLib.  If not, write to the Free Software
  24 .\" Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
  25 .\" USA.
  26 .
  27 .\"--------------------------------------------------------------------------
  28 .so ../defs.man \" @@@PRE@@@
  29 .
  30 .\"--------------------------------------------------------------------------
  31 .TH bench 3mLib "9 March 2024" "Straylight/Edgeware" "mLib utilities library"
  32 .\" @bench_createtimer
  33 .\" @BENCH_TIMELOOP_DECLS
  34 .\" @BENCH_TIMELOOP_TAG
  35 .
  36 .\" @bench_init
  37 .\" @bench_destroy
  38 .\" @bench_calibrate
  39 .\" @bench_preflight
  40 .\" @bench_adapt
  41 .\" @bench_adjust
  42 .\" @BENCH_MEASURE_DECLS
  43 .\" @BENCH_MEASURE_TAG
  44 .\" @BENCH_MEASURE
  45 .\" @bench_measure
  46 .
  47 .\" @bench_report
  48 .
  49 .\"--------------------------------------------------------------------------
  50 .SH NAME
  51 bench \- low-level benchmarking tools
  52 .
  53 .\"--------------------------------------------------------------------------
  54 .SH SYNOPSIS
  55 .
  56 .nf
  57 .B "#include <mLib/bench.h>"
  58 .PP
  59 .B "#define BTF_TIMEOK ..."
  60 .B "#define BTF_CYOK ..."
  61 .B "#define BTF_ANY (BTF_TIMEOK | BTF_CYOK)"
  62 .PP
  63 .ta 2n +2n +2n
  64 .B "struct bench_time {"
  65 .B "    unsigned f;"
  66 .B "    union {"
  67 .B "            struct { kludge64 s; uint32 ns; } ts;"
  68 .B "            clock_t clk;"
  69 .B "            kludge64 rawns;"
  70 .B "    } t;"
  71 .B "    kludge64 cy;"
  72 .B "};"
  73 .PP
  74 .B "struct bench_timing {"
  75 .B "    unsigned f;"
  76 .B "    double n;"
  77 .B "    double t;"
  78 .B "    double cy;"
  79 .B "};"
  80 .PP
  81 .B "#define BTF_T0 0u"
  82 .B "#define BTF_T1 ..."
  83 .B "struct bench_timerops {"
  84 .BI "   void (*describe)(struct bench_timer *" bt ", dstr *" d );
  85 .BI "   int (*preflight)(struct bench_timer *" bt );
  86 .ta 2n +\w'\fBint (*now)('u
  87 .BI "   int (*now)(struct bench_timer *" bt ,
  88 .BI "           struct bench_time *" t_out ", unsigned " f );
  89 .ta 2n +\w'\fBvoid (*diff)('u
  90 .BI "   void (*diff)(struct bench_timer *" bt ,
  91 .BI "           struct bench_timing *" delta_out ,
  92 .BI "           const struct bench_time *" t0 ,
  93 .BI "           const struct bench_time *" t1 );
  94 .BI "   void (*destroy)(struct bench_timer *" bt );
  95 .B "};"
  96 .B "struct bench_timer {"
  97 .B "    const struct bench_timerops *ops;"
  98 .B "    unsigned ref;"
  99 .B "};"
 100 .PP
 101 .B "struct bench_timer *bench_createtimer(void);"
 102 .B "BENCH_TIMELOOP_DECLS;"
 103 .ta 2n \w'\fBBENCH_TIMELOOP_TAG('u
 104 .BI "BENCH_TIMELOOP_TAG(" tag ", struct bench_timer *" tm ,
 105 .BI "           struct bench_timing *" delta_out ", double " n ,
 106 .BI "           " onbreak )
 107 .BI "   " stmt
 108 .PP
 109 .B "#define BTF_CLB ..."
 110 .B "#define BTF_INDIRECT ..."
 111 .PP
 112 .ta 2n
 113 .B "struct bench_state {"
 114 .B "    unsigned f;"
 115 .B "    double target_s;"
 116 .B "    ..."
 117 .B "}";
 118 .PP
 119 .BI "typedef void bench_fn(unsigned long " n ", void *" ctx );
 120 .PP
 121 .BI "int bench_init(struct bench_state *" b ", struct bench_timer *" tm );
 122 .BI "void bench_destroy(struct bench_state *" b );
 123 .BI "int bench_calibrate(struct bench_state *" b ", unsigned " f );
 124 .BI "int bench_preflight(struct bench_state *" b );
 125 .ta \w'\fBint bench_adapt('u
 126 .BI "int bench_adapt(struct bench_state *" b ", double *" n_inout ,
 127 .BI "   const struct bench_timing *" t );
 128 .ta \w'\fBint bench_adjust('u
 129 .BI "int bench_adjust(struct bench_state *" b ", struct bench_timing *" t_inout ,
 130 .BI "   double " n ", double " base );
 131 .B "BENCH_MEASURE_DECLS;"
 132 .ta 2n \w'\fBBENCH_MEASURE_TAG('u
 133 .BI "BENCH_MEASURE_TAG(" tag ", struct bench_state *" b ,
 134 .BI "           int &" rc ", struct bench_timing *" t_out ", double " bsae )
 135 .BI "   " stmt
 136 .ta 2n \w'\fBBENCH_MEASURE('u
 137 .BI "BENCH_MEASURE(struct bench_state *" b ,
 138 .BI "           int &" rc ", struct bench_timing *" t_out ", double " bsae )
 139 .BI "   " stmt
 140 .ta \w'\fBint bench_measure('u
 141 .BI "int bench_measure(struct bench_state *" b ", struct bench_timing *" t_out ,
 142 .BI "   double " base ", bench_fn *" fn ", void *" ctx );
 143 .PP
 144 .ta 2n
 145 .B "enum {"
 146 .B "    BTU_OP = 0,"
 147 .B "    BTU_BYTE = 1,"
 148 .B "    ..."
 149 .BI "   BTU_LIMIT = " n
 150 .B "};"
 151 .ta \w'\fBvoid bench_report('u
 152 .BI "void bench_report(const struct gprintf_ops *" gops ", void *" go ,
 153 .BI "   unsigned " unit ", const struct bench_timing *" t );
 154 .PP
 155 .fi
 156 .
 157 .\"--------------------------------------------------------------------------
 158 .SH DESCRIPTION
 159 .
 160 The header file
 161 .B "<mLib/bench.h>"
 162 provides declarations and defintions
 163 for performing low-level benchmarks.
 164 .PP
 165 The `main event' are the
 166 .B BENCH_MEASURE
 167 macro and
 168 .B bench_measure
 169 function.
 170 These will be described in detail later,
 171 but, in brief,
 172 they execute a caller-provided piece of code
 173 instructing it to run adaptively chosen numbers of iterations,
 174 in order to get a reasonably reliable measurement of its running time,
 175 and then report the results by filling in a structure.
 176 .PP
 177 With understanding these as our objective,
 178 we must examine all of the pieces involved in making them work.
 179 .
 180 .SS Timers in general
 181 A
 182 .I timer
 183 is a gadget which is capable of reporting the current time,
 184 in seconds (ideally precise to tiny fractions of a second),
 185 and/or in CPU cycles.
 186 A timer is represented by a pointer to an object of type
 187 .BR "struct bench_timer" .
 188 This structure has two members:
 189 .BR ops ,
 190 pointing to a
 191 .BR "struct bench_timerops" ,
 192 which is a table of function pointers,
 193 and
 194 .BR ref ,
 195 which is a simple reference count;
 196 typically, a timer has more data following this,
 197 but this fact is not exposed to applications.
 198 .PP
 199 The function pointers in
 200 .B "struct bench_timerops"
 201 are as follows.
 202 The first argument,
 203 named
 204 .I tm
 205 must always point to the timer object itself.
 206 .TP
 207 .IB tm ->ops->describe( tm ", " d)
 208 Write a description of the timer to the dynamic string
 209 .IR d .
 210 .TP
 211 .IB tm ->ops->preflight( tm )
 212 Ensure that the timer is in working order,
 213 and perform any necessary per-thread or per-process setup.
 214 Return zero if the
 215 .B now
 216 function is likely to work properly
 217 when called from the same thread
 218 in the near future;
 219 otherwise return \-1.
 220 .TP
 221 .IB tm ->ops->now( tm ", " t_out ", " f )
 222 Store the current time in
 223 .BI * t_out \fR.
 224 The
 225 .B BTF_T1
 226 flag in
 227 .I f
 228 to indicate that this is the second call in a pair;
 229 leave it clear for the first call.
 230 (A fake
 231 .B BTF_T0
 232 flag is defined to be zero for symmetry.)
 233 Return zero on success
 234 .I or
 235 permanent failure;
 236 return \-1 if timing failed but
 237 trying again immediately has a reasonable chance of success.
 238 .TP
 239 .IB tm ->ops->diff( tm ", " delta_out ", " t0 ", " t1 )
 240 Store in
 241 .BI * delta_out
 242 the difference between the two times
 243 .I t0
 244 and
 245 .IR t1 .
 246 .TP
 247 .IB tm ->ops->destroy( tm )
 248 Destroy the timer,
 249 releasing all of the resources that it holds.
 250 .PP
 251 In a freshly-created timer, the
 252 .B ref
 253 member is 1.
 254 Applications are expected to handle the reference count themselves;
 255 the
 256 .B destroy
 257 function does not check or decrement the count.
 258 Code for destroying the timer when it's no longer needed
 259 might look like this.
 260 .VS
 261 if (!--tm->ref) tm->ops->destroy(tm);
 262 .VE
 263 A
 264 .B bench_timing
 265 structure reports the difference between two times,
 266 as determined by a timer's
 267 .B diff
 268 function.
 269 It has four members.
 270 .TP
 271 .B f
 272 A flags word.
 273 .B BTF_TIMEOK
 274 is set if the passage-of-time measurement in
 275 .B t
 276 is valid;
 277 .B BTF_CYOK
 278 is set if the cycle count in
 279 .B cy
 280 is valid.
 281 The mask
 282 .B BTF_ANY
 283 covers the
 284 .B BTF_TIMEOK
 285 and
 286 .B BTF_CYOK
 287 bits:
 288 hence,
 289 .B f&BTF_ANY
 290 is nonzero (true)
 291 if the timer returned any valid timing information.
 292 .TP
 293 .B n
 294 The number of units processed the benchmark computation
 295 on its satisfactory run,
 296 multiplied by a given
 297 .IR base
 298 \(en see
 299 .BR BENCH_MEASURE ,
 300 .BR bench_measure ,
 301 and
 302 .BR bench_adjust .
 303 .TP
 304 .B t
 305 The time taken for the satisfactory run of the benchmark function,
 306 in seconds.
 307 Only valid if
 308 .B BTF_TIMEOK
 309 is set in
 310 .BR f .
 311 .TP
 312 .B cy
 313 The number of CPU cycles used
 314 in the satisfactory run of the benchmark function,
 315 in seconds.
 316 Only valid if
 317 .B BTF_CYOK
 318 is set in
 319 .BR f .
 320 .PP
 321 A
 322 .B "struct bench_time"
 323 represents a single instant in time,
 324 as captured by a timer's
 325 .B now
 326 function.
 327 The use of this structure is a private matter for the timer:
 328 the only hard requirement is that the
 329 .B diff
 330 function should be able to compute the difference between two times.
 331 However, the intent is that
 332 a passage-of-time measurement is stored in the
 333 .B t
 334 union,
 335 a cycle count is stored in the
 336 .B cy
 337 member, and
 338 the
 339 .B f
 340 member stores flags
 341 .B BTF_TIMEOK
 342 and or
 343 .B BTF_CYOK
 344 if the passage-of-time or cycle count respectively are valid.
 345 .PP
 346 The
 347 .B BENCH_TIMELOOP_TAG
 348 macro uses a timer to measure a number of iterations of a computation.
 349 It requires the declarations made by
 350 .B BENCH_TIMELOOP_DECLS
 351 to be in scope,
 352 ideally within an enclosing block
 353 (rather than at top-level,
 354 where they'll have static storage duration,
 355 and take longer to access).
 356 The macro's expansion is syntactically a statement head;
 357 see
 358 .BR control (3)
 359 for details about the underlying machinery.
 360 In more detail, the macro is invoked as
 361 .IP
 362 .nf
 363 .ta 2n
 364 .BI "BENCH_TIMELOOP_TAG(" tag ", " tm ", " delta_out ", " n ", " onbreak )
 365 .BI "   " stmt
 366 .fi
 367 .PP
 368 The
 369 .I tag
 370 argument is used to distinguish
 371 the labels used internally by the macro:
 372 see
 373 .BR control (3)
 374 for details about tags.
 375 The macro calls on the timer
 376 .I tm
 377 to determine the initial time and cycle counts,
 378 performs
 379 .I n
 380 iterations of some computation,
 381 and calls on the timer a second time
 382 to determine the final time and cycle counts,
 383 and to store the difference in
 384 .BI * delta_out \fR.
 385 The
 386 .I stmt
 387 may be any C statement:
 388 when it is executed,
 389 the variable
 390 .BR _bench_n ,
 391 of type
 392 .BR "unsigned long" ,
 393 is in scope.
 394 The statement should perform
 395 .B _bench_n
 396 iterations of the computation to be measured
 397 \(en and do as little else as possible.
 398 The argument
 399 .I n
 400 to the macro
 401 may be larger than
 402 .BR ULONG_MAX :
 403 the macro will execute
 404 .I stmt
 405 multiple times if necessary.
 406 The statement is allowed to clobber
 407 .BR _bench_n .
 408 Note that
 409 .B BENCH_TIMELOOP_TAG
 410 does
 411 .I not
 412 call the timer's
 413 .B preflight
 414 function.
 415 If the
 416 .I stmt
 417 executes a free
 418 .B break
 419 statement
 420 then the statement
 421 .I onbreak
 422 is executed;
 423 a free
 424 .B continue
 425 statement within
 426 .I stmt
 427 currently does not have a useful behaviour.
 428 Free
 429 .B break
 430 and
 431 .B continue
 432 statements within
 433 .I onbreak
 434 behave normally.
 435 (See
 436 .BR control (3)
 437 for a definition of
 438 `free'
 439 .B break
 440 and
 441 .B continue
 442 statements.)
 443 .
 444 .SS The built-in timer
 445 The function
 446 .B bench_createtimer
 447 constructs and returns a timer.
 448 It takes a single argument,
 449 a string
 450 .IR config ,
 451 from which it reads configuration information.
 452 If
 453 .B bench_createtimer
 454 fails, it returns a null pointer.
 455 .PP
 456 The
 457 .I config
 458 pointer may safely be null,
 459 in which case a default configuration will be used.
 460 Applications
 461 .I should only
 462 set this pointer to a value supplied by a user,
 463 e.g., through a command-line argument,
 464 environment variable, or
 465 configuration file.
 466 .PP
 467 The built-in timer makes use of one or two
 468 .IR subtimers :
 469 a `clock' subtimer to measure the passage of time,
 470 and possibly a `cycle' subtimer to count CPU cycles.
 471 .PP
 472 The configuration string consists of a sequence of words
 473 separated by whitespace.
 474 There may be additional whitespace at the start and end of the string.
 475 The words recognized are as follows.
 476 .TP
 477 .B list
 478 Prints a list of the available clock and cycle subtimers
 479 to standard output.
 480 .TP
 481 .BI clock= t , ...
 482 Use the first of the listed clock subtimers
 483 to initialize successfully
 484 as the clock subtimer.
 485 If none of the subtimers can be initialized,
 486 then construction of the timer as a whole fails.
 487 .TP
 488 .BI cycle= t , ...
 489 Use the first of the listed subtimers
 490 to initialize successfully
 491 as the cycle subtimer.
 492 If none of the subtimers can be initialized,
 493 then construction of the timer as a whole fails.
 494 .PP
 495 The clock subtimers are as follows.
 496 Not all of them will be available on every platform.
 497 .TP
 498 .B linux-x86-perf-rdpmc-hw-cycles
 499 This is a dummy companion to the similarly named cycle subtimer;
 500 see its description below.
 501 .TP
 502 .B posix-thread-cputime
 503 Measures the passage of time using
 504 .BR clock_gettime (2),
 505 specifying the
 506 .B CLOCK_\%THREAD_\%CPUTIME_\%ID
 507 clock.
 508 .TP
 509 .B stdc-clock
 510 Measures the passage of time using
 511 .BR clock (3).
 512 Since
 513 .BR clock (3)
 514 is part of the original ANSI\ C standard,
 515 this subtimer should always be available.
 516 However, it may produce unhelpful results
 517 if other threads are running.
 518 .PP
 519 The cycle subtimers are as follows.
 520 Not all of them will be available on every platform.
 521 .TP
 522 .B linux-perf-read-hw-cycles
 523 Counts CPU cycles using the Linux-specific
 524 .BR perf_event_open (2)
 525 function to read the
 526 .BR PERF_\%COUNT_\%HW_\%CPU_\%CYCLES
 527 counter.
 528 Only available on Linux.
 529 It will fail to initialize
 530 if access to performance counters is restricted,
 531 e.g., because the
 532 .B /proc/sys/kernel/perf_event_paranoid
 533 level is too high.
 534 .TP
 535 .B linux-perf-rdpmc-hw-cycles
 536 Counts CPU cycles using the Linux-specific
 537 .BR perf_event_open (2)
 538 function,
 539 as for
 540 .B linux-x86-perf-read-hw-cycles
 541 above,
 542 except that it additionally uses the i386/AMD64
 543 .B rdtsc
 544 and
 545 .B rdpmc
 546 instructions,
 547 together with information provided by the kernel
 548 through a memory-mapped page
 549 to do its measurements without any system call overheads.
 550 It does passage-of-time and cycle counting in a single operation,
 551 so no separate clock subtimer is required:
 552 the similarly-named clock subtimer does nothing
 553 except check that the
 554 .B linux-x86-perf-rdpmc-hw-cycles
 555 cycle subtimer has been selected.
 556 This is almost certainly the best choice if it's available;
 557 It is, however, not compatible with (at least some versions of)
 558 .BR valgrind (1);
 559 it will detect that it is running under
 560 .B valgrind
 561 and fail to initialize.
 562 .TP
 563 .B x86-rdtscp
 564 Counts CPU cycles using the x86
 565 .B rdtscp
 566 instruction.
 567 This instruction is not really suitable for performance measurement:
 568 it gives misleading results on CPUs with variable clock frequency.
 569 .TP
 570 .B x86-rdtsc
 571 Counts CPU cycles using the x86
 572 .B rdtsc
 573 instruction.
 574 This has the downsides of
 575 .B rdtscp
 576 above,
 577 but also fails to detect when the thread has been suspended
 578 or transferred to a different CPU core
 579 and gives misleading answers in this case.
 580 Not really recommended.
 581 .TP
 582 .B null
 583 A dummy cycle counter,
 584 which will initialize successfully
 585 and then fail to report cycle counts.
 586 This is a reasonable fallback in many situations.
 587 .PP
 588 The built-in preference order for clock subtimers,
 589 from most to least preferred, is
 590 .BR linux-x86-perf-rdpmc-hw-cycles ,
 591 followed by
 592 .BR posix-thread-cputime ,
 593 and finally
 594 .BR stdc-clock .
 595 The built-in preference order for cycle subtimers,
 596 from most to least preferred, is
 597 .BR linux-x86-perf-rdpmc-hw-cycles
 598 then
 599 .BR linux-x86-perf-read-hw-cycles ,
 600 followed by
 601 .BR x86-rdtscp ,
 602 and
 603 .BR x86-rdtsc ,
 604 and finally
 605 .BR null .
 606 .
 607 .SS The benchmark state
 608 A
 609 .I benchmark state
 610 tracks the information needed to measure performance of functions.
 611 It is represented by a
 612 .B struct bench_state
 613 structure.
 614 .PP
 615 The benchmark state is initialized by calling
 616 .BR bench_init ,
 617 passing the address of the state structure to be initialized,
 618 and a pointer to a timer.
 619 If
 620 .B bench_init
 621 is called with a non-null timer pointer,
 622 then it will not fail;
 623 the benchmark state will be initialized,
 624 and the function returns zero;
 625 the timer's reference count is
 626 .I not
 627 incremented.
 628 If the timer pointer is null,
 629 then
 630 .B bench_init
 631 attempts to construct a timer for itself
 632 by calling
 633 .BR bench_createtimer .
 634 If this succeeds,
 635 then the benchmark state will be initialized,
 636 and the function returns zero.
 637 In both cases,
 638 the timer reference becomes owned by the benchmark state:
 639 calling
 640 .B bench_destroy
 641 on the benchmark state will decrement the timer's reference count,
 642 and destroy it unless it has additional outstanding references.
 643 If
 644 .B bench_init
 645 is called with a null timer pointer,
 646 and its attempt to create a timer for itself fails,
 647 then
 648 .B bench_init
 649 returns \-1:
 650 the benchmark state is not initialized
 651 and can safely be discarded.
 652 .PP
 653 Calling
 654 .B bench_destroy
 655 on a benchmark state
 656 releases any resources it holds,
 657 most notably its timer, if any.
 658 Calling
 659 .B bench_destroy
 660 on an unsuccessfully initialized benchmark state
 661 is safe but has no effect.
 662 .PP
 663 Although
 664 .B struct bench_state
 665 is defined in the header file,
 666 only two members are available for use by applications.
 667 .TP
 668 .B f
 669 A word containing flags.
 670 .TP
 671 .B target_s
 672 The target time for which to try run a benchmark, in seconds.
 673 After initialization, this is set to 1.0,
 674 though applications can override it.
 675 .PP
 676 Before the benchmark state can be used in measurements,
 677 it must be
 678 .IR calibrated .
 679 This is performed by calling
 680 .B bench_calibrate
 681 on the benchmark state.
 682 Calibration takes a noticeable amount of time
 683 (currently about 0.25\*,s),
 684 so it makes sense to defer it until it's known to be necessary.
 685 .PP
 686 Calibration is carried out separately, but in parallel,
 687 for the timer's passage-of-time measurement and cycle counter.
 688 Either or both of these calibrations can succeed or fail;
 689 if passage-of-time calibration fails,
 690 then cycle count calibration is impossible.
 691 .PP
 692 The benchmarking state must be calibrated differently
 693 for different kinds of timing loop;
 694 this is controlled by the flags passed as the
 695 .I f
 696 argument to
 697 .BR bench_calibrate .
 698 The main difference lies in whether the code to be measured
 699 is called
 700 .IR indirectly ,
 701 i.e., via a function pointer.
 702 Set
 703 .B BTF_INDIRECT
 704 if the code is to be called indirectly;
 705 leave this flag clear if the code is called directly.
 706 The
 707 .B bench_measure
 708 function always makes indirect calls;
 709 the
 710 .B BENCH_MEASURE
 711 macro does not itself make indirect calls.
 712 Usually, a program needs only one or the other;
 713 if both are necessary for some reason,
 714 the best approach is just to set up two benchmarking states
 715 sharing the same timer,
 716 and calibrate them separately.
 717 .PP
 718 When it completes,
 719 .B bench_calibrate
 720 sets flags in the benchmark state's
 721 .B f
 722 member:
 723 if passage-of-time calibration succeeded,
 724 .B BTF_TIMEOK
 725 is set;
 726 if cycle-count calibration succeeded,
 727 .B BTF_CYOK
 728 is set;
 729 and the flag
 730 .B BTF_CLB
 731 is set unconditionally,
 732 as a persistent indication that calibration has been attempted.
 733 .PP
 734 The
 735 .B bench_calibrate
 736 function returns zero if it successfully calibrated
 737 at least the passage-of-time measurement;
 738 otherwise, it returns \-1.
 739 If
 740 .B bench_calibrate
 741 is called for a second or subsequent time on the same benchmark state,
 742 it returns immediately,
 743 either returning 0 or \-1
 744 according to whether passage-of-time had previously been calibrated.
 745 .PP
 746 The
 747 .B BENCH_MEASURE
 748 macro measures the performance of a computation.
 749 It requires the declarations made by
 750 .B BENCH_MEASURE_DECLS
 751 to be in scope,
 752 ideally within an enclosing block
 753 (rather than at top-level,
 754 where they'll have static storage duration,
 755 and take longer to access).
 756 The macro's expansion is syntactically a statement head;
 757 see
 758 .BR control (3)
 759 for details about the underlying machinery.
 760 In more detail, the macro is invoked as
 761 .IP
 762 .nf
 763 .ta 2n
 764 .BI "BENCH_MEASURE(" b ", " rc ", " t_out ", " base )
 765 .BI "   " stmt
 766 .fi
 767 .PP
 768 The
 769 .I stmt
 770 can be any C statement;
 771 it should perform
 772 .B _bench_n
 773 iterations of the computation to be measured.
 774 (The variable
 775 .B _bench_n
 776 is declarared as part of
 777 .B BENCH_MEASURE_DECLS
 778 and has type
 779 .BR "unsigned long" .
 780 Before commencing measurement proper,
 781 the macro calls
 782 .BR bench_preflight ,
 783 described below,
 784 to check that everything is set up properly
 785 for measurements on the current thread;
 786 if this fails, then the macro sets
 787 .I rc
 788 to \-1.
 789 Otherwise, the macro executes
 790 .I stmt
 791 one or more times,
 792 with the objective of finding an iteration count
 793 .I n
 794 such that
 795 .I n
 796 iterations of the computation take more than
 797 .IB b ->target_s "" \fR/\(sr2
 798 seconds.
 799 If measurement fails,
 800 then
 801 .I rc
 802 is set to \-1;
 803 otherwise,
 804 .I rc
 805 is set to zero, and
 806 .BI * t_out
 807 is filled in with the measurement;
 808 .IB t_out ->n
 809 is set to
 810 .IR n "\ \(mu\ " base .
 811 .PP
 812 The
 813 .B BENCH_MEASURE_TAG
 814 macro works just like
 815 .B BENCH_MEASURE
 816 except that it takes an additional
 817 .I tag
 818 argument used to distinguish the internal labels
 819 used by the macro's implementation;
 820 this makes it possible to use
 821 .B BENCH_MEASURE_TAG
 822 as a component in more complex macros.
 823 See
 824 .BR control (3)
 825 for details about control-structure macros
 826 and the meaning and format of tags.
 827 .PP
 828 The function
 829 .B bench_measure
 830 is similar,
 831 except that it calls a
 832 .I benchmark function
 833 to perform the computation.
 834 A benchmark function has the signature
 835 .IP
 836 .BI "void " fn "(unsigned long " n ", void *" ctx );
 837 .PP
 838 When called, it should perform the operation to be measured
 839 .I n
 840 times.
 841 The
 842 .I ctx
 843 argument is a pointer passed into
 844 .B bench_measure
 845 for the benchmark function's own purposes.
 846 The
 847 .B bench_measure
 848 function returns zero on success,
 849 or \-1 on failure.
 850 Note that
 851 .B bench_measure
 852 invokes the benchmark indirectly,
 853 so the benchmark state should have been calibrated with
 854 .BR BTF_INDIRECT .
 855 .
 856 .SS Measurement utilities
 857 The following functions are primarily exported for the benefit of the
 858 .B BENCH_MEASURE
 859 macro,
 860 but are documented here in case they are useful.
 861 .PP
 862 The
 863 .B bench_preflight
 864 function prepares a benchmarking state for use.
 865 It checks that the timer is calibrated
 866 and suitable for measuring passage-of-time;
 867 it also calls the timer's
 868 .B preflight
 869 function to prepare it for measurements on the current thread.
 870 If these checks succeed, then
 871 .B bench_preflight
 872 returns zero;
 873 otherwise it returns \-1
 874 and the caller should not proceed with measurements.
 875 .PP
 876 The
 877 .B bench_adapt
 878 function is used to determine iteration counts.
 879 It is used in a loop such as the following.
 880 .IP
 881 .nf
 882 .ta 2n +2n
 883 .B "BENCH_TIMELOOP_DECLS;"
 884 .B "struct bench_timer *tm;"
 885 .B "struct bench_timing t;"
 886 .B "double n = 1.0, target_s = 1.0;"
 887 .IP
 888 .B "do {"
 889 .B "    BENCH_TIMELOOP_TAG(time, tm, &t, n, { break; })"
 890 .BI "           " "(do " _bench_n " iterations of some computation)" ;
 891 .B "} while (!bench_adapt(&n, target_s, &t));"
 892 .fi
 893 .PP
 894 On entry,
 895 .BI *n_inout
 896 should be the number of iterations performed by the previous step,
 897 and
 898 .BI * t
 899 the resulting time;
 900 the
 901 .B BTF_TIMEOK
 902 flag must be set in
 903 .IB t ->f \fR.
 904 If the timing is sufficient \(en if
 905 .IR t\fB->t "\ \*(>=\ " target_s /\(sr2
 906 \(en then
 907 .B bench_adapt
 908 returns a nonzero value to indicate that measurement is complete.
 909 Otherwise, it sets
 910 .BI * n_inout
 911 to a new, larger iteration count
 912 and returns zero to indicate that a further pass is necessary.
 913 .PP
 914 The
 915 .B bench_adjust
 916 function adjusts a raw timing,
 917 as captured by
 918 .BR BENCH_TIMELOOP_TAG ,
 919 according to the calibration data captured in
 920 .IR b .
 921 On exit, the timing data is updated,
 922 and
 923 .IB t ->n
 924 is set to the product
 925 .IR n "\ \(mu\ " base .
 926 .
 927 .SS Reporting results
 928 The
 929 .B bench_report
 930 function formats a measurement result
 931 into a human-readable string.
 932 The function writes its output using the
 933 generalized output formatting operations
 934 .I gops
 935 and output pointer
 936 .IR go ;
 937 see
 938 .BR gprintf (3)
 939 for details on generalized output formatting.
 940 The
 941 .I unit
 942 argument describes the unit of activity being measured:
 943 .TP
 944 .B BTU_OP
 945 counts operations of some unspecified nature, while
 946 .TP
 947 .B BTU_BYTE
 948 counts a number of bytes processed.
 949 .PP
 950 These are presented differently
 951 \(em in particular,
 952 quantities bytes are expressed using binary scaling rather than decimal.
 953 The timing to report is given by the
 954 .I t
 955 argument;
 956 .IB t ->n
 957 gives the number of units processed.
 958 .
 959 .\"--------------------------------------------------------------------------
 960 .SH EXAMPLE
 961 .
 962 The following macros offer a fairly simple example of
 963 how the benchmarking functions and macros can be used.
 964 .VS
 965 .ta 2n +2n +2n 2n+\w'\fBBENCH_MEASURE_TAG('u \n(.lu-\n(.iu-4n
 966 #define BENCHMARK_DECLS                                 \e
 967         struct bench_timing _bmark_t;                           \e
 968         int _bmark_rc;                          \e
 969         BENCH_MEASURE_DECLS
 970 .VP
 971 #define BENCHMARK_TAG(tag, b, unit, base)                                       \e
 972         MC_BEFORE(tag##__benchmark_before, { fflush(stdout); })                         \e
 973         MC_AFTER(tag##__benchmark_after, {                              \e
 974                 if (_bmark_rc)                  \e
 975                         printf(": FAILED\en");          \e
 976                 else {                  \e
 977                         fputs(": ", stdout);            \e
 978                         bench_report(&file_printops, stdout, (unit), &_bmark_tm);\      \e
 979                         putchar('\n');          \e
 980                 }                       \e
 981         })                              \e
 982         BENCH_MEASURE_TAG(tag##__bmarkmark_measure,                             \e
 983                                 (b), _bmark_rc, &_bmark_t, (base))
 984 #define BENCHMARK(b, unit, base)                                        \e
 985         BENCHMARK_TAG(bench, b, unit, base)
 986 .VE
 987
 988 .\"--------------------------------------------------------------------------
 989 .SH "SEE ALSO"
 990 .
 991 .BR control (3),
 992 .BR macros (3),
 993 .BR tvec-bench (3),
 994 .BR mLib (3).
 995 .
 996 .\"--------------------------------------------------------------------------
 997 .SH AUTHOR
 998 .
 999 Mark Wooding, <mdw@distorted.org.uk>
1000 .
1001 .\"----- That's all, folks --------------------------------------------------