chiark - git - mdw - mLib/blob - test/bench.3

   1 .\" -*-nroff-*-
   2 .ie t .ds , \h'\w'\ 'u/2u'
   3 .el .ds , \ \"
   4 .TH bench 3 "9 March 2024" "Straylight/Edgeware" "mLib utilities library"
   5 .\" @bench_createtimer
   6 .\" @bench_init
   7 .\" @bench_destroy
   8 .\" @bench_calibrate
   9 .\" @bench_measure
  10 .
  11 .SH SYNOPSIS
  12 .nf
  13 .B "#include <mLib/bench.h>"
  14 .PP
  15 .ta 2n
  16 .B "struct bench_time {"
  17 .B "    unsigned f;"
  18 .B "    kludge64 s;"
  19 .B "    uint32 ns;"
  20 .B "    kludge64 cy;"
  21 .B "};"
  22 .PP
  23 .B "struct bench_timing {"
  24 .B "    unsigned f;"
  25 .B "    double n;"
  26 .B "    double t;"
  27 .B "    double cy;"
  28 .B "};"
  29 .PP
  30 .B "struct bench_timerops {"
  31 .BI "   void (*describe)(struct bench_timer *" bt ", dstr *" d );
  32 .BI "   void (*now)(struct bench_timer *" bt ", struct bench_time *" t_out );
  33 .BI "   void (*destroy)(struct bench_timer *" bt );
  34 .B "};"
  35 .B "struct bench_timer {"
  36 .B "    const struct bench_timerops *ops;"
  37 .B "};"
  38 .PP
  39 .B "struct bench_state {"
  40 .B "    unsigned f;"
  41 .B "    double target_s;"
  42 .B "    ..."
  43 .B "}";
  44 .PP
  45 .BI "typedef void bench_fn(unsigned long " n ", void *" ctx );
  46 .PP
  47 .B "#define BTF_TIMEOK ..."
  48 .B "#define BTF_CYOK ..."
  49 .B "#define BTF_CLB ..."
  50 .B "#define BTF_ANY (BTF_TIMEOK | BTF_CYOK)"
  51 .PP
  52 .B "struct bench_timer *bench_createtimer(void);"
  53 .PP
  54 .BI "int bench_init(struct bench_state *" b ", struct bench_timer *" tm );
  55 .BI "void bench_destroy(struct bench_state *" b );
  56 .BI "int bench_calibrate(struct bench_state *" b );
  57 .ta \w'\fBint bench_measure('u
  58 .BI "int bench_measure(struct bench_state *" b ", struct bench_timing *" t_out ,
  59 .BI "   double " base ", bench_fn *" fn ", void *" ctx );
  60 .fi
  61 .
  62 .SH DESCRIPTION
  63 The header file
  64 .B "<mLib/bench.h>"
  65 provides declarations and defintions
  66 for performing low-level benchmarks.
  67 .PP
  68 The `main event' is
  69 .BR bench_measure .
  70 This function will be described in detail later,
  71 but, in brief,
  72 it calls a caller-provided function,
  73 instructing it to run adaptively chosen numbers of iterations,
  74 in order to get a reasonably reliable measurement of its running time,
  75 and then reports its results by filling in a structure.
  76 .PP
  77 With understanding this function as our objective,
  78 we must examine all of the pieces involved in making it work.
  79 .
  80 .SS Timers in general
  81 A
  82 .I timer
  83 is a gadget which is capable of reporting the current time,
  84 in seconds (ideally precise to tiny fractions of a second),
  85 and/or in CPU cycles.
  86 A timer is represented by a pointer to an object of type
  87 .BR "struct bench_timer" .
  88 This structure has a single member,
  89 .BR ops ,
  90 pointing to a
  91 .BR "struct bench_timerops" ,
  92 which is a table of function pointers;
  93 typically, a timer has more data following this,
  94 but this fact is not exposed to applications.
  95 .PP
  96 The function pointers in
  97 .B "struct bench_timerops"
  98 are as follows.
  99 The first argument,
 100 named
 101 .I tm
 102 must always point to the timer object itself.
 103 .TP
 104 .IB tm ->ops->describe( tm ", " d)
 105 Write a description of the timer to the dynamic string
 106 .IR d .
 107 .TP
 108 .IB tm ->ops->now( tm ", " t_out)
 109 Store the current time in
 110 .IR t_out .
 111 The
 112 .B struct bench_time
 113 used to represent the time reported by a timer
 114 is described in detail below.
 115 .TP
 116 .IB tm ->ops->destroy( tm )
 117 Destroy the timer,
 118 releasing all of the resources that it holds.
 119 .PP
 120 A time, a reported by a timer, is represented by the
 121 .BR "struct bench_time" .
 122 A passage-of-time measurement is stored in the
 123 .B s
 124 and
 125 .B ns
 126 members, holding seconds and nanoseconds respectively.
 127 (A timer need not have nanosecond precision.
 128 The exact interpretation of the time \(en
 129 e.g., whether it measures wallclock time,
 130 user-mode CPU time,
 131 or total thread CPU time \(en
 132 is a matter for the specific timer implementation.)
 133 A cycle count is stored in the
 134 .B cy
 135 member.
 136 The
 137 .B f
 138 member stores flags:
 139 .B BTF_TIMEOK
 140 is set if the passage-of-time measurement
 141 .B s
 142 and
 143 .B ns
 144 are valid; and
 145 .B BTF_CYOK
 146 is set if the cycle count
 147 .B cy
 148 is valid.
 149 Neither the time nor the cycle count need be measured
 150 relative to any particular origin.
 151 The mask
 152 .B BTF_ANY
 153 covers the
 154 .B BTF_TIMEOK
 155 and
 156 .B BTF_CYOK
 157 bits:
 158 hence,
 159 .IB f &BTF_ANY
 160 is nonzero (true)
 161 if the timer returned any valid timing information.
 162 .
 163 .SS The built-in timer
 164 The function
 165 .B bench_createtimer
 166 constructs and returns a timer.
 167 It takes a single argument,
 168 a string
 169 .IR config ,
 170 from which it reads configuration information.
 171 If
 172 .B bench_createtimer
 173 fails, it returns a null pointer.
 174 .PP
 175 The
 176 .I config
 177 pointer may safely be null,
 178 in which case a default configuration will be used.
 179 Applications
 180 .I should only
 181 set this pointer to a value supplied by a user,
 182 e.g., through a command-line argument,
 183 environment variable, or
 184 configuration file.
 185 .PP
 186 The built-in timer makes use of one or two
 187 .IR subtimers :
 188 a `clock' subtimer to measure the passage of time,
 189 and possibly a `cycle' subtimer to count CPU cycles.
 190 .PP
 191 The configuration string consists of a sequence of words
 192 separated by whitespace.
 193 There may be additional whitespace at the start and end of the string.
 194 The words recognized are as follows.
 195 .TP
 196 .B list
 197 Prints a list of the available clock and cycle subtimers
 198 to standard output.
 199 .TP
 200 .BI clock= t , ...
 201 Use the first of the listed clock subtimers
 202 to initialize successfully
 203 as the clock subtimer.
 204 If none of the subtimers can be initialized,
 205 then construction of the timer as a whole fails.
 206 .TP
 207 .BI cycle= t , ...
 208 Use the first of the listed subtimers
 209 to initialize successfully
 210 as the cycle subtimer.
 211 If none of the subtimers can be initialized,
 212 then construction of the timer as a whole fails.
 213 .PP
 214 The clock subtimers are as follows.
 215 Not all of them will be available on every platform.
 216 .TP
 217 .B posix-thread-cputime
 218 Measures the passage of time using
 219 .BR clock_gettime (2),
 220 specifying the
 221 .B CLOCK_\%THREAD_\%CPUTIME_\%ID
 222 clock.
 223 .TP
 224 .B stdc-clock
 225 Measures the passage of time using
 226 .BR clock (3).
 227 Since
 228 .BR clock (3)
 229 is part of the original ANSI\ C standard,
 230 this subtimer should always be available.
 231 However, it may produce unhelpful results
 232 if other threads are running.
 233 .PP
 234 The cycle subtimers are as follows.
 235 Not all of them will be available on every platform.
 236 .TP
 237 .B linux-perf-event
 238 Counts CPU cycles using the Linux-specific
 239 .BR perf_event_open (2)
 240 function to read the
 241 .BR PERF_\%COUNT_\%HW_\%CPU_\%CYCLES
 242 counter.
 243 Only available on Linux.
 244 It will fail to initialize
 245 if access to performance counters is restricted,
 246 e.g., because the
 247 .B /proc/sys/kernel/perf_event_paranoid
 248 level is too high.
 249 .TP
 250 .B x86-rdtsc
 251 Counts CPU cycles using the x86
 252 .B rdtsc
 253 instruction.
 254 This instruction is not really suitable for performance measurement:
 255 it gives misleading results on CPUs with variable clock frequency.
 256 .TP
 257 .B null
 258 A dummy cycle counter,
 259 which will initialize successfully
 260 and then fail to report cycle counts.
 261 This is a reasonable fallback in many situations.
 262 .PP
 263 The built-in preference order for clock subtimers,
 264 from most to least preferred, is
 265 .B posix-thread-cputime
 266 followed by
 267 .BR stdc-clock .
 268 The built-in preference order for cycle subtimers,
 269 from most to least preferred, is
 270 .B linux-perf-event
 271 followed by
 272 .BR x86-rdtsc ,
 273 and then
 274 .BR null .
 275 .
 276 .SS The benchmark state
 277 A
 278 .I benchmark state
 279 tracks the information needed to measure performance of functions.
 280 It is represented by a
 281 .B struct bench_state
 282 structure.
 283 .PP
 284 The benchmark state is initialized by calling
 285 .BR bench_init ,
 286 passing the address of the state structure to be initialized,
 287 and a pointer to a timer.
 288 If
 289 .B bench_init
 290 is called with a non-null timer pointer,
 291 then it will not fail;
 292 the benchmark state will be initialized,
 293 and the function returns zero.
 294 If the timer pointer is null,
 295 then
 296 .B bench_init
 297 attempts to construct a timer for itself
 298 by calling
 299 .BR bench_createtimer .
 300 If this succeeds,
 301 then the benchmark state will be initialized,
 302 and the function returns zero.
 303 In both cases,
 304 the timer becomes owned by the benchmark state:
 305 calling
 306 .B bench_destroy
 307 on the benchmark state will destroy the timer.
 308 If
 309 .B bench_init
 310 is called with a null timer pointer,
 311 and its attempt to create a timer for itself fails,
 312 then
 313 .B bench_init
 314 returns \-1;
 315 the benchmark state is not initialized
 316 and can safely be discarded;
 317 calling
 318 safe to call
 319 .B bench_destroy
 320 on the unsuccessfully benchmark state is safe and has no effect.
 321 .PP
 322 Calling
 323 .B bench_destroy
 324 on a benchmark state
 325 releases any resources it holds,
 326 most notably its timer, if any.
 327 .PP
 328 Although
 329 .B struct bench_state
 330 is defined in the header file,
 331 only two members are available for use by applications.
 332 .TP
 333 .B f
 334 A word containing flags.
 335 .TP
 336 .B target_s
 337 The target time for which to try run a benchmark, in seconds.
 338 After initialization, this is set to 1.0,
 339 though applications can override it.
 340 .PP
 341 Before the benchmark state can be used in measurements,
 342 it must be
 343 .IR calibrated .
 344 This is performed by calling
 345 .B bench_calibrate
 346 on the benchmark state.
 347 Calibration takes a noticeable amount of time
 348 (currently about 0.25\*,s),
 349 so it makes sense to defer it until it's known to be necessary.
 350 .PP
 351 Calibration is carried out separately, but in parallel,
 352 for the timer's passage-of-time measurement and cycle counter.
 353 Either or both of these calibrations can succeed or fail;
 354 if passage-of-time calibration fails,
 355 then cycle count calibration is impossible.
 356 .PP
 357 When it completes,
 358 .B bench_calibrate
 359 sets flag in the benchmark state's
 360 .B f
 361 member:
 362 if passage-of-time calibration succeeded,
 363 .B BTF_TIMEOK
 364 is set;
 365 if cycle-count calibration succeeded,
 366 .B BTF_CYOK
 367 is set;
 368 and the flag
 369 .B BTF_CLB
 370 is set unconditionally,
 371 as a persistent indication that calibration has been attempted.
 372 .PP
 373 The
 374 .B bench_calibrate
 375 function returns zero if it successfully calibrated
 376 at least the passage-of-time measurement;
 377 otherwise, it returns \-1.
 378 If
 379 .B bench_calibrate
 380 is called for a second or subsequent time on the same benchmark state,
 381 it returns immediately,
 382 either returning 0 or \-1
 383 according to whether passage-of-time had previously been calibrated.
 384 .
 385 .SS Timing functions
 386 A
 387 .I benchmark function
 388 has the signature
 389 .IP
 390 .BI "void " fn "(unsigned long " n ", void *" ctx );
 391 .PP
 392 When called, it should perform the operation to be measured
 393 .I n
 394 times.
 395 The
 396 .I ctx
 397 argument is a pointer passed into
 398 .B bench_measure
 399 for the benchmark function's own purposes.
 400 .PP
 401 The function
 402 .B bench_measure
 403 receives five arguments.
 404 .TP
 405 .I b
 406 points to the benchmark state to be used.
 407 .TP
 408 .I t_out
 409 is the address of a
 410 .BR struct bench_timing
 411 in which the measurement should be left.
 412 This structure is described below.
 413 .TP
 414 .I base
 415 is a count of the number of operations performed
 416 by each iteration of the benchmark function.
 417 .TP
 418 .I fn
 419 is a benchmark function, described above.
 420 .TP
 421 .I ctx
 422 is a pointer to be passed to the benchmark function.
 423 .B bench_measure
 424 does not interpret this pointer in any way.
 425 .PP
 426 The
 427 .B bench_measure
 428 function calls its benchark function repeatedly
 429 with different iteration counts
 430 .IR n ,
 431 with the objective that the call take approximately
 432 .B target_s
 433 seconds, as established in the benchmark state.
 434 (Currently, if
 435 .B target_s
 436 holds the value
 437 .IR t ,
 438 then
 439 .B bench_measure
 440 is satisfied when a call takes at least
 441 .IR t /\(sr2\*,s.)
 442 Once the function finds a satisfactory number of iterations,
 443 it stores the results in
 444 .BI * t_out \fR.
 445 If measurement succeeds, then
 446 .B bench_measure
 447 returns zero.
 448 If it fails \(en
 449 most likely because the timer failed \(en
 450 then it returns \-1.
 451 .PP
 452 A
 453 .B bench_timing
 454 structure reports the outcome of a successful measurement.
 455 It has four members.
 456 .TP
 457 .B f
 458 A flags word.
 459 .B BTF_TIMEOK
 460 is set if the passage-of-time measurement in
 461 .B t
 462 is valid;
 463 .B BTF_CYOK
 464 is set if the cycle count in
 465 .B cy
 466 is valid.
 467 .TP
 468 .B n
 469 The number of iterations performed by the benchmark function
 470 on its satisfactory run,
 471 multiplied by
 472 .IR base .
 473 .TP
 474 .B t
 475 The time taken for the satisfactory run of the benchmark function,
 476 in seconds.
 477 Only valid if
 478 .B BTF_TIMEOK
 479 is set in
 480 .BR f .
 481 .TP
 482 .B cy
 483 The number of CPU cycles used
 484 in the satisfactory run of the benchmark function,
 485 in seconds.
 486 Only valid if
 487 .B BTF_CYOK
 488 is set in
 489 .BR f .
 490 .
 491 .SH "SEE ALSO"
 492 .BR mLib (3).
 493 .
 494 .SH AUTHOR
 495 Mark Wooding, <mdw@distorted.org.uk>