chiark - git - mdw - mLib/blob - utils/fltfmt.3.in

   1 .\" -*-nroff-*-
   2 .\"
   3 .\" Manual for floating-point format conversions
   4 .\"
   5 .\" (c) 2024 Straylight/Edgeware
   6 .\"
   7 .
   8 .\"----- Licensing notice ---------------------------------------------------
   9 .\"
  10 .\" This file is part of the mLib utilities library.
  11 .\"
  12 .\" mLib is free software: you can redistribute it and/or modify it under
  13 .\" the terms of the GNU Library General Public License as published by
  14 .\" the Free Software Foundation; either version 2 of the License, or (at
  15 .\" your option) any later version.
  16 .\"
  17 .\" mLib is distributed in the hope that it will be useful, but WITHOUT
  18 .\" ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  19 .\" FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
  20 .\" License for more details.
  21 .\"
  22 .\" You should have received a copy of the GNU Library General Public
  23 .\" License along with mLib.  If not, write to the Free Software
  24 .\" Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
  25 .\" USA.
  26 .
  27 .\"--------------------------------------------------------------------------
  28 .so ../defs.man \" @@@PRE@@@
  29 .
  30 .\"--------------------------------------------------------------------------
  31 .TH fltfmt 3mLib "22 April 2024" "Straylight/Edgeware" "mLib utilities library"
  32 .\" @FLTERR_OK
  33 .\" @FLTERR_INVAL
  34 .\" @FLTERR_INEXACT
  35 .\" @FLTERR_UFLOW
  36 .\" @FLTERR_OFLOW
  37 .\" @FLTERR_REPR
  38 .\" @FLTERR_ALLERRS
  39 .
  40 .\" @FRPF_LOW
  41 .\" @FRPF_HALF
  42 .\" @FRPF_ODD
  43 .\" @FRPF_NEG
  44 .\" @FRPMASK_LOW
  45 .\" @FRPMASK_HALF
  46 .\" @FRPMASK_ODD
  47 .\" @FRPMASK_NEG
  48 .\" @FRPMASK_INEXACT
  49 .\" @FRPMASK_NEAR
  50 .\" @FLTRND_ZERO
  51 .\" @FLTRND_PROJINF
  52 .\" @FLTRND_NEGINF
  53 .\" @FLTRND_POSINF
  54 .\" @FLTRND_EVEN
  55 .\" @FLTRND_ODD
  56 .\" @FLTRND_NEAREVEN
  57 .\" @FLTRND_NEARODD
  58 .\" @FLTRND_NEARZERO
  59 .\" @FLTRND_NEARINF
  60 .\" @FLTRND_NEARNEG
  61 .\" @FLTRND_NEARPOS
  62 .
  63 .\" @FLTFMT_NEG
  64 .\" @FLTFMT_INF
  65 .\" @FLTFMT_QNAN
  66 .\" @FLTFMT_SNAN
  67 .\" @FLTFMT_ZERO
  68 .\" @FLTFMT_NANMASK
  69 .\" @FLOATBITS_INIT
  70 .\" @fltfmt_initbits
  71 .\" @fltfmt_freebits
  72 .\" @fltfmt_allocfrac
  73 .\" @fltfmt_copybits
  74 .\" @fltfmt_round
  75 .
  76 .\" @FLTIF_HIDDEN
  77 .\" @fltfmt_f16
  78 .\" @fltfmt_f32
  79 .\" @fltfmt_f64
  80 .\" @fltfmt_f128
  81 .\" @fltfmt_mini
  82 .\" @fltfmt_bf16
  83 .\" @fltfmt_idblext80
  84 .
  85 .\" @fltfmt_encieee
  86 .\" @fltfmt_encf16
  87 .\" @fltfmt_encf32
  88 .\" @fltfmt_encf64
  89 .\" @fltfmt_encf128
  90 .\" @fltfmt_encmini
  91 .\" @fltfmt_encbf16
  92 .\" @fltfmt_encidblext80
  93 .\" @fltfmt_decieee
  94 .\" @fltfmt_decf16
  95 .\" @fltfmt_decf32
  96 .\" @fltfmt_decf64
  97 .\" @fltfmt_decf128
  98 .\" @fltfmt_decmini
  99 .\" @fltfmt_decbf16
 100 .\" @fltfmt_decidblext80
 101 .
 102 .\" @fltfmt_encflt
 103 .\" @fltfmt_encdbl
 104 .\" @fltfmt_encldbl
 105 .\" @fltfmt_decflt
 106 .\" @fltfmt_decdbl
 107 .\" @fltfmt_decldbl
 108 .
 109 .\" @fltfmt_flttof32l
 110 .\" @fltfmt_flttof32b
 111 .\" @fltfmt_dbltof64l
 112 .\" @fltfmt_dbltof64b
 113 .\" @fltfmt_f32ltoflt
 114 .\" @fltfmt_f32btoflt
 115 .\" @fltfmt_f64ltodbl
 116 .\" @fltfmt_f64btodbl
 117 .
 118 .\"--------------------------------------------------------------------------
 119 .SH NAME
 120 fltfmt \- floating-point format conversions
 121 .
 122 .\"--------------------------------------------------------------------------
 123 .SH SYNOPSIS
 124 .
 125 .nf
 126 .B "#define FLTERR_OK 0"
 127 .B "#define FLTERR_INVAL ..."
 128 .B "#define FLTERR_INEXACT ..."
 129 .B "#define FLTERR_UFLOW ..."
 130 .B "#define FLTERR_OFLOW ..."
 131 .B "#define FLTERR_REPR ..."
 132 .B "#define FLTERR_ALLERRS ..."
 133 .PP
 134 .ta 40n
 135 .B "#define FRPF_LOW 1u"
 136 .B "#define FRPF_HALF 2u"
 137 .B "#define FRPF_ODD 4u"
 138 .B "#define FRPF_NEG 8u"
 139 .B "#define FRPMASK_LOW 0xaaaau"
 140 .B "#define FRPMASK_HALF 0xccccu."
 141 .B "#define FRPMASK_ODD 0xf0f0u"
 142 .B "#define FRPMASK_NEG 0xff00u"
 143 .B "#define FRPMASK_INEXACT ... /* LOW | HALF */"
 144 .BI "unsigned FRPMASK_NEAR(unsigned " dir ");   /* HALF&(LOW | " dir ") */"
 145 .B "#define FLTRND_ZERO ...     /* 0 */"
 146 .B "#define FLTRND_PROJINF ...  /* INEXACT */"
 147 .B "#define FLTRND_NEGINF ...   /* INEXACT&NEG */"
 148 .B "#define FLTRND_POSINF ...   /* INEXACT&~NEG */"
 149 .B "#define FLTRND_EVEN ...     /* INEXACT&ODD */"
 150 .B "#define FLTRND_ODD ...      /* INEXACT&~ODD */"
 151 .B "#define FLTRND_NEAREVEN ... /* HALF&(LOW | ODD) */"
 152 .B "#define FLTRND_NEARODD ...  /* HALF&(LOW | ~ODD) */"
 153 .B "#define FLTRND_NEARZERO ... /* HALF&LOW */"
 154 .B "#define FLTRND_NEARINF ...  /* HALF */"
 155 .B "#define FLTRND_NEARNEG ...  /* HALF&(LOW | NEG) */"
 156 .B "#define FLTRND_NEARPOS ...  /* HALF&(LOW | ~NEG) */"
 157 .PP
 158 .ta 2n
 159 .B "#define FLTF_NEG ..."
 160 .B "#define FLTF_INF ..."
 161 .B "#define FLTF_QNAN ..."
 162 .B "#define FLTF_SNAN ..."
 163 .B "#define FLTF_ZERO ..."
 164 .B "#define FLTF_NANMASK (FLTF_QNAN | FLTF_SNAN)"
 165 .B "struct floatbits {"
 166 .B "    unsigned f;"
 167 .B "    int exp;"
 168 .B "    arena *a;"
 169 .B "    uint32 *frac;"
 170 .B "    unsigned n, fracsz;"
 171 .B "};"
 172 .B "#define FLOATBITS_INIT { ...\& };"
 173 .PP
 174 .BI "void fltfmt_initbits(struct floatbits *" x );
 175 .BI "void fltfmt_freebits(struct floatbits *" x );
 176 .BI "void fltfmt_allocfrac(struct floatbits *" x ", unsigned " n );
 177 .ta \w'\fBvoid fltfmt_copybits('u
 178 .BI "void fltfmt_copybits(struct floatbits *" z_out ,
 179 .BI "   const struct floatbits *" x );
 180 .ta \w'\fBvoid fltfmt_round('u
 181 .BI "void fltfmt_round(struct floatbits *" z_out ,
 182 .BI "   const struct floatbits *" x ,
 183 .BI "   unsigned " r ", unsigned " n );
 184 .PP
 185 .
 186 .ta 2n
 187 .B "#define FLTIF_HIDDEN ..."
 188 .B "struct fltfmt_ieeefmt {"
 189 .B "    unsigned f;"
 190 .B "    unsigned expwd;"
 191 .B "    unsigned prec;"
 192 .B "};"
 193 .B "const struct fltfmt_ieeefmt fltfmt_f16;"
 194 .B "const struct fltfmt_ieeefmt fltfmt_f32;"
 195 .B "const struct fltfmt_ieeefmt fltfmt_f64;"
 196 .B "const struct fltfmt_ieeefmt fltfmt_f128;"
 197 .B "const struct fltfmt_ieeefmt fltfmt_mini;"
 198 .B "const struct fltfmt_ieeefmt fltfmt_bf16;"
 199 .B "const struct fltfmt_ieeefmt fltfmt_idblext80;"
 200 .PP
 201 .ta \w'\fBunsigned fltfmt_encieee('u
 202 .BI "unsigned fltfmt_encieee(const struct fltfmt_ieeefmt *" fmt ,
 203 .BI "   uint32 *" z ", const struct floatbits *" x ,
 204 .BI "   unsigned " r ", unsigned " errmask );
 205 .ta \w'\fBunsigned fltfmt_encf16('u
 206 .BI "unsigned fltfmt_encf16(uint16 *" z_out ", const struct floatbits *" x ,
 207 .BI "   unsigned " r ", unsigned " errmask );
 208 .ta \w'\fBunsigned fltfmt_encf32('u
 209 .BI "unsigned fltfmt_encf32(uint32 *" z_out ", const struct floatbits *" x ,
 210 .BI "   unsigned " r ", unsigned " errmask );
 211 .ta \w'\fBunsigned fltfmt_encf64('u
 212 .BI "unsigned fltfmt_encf64(kludge64 *" z_out ", const struct floatbits *" x ,
 213 .BI "   unsigned " r ", unsigned " errmask );
 214 .ta \w'\fBunsigned fltfmt_encf128('u
 215 .BI "unsigned fltfmt_encf128(uint32 *" z_out ", const struct floatbits *" x ,
 216 .BI "   unsigned " r ", unsigned " errmask );
 217 .ta \w'\fBunsigned fltfmt_encmini('u
 218 .BI "unsigned fltfmt_encmini(octet *" z_out ", const struct floatbits *" x ,
 219 .BI "   unsigned " r ", unsigned " errmask );
 220 .ta \w'\fBunsigned fltfmt_encbf16('u
 221 .BI "unsigned fltfmt_encbf16(uint16 *" z_out ", const struct floatbits *" x ,
 222 .BI "   unsigned " r ", unsigned " errmask );
 223 .ta \w'\fBunsigned fltfmt_encidblext80('u
 224 .BI "unsigned fltfmt_encidblext80(uint16 *" se_out ", kludge64 *" m_out ,
 225 .BI "   const struct floatbits *" x ,
 226 .BI "   unsigned " r ", unsigned " errmask );
 227 .PP
 228 .ta \w'\fBunsigned fltfmt_decieee('u
 229 .BI "unsigned fltfmt_decieee(const struct fltfmt_ieeefmt *" fmt ,
 230 .BI "   struct floatbits *" z_out ", const uint32 *" x );
 231 .BI "unsigned fltfmt_decf16(struct floatbits *" z_out ", uint16 " x );
 232 .BI "unsigned fltfmt_decf32(struct floatbits *" z_out ", uint32 " x );
 233 .BI "unsigned fltfmt_decf64(struct floatbits *" z_out ", kludge64 " x );
 234 .BI "unsigned fltfmt_decf128(struct floatbits *" z_out ", const uint32 *" x );
 235 .BI "unsigned fltfmt_decmini(struct floatbits *" z_out ", octet " x );
 236 .BI "unsigned fltfmt_decbf16(struct floatbits *" z_out ", uint16 " x );
 237 .ta \w'\fBunsigned fltfmt_decidblext80('u
 238 .BI "unsigned fltfmt_decidblext80(struct floatbits *" z_out ,
 239 .BI "   uint16 " se ", kludge64 " m );
 240 .PP
 241 .ta \w'\fBunsigned fltfmt_encflt('u
 242 .BI "unsigned fltfmt_encflt(float *" z_out ,
 243 .BI "   const struct floatbits *" x ", unsigned " r );
 244 .ta \w'\fBunsigned fltfmt_encdbl('u
 245 .BI "unsigned fltfmt_encdbl(double *" z_out ,
 246 .BI "   const struct floatbits *" x ", unsigned " r );
 247 .ta \w'\fBunsigned fltfmt_encldbl('u
 248 .BI "unsigned fltfmt_encldbl(long double *" z_out ,
 249 .BI "   const struct floatbits *" x ", unsigned " r );
 250 .ta \w'\fBunsigned fltfmt_decflt('u
 251 .BI "unsigned fltfmt_decflt(struct floatbits *" z_out ,
 252 .BI "   float *" x ", unsigned " r );
 253 .ta \w'\fBunsigned fltfmt_decdbl('u
 254 .BI "unsigned fltfmt_decdbl(struct floatbits *" z_out ,
 255 .BI "   double *" x ", unsigned " r );
 256 .ta \w'\fBunsigned fltfmt_decldbl('u
 257 .BI "unsigned fltfmt_decldbl(struct floatbits *" z_out ,
 258 .BI "   long double *" x ", unsigned " r );
 259 .PP
 260 .BI "unsigned fltfmt_flttof32l(octet *" p ", float " x ", unsigned " r );
 261 .BI "unsigned fltfmt_flttof32b(octet *" p ", float " x ", unsigned " r );
 262 .BI "unsigned fltfmt_dbltof64l(octet *" p ", double " x ", unsigned " r );
 263 .BI "unsigned fltfmt_dbltof64b(octet *" p ", double " x ", unsigned " r );
 264 .BI "unsigned fltfmt_f32ltoflt(float *" z_out ", const octet *" p ", unsigned " r );
 265 .BI "unsigned fltfmt_f32btoflt(float *" z_out ", const octet *" p ", unsigned " r );
 266 .BI "unsigned fltfmt_f64ltodbl(float *" z_out ", const octet *" p ", unsigned " r );
 267 .BI "unsigned fltfmt_f64btodbl(float *" z_out ", const octet *" p ", unsigned " r );
 268 .
 269 .\"--------------------------------------------------------------------------
 270 .SH DESCRIPTION
 271 .
 272 The
 273 .B "<mLib/fltfmt.h>"
 274 header file defines structures, macros, and functions
 275 for converting floating-point values between various formats,
 276 including the native floating-point formats
 277 and IEEE\ 754 and related formats.
 278 .
 279 .SS Error conditions
 280 Most of the functions in this module return an unsigned integer.
 281 A return value of zero means that no error occurred;
 282 set bits indicate various error conditions.
 283 .TP
 284 .B FLTERR_INVAL
 285 A binary input to be decoded contained an invalid bit pattern,
 286 e.g., an unnormalized input value with a nonminimal exponent.
 287 The function will have produced a reasonable output anyway,
 288 but the original value will not be recoverable from the result.
 289 .TP
 290 .B FLTERR_INEXACT
 291 The conversion was inexact.
 292 Converting the output back into the format of the input
 293 may not reproduce the original input value.
 294 This error flag is sometimes set conservatively.
 295 .TP
 296 .B FLTERR_UFLOW
 297 The conversion underflowed:
 298 a nonzero input was too tiny (in absolute value) to represent,
 299 and a zero result was returned.
 300 .TP
 301 .B FLTERR_OFLOW
 302 The conversion overflowed:
 303 a finite input was too huge (in absolute value) to represent,
 304 and either the appropriately signed infinity
 305 or largest-magnitude finite value
 306 was returned, determined by the requested rounding mode.
 307 .TP
 308 .B FLTERR_REPR
 309 The output format failed entirely to represent the input value.
 310 The result is zero if the input was a NaN,
 311 or the appropriately signed largest-magnitude finite value
 312 if the input was an infinity.
 313 .
 314 .SS Rounding modes
 315 The rounding system works as follows.
 316 There are four
 317 .I rounding predicates
 318 considered when a rounding decision is taken.
 319 These are determined from the unrounded input value
 320 .IR x ,
 321 and the two nearest rounded values
 322 .RI | u "|\ \*(<=\ |" x |
 323 and
 324 .RI | v "|\ >\ |" x |.
 325 The predicates are as follows.
 326 .TP
 327 .B FRPF_LOW
 328 If
 329 .IR x "\ \*(/=\ " u
 330 and
 331 .IR x "\ \*/=\ (" u "\ +\ " v )/2,
 332 i.e.,
 333 .I x
 334 is neither equal to a rounded value,
 335 nor exactly halfway between two rounded values.
 336 This predicate is sometimes referred to as a `sticky bit'.
 337 .TP
 338 .B FRPF_HALF
 339 If
 340 .RI | x "|\ \*(>=\ |(" u "\ +\ " v )/2|,
 341 i.e.,
 342 .I x
 343 is halfway or more towards its larger rounded neighbour.
 344 .TP
 345 .B FRPF_ODD
 346 If least significant digit of
 347 .I u
 348 is odd.
 349 In binary floating-point formats,
 350 this is just the least significant bit of
 351 .IR u .
 352 .TP
 353 .B FRPF_NEG
 354 If
 355 .I x
 356 is negative.
 357 .PP
 358 These four predicates are packed into a four-bit mask value
 359 .I rf
 360 between 0 and 15.
 361 A
 362 .I rounding mode
 363 is simply a 16-bit mask:
 364 if bit
 365 .I rf
 366 of the rounding-mode mask is set,
 367 then
 368 .I x
 369 is rounded to
 370 .IR v ;
 371 otherwise it is rounded to
 372 .IR u .
 373 That is, the rounding-mode mask is essentially a truth table.
 374 Rounding modes with
 375 .I set
 376 bits corresponding to situations where both
 377 .B FRPF_LOW
 378 and
 379 .FRPF_HALF
 380 are false,
 381 i.e., where
 382 .I x
 383 is already a rounded value,
 384 are forbidden.
 385 .PP
 386 Some useful machinery is provided
 387 for constructing rounding-mode masks.
 388 .BR FRPMASK_LOW ,
 389 .BR FRPMASK_HALF ,
 390 .BR FRPMASK_ODD ,
 391 and
 392 .B FRPMASK_NEG ,
 393 are mask with set bits corresponding to their respective predicates.
 394 Bitwise boolean logic can be applied to these masks
 395 in order to calculate the masks corresponding to
 396 the same logical expression applied to the individual predicates.
 397 .B FRPMASK_INEXACT holds if
 398 .B LOW
 399 or
 400 .B HALF
 401 holds;
 402 i.e., if
 403 .IR x "\ \*(/=\ " u ;
 404 as mentioned above, only these bits may be set
 405 in a valid rounding-mode mask.
 406 .BI FRPMASK_NEAR( dir )
 407 is the mask for rounding to nearest with ties broken according to
 408 .IR dir ,
 409 which is another rounding-mode mask.
 410 The complete set of predefined masks is listed above in the synopsis,
 411 together with their description in terms of the basic predicates.
 412 The usual IEEE rounding mode is
 413 round-to-nearest/ties-to-even,
 414 denoted
 415 .BR FLTRND_NEAREVEN .
 416 This is likely a good option
 417 if there is no compelling argument for a different specific choice.
 418 .
 419 .SS Direct conversions
 420 The functions
 421 .B fltfmt_flttof32l
 422 and
 423 .B fltfmt_flttof32b
 424 convert a
 425 .B float
 426 argument to an IEEE\ 754 Binary32 value
 427 in little- or big-endian byte order, respectively;
 428 similarly
 429 .B fltfmt_dbltof64l
 430 and
 431 .B fltfmt_dbltof64b
 432 convert a
 433 .B double
 434 argument to an IEEE\ 754 Binary64 value
 435 in little- or big-endian byte order, respectively.
 436 The value to convert is given as
 437 .I x
 438 and the result is written at the address
 439 .IR p .
 440 .PP
 441 The functions
 442 .B fltfmt_f32ltoflt
 443 and
 444 .B fltfmt_f32btoflt
 445 convert an IEEE\ 754 Binary32 value,
 446 in little- or big-endian byte order, respectively,
 447 to a
 448 .BR float ;
 449 similarly,
 450 .B fltfmt_f32ltoflt
 451 and
 452 .B fltfmt_f32btoflt
 453 convert an IEEE\ 754 Binary64 value,
 454 in little- or big-endian byte order, respectively,
 455 to a
 456 .BR double .
 457 The value to convert is read from address
 458 .I p
 459 and the result is written to
 460 .RI * z_out \fR.
 461 .PP
 462 Both functions additionally take a rounding mode
 463 .I r
 464 which is applied if the conversion cannot be performed exactly,
 465 and return an error code as described above.
 466 .PP
 467 On many modern platforms, the
 468 .B float
 469 and
 470 .I double
 471 types are represented internally using the IEEE
 472 Binary32 and Binary64 formats,
 473 so these conversions are trivial, or nearly so.
 474 A complication arises on PA-RISC and older MIPS processors:
 475 see the descriptions of
 476 .B fltfmt_encieee
 477 and
 478 .B fltfmt_decdbl
 479 below for the details.
 480 .PP
 481 On other platforms,
 482 the conversion is decidedly nontrivial,
 483 and makes use of the machinery described below;
 484 this may also be useful for more complex conversions.
 485 .
 486 .SS The floatbits structure
 487 In order to avoid a combinatorial explosion in conversion operations,
 488 all the basic conversions involve,
 489 as source or target,
 490 a `common currency' format represented by the type
 491 .BR "struct floatbits" .
 492 .PP
 493 This structure consists of
 494 a set of flags
 495 .BR f ;
 496 a signed exponent
 497 .BR exp ;
 498 an
 499 .B arena
 500 pointer
 501 .BR a ;
 502 a pointer
 503 .B frac
 504 to a vector of
 505 .B uint32
 506 values;
 507 the length
 508 .B n
 509 of the
 510 .B frac
 511 vector; and
 512 the currently allocated size
 513 .B fracsz
 514 of the vector.
 515 Both
 516 .B n
 517 and
 518 .B fracsz
 519 count elements, not bytes.
 520 .PP
 521 Storage for
 522 .B frac
 523 comes from the arena
 524 .BR a .
 525 Only the first
 526 .B n
 527 words of
 528 .B frac
 529 are significant;
 530 .B frac[0]
 531 is the most significant word.
 532 The value represented by a
 533 .B struct floatbits
 534 is never changed by adding or removing zero-valued words
 535 at the end of the
 536 .B frac
 537 vector.
 538 It is always the case that
 539 .BR n "\ \*(<=\ " fracsz ;
 540 if
 541 .B fracsz
 542 is zero then
 543 .B frac
 544 may be a null pointer.
 545 .PP
 546 The interpretation of the
 547 .B exp
 548 and
 549 .B frac
 550 members depends on the flags set in
 551 .BR f .
 552 Apart from
 553 .BR FLTF_NEG ,
 554 the flags are
 555 .IR "mutually exclusive" :
 556 at most one flag may be set.
 557 .TP
 558 .B FLTF_NEG
 559 The value is negative.
 560 .TP
 561 .B FLTF_INF
 562 The value is positive or negative infinity.
 563 The
 564 .B exp
 565 and
 566 .B frac
 567 are ignored.
 568 .TP
 569 .BR FLTF_QNAN " and " FLTF_SNAN
 570 The value is a quiet or signalling not-a-number, respectively.
 571 The
 572 .B exp
 573 is ignored.
 574 The payload is stored in
 575 .BR frac ;
 576 the payload does not include the `quiet' bit.
 577 .TP
 578 .B FLTF_ZERO
 579 The number is zero.
 580 Negative zero is distinct from positive zero.
 581 The
 582 .B exp
 583 and
 584 .B frac
 585 are ignored.
 586 .IP "All non-sign bits clear"
 587 The value is a finite nonzero number.
 588 The
 589 .B frac
 590 holds the significand.
 591 The most significand significand bit must be set, so
 592 (a)\ the number must be nonzero, and
 593 (b)\ the significand is normalized.
 594 The significand is interpreted as a fraction
 595 .RI "1/2\ \*(<=\ " m "\ <\ 1."
 596 If
 597 .I e
 598 is the value of the
 599 .B exp
 600 member,
 601 and
 602 .I s
 603 is \-1 if
 604 .B FLTF_NEG
 605 is set
 606 or +1 if
 607 .B FLTF_NEG
 608 is clear,
 609 then the number represented is
 610 .IR s "\ \*(mu\ " m "\ \*(mu\ 2\*(ss" e \*(se.
 611 .PP
 612 A
 613 .B struct floatbits
 614 can be initialized statically by
 615 .BR FLOATBITS_INIT ,
 616 or dynamically using the function
 617 .BR fltfmt_initbits .
 618 These are not quite the same:
 619 .B FLOATBITS_INIT
 620 initializes
 621 .B a
 622 to
 623 .BR &arena_stdlib ,
 624 while
 625 .B fltfmt_initbits
 626 sets it to the runtime value of
 627 .BR arena_global .
 628 With this exception,
 629 both forms of initialization set the value to (positive) zero;
 630 neither allocates any storage or other resources,
 631 leaving
 632 .B frac
 633 null.
 634 In this state, it is safe to modify the arena pointer
 635 .B a
 636 if the default initialization is unsatisfactory.
 637 .PP
 638 The
 639 .B fltfmt_allocfrac
 640 function is given a pointer
 641 .I x
 642 to a
 643 .B struct floatbits
 644 and a length
 645 .IR n :
 646 it ensures that there is enough storage at
 647 .IB x ->frac
 648 for at least
 649 .I n
 650 words:
 651 if the current size is too small,
 652 then any existing buffer is discarded and a new one allocated
 653 from the arena
 654 .IB x ->a \fR;
 655 any existing contents of the buffer are lost.
 656 On exit,
 657 .IB x ->n
 658 is set to
 659 .IR n .
 660 .PP
 661 The
 662 .B fltfmt_freebits
 663 function
 664 frees a
 665 .B struct floatbits
 666 structure, releasing the storage held by
 667 .BR frac .
 668 .PP
 669 The
 670 .B fltfmt_copybits
 671 function simply copies its input
 672 .I x
 673 to its output
 674 .IR z_out ;
 675 both must refer to initialized
 676 .B struct floatbits
 677 structures.
 678 If
 679 .I z_out
 680 and
 681 .I x
 682 are equal, then nothing happens.
 683 .PP
 684 Finally, the
 685 .B fltfmt_round
 686 function rounds the value in the
 687 .B struct floatbits
 688 structure
 689 .I x
 690 to
 691 .I n
 692 bits using the rounding mode
 693 .IR r ;
 694 the result is written to
 695 .IR z_out ;
 696 it is permitted for
 697 .I z_out
 698 to be equal to
 699 .IR x .
 700 If
 701 .I x
 702 is a zero or infinity,
 703 then the output is equal to the input,
 704 as if
 705 .B fltfmt_copybits
 706 had been called instead.
 707 If
 708 .I x
 709 is a NaN,
 710 then the payload is simply truncated to
 711 .I n
 712 bits, without regard to the rounding mode.
 713 Otherwise, the input is nonzero and finite;
 714 the significand is rounded to
 715 .I n
 716 bits according to the rounding mode.
 717 In all cases, the return value is
 718 zero if the output is equal to the input,
 719 or
 720 .B FLTERR_INEXACT
 721 if the rounded result is not equal to the input.
 722 .
 723 .SS IEEE and related formats
 724 An IEEE floating-point format is characterized by three parameters:
 725 the
 726 .I "exponent width"
 727 .IR w ,
 728 the
 729 .I "precision"
 730 .IR p ,
 731 and
 732 the
 733 .I "unit width"
 734 .IR h .
 735 .PP
 736 The encoded value consists of
 737 .IR p "\ +\ " w "\ +\ " h "\ \-\ 1"
 738 bits.
 739 This is divided, from the most significant bit downwards,
 740 into a
 741 .I "sign bit"
 742 .IR s ,
 743 a
 744 .IR w -bit
 745 .I "biased exponent"
 746 .IR e \*',
 747 a
 748 .IR h -bit
 749 .I "unit bit"
 750 .IR u ,
 751 and a
 752 .RI ( p "\ \-\ " h )-bit
 753 .I fraction
 754 .IR f .
 755 The
 756 .I "exponent bias"
 757 is
 758 .IR e "\*(us0\*(ue\ =\ 2\*(ss" w "\-1\*(se\ \-\ 1;"
 759 the true exponent
 760 .I e
 761 is calculated from the biased exponent by
 762 .IR e "\ =\ " e "\*'\ \-\ " e \*(us0\*(ue.
 763 The unit and fraction field are usually interpreted as denoting
 764 a significand
 765 .IR m "\ =\ " u "\ +\ " f /2\*(ss p \-1\*(se
 766 with
 767 .RI "0\ \*(<=\ " m "\ <\ 2."
 768 If
 769 .I h
 770 is zero,
 771 the value of the unit bit
 772 .I u
 773 is implied by the exponent as described below.
 774 The encoded value is interpreted as follows.
 775 .hP \*o
 776 If
 777 .IR e "\ =\ \-" e \*(us0\*(ue
 778 then the value is zero or a subnormal,
 779 with the value
 780 .RI (\-1)\*(ss s "\*(se\ \*(mu\ " m "\ \*(mu\ 2\*(ss" e +1\*(se.
 781 In particular,
 782 if
 783 .IR m "\ =\ 0"
 784 then the value is positive or negative zero,
 785 according to the sign bit
 786 .IR s .
 787 If
 788 .I h
 789 is zero then
 790 .IR u "\ =\ 0;"
 791 if
 792 .I h
 793 is nonzero
 794 but
 795 .IR u "\ \*(/=\ 0"
 796 then the encoding is invalid:
 797 decoding returns
 798 .BR FLTERR_INVAL ,
 799 but the result will be as described.
 800 .hP \*o
 801 If
 802 .RI "1\ \-\ " e "\*(us0\*(ue \*(<=\ " e "\ < 2" e "\*(us0\*(ue\ +\ 1"
 803 then the value is a (supposedly) normal number
 804 .RI (\-1)\*(ss s "\*(se\ \*(mu\ " m "\ \*(mu\ 2\*(ss" e \*(se.
 805 If
 806 .I h
 807 is zero then
 808 .IR u "\ =\ 1;"
 809 if
 810 .I h
 811 is nonzero
 812 but
 813 .IR u "\ \*(/=\ 0"
 814 then the encoding is invalid:
 815 decoding returns
 816 .BR FLTERR_INVAL ,
 817 but the result will be as described.
 818 .hP \*o
 819 If
 820 .IR e "\ =\ 2" e "\*(us0\*(ue\ +\ 1"
 821 and
 822 .IR f "\ =\ 0"
 823 then the value is positive or negative infinity,
 824 according to the sign bit
 825 .IR s .
 826 If
 827 .I h
 828 is nonzero and
 829 .IR u "\ =\ 0"
 830 then the encoding is invalid:
 831 decoding returns
 832 .BR FLTERR_INVAL ,
 833 but the result will still be infinity.
 834 .hP \*o
 835 If
 836 .IR e "\ =\ 2" e "\*(us0\*(ue\ +\ 1"
 837 and
 838 .IR f "\ \*(/=\ 0"
 839 then the value is not-a-number (NaN).
 840 The most significant bit of
 841 .I f
 842 is the `quiet bit':
 843 if the bit is set, the value is a `quiet NaN';
 844 if the bit is clear, the value is a `signalling NaN'.
 845 (This is the convention recommended by IEEE\ 754-2008 \(sc6.2.1,
 846 it has the advantage that a signalling NaN can be `quieted'
 847 by setting the most significant fraction bit;
 848 HP-PA and older MIPS processors use the opposite convention
 849 for distinguishing quiet and signalling NaNs,
 850 but a signalling NaN with all but the most significant
 851 fraction bit zero cannot be `quieted' by clearing the
 852 most significant bit, since the resulting encoding denotes
 853 an infinity, not a QNaN.)
 854 The remaining bits of
 855 .I f
 856 form the
 857 .I payload.
 858 Positive and negative NaN values are distinguished,
 859 with sign determined by the sign bit.
 860 If
 861 .I h
 862 is nonzero and
 863 .IR u "\ =\ 0"
 864 then the encoding is invalid:
 865 decoding returns
 866 .BR FLTERR_INVAL ,
 867 but the result will still be a NaN;
 868 the unit bit does not affect the NaN payload.
 869 .PP
 870 An IEEE format is described by the type
 871 .BR "struct fltfmt_ieeefmt" .
 872 This has three members:
 873 .TP
 874 .B f
 875 A flags word.
 876 If
 877 .B FLTIF_HIDDEN
 878 is set, the the format uses a `hidden bit' convention:
 879 in the notation above
 880 .IR h "\ =\ 0;"
 881 if the flag is clear,
 882 the format has an explicit unit bit, and
 883 .IR h "\ =\ 1."
 884 .TP
 885 .B expwd
 886 The exponent width;
 887 in the notation above this is
 888 .IR w .
 889 .TP
 890 .B prec
 891 The precision;
 892 in the notation above this is
 893 .IR p .
 894
 895 The
 896 .B fltfmt_encieee
 897 and
 898 .B fltfmt_decieee
 899 functions convert between IEEE and related formats
 900 and the internal
 901 .B struct floatbits
 902 representation.
 903 They respectively encode or decode an IEEE-format value,
 904 as described above,
 905 from a vector of
 906 .B uint32
 907 words,
 908 most-significant word first
 909 \(en so the sign bit is in the first word.
 910 For formats whose size is not a multiple of 32,
 911 the encoding is
 912 .IR right-aligned :
 913 the least significant bit of the fraction
 914 is in the least significant bit of the last word in the vector.
 915 .PP
 916 The
 917 .B fltfmt_encieee
 918 function encodes an IEEE-format value.
 919 The function is given five arguments:
 920 a pointer
 921 .I fmt
 922 to the IEEE format description,
 923 a pointer
 924 .I p
 925 to a sufficiently long vector of 32-bit words
 926 in which to store the encoded value,
 927 a pointer
 928 .I x
 929 to the
 930 .B struct floatbits
 931 holding the value to encode,
 932 a rounding mode
 933 .IR r ,
 934 and an error mask
 935 .IR errmask .
 936 If the input is a NaN,
 937 then the payload is truncated to fit
 938 regardless of the rounding mode,
 939 discarding low-significant bits;
 940 if the input is a finite value,
 941 then the significand is rounded to fit
 942 according to the requested rounding mode.
 943 If a signalling NaN ends up with all of its payload bits zero,
 944 as a result of truncation or otherwise,
 945 then the least-signficant bit of the output payload is forced on
 946 in order to distinguish the result from an infinity.
 947 The possible errors are
 948 .B FLTERR_UFLOW
 949 if the value is unrepresentably tiny,
 950 .B FLTERR_OFLOW
 951 if the value is unrepresentably huge,
 952 and
 953 .B FLTERR_INEXACT
 954 if the encoding fails to preserve the input value exactly;
 955 hence
 956 .B FLTERR_INEXACT
 957 is set whenever
 958 .B FLTERR_OFLOW
 959 or
 960 .B FLTERR_UFLOW
 961 is set,
 962 or if bits are lost due to NaN-payload truncation or rounding.
 963 If, during encoding,
 964 an error is encountered,
 965 processing stops immediately
 966 unless the corresponding bit of
 967 .I errmask
 968 is set.
 969 .PP
 970 The
 971 .B fltfmt_decieee
 972 function decodes an IEEE-format value.
 973 The function is given three arguments:
 974 a pointer
 975 .I fmt
 976 to the IEEE format description,
 977 a pointer
 978 .I z_out
 979 to the initialized
 980 .B struct floatbits
 981 to fill in, and
 982 a pointer
 983 .I p
 984 to the IEEE-encoded value to decode,
 985 in a vector of 32-bit words as described above.
 986 The only error that can occur during decoding is
 987 .BR FLTERR_INVAL :
 988 as described above,
 989 this occurs in non-hidden-bit formats
 990 when the unit bit does not match that implied by the exponent;
 991 the result is returned anyway,
 992 with the unit bit interpreted as encoded in finite numbers,
 993 and discarded in infinities and NaNs.
 994 .PP
 995 A number of IEEE and IEEE-like formats are predefined:
 996 for format
 997 .IR fmt ,
 998 there is
 999 an IEEE format description, named
1000 .BI fltfmt_ fmt \fR,
1001 together with encoding and decoding functions, named
1002 .BI fltfmt_enc fmt
1003 and
1004 .BI fltfmt_dec fmt \fR;
1005 for the most part,
1006 these functions use more convenient types
1007 to hold encoded values.
1008 .TP
1009 .B "f16"
1010 The IEEE\ 754 Binary16 format, with
1011 .IR w "\ =\ 5,"
1012 .IR p "\ =\ 11,"
1013 and
1014 .IR h "\ =\ 0;"
1015 stored in a
1016 .BR uint16 .
1017 .TP
1018 .B "fltfmt_f32"
1019 The IEEE\ 754 Binary32 (`single precision') format, with
1020 .IR w "\ =\ 8,"
1021 .IR p "\ =\ 24,"
1022 and
1023 .IR h "\ =\ 0;"
1024 stored in a
1025 .BR uint32 .
1026 .TP
1027 .B "fltfmt_f64"
1028 The IEEE\ 754 Binary64 (`double precision') format, with
1029 .IR w "\ =\ 11,"
1030 .IR p "\ =\ 53,"
1031 and
1032 .IR h "\ =\ 0;"
1033 stored in a
1034 .B kludge64
1035 (see
1036 .BR bits (3)
1037 for details).
1038 .TP
1039 .B "fltfmt_f128"
1040 The IEEE\ 754 Binary128 (`quad precision') format, with
1041 .IR w "\ =\ 15,"
1042 .IR p "\ =\ 113,"
1043 and
1044 .IR h "\ =\ 0;"uint
1045 stored in a big-endian vector of
1046 .BR uint32 ,
1047 just as for the generic functions described above.
1048 .TP
1049 .B "fltfmt_mini"
1050 An eight-bit `minifloat' format, with
1051 .IR w "\ =\ 4,"
1052 .IR p "\ =\ 4,"
1053 and
1054 .IR h "\ =\ 0;"
1055 stored in an
1056 .BR octet .
1057 .TP
1058 .B "fltfmt_bf16"
1059 The Google `BFloat16' format, with
1060 .IR w "\ =\ 8,"
1061 .IR p "\ =\ 8,"
1062 and
1063 .IR h "\ =\ 0;"
1064 stored in a
1065 .BR uint16 .
1066 .TP
1067 .B "fltfmt_idblext80"
1068 The Intel 8087 80-bit `double extended' format, with
1069 .IR w "\ =\ 15,"
1070 .IR p "\ =\ 64,"
1071 and
1072 .IR h "\ =\ 1;"
1073 stored as a
1074 .B uint16
1075 holding the sign and exponent,
1076 and a
1077 .B kludge64
1078 holding the significand.
1079 .
1080 .SS Native formats
1081 There are also functions for converting between
1082 .B struct floatbits
1083 and the implementation's native floating-point types
1084 .B float
1085 (abbreviated
1086 .BR flt ),
1087 .B double
1088 (abbreviated
1089 .BR dbl ),
1090 and
1091 .B "long double"
1092 (abbreviated
1093 .BR ldbl ).
1094 .PP
1095 For each native type abbreviation
1096 .IR ty ,
1097 there are functions
1098 .BI fltfmt_enc ty
1099 and
1100 .BI fltfmt_dec ty \fR,
1101 which respectively convert the value held in
1102 .B struct floatbits
1103 to or from a value of the corresponding C type.
1104 (The functions acting on
1105 .B long double
1106 values are only available if the platform supports C99 or later.)
1107 .PP
1108 The
1109 .BI fltfmt_enc ty
1110 functions read an input value from a
1111 .B struct floatbits
1112 pointer
1113 .I x
1114 and store the encoded result through a pointer
1115 .I z_out
1116 to the appropriate C type;
1117 the function also receives a rounding mode
1118 .IR r ,
1119 but see below.
1120 The
1121 .BI fltfmt_dec ty
1122 functions are given an input value of the appropriate C type,
1123 and store the decoded result in a
1124 .B struct floatbits
1125 structure pointed to by
1126 .I z_arg ;
1127 again, the function also receives a rounding mode
1128 .IR r ,
1129 but see below.
1130 .PP
1131 These functions can use two different strategies for conversion.
1132 If the compile-time configuration step detects
1133 that the implementation is using
1134 a specific, supported format for a native type,
1135 then conversions involving the native type
1136 are performed using the existing machinery for that format.
1137 For example, if,
1138 as is in fact nearly universal on modern-ish systems,
1139 the
1140 .B double
1141 type uses the IEEE\ 754 Binary64 format,
1142 then
1143 .B fltfmt_encdbl
1144 and
1145 .B fltfmt_decdbl
1146 use the functions
1147 .B fltfmt_encf64
1148 and
1149 .B fltfmt_decf64
1150 described above for the conversion.
1151 This approach has the benefit that
1152 everything is done under the control of the
1153 .B fltfmt
1154 machinery,
1155 which can faithfully preserve signs of zero values,
1156 and NaN payloads.
1157 The error conditions are, for the most part, the same as for the
1158 .B fltfmt_encieee
1159 and
1160 .B fltfmt_decieee
1161 functions described above.
1162 The encoding functions have an additional source of inexactness
1163 on PA-RISC and older MIPS processors
1164 which use the reversed quiet/signalling NaN convention:
1165 a quiet NaN with an all-zero payload
1166 is not representable on such implementations
1167 (the encoding is an infinity instead);
1168 in this situation,
1169 the least significant payload bit is forced on,
1170 just as if the payload required truncation,
1171 and
1172 .B FLTERR_INEXACT
1173 is returned.
1174 .
1175 .\"--------------------------------------------------------------------------
1176 .SH "SEE ALSO"
1177 .
1178 .BR bits (3),
1179 .BR mLib (3).
1180 .
1181 .\"--------------------------------------------------------------------------
1182 .SH AUTHOR
1183 .
1184 Mark Wooding, <mdw@distorted.org.uk>
1185 .
1186 .\"----- That's all, folks --------------------------------------------------