chiark - git - mdw - mLib/blob - utils/fltfmt.3.in

   1 .\" -*-nroff-*-
   2 .\"
   3 .\" Manual for floating-point format conversions
   4 .\"
   5 .\" (c) 2024 Straylight/Edgeware
   6 .\"
   7 .
   8 .\"----- Licensing notice ---------------------------------------------------
   9 .\"
  10 .\" This file is part of the mLib utilities library.
  11 .\"
  12 .\" mLib is free software: you can redistribute it and/or modify it under
  13 .\" the terms of the GNU Library General Public License as published by
  14 .\" the Free Software Foundation; either version 2 of the License, or (at
  15 .\" your option) any later version.
  16 .\"
  17 .\" mLib is distributed in the hope that it will be useful, but WITHOUT
  18 .\" ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  19 .\" FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
  20 .\" License for more details.
  21 .\"
  22 .\" You should have received a copy of the GNU Library General Public
  23 .\" License along with mLib.  If not, write to the Free Software
  24 .\" Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
  25 .\" USA.
  26 .
  27 .\"--------------------------------------------------------------------------
  28 .so ../defs.man \" @@@PRE@@@
  29 .
  30 .\"--------------------------------------------------------------------------
  31 .TH fltfmt 3mLib "22 April 2024" "Straylight/Edgeware" "mLib utilities library"
  32 .\" @FLTERR_OK
  33 .\" @FLTERR_INVAL
  34 .\" @FLTERR_INEXACT
  35 .\" @FLTERR_UFLOW
  36 .\" @FLTERR_OFLOW
  37 .\" @FLTERR_REPR
  38 .\" @FLTERR_ALLERRS
  39 .
  40 .\" @FRPF_LOW
  41 .\" @FRPF_HALF
  42 .\" @FRPF_ODD
  43 .\" @FRPF_NEG
  44 .\" @FRPMASK_LOW
  45 .\" @FRPMASK_HALF
  46 .\" @FRPMASK_ODD
  47 .\" @FRPMASK_NEG
  48 .\" @FRPMASK_INEXACT
  49 .\" @FRPMASK_NEAR
  50 .\" @FLTRND_ZERO
  51 .\" @FLTRND_PROJINF
  52 .\" @FLTRND_NEGINF
  53 .\" @FLTRND_POSINF
  54 .\" @FLTRND_EVEN
  55 .\" @FLTRND_ODD
  56 .\" @FLTRND_NEAREVEN
  57 .\" @FLTRND_NEARODD
  58 .\" @FLTRND_NEARZERO
  59 .\" @FLTRND_NEARINF
  60 .\" @FLTRND_NEARNEG
  61 .\" @FLTRND_NEARPOS
  62 .
  63 .\" @FLTFMT_NEG
  64 .\" @FLTFMT_INF
  65 .\" @FLTFMT_QNAN
  66 .\" @FLTFMT_SNAN
  67 .\" @FLTFMT_ZERO
  68 .\" @FLTFMT_NANMASK
  69 .\" @FLOATBITS_INIT
  70 .\" @fltfmt_initbits
  71 .\" @fltfmt_freebits
  72 .\" @fltfmt_allocfrac
  73 .\" @fltfmt_copybits
  74 .\" @fltfmt_round
  75 .
  76 .\" @FLTIF_HIDDEN
  77 .\" @fltfmt_f16
  78 .\" @fltfmt_f32
  79 .\" @fltfmt_f64
  80 .\" @fltfmt_f128
  81 .\" @fltfmt_mini
  82 .\" @fltfmt_bf16
  83 .\" @fltfmt_idblext80
  84 .
  85 .\" @fltfmt_encieee
  86 .\" @fltfmt_encf16
  87 .\" @fltfmt_encf32
  88 .\" @fltfmt_encf64
  89 .\" @fltfmt_encf128
  90 .\" @fltfmt_encmini
  91 .\" @fltfmt_encbf16
  92 .\" @fltfmt_encidblext80
  93 .\" @fltfmt_decieee
  94 .\" @fltfmt_decf16
  95 .\" @fltfmt_decf32
  96 .\" @fltfmt_decf64
  97 .\" @fltfmt_decf128
  98 .\" @fltfmt_decmini
  99 .\" @fltfmt_decbf16
 100 .\" @fltfmt_decidblext80
 101 .
 102 .\" @fltfmt_encflt
 103 .\" @fltfmt_encdbl
 104 .\" @fltfmt_encldbl
 105 .\" @fltfmt_decflt
 106 .\" @fltfmt_decdbl
 107 .\" @fltfmt_decldbl
 108 .
 109 .\" @fltfmt_flttof32l
 110 .\" @fltfmt_flttof32b
 111 .\" @fltfmt_dbltof64l
 112 .\" @fltfmt_dbltof64b
 113 .\" @fltfmt_f32ltoflt
 114 .\" @fltfmt_f32btoflt
 115 .\" @fltfmt_f64ltodbl
 116 .\" @fltfmt_f64btodbl
 117 .
 118 .\"--------------------------------------------------------------------------
 119 .SH NAME
 120 fltfmt \- floating-point format conversions
 121 .
 122 .\"--------------------------------------------------------------------------
 123 .SH SYNOPSIS
 124 .
 125 .nf
 126 .B "#define FLTERR_OK 0"
 127 .B "#define FLTERR_INVAL ..."
 128 .B "#define FLTERR_INEXACT ..."
 129 .B "#define FLTERR_UFLOW ..."
 130 .B "#define FLTERR_OFLOW ..."
 131 .B "#define FLTERR_REPR ..."
 132 .B "#define FLTERR_ALLERRS ..."
 133 .PP
 134 .ta 40n
 135 .B "#define FRPF_LOW 1u"
 136 .B "#define FRPF_HALF 2u"
 137 .B "#define FRPF_ODD 4u"
 138 .B "#define FRPF_NEG 8u"
 139 .B "#define FRPMASK_LOW 0xaaaau"
 140 .B "#define FRPMASK_HALF 0xccccu."
 141 .B "#define FRPMASK_ODD 0xf0f0u"
 142 .B "#define FRPMASK_NEG 0xff00u"
 143 .B "#define FRPMASK_INEXACT ... /* LOW | HALF */"
 144 .BI "unsigned FRPMASK_NEAR(unsigned " dir ");   /* HALF&(LOW | " dir ") */"
 145 .B "#define FLTRND_ZERO ...     /* 0 */"
 146 .B "#define FLTRND_PROJINF ...  /* INEXACT */"
 147 .B "#define FLTRND_NEGINF ...   /* INEXACT&NEG */"
 148 .B "#define FLTRND_POSINF ...   /* INEXACT&~NEG */"
 149 .B "#define FLTRND_EVEN ...     /* INEXACT&ODD */"
 150 .B "#define FLTRND_ODD ...      /* INEXACT&~ODD */"
 151 .B "#define FLTRND_NEAREVEN ... /* HALF&(LOW | ODD) */"
 152 .B "#define FLTRND_NEARODD ...  /* HALF&(LOW | ~ODD) */"
 153 .B "#define FLTRND_NEARZERO ... /* HALF&LOW */"
 154 .B "#define FLTRND_NEARINF ...  /* HALF */"
 155 .B "#define FLTRND_NEARNEG ...  /* HALF&(LOW | NEG) */"
 156 .B "#define FLTRND_NEARPOS ...  /* HALF&(LOW | ~NEG) */"
 157 .PP
 158 .ta 2n
 159 .B "#define FLTF_NEG ..."
 160 .B "#define FLTF_INF ..."
 161 .B "#define FLTF_QNAN ..."
 162 .B "#define FLTF_SNAN ..."
 163 .B "#define FLTF_ZERO ..."
 164 .B "#define FLTF_NANMASK (FLTF_QNAN | FLTF_SNAN)"
 165 .B "struct floatbits {"
 166 .B "    unsigned f;"
 167 .B "    int exp;"
 168 .B "    arena *a;"
 169 .B "    uint32 *frac;"
 170 .B "    unsigned n, fracsz;"
 171 .B "};"
 172 .B "#define FLOATBITS_INIT { ...\& };"
 173 .PP
 174 .BI "void fltfmt_initbits(struct floatbits *" x );
 175 .BI "void fltfmt_freebits(struct floatbits *" x );
 176 .BI "void fltfmt_allocfrac(struct floatbits *" x ", unsigned " n );
 177 .ta \w'\fBvoid fltfmt_copybits('u
 178 .BI "void fltfmt_copybits(struct floatbits *" z_out ,
 179 .BI "   const struct floatbits *" x );
 180 .ta \w'\fBvoid fltfmt_round('u
 181 .BI "void fltfmt_round(struct floatbits *" z_out ,
 182 .BI "   const struct floatbits *" x ,
 183 .BI "   unsigned " r ", unsigned " n );
 184 .PP
 185 .
 186 .ta 2n
 187 .B "#define FLTIF_HIDDEN ..."
 188 .B "struct fltfmt_ieeefmt {"
 189 .B "    unsigned f;"
 190 .B "    unsigned expwd;"
 191 .B "    unsigned prec;"
 192 .B "};"
 193 .B "const struct fltfmt_ieeefmt fltfmt_f16;"
 194 .B "const struct fltfmt_ieeefmt fltfmt_f32;"
 195 .B "const struct fltfmt_ieeefmt fltfmt_f64;"
 196 .B "const struct fltfmt_ieeefmt fltfmt_f128;"
 197 .B "const struct fltfmt_ieeefmt fltfmt_mini;"
 198 .B "const struct fltfmt_ieeefmt fltfmt_bf16;"
 199 .B "const struct fltfmt_ieeefmt fltfmt_idblext80;"
 200 .PP
 201 .ta \w'\fBunsigned fltfmt_encieee('u
 202 .BI "unsigned fltfmt_encieee(const struct fltfmt_ieeefmt *" fmt ,
 203 .BI "   uint32 *" z ", const struct floatbits *" x ,
 204 .BI "   unsigned " r ", unsigned " errmask );
 205 .ta \w'\fBunsigned fltfmt_encf16('u
 206 .BI "unsigned fltfmt_encf16(uint16 *" z_out ", const struct floatbits *" x ,
 207 .BI "   unsigned " r ", unsigned " errmask );
 208 .ta \w'\fBunsigned fltfmt_encf32('u
 209 .BI "unsigned fltfmt_encf32(uint32 *" z_out ", const struct floatbits *" x ,
 210 .BI "   unsigned " r ", unsigned " errmask );
 211 .ta \w'\fBunsigned fltfmt_encf64('u
 212 .BI "unsigned fltfmt_encf64(kludge64 *" z_out ", const struct floatbits *" x ,
 213 .BI "   unsigned " r ", unsigned " errmask );
 214 .ta \w'\fBunsigned fltfmt_encf128('u
 215 .BI "unsigned fltfmt_encf128(uint32 *" z_out ", const struct floatbits *" x ,
 216 .BI "   unsigned " r ", unsigned " errmask );
 217 .ta \w'\fBunsigned fltfmt_encmini('u
 218 .BI "unsigned fltfmt_encmini(octet *" z_out ", const struct floatbits *" x ,
 219 .BI "   unsigned " r ", unsigned " errmask );
 220 .ta \w'\fBunsigned fltfmt_encbf16('u
 221 .BI "unsigned fltfmt_encbf16(uint16 *" z_out ", const struct floatbits *" x ,
 222 .BI "   unsigned " r ", unsigned " errmask );
 223 .ta \w'\fBunsigned fltfmt_encidblext80('u
 224 .BI "unsigned fltfmt_encidblext80(uint16 *" se_out ", kludge64 *" m_out ,
 225 .BI "   const struct floatbits *" x ,
 226 .BI "   unsigned " r ", unsigned " errmask );
 227 .PP
 228 .ta \w'\fBunsigned fltfmt_decieee('u
 229 .BI "unsigned fltfmt_decieee(const struct fltfmt_ieeefmt *" fmt ,
 230 .BI "   struct floatbits *" z_out ", const uint32 *" x );
 231 .BI "unsigned fltfmt_decf16(struct floatbits *" z_out ", uint16 " x );
 232 .BI "unsigned fltfmt_decf32(struct floatbits *" z_out ", uint32 " x );
 233 .BI "unsigned fltfmt_decf64(struct floatbits *" z_out ", kludge64 " x );
 234 .BI "unsigned fltfmt_decf128(struct floatbits *" z_out ", const uint32 *" x );
 235 .BI "unsigned fltfmt_decmini(struct floatbits *" z_out ", octet " x );
 236 .BI "unsigned fltfmt_decbf16(struct floatbits *" z_out ", uint16 " x );
 237 .ta \w'\fBunsigned fltfmt_decidblext80('u
 238 .BI "unsigned fltfmt_decidblext80(struct floatbits *" z_out ,
 239 .BI "   uint16 " se ", kludge64 " m );
 240 .PP
 241 .ta \w'\fBunsigned fltfmt_encflt('u
 242 .BI "unsigned fltfmt_encflt(float *" z_out ,
 243 .BI "   const struct floatbits *" x ", unsigned " r );
 244 .ta \w'\fBunsigned fltfmt_encdbl('u
 245 .BI "unsigned fltfmt_encdbl(double *" z_out ,
 246 .BI "   const struct floatbits *" x ", unsigned " r );
 247 .ta \w'\fBunsigned fltfmt_encldbl('u
 248 .BI "unsigned fltfmt_encldbl(long double *" z_out ,
 249 .BI "   const struct floatbits *" x ", unsigned " r );
 250 .ta \w'\fBunsigned fltfmt_decflt('u
 251 .BI "unsigned fltfmt_decflt(struct floatbits *" z_out ,
 252 .BI "   float *" x ", unsigned " r );
 253 .ta \w'\fBunsigned fltfmt_decdbl('u
 254 .BI "unsigned fltfmt_decdbl(struct floatbits *" z_out ,
 255 .BI "   double *" x ", unsigned " r );
 256 .ta \w'\fBunsigned fltfmt_decldbl('u
 257 .BI "unsigned fltfmt_decldbl(struct floatbits *" z_out ,
 258 .BI "   long double *" x ", unsigned " r );
 259 .PP
 260 .BI "unsigned fltfmt_flttof32l(octet *" p ", float " x ", unsigned " r );
 261 .BI "unsigned fltfmt_flttof32b(octet *" p ", float " x ", unsigned " r );
 262 .BI "unsigned fltfmt_dbltof64l(octet *" p ", double " x ", unsigned " r );
 263 .BI "unsigned fltfmt_dbltof64b(octet *" p ", double " x ", unsigned " r );
 264 .BI "unsigned fltfmt_f32ltoflt(float *" z_out ", const octet *" p ", unsigned " r );
 265 .BI "unsigned fltfmt_f32btoflt(float *" z_out ", const octet *" p ", unsigned " r );
 266 .BI "unsigned fltfmt_f64ltodbl(float *" z_out ", const octet *" p ", unsigned " r );
 267 .BI "unsigned fltfmt_f64btodbl(float *" z_out ", const octet *" p ", unsigned " r );
 268 .
 269 .\"--------------------------------------------------------------------------
 270 .SH DESCRIPTION
 271 .
 272 The
 273 .B "<mLib/fltfmt.h>"
 274 header file defines structures, macros, and functions
 275 for converting floating-point values between various formats,
 276 including the native floating-point formats
 277 and IEEE\ 754 and related formats.
 278 .
 279 .SS Error conditions
 280 Most of the functions in this module return an unsigned integer.
 281 A return value of zero means that no error occurred;
 282 set bits indicate various error conditions.
 283 .TP
 284 .B FLTERR_INVAL
 285 A binary input to be decoded contained an invalid bit pattern,
 286 e.g., an unnormalized input value with a nonminimal exponent.
 287 The function will have produced a reasonable output anyway,
 288 but the original value will not be recoverable from the result.
 289 .TP
 290 .B FLTERR_INEXACT
 291 The conversion was inexact.
 292 Converting the output back into the format of the input
 293 may not reproduce the original input value.
 294 This error flag is sometimes set conservatively.
 295 .TP
 296 .B FLTERR_UFLOW
 297 The conversion underflowed:
 298 a nonzero input was too tiny (in asbolute value) to represent,
 299 and a zero result was returned.
 300 .TP
 301 .B FLTERR_OFLOW
 302 The conversion overflowed:
 303 a finite input was too huge (in absolute value) to represent,
 304 and either the appropriately signed infinity
 305 or largest-magnitude finite value
 306 was returned, determined by the requested rounding mode.
 307 .TP
 308 .B FLTERR_REPR
 309 The output format failed entirely to represent the input value.
 310 The result is zero if the input was a NaN,
 311 or the appropriately signed largest-magnitude finite value
 312 if the input was an infinity.
 313 .
 314 .SS Rounding modes
 315 The rounding system works as follows.
 316 There are four
 317 .I rounding predicates
 318 considered when a rounding decision is taken.
 319 These are determined from the unrounded input value
 320 .IR x ,
 321 and the two nearest rounded values
 322 .RI | u "|\ \*(<=\ |" x |
 323 and
 324 .RI | v "|\ >\ |" x |.
 325 The predicates are as follows.
 326 .TP
 327 .B FRPF_LOW
 328 If
 329 .IR x "\ \*(/=\ " u
 330 and
 331 .IR x "\ \*/=\ (" u "\ +\ " v )/2,
 332 i.e.,
 333 .I x
 334 is neither equal to a rounded value,
 335 nor exactly halfway between two rounded values.
 336 This predicate is sometimes referred to as a `sticky bit'.
 337 .TP
 338 .B FRPF_HALF
 339 If
 340 .RI | x "|\ \*(>=\ |(" u "\ +\ " v )/2|,
 341 i.e.,
 342 .I x
 343 is halfway or more towards its larger rounded neighbour.
 344 .TP
 345 .B FRPF_ODD
 346 If least significant digit of
 347 .I u
 348 is odd.
 349 In binary floating-point formats,
 350 this is just the least significant bit of
 351 .IR u .
 352 .TP
 353 .B FRPF_NEG
 354 If
 355 .I x
 356 is negative.
 357 .PP
 358 These four predicates are packed into a four-bit mask value
 359 .I rf
 360 between 0 and 15.
 361 A
 362 .I rounding mode
 363 is simply a 16-bit mask:
 364 if bit
 365 .I rf
 366 of the rounding-mode mask is set,
 367 then
 368 .I x
 369 is rounded to
 370 .IR v ;
 371 otherwise it is rounded to
 372 .IR u .
 373 That is, the rounding-mode mask is essentially a truth table.
 374 Rounding modes with
 375 .I set
 376 bits corresponding to situations where both
 377 .B FRPF_LOW
 378 and
 379 .FRPF_HALF
 380 are false,
 381 i.e., where
 382 .I x
 383 is already a rounded value,
 384 are forbidden.
 385 .PP
 386 Some useful machinery is provided
 387 for constructing rounding-mode masks.
 388 .BR FRPMASK_LOW ,
 389 .BR FRPMASK_HALF ,
 390 .BR FRPMASK_ODD ,
 391 and
 392 .B FRPMASK_NEG ,
 393 are mask with set bits corresponding to their respective predicates.
 394 Bitwise boolean logic can be applied to these masks
 395 in order to calculate the masks corresponding to
 396 the same logical expresssion applied to the individual predicates.
 397 .B FRPMASK_INEXACT holds if
 398 .B LOW
 399 or
 400 .B HALF
 401 holds;
 402 i.e., if
 403 .IR x "\ \*(/=\ " u ;
 404 as mentioned above, only these bits may be set
 405 in a valid rounding-mode mask.
 406 .BI FRPMASK_NEAR( dir )
 407 is the mask for rounding to nearest with ties broken according to
 408 .IR dir ,
 409 which is another rounding-mode mask.
 410 The complete set of predefined masks is listed above in the synopsis,
 411 together with their description in terms of the basic predicates.
 412 The usual IEEE rounding mode is
 413 round-to-nearest/ties-to-even,
 414 denoted
 415 .BR FLTRND_NEAREVEN .
 416 This is likely a good option
 417 if there is no compelling argument for a different specific choice.
 418 .
 419 .SS The floatbits structure
 420 In order to avoid a combinatorial explosion in conversion operations,
 421 all the basic conversions involve,
 422 as source or target,
 423 a `common currency' format represented by the type
 424 .BR "struct floatbits" .
 425 .PP
 426 This structure consists of
 427 a set of flags
 428 .BR f ;
 429 a signed exponent
 430 .BR exp ;
 431 an
 432 .B arena
 433 pointer
 434 .BR a ;
 435 a pointer
 436 .B frac
 437 to a vector of
 438 .B uint32
 439 values;
 440 the length
 441 .B n
 442 of the
 443 .B frac
 444 vector; and
 445 the currently allocated size
 446 .B fracsz
 447 of the vector.
 448 Both
 449 .B n
 450 and
 451 .B fracsz
 452 count elements, not bytes.
 453 .PP
 454 Storage for
 455 .B frac
 456 comes from the arena
 457 .BR a .
 458 Only the first
 459 .B n
 460 words of
 461 .B frac
 462 are significant;
 463 .B frac[0]
 464 is the most significant word.
 465 The value represented by a
 466 .B struct floatbits
 467 is never changed by adding or removing zero-valued words
 468 at the end of the
 469 .B frac
 470 vector.
 471 It is always the case that
 472 .BR n "\ \*(<=\ " fracsz ;
 473 if
 474 .B fracsz
 475 is zero then
 476 .B frac
 477 may be a null pointer.
 478 .PP
 479 The interpretation of the
 480 .B exp
 481 and
 482 .B frac
 483 members depends on the flags set in
 484 .BR f .
 485 Apart from
 486 .BR FLTF_NEG ,
 487 the flags are
 488 .IR "mutually exclusive" :
 489 at most one flag may be set.
 490 .TP
 491 .B FLTF_NEG
 492 The value is negative.
 493 .TP
 494 .B FLTF_INF
 495 The value is positive or negative infinity.
 496 The
 497 .B exp
 498 and
 499 .B frac
 500 are ignored.
 501 .TP
 502 .BR FLTF_QNAN " and " FLTF_SNAN
 503 The value is a quiet or signalling not-a-number, respectively.
 504 The
 505 .B exp
 506 is ignored.
 507 The payload is stored in
 508 .BR frac ;
 509 the payload does not include the `quiet' bit.
 510 .TP
 511 .B FLTF_ZERO
 512 The number is zero.
 513 Negative zero is distinct from positive zero.
 514 The
 515 .B exp
 516 and
 517 .B frac
 518 are ignored.
 519 .IP "All non-sign bits clear"
 520 The value is a finite nonzero number.
 521 The
 522 .B frac
 523 holds the significand.
 524 The most significand significand bit must be set, so
 525 (a)\ the number must be nonzero, and
 526 (b)\ the significand is normalized.
 527 The significand is interpreted as a fraction
 528 .RI "1/2\ \*(<=\ " m "\ <\ 1."
 529 If
 530 .I e
 531 is the value of the
 532 .B exp
 533 member,
 534 and
 535 .I s
 536 is \-1 if
 537 .B FLTF_NEG
 538 is set
 539 or +1 if
 540 .B FLTF_NEG
 541 is clear,
 542 then the number represented is
 543 .IR s "\ \(mu\ " m "\ \(mu\ 2\*(ss" e \*(se.
 544 .PP
 545 A
 546 .B struct floatbits
 547 can be initialized statically by
 548 .BR FLOATBITS_INIT ,
 549 or dynamically using the function
 550 .BR fltfmt_initbits .
 551 These are not quite the same:
 552 .B FLOATBITS_INIT
 553 initializes
 554 .B a
 555 to
 556 .BR &arena_stdlib ,
 557 while
 558 .B fltfmt_initbits
 559 sets it to the runtime value of
 560 .BR arena_global .
 561 With this exception,
 562 both forms of initialization set the value to (positive) zero;
 563 neither allocates any storage or other resources,
 564 leaving
 565 .B frac
 566 null.
 567 .PP
 568 The
 569 .B fltfmt_allocfrac
 570 function is given a pointer
 571 .I x
 572 to a
 573 .B struct floatbits
 574 and a length
 575 .IR n :
 576 it ensures that there is enough storage at
 577 .IB x ->frac
 578 for at least
 579 .I n
 580 words:
 581 if the current size is too small,
 582 then any existing buffer is discarded and a new one allocated
 583 from the arena
 584 .IB x ->a \fR;
 585 any existing contents of the buffer are lost.
 586 On exit,
 587 .IB x ->n
 588 is set to
 589 .IR n .
 590 .PP
 591 The
 592 .B fltfmt_freebits
 593 function
 594 frees a
 595 .B struct floatbits
 596 structure, releasing the storage held by
 597 .BR frac .
 598 .PP
 599 The
 600 .B fltfmt_copybits
 601 function simply copies its input
 602 .I x
 603 to its output
 604 .IR z_out ;
 605 both must refer to initialized
 606 .B struct floatbits
 607 structures.
 608 If
 609 .I z_out
 610 and
 611 .I x
 612 are equal, then nothing happens.
 613 .PP
 614 Finally, the
 615 .B fltfmt_round
 616 function rounds the value in the
 617 .B struct floatbits
 618 structure
 619 .I x
 620 to
 621 .I n
 622 bits using the rounding mode
 623 .IR r ;
 624 the result is written to
 625 .IR z_out ;
 626 it is permitted for
 627 .I z_out
 628 to be equal to
 629 .IR x .
 630 If
 631 .I x
 632 is a zero or infinity,
 633 then the output is equal to the input,
 634 as if
 635 .B fltfmt_copybits
 636 had been called instead.
 637 If
 638 .I x
 639 is a NaN,
 640 then the payload is simply truncated to
 641 .I n
 642 bits, without regard to the rounding mode.
 643 Otherwise, the input is nonzero and finite;
 644 the significand is rounded to
 645 .I n
 646 bits according to the rounding mode.
 647 In all cases, the return value is
 648 zero if the output is equal to the input,
 649 or
 650 .B FLTERR_INEXACT
 651 if the rounded result is not equal to the input.
 652 .
 653 .SS IEEE and related formats
 654 An IEEE floating-point format is characterized by three parameters:
 655 the
 656 .I "exponent width"
 657 .IR w ,
 658 the
 659 .I "precision"
 660 .IR p ,
 661 and
 662 the
 663 .I "unit width"
 664 .IR h .
 665 .PP
 666 The encoded value consists of
 667 .IR p "\ +\ " w "\ +\ " h "\ \-\ 1"
 668 bits.
 669 This is divided, from the most significant bit downwards,
 670 into a
 671 .I "sign bit"
 672 .IR s ,
 673 a
 674 .IR w -bit
 675 .I "biased exponent"
 676 .IR e \*',
 677 a
 678 .IR h -bit
 679 .I "unit bit"
 680 .IR u ,
 681 and a
 682 .RI ( p "\ \-\ " h )-bit
 683 .I fraction
 684 .IR f .
 685 The
 686 .I "exponent bias"
 687 is
 688 .IR e "\*(us0\*(ue\ =\ 2\*(ss" w "\-1\*(se\ \-\ 1;"
 689 the true exponent
 690 .I e
 691 is calculated from the biased exponent by
 692 .IR e "\ =\ " e "\*'\ \-\ " e \*(us0\*(ue.
 693 The unit and fraction field are usually interpreted as denoting
 694 a significand
 695 .IR m "\ =\ " u "\ +\ " f /2\*(ss p \-1\*(se
 696 with
 697 .RI "0\ \*(<=\ " m "\ <\ 2."
 698 If
 699 .I h
 700 is zero,
 701 the value of the unit bit
 702 .I u
 703 is implied by the exponent as described below.
 704 The encoded value is interpreted as follows.
 705 .hP \*o
 706 If
 707 .IR e "\ =\ \-" e \*(us0\*(ue
 708 then the value is zero or a subnormal,
 709 with the value
 710 .RI (\-1)\*(ss s "\*(se\ \(mu\ " m "\ \(mu\ 2\*(ss" e +1\*(se.
 711 In particular,
 712 if
 713 .IR m "\ =\ 0"
 714 then the value is positive or negative zero,
 715 according to the sign bit
 716 .IR s .
 717 If
 718 .I h
 719 is zero then
 720 .IR u "\ =\ 0;"
 721 if
 722 .I h
 723 is nonzero
 724 but
 725 .IR u "\ \*(/=\ 0"
 726 then the encoding is invalid:
 727 decoding returns
 728 .BR FLTERR_INVAL ,
 729 but the result will be as described.
 730 .hP \*o
 731 If
 732 .RI "1\ \-\ " e "\*(us0\*(ue \*(<=\ " e "\ < 2" e "\*(us0\*(ue\ +\ 1"
 733 then the value is a (supposedly) normal number
 734 .RI (\-1)\*(ss s "\*(se\ \(mu\ " m "\ \(mu\ 2\*(ss" e \*(se.
 735 If
 736 .I h
 737 is zero then
 738 .IR u "\ =\ 1;"
 739 if
 740 .I h
 741 is nonzero
 742 but
 743 .IR u "\ \*(/=\ 0"
 744 then the encoding is invalid:
 745 decoding returns
 746 .BR FLTERR_INVAL ,
 747 but the result will be as described.
 748 .hP \*o
 749 If
 750 .IR e "\ =\ 2" e "\*(us0\*(ue\ +\ 1"
 751 and
 752 .IR f "\ =\ 0"
 753 then the value is positive or negative infinity,
 754 according to the sign bit
 755 .IR s .
 756 If
 757 .I h
 758 is nonzero and
 759 .IR u "\ =\ 0"
 760 then the encoding is invalid:
 761 decoding returns
 762 .BR FLTERR_INVAL ,
 763 but the result will still be infinity.
 764 .hP \*o
 765 If
 766 .IR e "\ =\ 2" e "\*(us0\*(ue\ +\ 1"
 767 and
 768 .IR f "\ \*(/=\ 0"
 769 then the value is not-a-number (NaN).
 770 The most significant bit of
 771 .I f
 772 is the `quiet bit':
 773 if the bit is set, the value is a `quiet NaN';
 774 if the bit is clear, the value is a `signalling NaN'.
 775 (This is the convention recommended by IEEE\ 754-2008 \(sc6.2.1,
 776 it has the advantage that a signalling NaN can be `quieted'
 777 by setting the most significant fraction bit;
 778 HP-PA and older MIPS processors use the opposite convention
 779 for distinguishing quiet and signalling NaNs,
 780 but a signalling NaN with all but the most significant
 781 fraction bit zero cannot be `quieted' by clearing the
 782 most significant bit, since the resulting encoding denotes
 783 an infinity, not a QNaN.)
 784 The remaining bits of
 785 .I f
 786 form the
 787 .I payload.
 788 Positive and negative NaN values are distinguished,
 789 with sign determined by the sign bit.
 790 If
 791 .I h
 792 is nonzero and
 793 .IR u "\ =\ 0"
 794 then the encoding is invalid:
 795 decoding returns
 796 .BR FLTERR_INVAL ,
 797 but the result will still be a NaN;
 798 the unit bit does not affect the NaN payload.
 799 .PP
 800 An IEEE format is described by the type
 801 .BR "struct fltfmt_ieeefmt" .
 802 This has three members:
 803 .TP
 804 .B f
 805 A flags word.
 806 If
 807 .B FLTIF_HIDDEN
 808 is set, the the format uses a `hidden bit' convention:
 809 in the notation above
 810 .IR h "\ =\ 0;"
 811 if the flag is clear,
 812 the format has an explicit unit bit, and
 813 .IR h "\ =\ 1."
 814 .TP
 815 .B expwd
 816 The exponent width;
 817 in the notation above this is
 818 .IR w .
 819 .TP
 820 .B prec
 821 The precision;
 822 in the notation above this is
 823 .IR p .
 824 .PP
 825 The following IEEE formats descriptions are already defined.
 826 .TP
 827 .B "fltfmt_f16"
 828 The IEEE\ 754 Binary16 format, with
 829 .IR w "\ =\ 5,"
 830 .IR p "\ =\ 11,"
 831 and
 832 .IR h "\ =\ 0."
 833 .TP
 834 .B "fltfmt_f32"
 835 The IEEE\ 754 Binary32 (`single precision') format, with
 836 .IR w "\ =\ 8,"
 837 .IR p "\ =\ 24,"
 838 and
 839 .IR h "\ =\ 0."
 840 .TP
 841 .B "fltfmt_f64"
 842 The IEEE\ 754 Binary64 (`double precision') format, with
 843 .IR w "\ =\ 11,"
 844 .IR p "\ =\ 53,"
 845 and
 846 .IR h "\ =\ 0."
 847 .TP
 848 .B "fltfmt_f128"
 849 The IEEE\ 754 Binary128 (`quad precision') format, with
 850 .IR w "\ =\ 15,"
 851 .IR p "\ =\ 113,"
 852 and
 853 .IR h "\ =\ 0."
 854 .TP
 855 .B "fltfmt_mini"
 856 An eight-bit `minifloat' format, with
 857 .IR w "\ =\ 4,"
 858 .IR p "\ =\ 4,"
 859 and
 860 .IR h "\ =\ 0."
 861 .TP
 862 .B "fltfmt_bf16"
 863 The Google `BFloat16' format, with
 864 .IR w "\ =\ 8,"
 865 .IR p "\ =\ 8,"
 866 and
 867 .IR h "\ =\ 0."
 868 .TP
 869 .B "fltfmt_idblext80"
 870 The Intel 8087 80-bit `double extended' format, with
 871 .IR w "\ =\ 15,"
 872 .IR p "\ =\ 64,"
 873 and
 874 .IR h "\ =\ 1."
 875 .PP
 876 The
 877 .B fltfmt_encieee
 878 and
 879 .B fltfmt_decieee
 880 functions convert between IEEE and related formats
 881 and the internal
 882 .B struct floatbits
 883 representation.
 884 They respectively encode or decode an IEEE-format value,
 885 as described above,
 886 from a vector of
 887 .B uint32
 888 words,
 889 most-significant word first
 890 \(en so the sign bit is in the first word.
 891 For formats whose size is not a multiple of 32,
 892 the encoding is
 893 .IR right-aligned :
 894 the least significant bit of the fraction
 895 is in the least significant bit of the last word in the vector.
 896 .PP
 897 The
 898 .B fltfmt_encieee
 899 function encodes an IEEE-format value.
 900 The function is given five arguments:
 901 a pointer
 902 .I fmt
 903 to the IEEE format description,
 904 a pointer
 905 .I p
 906 to a sufficiently long vector of 32-bit words
 907 in which to store the encoded value,
 908 a pointer
 909 .I x
 910 to the
 911 .B struct floatbits
 912 holding the value to encode,
 913 a rounding mode
 914 .IR r ,
 915 and an error mask
 916 .IR errmask .
 917 If the input is a NaN,
 918 then the payload is truncated to fit
 919 regardless of the rounding mode,
 920 discarding low-significant bits;
 921 if the input is a finite value,
 922 then the significand is rounded to fit
 923 according to the requested rounding mode.
 924 The possible errors are
 925 .B FLTERR_UFLOW
 926 if the value is unrepresentably tiny,
 927 .B FLTERR_OFLOW
 928 if the value is unrepresentably huge,
 929 and
 930 .B FLTERR_INEXACT
 931 if the encoding fails to preserve the input value exactly;
 932 hence
 933 .B FLTERR_INEXACT
 934 is set whenever
 935 .B FLTERR_OFLOW
 936 or
 937 .B FLTERR_UFLOW
 938 is set,
 939 or if bits are lost due to NaN-payload truncation or rounding.
 940 If, during encoding,
 941 an error is encountered,
 942 processing stops immediately
 943 unless the corresponding bit of
 944 .I errmask
 945 is set.
 946 .PP
 947 The
 948 .B fltfmt_decieee
 949 function decodes an IEEE-format value.
 950 The function is given three arguments:
 951 a pointer
 952 .I fmt
 953 to the IEEE format description,
 954 a pointer
 955 .I z_out
 956 to the initialized
 957 .B struct floatbits
 958 to fill in, and
 959 a pointer
 960 .I p
 961 to the IEEE-encoded value to decode,
 962 in a vector of 32-bit words as described above.
 963 The only error that can occur during decoding is
 964 .BR FLTERR_INVAL :
 965 as described above,
 966 this occurs in non-hidden-bit formats
 967 when the unit bit does not match that implied by the exponent;
 968 the result is returned anyway,
 969 with the unit bit interpreted as encoded in finite numbers,
 970 and discarded in infinities and NaNs.
 971 .PP
 972 For each of the format
 973
 974
 975
 976 .
 977 .\"--------------------------------------------------------------------------
 978 .SH "SEE ALSO"
 979 .
 980 .BR bits (3),
 981 .BR mLib (3).
 982 .
 983 .\"--------------------------------------------------------------------------
 984 .SH AUTHOR
 985 .
 986 Mark Wooding, <mdw@distorted.org.uk>
 987 .
 988 .\"----- That's all, folks --------------------------------------------------