chiark - git - mdw - mLib/blob - utils/fltfmt.h

   1 /* -*-c-*-
   2  *
   3  * Floating-point format conversions
   4  *
   5  * (c) 2024 Straylight/Edgeware
   6  */
   7
   8 /*----- Licensing notice --------------------------------------------------*
   9  *
  10  * This file is part of the mLib utilities library.
  11  *
  12  * mLib is free software: you can redistribute it and/or modify it under
  13  * the terms of the GNU Library General Public License as published by
  14  * the Free Software Foundation; either version 2 of the License, or (at
  15  * your option) any later version.
  16  *
  17  * mLib is distributed in the hope that it will be useful, but WITHOUT
  18  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  19  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
  20  * License for more details.
  21  *
  22  * You should have received a copy of the GNU Library General Public
  23  * License along with mLib.  If not, write to the Free Software
  24  * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
  25  * USA.
  26  */
  27
  28 #ifndef MLIB_FLTFMT_H
  29 #define MLIB_FLTFMT_H
  30
  31 #ifdef __cplusplus
  32   extern "C" {
  33 #endif
  34
  35 /*----- Header files ------------------------------------------------------*/
  36
  37 #ifndef MLIB_ARENA_H
  38 #  include "arena.h"
  39 #endif
  40
  41 #ifndef MLIB_BITS_H
  42 #  include "bits.h"
  43 #endif
  44
  45 /*----- Data structures ---------------------------------------------------*/
  46
  47 /* Error codes. */
  48 #define FLTERR_OK 0x0000u               /* no trouble */
  49 #define FLTERR_INVAL 0x0001u            /* technically invalid encoding */
  50 #define FLTERR_INEXACT 0x0002u          /* result is inexect */
  51 #define FLTERR_UFLOW 0x0004u            /* underflowed to zero */
  52 #define FLTERR_OFLOW 0x0008u            /* overflowed to ±∞ or max finite */
  53 #define FLTERR_REPR 0x0010              /* not representable */
  54 #define FLTERR_ALLERRS 0xffff           /* all errors */
  55
  56 /* Predicates considered for rounding. */
  57 #define FRPF_LOW 0x0001u             /* lost bits not exactly zero or half */
  58 #define FRPF_HALF 0x0002u               /* lost a half or more  */
  59 #define FRPF_ODD 0x0004u                /* final place is currently odd */
  60 #define FRPF_NEG 0x0008u                /* number is negative */
  61
  62 /* Rounding policies.  These are represented as a 16-bit truth table applied
  63  * to the predicate bits listed above.  The following are the mask values
  64  * corresponding to the predicate bits being set; a set bit means that the
  65  * number should be rounded away from zero.
  66  */
  67 #define FRPMASK_LOW 0xaaaau             /* lost bits below half */
  68 #define FRPMASK_HALF 0xccccu            /* lost a half or more */
  69 #define FRPMASK_ODD 0xf0f0u             /* final place is dod */
  70 #define FRPMASK_NEG 0xff00u             /* number is negative */
  71
  72 /* Useful constructed masks from the above. */
  73 #define FRPMASK_INEXACT (FRPMASK_LOW | FRPMASK_HALF) /* lost nonzero bits */
  74 #define FRPMASK_NEAR(dir) (FRPMASK_HALF&(FRPMASK_LOW | (dir))) /*  */
  75
  76 /* Generally useful rounding criteria. */
  77 #define FLTRND_ZERO 0                   /* towards zero (truncate) */
  78 #define FLTRND_PROJINF FRPMASK_INEXACT  /* towards (projective) ±∞ */
  79 #define FLTRND_NEGINF (FRPMASK_INEXACT&FRPMASK_NEG) /* down, towards -∞ */
  80 #define FLTRND_POSINF (FRPMASK_INEXACT&~FRPMASK_NEG) /* up, towards +∞ */
  81 #define FLTRND_EVEN (FRPMASK_INEXACT&FRPMASK_ODD) /* to even */
  82 #define FLTRND_ODD (FRPMASK_INEXACT&~FRPMASK_ODD) /* to odd */
  83 #define FLTRND_NEAREVEN FRPMASK_NEAR(FLTRND_EVEN) /* nearest, ties to even */
  84 #define FLTRND_NEARODD FRPMASK_NEAR(FLTRND_ODD) /* nearest, ties to odd */
  85 #define FLTRND_NEARZERO FRPMASK_NEAR(FLTRND_ZERO) /* nearest, ties to zero */
  86 #define FLTRND_NEARINF FRPMASK_NEAR(FLTRND_PROJINF) /* nearest, ties to ±∞ */
  87 #define FLTRND_NEARNEG FRPMASK_NEAR(FLTRND_NEGINF) /* nearest, ties to -∞ */
  88 #define FLTRND_NEARPOS FRPMASK_NEAR(FLTRND_POSINF) /* nearest, ties to +∞ */
  89
  90 struct floatbits {
  91   /* A decoded floating-point number.
  92    *
  93    * The flags do most of the heavy lifting here.
  94    *
  95    *   * @FLTF_ZERO@ is set if the number is zero.  The @frac@ and @exp@ are
  96    *     ignored.
  97    *
  98    *   * @FLTF_NEG@ is set if the number is negative.  The representation is
  99    *     signed magnitude, because that seems basically universal among
 100    *     floating-point formats.  Negative zero is a thing.
 101    *
 102    *   * @FLTF_SNAN@ and @FLTF_QMAN@ are set if the value is, respectively, a
 103    *     signalling or quiet not-a-number.  The @frac@ holds the payload,
 104    *     left-aligned, excluding the quiet bit; @exp@ is ignored.
 105    *
 106    *   * @FLTF_INF@ is set if the number is positive or negative infinity.
 107    *     Projective infinity is not representable.  The @frac@ and @exp@ are
 108    *     ignored.
 109    *
 110    * The @frac@ field contains the fractional significand, big-end first;
 111    * either the number is identically (positive or negative) zero, or the
 112    * most significant bit of @sig[0]@ is set, and the significand lies
 113    * between a half (inclusive) and one (exclusive).  The @exp@ is the power
 114    * of two by which the significand is to be scaled.
 115    *
 116    * The essential convention for @frac@ is that the value is unchanged if
 117    * zero-valued words are added or removed at the end.
 118    */
 119
 120   unsigned f;                           /* flags */
 121 #define FLTF_NEG 0x0001u                /*   number is negative */
 122 #define FLTF_INF 0x0002u                /*   number is negative */
 123 #define FLTF_QNAN 0x0004u               /*   quiet not-a-number */
 124 #define FLTF_SNAN 0x0008u               /*   signalling not-a-number */
 125 #define FLTF_ZERO 0x0010u               /*   number is zero */
 126 #define FLTF_NANMASK (FLTF_QNAN | FLTF_SNAN) /* any kind of NaN */
 127   int exp;                              /* exponent, base 2 */
 128   arena *a;                             /* memory arena */
 129   uint32 *frac;                         /* fraction */
 130   unsigned n, fracsz;                   /* fraction limbs used/allocated */
 131 };
 132 #define FLOATBITS_INIT { FLTF_ZERO, 0, &arena_stdlib, 0, 0, 0 }
 133
 134 /*----- General floating-point hacking ------------------------------------*/
 135
 136 /* --- @fltfmt_initbits@ --- *
 137  *
 138  * Arguments:   @struct floatbits *x@ = pointer to structure to initialize
 139  *
 140  * Returns:     ---
 141  *
 142  * Use:         Dynamically initialize @x@ to (positive) zero so that it can
 143  *              be used as the destination operand by other operations.  This
 144  *              doesn't allocate resources and cannot fail.  The
 145  *              @FLOATBITS_INIT@ macro is a suitable static initializer for
 146  *              performing the same task.
 147  */
 148
 149 extern void fltfmt_initbits(struct floatbits */*x*/);
 150
 151 /* --- @fltfmt_freebits@ --- *
 152  *
 153  * Arguments:   @struct floatbits *x@ = pointer to structure to free
 154  *
 155  * Returns:     ---
 156  *
 157  * Use:         Releases the memory held by @x@.  Afterwards, @x@ is a valid
 158  *              (positive) zero, but can safely be discarded.
 159  */
 160
 161 extern void fltfmt_freebits(struct floatbits */*x*/);
 162
 163 /* --- @fltfmt_allocfrac@ --- *
 164  *
 165  * Arguments:   @struct floatbits *x@ = structure to adjust
 166  *              @unsigned n@ = number of words required
 167  *
 168  * Returns:     ---
 169  *
 170  * Use:         Reallocate the @frac@ vector so that it has space for at
 171  *              least @n@ 32-bit words, and set @x->n@ equal to @n@.  If the
 172  *              current size is already @n@ or greater, then just update the
 173  *              active length @n@ and return; otherwise, any existing vector
 174  *              is discarded and a fresh, larger one allocated.
 175  */
 176
 177 extern void fltfmt_allocfrac(struct floatbits */*x*/, unsigned /*n*/);
 178
 179 /* --- @fltfmt_copybits@ --- *
 180  *
 181  * Arguments:   @struct floatbits *z_out@ = where to leave the result
 182  *              @const struct floatbits *x@ = source to copy
 183  *
 184  * Returns:     ---
 185  *
 186  * Use:         Make @z_out@ be a copy of @x@.  If @z_out@ is the same object
 187  *              as @x@ then do nothing.
 188  */
 189
 190 extern void fltfmt_copybits(struct floatbits */*z_out*/,
 191                             const struct floatbits */*x*/);
 192
 193 /* --- @fltfmt_round@ --- *
 194  *
 195  * Arguments:   @struct floatbits *z_out@ = destination (may equal source)
 196  *              @const struct floatbits *x@ = source
 197  *              @unsigned r@ = rounding mode (@FLTRND_...@ code)
 198  *              @unsigned n@ = nonzero number of bits to leave
 199  *
 200  * Returns:     A @FLTERR_...@ code, specifically either @FLTERR_INEXACT@ if
 201  *              rounding discarded some nonzero value bits, or @FLTERR_OK@ if
 202  *              rounding was unnecessary.
 203  *
 204  * Use:         Rounds a floating-point value to a given number of
 205  *              significant bits, using the given rounding rule.
 206  */
 207
 208 extern unsigned fltfmt_round(struct floatbits */*z_out*/,
 209                              const struct floatbits */*x*/,
 210                              unsigned /*r*/, unsigned /*n*/);
 211
 212 /*----- IEEE formats ------------------------------------------------------*/
 213
 214 struct fltfmt_ieeefmt {
 215   /* Description of a binary IEEE floating-point format.
 216    *
 217    * An IEEE binary floating-point encoding is split into three fields,
 218    * called %$\sigma$%, %$e'$%, and %$m$%.
 219    *
 220    * The %$\sigma$% field encodes the sign as a single bit: if %$\sigma = 0$%
 221    * then the value is nonnegative; if %$\sigma = 1$% then the value is
 222    * negative.  Signed-magnitude encoding is used: if the rest of the
 223    * encoding represents a (necessarily nonnegative) value %$x$% then the
 224    * signed value is %$(-1)^\sigma \cdot x$%.
 225    *
 226    * The %$e'$% field encodes the exponent in a field of %$w$% bits.  The
 227    * true exponent %$e = e' - e_0$%, where %$e_0 = 2^{w-1} - 1$% is the
 228    * %%\emph{exponent bias}%%.  The maximum exponent for finite values is
 229    * %$e_{\text{max}} = 2^w - 2 - e_0 = 2^{w-1} - 1$%, which is
 230    * coincidentally equal to %$e_0$%; and the minimum exponent for
 231    * %%\emph{normal}%% finite values is %$e_{\text{min}} = 1 - e_0 = {}$%
 232    * %$2 - 2^{w-1}$%.  The maximum exponent value %$2^w - 1$% denotes
 233    * infinities and NaN values, while the minimum value denotes zeros and
 234    * subnormal values.
 235    *
 236    * If a `hidden-bit' convention is used (@IEEEF_HIDDEN@ is set in @f@),
 237    * then %$h = 1$%; otherwise, %$h = 0$%.
 238    *
 239    * The %$m$% field encodes the %$p$%-bit %%\emph{significand}%%.  If a
 240    * `hidden-bit' convention is used then the %$m$% field is actually %$p -
 241    * 1$% bits wide; otherwise, it is %$p$% bits.
 242    *
 243    *   * If %$e_{\text{min}} \le e \le e_{\text{max}}$% then the encoding
 244    *     represents a %%\emph{normal} value, specifically the value
 245    *     %$x = (-1)^\sigma \cdot (h + m/2^{p-1}) \cdot 2^e$%.  In formats
 246    *     which do not use the hidden-bit convention, the most significant bit
 247    *     of %$m$% must be set; we return @FLTERR_INVAL@ for other
 248    *     encodings, and interpret the `unnormal' value as encoded.
 249    *
 250    *   * If %$e = e_{\text{min}} - 1$% then the encoding represents (signed)
 251    *     zero if %$m = 0$%, or a %%\emph{subnormal}%% value %$x = (-1)^\sigma
 252    *     \cdot m/2^{p-1} \cdot 2^{e_{\text{min}}}$%.  Note that, in formats
 253    *     which do not use the hidden-bit convention, the unit bit should be
 254    *     clear; we return @FLTERR_INVAL@ for other encodings, and interpret
 255    *     the `pseudo-denormal' value as encoded.
 256    *
 257    *   * If %e = e_{\text{max}} + 1$% then the encoding represents
 258    *     %$(-1)^\sigma \cdot \infty$% if %$m = 0$%, or a not-a-number value
 259    *     (NaN) with payload %$m \ne 0$%.  A %%\emph{quiet}%% NaN has bit
 260    *     %$p - 2$% set in %$m$%; a signalling NaN has this bit reset.  Note
 261    *     that some platform's native format reverses this convention, but
 262    *     this is handled in code which deals with native formats: the
 263    *     interchange formats described here always indicate quiet NaNs by
 264    *     setting the bit.  In formats which use the hidden-bit convetion, the
 265    *     unit bit %$p - 1$% is ignored
 266    */
 267
 268   unsigned f;                           /* flags */
 269 #define FLTIF_HIDDEN 1u                 /*   unit bit is implicit */
 270   unsigned expwd;                       /* exponent field width %$w$% */
 271   unsigned prec;                        /* precision %$p$% */
 272 };
 273
 274 /* IEEE (and related) format descriptions. */
 275 extern const struct fltfmt_ieeefmt
 276   fltfmt_f16, fltfmt_f32, fltfmt_f64, fltfmt_f128,
 277   fltfmt_mini, fltfmt_bf16, fltfmt_idblext80;
 278
 279 /* --- @fltfmt_encieee@ ---
 280  *
 281  * Arguments:   @const struct fltfmt_ieeefmt *fmt@ = format description
 282  *              @uint32 *z@ = output vector
 283  *              @const struct floatbits *x@ = value to encode
 284  *              @unsigned r@ = rounding mode
 285  *              @unsigned errmask@ = error mask
 286  *
 287  * Returns:     Error flags (@FLTERR_...@).
 288  *
 289  * Use:         Encode a floating-point value in an IEEE format.  This is the
 290  *              machinery shared by the @fltfmt_enc...@ functions for
 291  *              encoding IEEE-format values.  Most of the arguments and
 292  *              behaviour are as described for those functions.
 293  *
 294  *              The encoded value is right-aligned and big-endian; i.e., the
 295  *              sign bit ends up in @z[0]@, and the least significant bit of
 296  *              the significand ends up in the least significant bit of
 297  *              @z[n - 1]@.
 298  */
 299
 300 extern unsigned fltfmt_encieee(const struct fltfmt_ieeefmt */*fmt*/,
 301                                uint32 */*z*/, const struct floatbits */*x*/,
 302                                unsigned /*r*/, unsigned /*errmask*/);
 303
 304 /* --- @fltfmt_encTY@ --- *
 305  *
 306  * Arguments:   @octet *z_out@, @uint16 *z_out@, @uint32 *z_out@,
 307  *                      @kludge64 *z_out@ = where to put the encoded value
 308  *              @uint16 *se_out@, @kludge64 *m_out@ = where to put the
 309  *                      encoded sign-and-exponent and significand
 310  *              @const struct floatbits *x@ = value to encode
 311  *              @unsigned r@ = rounding mode
 312  *              @unsigned errmask@ = error mask
 313  *
 314  * Returns:     Error flags (@FLTERR_...@).
 315  *
 316  * Use:         Encode a floating-point value in an IEEE (or IEEE-adjacent)
 317  *              format.
 318  *
 319  *              If an error is encountered during the encoding, and the
 320  *              corresponding bit of @errmask@ is clear, then processing
 321  *              stops immediately and the error is returned; if the bit is
 322  *              set, then processing continues as described below.
 323  *
 324  *              The @TY@ may be
 325  *
 326  *                * @mini@ for the 8-bit `1.4.3 minifloat' format, with
 327  *                  four-bit exponent and four-bit significand, represented
 328  *                  as a single octet;
 329  *
 330  *                * @bf16@ for the Google `bfloat16' format, with eight-bit
 331  *                  exponent and eight-bit significand, represented as a
 332  *                  @uint16@;
 333  *
 334  *                * @f16@ for the IEEE `binary16' format, with five-bit
 335  *                  exponent and eleven-bit significand, represented as a
 336  *                  @uint16@;
 337  *
 338  *                * @f32@ for the IEEE `binary32' format, with eight-bit
 339  *                  exponent and 24-bit significand, represented as a
 340  *                  @uint32@;
 341  *
 342  *                * @f64@ for the IEEE `binary64' format, with eleven-bit
 343  *                  exponent and 53-bit significand, represented as a
 344  *                  @kludge64@;
 345  *
 346  *                * @f128@ for the IEEE `binary128' format, with fifteen-bit
 347  *                  exponent and 113-bit significand, represented as four
 348  *                  @uint32@ limbs, most significant first; or
 349  *
 350  *                * @idblext80@ for the Intel 80-bit `double extended'
 351  *                  format, with fifteen-bit exponent and 64-bit significand
 352  *                  with no hidden bit, represented as a @uint16 se@
 353  *                  holding the sign and exponent, and a @kludge64 m@
 354  *                  holding the significand.
 355  *
 356  *              Positive and negative zero and infinity are representable
 357  *              exactly.
 358  *
 359  *              Following IEEE recommendations (and most implementations),
 360  *              the most significant fraction bit of a quiet NaN is set; this
 361  *              bit is clear in a signalling NaN.  The most significant
 362  *              payload bits of a NaN, held in the top bits of @x->frac[0]@,
 363  *              are encoded in the output significand following the `quiet'
 364  *              bit.  If the chosen format's significand field is too small
 365  *              to accommodate all of the set payload bits then the
 366  *              @FLTERR_INEXACT@ error bit is set and, if masked, the
 367  *              excess payload bits are discarded.  No rounding of NaN
 368  *              payloads is performed.
 369  *
 370  *              Otherwise, the input value is finite and nonzero.  If the
 371  *              significand cannot be represented exactly then the
 372  *              @FLTERR_INEXACT@ error bit is set, and, if masked, the value
 373  *              will be rounded (internally -- the input @x@ is not changed).
 374  *              If the (rounded) value's exponent is too large to represent,
 375  *              then the @FLTERR_OFLOW@ and @FLTERR_INEXACT@ error bits are
 376  *              set and, if masked, the result is either the (absolute)
 377  *              largest representable finite value or infinity, with the
 378  *              appropriate sign, chosen according to the rounding mode.  If
 379  *              the exponent is too small to represent, then the
 380  *              @FLTERR_UFLOW@ and @FLTERR_INEXACT@ error bits are set and,
 381  *              if masked, the result is either the (absolute) smallest
 382  *              nonzero value or zero, with the appropriate sign, chosen
 383  *              according to the rounding mode.
 384  */
 385
 386 extern unsigned fltfmt_encmini(octet */*z_out*/,
 387                                const struct floatbits */*x*/,
 388                                unsigned /*r*/, unsigned /*errmask*/);
 389
 390 extern unsigned fltfmt_encbf16(uint16 */*z_out*/,
 391                                const struct floatbits */*x*/,
 392                                unsigned /*r*/, unsigned /*errmask*/);
 393
 394 extern unsigned fltfmt_encf16(uint16 */*z_out*/,
 395                               const struct floatbits */*x*/,
 396                               unsigned /*r*/, unsigned /*errmask*/);
 397
 398 extern unsigned fltfmt_encf32(uint32 */*z_out*/,
 399                               const struct floatbits */*x*/,
 400                               unsigned /*r*/, unsigned /*errmask*/);
 401
 402 extern unsigned fltfmt_encf64(kludge64 */*z_out*/,
 403                               const struct floatbits */*x*/,
 404                               unsigned /*r*/, unsigned /*errmask*/);
 405
 406 extern unsigned fltfmt_encf128(uint32 */*z_out*/,
 407                                const struct floatbits */*x*/,
 408                                unsigned /*r*/, unsigned /*errmask*/);
 409
 410 extern unsigned fltfmt_encidblext80(uint16 */*se_out*/, kludge64 */*f_out*/,
 411                                     const struct floatbits */*x*/,
 412                                     unsigned /*r*/, unsigned /*errmask*/);
 413
 414 /* --- @fltfmt_decieee@ --- *
 415  *
 416  * Arguments:   @const struct fltfmt_ieeefmt *fmt@ = format description
 417  *              @struct floatbits *z_out@ = output decoded representation
 418  *              @const uint32 *x@ = input encoding
 419  *
 420  * Returns:     Error flags (@FLTERR_...@).
 421  *
 422  * Use:         Decode a floating-point value in an IEEE format.  This is the
 423  *              machinery shared by the @fltfmt_dec...@ functions for
 424  *              deccoding IEEE-format values.  Most of the arguments and
 425  *              behaviour are as described for those functions.
 426  *
 427  *              The encoded value should be right-aligned and big-endian;
 428  *              i.e., the sign bit ends up in @z[0]@, and the least
 429  *              significant bit of the significand ends up in the least
 430  *              significant bit of @z[n - 1]@.
 431  */
 432
 433 extern unsigned fltfmt_decieee(const struct fltfmt_ieeefmt */*fmt*/,
 434                                struct floatbits */*z_out*/,
 435                                const uint32 */*x*/);
 436
 437 /* --- @fltfmt_decTY@ --- *
 438  *
 439  * Arguments:   @const struct floatbits *z_out@ = storage for the result
 440  *              @octet x@, @uint16 x@, @uint32 x@, @kludge64 x@ =
 441  *                      encoded input
 442  *              @uint16 se@, @kludge64 m@ = encoded sign-and-exponent and
 443  *                      significand
 444  *
 445  * Returns:     Error flags (@FLTERR_...@).
 446  *
 447  * Use:         Encode a floating-point value in an IEEE (or IEEE-adjacent)
 448  *              format.
 449  *
 450  *              The options for @TY@ are as documented for the encoding
 451  *              functions above.
 452  *
 453  *              In formats without a hidden bit -- currently only @idblext80@
 454  *              -- not all bit patterns are valid encodings.  If the explicit
 455  *              unit bit is set when the exponent field is all-bits-zero, or
 456  *              clear when the exponent field is not all-bits-zero, then the
 457  *              @FLTERR_INVAL@ error bit is set.  If the exponent is all-
 458  *              bits-set, denoting infinity or a NaN, then the unit bit is
 459  *              otherwise ignored -- in particular, it does not affect the
 460  *              NaN payload, or even whether the input encodes a NaN or
 461  *              infinity.  Otherwise, the unit bit is considered significant,
 462  *              and the result is normalized as one would expect.
 463  *              Consequently, biased exponent values 0 and 1 are distinct
 464  *              only with respect to which bit patterns are considered valid,
 465  *              and not with respect to the set of values denoted.
 466  */
 467
 468 extern unsigned fltfmt_decmini(struct floatbits */*z_out*/, octet /*x*/);
 469
 470 extern unsigned fltfmt_decbf16(struct floatbits */*z_out*/, uint16 /*x*/);
 471
 472 extern unsigned fltfmt_decf16(struct floatbits */*z_out*/, uint16 /*x*/);
 473
 474 extern unsigned fltfmt_decf32(struct floatbits */*z_out*/, uint32 /*x*/);
 475
 476 extern unsigned fltfmt_decf64(struct floatbits */*z_out*/, kludge64 /*x*/);
 477
 478 extern unsigned fltfmt_decf128(struct floatbits */*z_out*/,
 479                                const uint32 */*x*/);
 480
 481 extern unsigned fltfmt_decidblext80(struct floatbits */*z_out*/,
 482                                     uint16 /*se*/, kludge64 /*f*/);
 483
 484 /*----- Native formats ----------------------------------------------------*/
 485
 486 /* Hacking for platforms which ill-advisedly have the opposite sense for the
 487  * quiet NaN bit.
 488  *
 489  * Obviously we toggle the quiet bit, but there's a problem: if the quiet bit
 490  * is the only one set, then if we toggle it, the fraction will become zero
 491  * and we'll be left with an infinity.  Follow MIPS and set all of the bits.
 492  *
 493  * This is all internal machinery and shouldn't be relied on by applications.
 494  */
 495 #if defined(__hppa__) || (defined(__mips__) && !defined(__mips_nan2008))
 496 #  define FLTFMT__MUST_FROB_NANS
 497
 498 #  define FLTFMT__FROB_NAN_F32(x_inout, rc) do {                        \
 499      uint32 *_x_inout_ = (x_inout), _x0_ = _x_inout_[0];                \
 500                                                                         \
 501      if ((_x0_&0x7f800000) != 0x7f800000 || !(_x0_&0x007fffff))         \
 502        ;                                                                \
 503      else if (_x0_&0x003fffff)                                          \
 504        _x_inout_[0] = _x0_ ^ 0x00400000;                                \
 505      else {                                                             \
 506        _x_inout_[0] = (_x0_&0x80000000) | 0x7fffffff;                   \
 507        (rc) |= FLTERR_INEXACT;                                          \
 508      }                                                                  \
 509    } while (0)
 510
 511 #  define FLTFMT__FROB_NAN_F64(x_inout, rc) do {                        \
 512      uint32 *_x_inout_ = (x_inout),                                     \
 513        _x0_ = _x_inout_[0], _x1_ = _x_inout_[1];                        \
 514                                                                         \
 515      if ((_x0_&0x7ff00000) != 0x7ff00000 || (!(_x0_&0x000fffff) && !_x1_)) \
 516        ;                                                                \
 517      else if ((_x0_&0x0007ffff) || _x1_)                                \
 518        _x_inout_[0] = _x0_ ^ 0x00080000;                                \
 519      else {                                                             \
 520        _x_inout_[0] = (_x0_&0x80000000) | 0x7fffffff;                   \
 521        _x_inout_[1] = 0xffffffff;                                       \
 522        (rc) |= FLTERR_INEXACT;                                          \
 523      }                                                                  \
 524    } while (0)
 525
 526 #  define FLTFMT__FROB_NAN_F128(x_inout, rc) do {                       \
 527      uint32 *_x_inout_ = (x_inout),                                     \
 528        _x0_ = _x_inout_[0], _x1_ = _x_inout_[1],                        \
 529        _x2_ = _x_inout_[2], _x3_ = _x_inout_[3];                        \
 530                                                                         \
 531      if ((_x0_&0x7fff0000) != 0x7fff0000 ||                             \
 532          (!(_x0_&0x000fffff) && !_x1_ && !_x2_ && !_x3_))               \
 533        ;                                                                \
 534      else if ((_x0_&0x00007fff) || _x1_ || _x2_ || _x3_)                \
 535        _x_inout_[0] = _x0_ ^ 0x00008000;                                \
 536      else {                                                             \
 537        _x_inout_[0] = (_x0_&0x80000000) | 0x7fffffff;                   \
 538        _x_inout_[1] = _x_inout_[2] = _x_inout_[3] = 0xffffffff;         \
 539        (rc) |= FLTERR_INEXACT;                                          \
 540      }                                                                  \
 541    } while (0)
 542
 543 #  define FLTFMT__FROB_NAN_IDBLEXT80(x_inout, rc) do {                  \
 544      uint32 *_x_inout_ = (x_inout),                                     \
 545        _x0_ = _x_inout_[0], _x1_ = _x_inout_[1], _x2_ = _x_inout_[2];   \
 546                                                                         \
 547      if ((_x0_&0x00007fff) != 0x00007fff || (!(_x1_&0x7fffffff) && !_x2_)) \
 548        ;                                                                \
 549      else if ((_x1_&0x3fffffff) || _x1_ || _x2_)                        \
 550        _x_inout_[1] = _x1_ ^ 0x40000000;                                \
 551      else {                                                             \
 552        _x_inout_[1] = (_x1_&0x80000000) | 0x3fffffff; /* preserve unit */ \
 553        _x_inout_[2] = 0xffffffff;                                       \
 554      }                                                                  \
 555    } while (0)
 556
 557 #else
 558 #  define FLTFMT__FROB_NAN_F32(x_inout, rc) do ; while (0)
 559 #  define FLTFMT__FROB_NAN_F64(x_inout, rc) do ; while (0)
 560 #  define FLTFMT__FROB_NAN_F128(x_inout, rc) do ; while (0)
 561 #  define FLTFMT__FROB_NAN_IDBLEXT80(x_inout, rc) do ; while (0)
 562 #endif
 563
 564 /* --- @fltfmt_encTY@ --- *
 565  *
 566  * Arguments:   @ty *z_out@ = storage for the result
 567  *              @const struct floatbits *x@ = value to encode
 568  *              @unsigned r@ = rounding mode
 569  *
 570  * Returns:     Error flags (@FLTERR_...@).
 571  *
 572  * Use:         Encode the floating-point value @x@ as a native C object and
 573  *              store the result in @z_out@.
 574  *
 575  *              The @TY@ may be @flt@ to encode a @float@, @dbl@ to encode a
 576  *              @double@, or (on C99 implementations) @ldbl@ to encode a
 577  *              @long double@.
 578  *
 579  *              In detail, conversion is performed as follows.
 580  *
 581  *                * If a non-finite value cannot be represented by the
 582  *                  implementation then the @FLTERR_REPR@ error bit is set
 583  *                  and @*z_out@ is set to zero if @x@ is a NaN, or the
 584  *                  (absolute) largest representable value, with appropriate
 585  *                  sign, if @x@ is an infinity.
 586  *
 587  *                * If the implementation can represent NaNs, but cannot set
 588  *                  NaN payloads, then the @FLTERR_INEXACT@ error bit is set,
 589  *                  and @*z_out@ is set to an arbitrary (quiet) NaN value.
 590  *
 591  *                * If @x@ is negative zero, but the implementation does not
 592  *                  distinguish negative and positive zero, then the
 593  *                  @FLTERR_INEXACT@ error bit is set and @*z_out@ is set to
 594  *                  zero.
 595  *
 596  *                * If the implementation's floating-point radix is not a
 597  *                  power of two, and @x@ is a nonzero finite value, then
 598  *                  @FLTERR_INEXACT@ error bit is set (unconditionally), and
 599  *                  the value is rounded by the implementation using its
 600  *                  prevailing rounding policy.  If the radix is a power of
 601  *                  two, then the @FLTERR_INEXACT@ error bit is set only if
 602  *                  rounding is necessary, and rounding is performed using
 603  *                  the rounding mode @r@.
 604  */
 605
 606 extern unsigned fltfmt_encflt(float */*z_out*/,
 607                               const struct floatbits */*x*/,
 608                               unsigned /*r*/);
 609
 610 extern unsigned fltfmt_encdbl(double */*z_out*/,
 611                               const struct floatbits */*x*/,
 612                               unsigned /*r*/);
 613
 614 #if __STDC_VERSION__ >= 199001
 615 extern unsigned fltfmt_encldbl(long double */*z_out*/,
 616                                const struct floatbits */*x*/,
 617                                unsigned /*r*/);
 618 #endif
 619
 620 /* --- @fltfmt_decTY@ --- *
 621  *
 622  * Arguments:   @struct floatbits *z_out@ = storage for the result
 623  *              @ty x@ = value to decode
 624  *              @unsigned r@ = rounding mode
 625  *
 626  * Returns:     Error flags (@FLTERR_...@).
 627  *
 628  * Use:         Decode the native C floatingpoint value @x@ and store the
 629  *              result in @z_out@.
 630  *
 631  *              The @TY@ may be @flt@ to encode a @float@, @dbl@ to encode a
 632  *              @double@, or (on C99 implementations) @ldbl@ to encode a
 633  *              @long double@.
 634  *
 635  *              In detail, conversion is performed as follows.
 636  *
 637  *                * If the implementation supports negative zeros and/or
 638  *                  infinity, then these are recognized and decoded.
 639  *
 640  *                * If the input as a NaN, but the implementation cannot
 641  *                  usefully report NaN payloads, then the @FLTERR_INEXACT@
 642  *                  error bit is set and the decoded payload is left empty.
 643  *
 644  *                * If the implementation's floating-point radix is not a
 645  *                  power of two, and @x@ is a nonzero finite value, then
 646  *                  @FLTERR_INEXACT@ error bit is set (unconditionally), and
 647  *                  the rounded value (according to the rounding mode @r@) is
 648  *                  stored in as many fraction words as necessary to identify
 649  *                  the original value uniquely.  If the radix is a power of
 650  *                  two, then the value is represented exactly.
 651  */
 652
 653 extern unsigned fltfmt_decflt(struct floatbits */*z_out*/,
 654                               float /*x*/, unsigned /*r*/);
 655
 656 extern unsigned fltfmt_decdbl(struct floatbits */*z_out*/,
 657                               double /*x*/, unsigned /*r*/);
 658
 659 #if __STDC_VERSION__ >= 199001
 660 extern unsigned fltfmt_decldbl(struct floatbits */*z_out*/,
 661                                long double /*x*/, unsigned /*r*/);
 662 #endif
 663
 664 /*----- Some common conversions packaged up -------------------------------*/
 665
 666 /* --- @fltfmt_CTYtoFTYE@ --- *
 667  *
 668  * Arguments:   @octet *p@ = output pointer
 669  *              @float x@, @double x@ = value to convert
 670  *              @unsigned r@ = rounding mode
 671  *
 672  * Returns:     Error flags (@FLTERR_...@).
 673  *
 674  * Use:         Encode a native C floating-point value in an external format.
 675  *
 676  *              The @CTY@ is an abbreviation for a C type: @flt@ for @float@,
 677  *              or @dbl@ for @double@; @fty@ is an abbreviation for the
 678  *              external format, @f32@ for IEEE Binary32, or @f64@ for IEEE
 679  *              Binary64; and @E@ is @l@ for little-endian or @b@ for
 680  *              big-endian byte order.  Not all combinations are currently
 681  *              supported.
 682  *
 683  *              On platforms where the external format is used natively,
 684  *              these functions are simple data copies.
 685  */
 686
 687 extern unsigned fltfmt_flttof32l(octet */*p*/, float /*x*/, unsigned /*r*/);
 688 extern unsigned fltfmt_flttof32b(octet */*p*/, float /*x*/, unsigned /*r*/);
 689 extern unsigned fltfmt_dbltof64l(octet */*p*/, double /*x*/, unsigned /*r*/);
 690 extern unsigned fltfmt_dbltof64b(octet */*p*/, double /*x*/, unsigned /*r*/);
 691
 692 /* --- @fltfmt_FTYEtoCTY@ --- *
 693  *
 694  * Arguments:   @float *z_out@, @double *z_out@ = storage for output
 695  *              @const octet *p@ = input pointer
 696  *              @unsigned r@ = rounding mode
 697  *
 698  * Returns:     Error flags (@FLTERR_...@).
 699  *
 700  * Use:         Decodes a floating point value in an external format into a
 701  *              native value.
 702  *
 703  *              The naming conventions are the same as for @fltfmt_dbltof64b@
 704  *              above.
 705  *
 706  *              On platforms where the external format is used natively,
 707  *              these functions are simple data copies.
 708  */
 709
 710 extern unsigned fltfmt_f32ltoflt(float */*z_out*/, const octet */*p*/,
 711                                  unsigned /*r*/);
 712 extern unsigned fltfmt_f32btoflt(float */*z_out*/, const octet */*p*/,
 713                                  unsigned /*r*/);
 714 extern unsigned fltfmt_f64ltodbl(double */*z_out*/, const octet */*p*/,
 715                                  unsigned /*r*/);
 716 extern unsigned fltfmt_f64btodbl(double */*z_out*/, const octet */*p*/,
 717                                  unsigned /*r*/);
 718
 719 /*----- That's all, folks -------------------------------------------------*/
 720
 721 #ifdef __cplusplus
 722   }
 723 #endif
 724
 725 #endif