--- /dev/null
+/* -*-c-*-
+ *
+ * Floating-point format conversions
+ *
+ * (c) 2024 Straylight/Edgeware
+ */
+
+/*----- Licensing notice --------------------------------------------------*
+ *
+ * This file is part of the mLib utilities library.
+ *
+ * mLib is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU Library General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or (at
+ * your option) any later version.
+ *
+ * mLib is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public
+ * License for more details.
+ *
+ * You should have received a copy of the GNU Library General Public
+ * License along with mLib. If not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
+ * USA.
+ */
+
+/*----- Header files ------------------------------------------------------*/
+
+#include "config.h"
+
+#include <assert.h>
+#include <float.h>
+#include <limits.h>
+#include <math.h>
+
+#include "alloc.h"
+#include "arena.h"
+#include "bits.h"
+#include "fltfmt.h"
+#include "growbuf.h"
+#include "macros.h"
+#include "maths.h"
+
+#if GCC_VERSION_P(4, 4)
+# pragma GCC optimize "-frounding-math"
+#elif CLANG_VERSION_P(11, 0) && !CLANG_VERSION_P(12, 0)
+# pragma clang optimize "-frounding-math"
+#elif GCC_VERSION_P(0, 0) || \
+ (CLANG_VERSION_P(0, 0) && !CLANG_VERSION_P(12, 0))
+ /* We just lose. */
+#elif __STDC_VERSION__ >= 199001
+# include <fenv.h>
+# pragma STDC FENV_ACCESS ON
+#endif
+
+/*----- Some useful constants ---------------------------------------------*/
+
+#define B31 0x80000000 /* just bit 31 */
+#define B30 0x40000000 /* just bit 30 */
+
+#define SH32 4294967296.0 /* 2^32 as floating-point */
+
+/*----- Useful macros -----------------------------------------------------*/
+
+#define B32(k) ((uint32)1 << (k))
+#define M32(k) (B32(k) - 1)
+
+#define FINITEP(x) (!((x)->f&(FLTF_INF | FLTF_NANMASK | FLTF_ZERO)))
+
+/*----- Utility functions -------------------------------------------------*/
+
+/* --- @clz32@ --- *
+ *
+ * Arguments: @uint32 x@ = a 32-bit value
+ *
+ * Returns: The number of leading zeros in @x@, i.e., the nonnegative
+ * integer %$n$% such that %$2^{31} \le 2^n x < 2^{32}$%.
+ * Returns a nonsensical value if %$x = 0$%.
+ */
+
+static unsigned clz32(uint32 x)
+{
+ unsigned n = 0;
+
+ /* Divide and conquer. If the top half of the bits are clear, then there
+ * must be at least 16 leading zero bits, so accumulate and shift. Repeat
+ * for smaller powers of two.
+ *
+ * This ends up returning 31 if %$x = 0$%, but don't rely on this.
+ */
+ if (!(x&0xffff0000)) { x <<= 16; n += 16; }
+ if (!(x&0xff000000)) { x <<= 8; n += 8; }
+ if (!(x&0xf0000000)) { x <<= 4; n += 4; }
+ if (!(x&0xc0000000)) { x <<= 2; n += 2; }
+ if (!(x&0x80000000)) { n += 1; }
+ return (n);
+}
+
+/* --- @ctz32@ --- *
+ *
+ * Arguments: @uint32 x@ = a 32-bit value
+ *
+ * Returns: The number of trailing zeros in @x@, i.e., the nonnegative
+ * integer %$n$% such that %$x/2^n$% is an odd integer.
+ * Returns a nonsensical value if %$x = 0$%.
+ */
+
+static unsigned ctz32(uint32 x)
+{
+#ifdef CTZ_TRADITIONAL
+
+ unsigned n = 0;
+
+ /* Divide and conquer. If the bottom half of the bits are clear, then
+ * there must be at least 16 trailing zero bits, so accumulate and shift.
+ * Repeat for smaller powers of two.
+ *
+ * This ends up returning 31 if %$x = 0$%, but don't rely on this.
+ */
+ if (!(x&0x0000ffff)) { x >>= 16; n += 16; }
+ if (!(x&0x000000ff)) { x >>= 8; n += 8; }
+ if (!(x&0x0000000f)) { x >>= 4; n += 4; }
+ if (!(x&0x00000003)) { x >>= 2; n += 2; }
+ if (!(x&0x00000001)) { n += 1; }
+ return (n);
+
+#else
+
+ static unsigned char tab[] =
+ /*
+ ;;; Compute the decoding table for the de Bruijn sequence trick below.
+
+ (let ((db #x04653adf)
+ (rv (make-vector 32 nil)))
+ (dotimes (i 32)
+ (aset rv (logand (ash db (- i 27)) #x1f) i))
+ (save-excursion
+ (goto-char (point-min))
+ (search-forward (concat "***" "BEGIN ctz32tab" "***"))
+ (beginning-of-line 2)
+ (delete-region (point)
+ (progn
+ (search-forward "***END***")
+ (beginning-of-line)
+ (point)))
+ (dotimes (i 32)
+ (cond ((zerop i) (insert " { "))
+ ((zerop (mod i 16)) (insert ",\n "))
+ ((zerop (mod i 4)) (insert ", "))
+ (t (insert ", ")))
+ (insert (format "%2d" (aref rv i))))
+ (insert " };\n")))
+ */
+
+ /* ***BEGIN ctz32tab*** */
+ { 0, 1, 2, 6, 3, 11, 7, 16, 4, 14, 12, 21, 8, 23, 17, 26,
+ 31, 5, 10, 15, 13, 20, 22, 25, 30, 9, 19, 24, 29, 18, 28, 27 };
+ /* ***END*** */
+
+ /* Sneaky trick. Two's complement negation (which you effectively get
+ * using C unsigned arithmetic, whether you like it or not) complements all
+ * of the bits of an operand more significant than the least significant
+ * set bit. Therefore, this bit is the only one set in both %$x$% and
+ * %$-x$%, so @x&-x@ will isolate it for us.
+ *
+ * The magic number @0x04653adf@ is a %%\emph{de Bruijn} number%%: every
+ * group of five consecutive bits is distinct, including groups which `wrap
+ * around', including some low bits and some high bits. Multiplying this
+ * number by a power of two is equivalent to a left shift; and, because the
+ * top five bits are all zero, the most significant five bits of the
+ * product are the same as if we'd applied a rotation. The result is that
+ * we end up with a distinctive pattern in those bits which perfectly
+ * diagnose each shift from 0 up to 31, which we can decode using a table.
+ *
+ * David Seal described a similar trick -- using the six-bit pattern
+ * generated by the constant @0x0450fbaf@ -- in `comp.sys.arm' in 1994;
+ * this constant was particularly convenient to multiply by on early ARM
+ * processors. The use of a de Bruijn number is described in Henry
+ * Warren's %%\emph{Hacker's Delight}%%.
+ */
+ return (tab[((x&-x)*0x04653adf >> 27)&0x1f]);
+
+#endif
+}
+
+/* --- @shl@, @shr@ --- *
+ *
+ * Arguments: @uint32 *z@ = destination vector
+ * @const uint32 *x@ = source vector
+ * @size_t sz@ = size of source vector, in elements
+ * @unsigned n@ = number of bits to shift by; must be less than
+ * 32
+ *
+ * Returns: The bits shifted off the end of the vector.
+ *
+ * Use: Shift a vector of 32-bit words left (@shl@) or right (@shr@)
+ * by some number of bit positions. These functions work
+ * correctly if @z@ and @x@ are the same pointer, but not if
+ * they otherwise overlap.
+ */
+
+static uint32 shl(uint32 *z, const uint32 *x, size_t sz, unsigned n)
+{
+ size_t i;
+ uint32 t, u;
+ unsigned r;
+
+ if (!n) {
+ for (i = 0; i < sz; i++) z[i] = x[i];
+ t = 0;
+ } else {
+ r = 32 - n;
+ for (t = 0, i = sz; i--; )
+ { u = x[i]; z[i] = ((u << n) | t)&MASK32; t = u >> r; }
+ }
+ return (t);
+}
+
+static uint32 shr(uint32 *z, const uint32 *x, size_t sz, unsigned n)
+{
+ size_t i;
+ uint32 t, u;
+ unsigned r;
+
+ if (!n) {
+ for (i = 0; i < sz; i++) z[i] = x[i];
+ t = 0;
+ } else {
+ r = 32 - n;
+ for (t = 0, i = 0; i < sz; i++)
+ { u = x[i]; z[i] = ((u >> n) | t)&MASK32; t = u << r; }
+ }
+ return (t);
+}
+
+/* --- @sigbits@ --- *
+ *
+ * Arguments: @const struct floatbits *x@ = decoded floating-point number
+ *
+ * Returns: The number of significant digits in @x@'s fraction. This
+ * will be zero if @x@ is zero or infinite.
+ */
+
+static unsigned sigbits(const struct floatbits *x)
+{
+ unsigned i;
+ uint32 w;
+
+ if (x->f&(FLTF_ZERO | FLTF_INF)) return (0);
+ i = x->n;
+ for (;;) {
+ if (!i) return (0);
+ w = x->frac[--i]; if (w) return (32*(i + 1) - ctz32(w));
+ }
+}
+
+/* --- @ms_set_bit@ --- *
+ *
+ * Arguments: @const uint32 *x@ = pointer to the %%\emph{end}%% of the
+ * buffer
+ * @unsigned from, to@ = lower (inclusive) and upper (exclusive)
+ * bounds on the region of bits to inspect
+ *
+ * Returns: Index of the most significant set bit, or @ALLCLEAR@.
+ *
+ * Use: For the (rather unusual) purposes of this function, the bits
+ * of the input are numbered from zero, being the least
+ * significant bit of @x[-1]@, upwards through more significant
+ * bits of @x[-1]@, and then through @x[-2]@ and so on.
+ *
+ * If all of the bits in the half-open region are clear then
+ * @ALLCLEAR@ is returned; otherwise, the return value is the
+ * index of the most significant set bit in the region. Note
+ * that @ALLCLEAR@ is equal to @UINT_MAX@: since this is the
+ * largest possible value of @to@, and the upper bound is
+ * exclusive, this cannot be the index of a bit in the region.
+ */
+
+#define ALLCLEAR UINT_MAX
+static unsigned ms_set_bit(const uint32 *x, unsigned from, unsigned to)
+{
+ unsigned n0, n, b, base;
+ uint32 m, w;
+
+ /* <--- increasing indices <---
+ *
+ * ---+-------+-------+-------+-------+-------+---
+ * ...S |///| | | | | |//| S...
+ * ---+-------+-------+-------+-------+-------+---
+ */
+
+ /* If the region is empty then it's technically true that all of the bits
+ * are zero. It's important to be able to do answer the case where
+ * %$\id{from} = \id{to} = 0$% without accessing memory.
+ */
+ assert(to >= from); if (to == from) return (ALLCLEAR);
+
+ /* This is distressingly complicated. Here's how it's going to work.
+ *
+ * There's at least one bit to check, or we'd have returned already --
+ * specifically, we must check the bit with index @from@. But that's at
+ * the wrong end. Instead, we start at the most significant end, with the
+ * word containing the bit one short of the @to@ position. Even though
+ * it's actually one off, because we use a half-open interval, we'll call
+ * that the `@to@ bit'.
+ *
+ * We start by loading the word containing the @to@ bit, and start @base@
+ * off as the bit index of the least significant bit of this word. We mask
+ * off the high bits (if any), leaving only the @to@ bit and the less
+ * significant ones. We %%\emph{don't}%% check the remaining bits yet.
+ *
+ * We then start an offset loop. In each iteration, we check the word
+ * we're currently holding: if it's not zero, then we return @base@ plus
+ * the position of the most-significant set bit, using @clz32@. Otherwise,
+ * we load the next (less significant) word, and drop @base@ by 32, but
+ * don't check it yet. We end this loop when the word we're holding
+ * contains the @from@ bit. It's possible that we didn't do any iterations
+ * of the loop, in which case we're still holding the word containing the
+ * @to@ bit at this point.
+ *
+ * Finally, we mask off the bits below the @from@ bit, and check that what
+ * we have left is zero. If it isn't, we return @base@ plus the position
+ * of the most significant bit; if it is, we return @ALLCEAR@.
+ */
+
+ /* The first job is to find the word containing the @to@ bit and mask off
+ * any higher bits that we don't care about.
+ *
+ * Recall that the bit's index is @to - 1@, but this must be a valid index
+ * because there is at least one bit in the region. But we start out
+ * pointing beyond the vector, so we must add an extra 32 bits.
+ */
+ n0 = (to + 31)/32; x -= n0; base = (to - 1)&~31u; w = *x++;
+ b = to%32; if (b) w &= M32(b);
+
+ /* Before we start the loop, it'd be useful to know how many iterations we
+ * need. This is going to be the offset from the word containing the @to@
+ * bit to the word containing the @from@ bit. Again, we're off by one
+ * because that's how our initial indexing is set up.
+ */
+ n = n0 - from/32 - 1;
+
+ /* Now it's time to do the loop. This is the easy bit. */
+ while (n--) {
+ if (w) return (base + 31 - clz32(w));
+ w = *x++&MASK32; base -= 32;
+ }
+
+ /* We're now holding the final word -- the one containing the @from@ bit.
+ * We need to mask off any low bits that we don't care about.
+ */
+ m = M32(from%32); w &= MASK32&~m;
+
+ /* Final check. */
+ if (w) return (base + 31 - clz32(w));
+ else return (ALLCLEAR);
+}
+
+/*----- General floating-point hacking ------------------------------------*/
+
+/* --- @fltfmt_initbits@ --- *
+ *
+ * Arguments: @struct floatbits *x@ = pointer to structure to initialize
+ *
+ * Returns: ---
+ *
+ * Use: Dynamically initialize @x@ to (positive) zero so that it can
+ * be used as the destination operand by other operations. This
+ * doesn't allocate resources and cannot fail. The
+ * @FLOATBITS_INIT@ macro is a suitable static initializer for
+ * performing the same task.
+ */
+
+void fltfmt_initbits(struct floatbits *x)
+{
+ x->f = FLTF_ZERO;
+ x->a = arena_global;
+ x->frac = 0; x->n = x->fracsz = 0;
+}
+
+/* --- @fltfmt_freebits@ --- *
+ *
+ * Arguments: @struct floatbits *x@ = pointer to structure to free
+ *
+ * Returns: ---
+ *
+ * Use: Releases the memory held by @x@. Afterwards, @x@ is a valid
+ * (positive) zero, but can safely be discarded.
+ */
+
+void fltfmt_freebits(struct floatbits *x)
+{
+ if (x->frac) x_free(x->a, x->frac);
+ x->f = FLTF_ZERO;
+ x->frac = 0; x->n = x->fracsz = 0;
+}
+
+/* --- @fltfmt_allocfrac@ --- *
+ *
+ * Arguments: @struct floatbits *x@ = structure to adjust
+ * @unsigned n@ = number of words required
+ *
+ * Returns: ---
+ *
+ * Use: Reallocate the @frac@ vector so that it has space for at
+ * least @n@ 32-bit words, and set @x->n@ equal to @n@. If the
+ * current size is already @n@ or greater, then just update the
+ * active length @n@ and return; otherwise, any existing vector
+ * is discarded and a fresh, larger one allocated.
+ */
+
+void fltfmt_allocfrac(struct floatbits *x, unsigned n)
+ { GROWBUF_REPLACEV(unsigned, x->a, x->frac, x->fracsz, n, 4); x->n = n; }
+
+/* --- @fltfmt_copybits@ --- *
+ *
+ * Arguments: @struct floatbits *z_out@ = where to leave the result
+ * @const struct floatbits *x@ = source to copy
+ *
+ * Returns: ---
+ *
+ * Use: Make @z_out@ be a copy of @x@. If @z_out@ is the same object
+ * as @x@ then do nothing.
+ */
+
+void fltfmt_copybits(struct floatbits *z_out, const struct floatbits *x)
+{
+ unsigned i;
+
+ if (z_out == x) return;
+ z_out->f = x->f;
+ if (!FINITEP(x)) z_out->exp = 0;
+ else z_out->exp = x->exp;
+ if ((x->f&(FLTF_ZERO | FLTF_INF)) || !x->n)
+ { z_out->n = 0; z_out->frac = 0; }
+ else {
+ fltfmt_allocfrac(z_out, x->n);
+ for (i = 0; i < x->n; i++) z_out->frac[i] = x->frac[i];
+ }
+}
+
+/* --- @fltfmt_round@ --- *
+ *
+ * Arguments: @struct floatbits *z_out@ = destination (may equal source)
+ * @const struct floatbits *x@ = source
+ * @unsigned r@ = rounding mode (@FLTRND_...@ code)
+ * @unsigned n@ = nonzero number of bits to leave
+ *
+ * Returns: A @FLTERR_...@ code, specifically either @FLTERR_INEXACT@ if
+ * rounding discarded some nonzero value bits, or @FLTERR_OK@ if
+ * rounding was unnecessary.
+ *
+ * Use: Rounds a floating-point value to a given number of
+ * significant bits, using the given rounding rule.
+ */
+
+unsigned fltfmt_round(struct floatbits *z_out, const struct floatbits *x,
+ unsigned r, unsigned n)
+{
+ unsigned rf, i, uw, ub, hw, hb, rc = 0;
+ uint32 um, hm, w;
+ int exp;
+
+ /* Check that this is likely to work. We must have at least one bit
+ * remaining, so that we can inspect the last-place unit bit. And we
+ * mustn't round up if the current value is already exact, because that
+ * would be nonsensical (and inconvenient).
+ */
+ assert(n > 0); assert(!(r&~(FRPMASK_LOW | FRPMASK_HALF)));
+
+ /* Eliminate trivial cases. There's nothing to do if the value is infinite
+ * or zero, or if we don't have enough precision already.
+ *
+ * The caller will have set the rounding mode and length suitably for a
+ * NaN.
+ */
+ if (x->f&(FLTF_ZERO | FLTF_INF) || n >= 32*x->n)
+ { fltfmt_copybits(z_out, x); return (FLTERR_OK); }
+
+ /* Determine various indices.
+ *
+ * The quantities @uw@ and @ub@ are the word and bit number which will hold
+ * the unit bit when we've finished; @hw@ and @hb@ similarly index the
+ * `half' bit, which is the next less significant bit.
+ */
+ uw = (n - 1)/32; ub = -n&31;
+ if (!ub) { hw = uw + 1; hb = 31; }
+ else { hw = uw; hb = ub - 1; }
+
+ /* Determine the necessary predicates for the rounding decision. */
+ rf = 0;
+ if (x->f&FLTF_NEG) rf |= FRPF_NEG;
+ um = B32(ub); if (x->frac[uw]&um) rf |= FRPF_ODD;
+ hm = B32(hb); if (x->frac[hw]&hm) rf |= FRPF_HALF;
+ if (x->frac[hw]&(hm - 1)) rf |= FRPF_LOW;
+ for (i = hw + 1; i < x->n; i++) if (x->frac[i]) rf |= FRPF_LOW;
+ if (rf&(FRPF_LOW | FRPF_HALF)) rc |= FLTERR_INEXACT;
+
+ /* Allocate space for the result. */
+ fltfmt_allocfrac(z_out, uw + 1);
+
+ /* We start looking at the least significant word of the result. Clear the
+ * low bits.
+ */
+ i = uw; exp = x->exp; w = x->frac[i]&~(um - 1);
+
+ /* If the rounding function is true then we need to add one to the
+ * truncated fraction and propagate carries.
+ */
+ if ((r >> rf)&1) {
+ w = (w + um)&MASK32;
+ while (i && !w) {
+ z_out->frac[i] = w;
+ w = (x->frac[--i] + 1)&MASK32;
+ }
+ if (!w) { w = B31; exp++; }
+ }
+
+ /* Store, and copy the remaining words. */
+ for (;;) {
+ z_out->frac[i] = w;
+ if (!i) break;
+ w = x->frac[--i];
+ }
+
+ /* Done. */
+ z_out->f = x->f&(FLTF_NEG | FLTF_NANMASK);
+ if (x->f&FLTF_NANMASK) z_out->exp = 0;
+ else z_out->exp = exp;
+ return (rc);
+}
+
+/*----- IEEE formats ------------------------------------------------------*/
+
+/* IEEE (and related) format descriptions. */
+const struct fltfmt_ieeefmt
+ fltfmt_mini = { IEEEF_HIDDEN, 4, 4 },
+ fltfmt_bf16 = { IEEEF_HIDDEN, 8, 8 },
+ fltfmt_f16 = { IEEEF_HIDDEN, 5, 11 },
+ fltfmt_f32 = { IEEEF_HIDDEN, 8, 24 },
+ fltfmt_f64 = { IEEEF_HIDDEN, 11, 53 },
+ fltfmt_f128 = { IEEEF_HIDDEN, 15, 113 },
+ fltfmt_idblext80 = { 0, 15, 64 };
+
+/* --- @fltfmt_encieee@ ---
+ *
+ * Arguments: @const struct fltfmt_ieeefmt *fmt@ = format description
+ * @uint32 *z@ = output vector
+ * @const struct floatbits *x@ = value to encode
+ * @unsigned r@ = rounding mode
+ * @unsigned errmask@ = error mask
+ *
+ * Returns: Error flags (@FLTERR_...@).
+ *
+ * Use: Encode a floating-point value in an IEEE format. This is the
+ * machinery shared by the @fltfmt_enc...@ functions for
+ * encoding IEEE-format values. Most of the arguments and
+ * behaviour are as described for those functions.
+ *
+ * The encoded value is right-aligned and big-endian; i.e., the
+ * sign bit ends up in @z[0]@, and the least significant bit of
+ * the significand ends up in the least significant bit of
+ * @z[n - 1]@.
+ */
+
+unsigned fltfmt_encieee(const struct fltfmt_ieeefmt *fmt,
+ uint32 *z, const struct floatbits *x,
+ unsigned r, unsigned errmask)
+{
+ struct floatbits y;
+ unsigned sigwd, fracwd, err = 0, f = x->f, rf;
+ unsigned i, j, n, nb, nw, mb, mw, esh, sh;
+ int exp, minexp, maxexp;
+ uint32 z0, t;
+
+#define ERR(e) do { err |= (e); if (err&~errmask) goto end; } while (0)
+
+ /* The following code assumes that the sign, biased exponent, unit, and
+ * quiet/signalling bits can all fit into the most significant 32 bits of
+ * the result.
+ */
+ assert(fmt->expwd + 3 <= 32);
+ esh = 31 - fmt->expwd;
+
+ /* Determine the output size. */
+ nb = fmt->prec + fmt->expwd + 1;
+ if (fmt->f&IEEEF_HIDDEN) nb--;
+ nw = (nb + 31)/32;
+
+ /* Determine the top bits. */
+ z0 = 0; i = 0;
+ if (x->f&FLTF_NEG) z0 |= B31;
+
+ /* And now for the main case analysis. */
+
+ if (f&FLTF_ZERO) {
+ /* Zero. There's very little to do here. */
+
+ } else if (f&FLTF_INF) {
+ /* Infinity. Set the exponent and, for non-hidden-bit formats, the unit
+ * bit.
+ */
+
+ z0 |= M32(fmt->expwd) << esh;
+ if (!(fmt->f&IEEEF_HIDDEN)) z0 |= B32(esh - 1);
+
+ } else if (f&FLTF_NANMASK) {
+ /* Not-a-number.
+ *
+ * We must check that we won't lose significant bits. We need a bit for
+ * the quiet/signalling flag, and enough space for the significant
+ * payload bits. The unit bit is not in play here, so the available
+ * space is always one less than the advertised precision. To put it
+ * another way, we need space for the payload, a bit for the
+ * quiet/signalling flag, and a bit for the unit place.
+ */
+
+ fracwd = sigbits(x);
+ if (fracwd + 2 > fmt->prec) ERR(FLTERR_INEXACT);
+
+ /* Copy the payload.
+ *
+ * If the payload is all-zero and we're meant to set a signalling NaN
+ * then report an exactness failure and set the low bit.
+ */
+ mb = fmt->prec - 2; mw = (mb + 31)/32; sh = -mb%32;
+ for (i = 0; i < nw - mw; i++) z[i] = 0;
+ n = x->n; if (n > mw) n = nw;
+ t = shr(z + i, x->frac, n, sh); i += n;
+ if (i < nw) z[i++] = t;
+ sh = esh - 2; if (fmt->f&IEEEF_HIDDEN) sh++;
+ if (f&FLTF_QNAN) z0 |= B32(sh);
+ else if (!fracwd) { ERR(FLTERR_INEXACT); z[nw - 1] |= 1; }
+
+ /* Set the exponent and, for non-hidden-bit formats, the unit bit. */
+ z0 |= M32(fmt->expwd) << esh;
+ if (!(fmt->f&IEEEF_HIDDEN)) z0 |= B32(esh - 1);
+
+ } else {
+ /* A finite value.
+ *
+ * Adjust the exponent by one place to compensate for the difference in
+ * significant conventions. Our significand lies between zero (in fact,
+ * a half, because we require normalization) and one, while an IEEE
+ * significand lies between zero (in fact, one) and two. Our exponent is
+ * therefore one larger than the IEEE exponent will be.
+ */
+
+ /* Determine the maximum true (unbiased) exponent. As noted above, this
+ * is also the bias.
+ */
+ exp = x->exp - 1;
+ maxexp = (1 << (fmt->expwd - 1)) - 1;
+ minexp = 1 - maxexp;
+
+ if (exp <= minexp - (int)fmt->prec) {
+ /* If the exponent is very small then we underflow. We have %$p - 1$%
+ * bits available to represent a subnormal significand, and therefore
+ * can represent at least one bit of a value as small as
+ * %$2^{e_{\text{min}}-p+1}$%.
+ *
+ * If the exponent is one short of the threshold, then we check to see
+ * whether the value will round up.
+ */
+
+ if ((minexp - exp == fmt->prec) &&
+ ((r >> (FRPF_HALF |
+ (sigbits(x) > 1 ? FRPF_LOW : 0) |
+ (f&FLTF_NEG ? FRPF_NEG : 0)))&1)) {
+ ERR(FLTERR_INEXACT);
+ for (i = 0; i < nw - 1; i++) z[i] = 0;
+ z[i++] = 1;
+ } else {
+ ERR(FLTERR_UFLOW | FLTERR_INEXACT);
+ /* Return (signed) zero. */
+ }
+
+ } else {
+ /* We can at least try to store some bits. */
+
+ /* Let's see how many we need to deal with and how much space we have.
+ * We might as well set the biased exponent here while we're at it.
+ *
+ * If %$e \ge e_{\text{min}}$% then we can store %$p$% bits of
+ * significand. Otherwise, we must make a subnormal and we can only
+ * store %$p + e - e_{\text{min}}$% bits. (Cross-check: if %$e \le
+ * e_{\text{min}} - p$% then we can store zero bits or fewer and have
+ * underflowed to zero, which matches the previous case.) In the
+ * subnormal case, we also `correct' the exponent so that we store the
+ * correct sentinel value later.
+ */
+ fracwd = sigbits(x);
+ if (exp >= minexp) sigwd = fmt->prec;
+ else { sigwd = fmt->prec + exp - minexp; exp = minexp - 1; }
+ mw = (sigwd + 31)/32; sh = -sigwd%32;
+
+ /* If we don't have enough significand bits then we must round. This
+ * might increase the exponent, so we must reload.
+ */
+ if (fracwd > sigwd) {
+ ERR(FLTERR_INEXACT);
+ y.frac = z + nw - mw; y.fracsz = mw; fltfmt_round(&y, x, r, sigwd);
+ x = &y; exp = y.exp - 1; fracwd = sigwd;
+ }
+
+ if (exp > maxexp) {
+ /* If the exponent is too large, then we overflow. If the error is
+ * masked, then we must produce a default value, choosing between
+ * infinity and the largest representable finite value according to
+ * the rounding mode.
+ */
+
+ rf = FRPF_ODD | FRPF_HALF | FRPF_LOW;
+ if (f&FLTF_NEG) rf |= FRPF_NEG;
+ if ((r >> rf)&1) {
+ ERR(FLTERR_OFLOW | FLTERR_INEXACT);
+ z0 |= M32(fmt->expwd) << esh;
+ } else {
+ ERR(FLTERR_INEXACT);
+ z0 |= (B32(fmt->expwd) - 2) << esh;
+ mb = fmt->prec; if (fmt->f&IEEEF_HIDDEN) mb--;
+ mw = (mb + 31)/32;
+ i = nw - mw;
+ z[i++] = M32(mb%32);
+ while (i < nw) z[i++] = MASK32;
+ }
+
+ } else {
+ /* The exponent is in range. Everything is ready. */
+
+ /* Store the significand. */
+ n = (fracwd + 31)/32; i = nw - mw;
+ t = shr(z + i, x->frac, n, sh); i += n;
+ if (i < nw) z[i++] = t;
+
+ /* Fill in the top end. */
+ for (j = nw - mw; j--; ) z[j] = 0;
+
+ /* Set the biased exponent. */
+ z0 |= (exp + maxexp) << esh;
+
+ /* Clear the unit bit if we're suppose to use a hidden-bit convention. */
+ if (fmt->f&IEEEF_HIDDEN) {
+ mb = fmt->prec - 1; mw = (mb + 31)/32; mb = mb%32;
+ z[nw - mw] &= ~B32(mb);
+ }
+ }
+ }
+ }
+
+ /* Clear the significand bits that we haven't set explicitly yet. */
+ while (i < nw) z[i++] = 0;
+
+ /* All that remains is to insert the top bits @z0@ in the right place.
+ * This will set the exponent, and the unit and quiet bits.
+ */
+ sh = -nb%32;
+ z[0] |= z0 >> sh;
+ if (sh && nb >= 32) z[1] |= z0 << (32 - sh);
+
+end:
+ return (err);
+
+#undef ERR
+}
+
+/* --- @fltfmt_encTY@ --- *
+ *
+ * Arguments: @octet *z_out@, @uint16 *z_out@, @uint32 *z_out@,
+ * @kludge64 *z_out@ = where to put the encoded value
+ * @uint16 *se_out@, @kludge64 *m_out@ = where to put the
+ * encoded sign-and-exponent and significand
+ * @const struct floatbits *x@ = value to encode
+ * @unsigned r@ = rounding mode
+ * @unsigned errmask@ = error mask
+ *
+ * Returns: Error flags (@FLTERR_...@).
+ *
+ * Use: Encode a floating-point value in an IEEE (or IEEE-adjacent)
+ * format.
+ *
+ * If an error is encountered during the encoding, and the
+ * corresponding bit of @errmask@ is clear, then processing
+ * stops immediately and the error is returned; if the bit is
+ * set, then processing continues as described below.
+ *
+ * The @TY@ may be
+ *
+ * * @mini@ for the 8-bit `1.4.3 minifloat' format, with
+ * four-bit exponent and four-bit significand, represented
+ * as a single octet;
+ *
+ * * @bf16@ for the Google `bfloat16' format, with eight-bit
+ * exponent and eight-bit significand, represented as a
+ * @uint16@;
+ *
+ * * @f16@ for the IEEE `binary16' format, with five-bit
+ * exponent and eleven-bit significand, represented as a
+ * @uint16@;
+ *
+ * * @f32@ for the IEEE `binary32' format, with eight-bit
+ * exponent and 24-bit significand, represented as a
+ * @uint32@;
+ *
+ * * @f64@ for the IEEE `binary64' format, with eleven-bit
+ * exponent and 53-bit significand, represented as a
+ * @kludge64@;
+ *
+ * * @f128@ for the IEEE `binary128' format, with fifteen-bit
+ * exponent and 113-bit significand, represented as four
+ * @uint32@ limbs, most significant first; or
+ *
+ * * @idblext80@ for the Intel 80-bit `double extended'
+ * format, with fifteen-bit exponent and 64-bit significand
+ * with no hidden bit, represented as a @uint16 se@
+ * holding the sign and exponent, and a @kludge64 m@
+ * holding the significand.
+ *
+ * Positive and negative zero and infinity are representable
+ * exactly.
+ *
+ * Following IEEE recommendations (and most implementations),
+ * the most significant fraction bit of a quiet NaN is set; this
+ * bit is clear in a signalling NaN. The most significant
+ * payload bits of a NaN, held in the top bits of @x->frac[0]@,
+ * are encoded in the output significand following the `quiet'
+ * bit. If the chosen format's significand field is too small
+ * to accommodate all of the set payload bits then the
+ * @FLTERR_INEXACT@ error bit is set and, if masked, the
+ * excess payload bits are discarded. No rounding of NaN
+ * payloads is performed.
+ *
+ * Otherwise, the input value is finite and nonzero. If the
+ * significand cannot be represented exactly then the
+ * @FLTERR_INEXACT@ error bit is set, and, if masked, the value
+ * will be rounded (internally -- the input @x@ is not changed).
+ * If the (rounded) value's exponent is too large to represent,
+ * then the @FLTERR_OFLOW@ and @FLTERR_INEXACT@ error bits are
+ * set and, if masked, the result is either the (absolute)
+ * largest representable finite value or infinity, with the
+ * appropriate sign, chosen according to the rounding mode. If
+ * the exponent is too small to represent, then the
+ * @FLTERR_UFLOW@ and @FLTERR_INEXACT@ error bits are set and,
+ * if masked, the result is either the (absolute) smallest
+ * nonzero value or zero, with the appropriate sign, chosen
+ * according to the rounding mode.
+ */
+
+unsigned fltfmt_encmini(octet *z_out, const struct floatbits *x,
+ unsigned r, unsigned errmask)
+{
+ uint32 t[1];
+ unsigned rc;
+
+ rc = fltfmt_encieee(&fltfmt_mini, t, x, r, errmask);
+ if (!(rc&~errmask)) *z_out = t[0];
+ return (rc);
+}
+
+unsigned fltfmt_encbf16(uint16 *z_out, const struct floatbits *x,
+ unsigned r, unsigned errmask)
+{
+ uint32 t[1];
+ unsigned rc;
+
+ rc = fltfmt_encieee(&fltfmt_bf16, t, x, r, errmask);
+ if (!(rc&~errmask)) *z_out = t[0];
+ return (rc);
+}
+
+unsigned fltfmt_encf16(uint16 *z_out, const struct floatbits *x,
+ unsigned r, unsigned errmask)
+{
+ uint32 t[1];
+ unsigned rc;
+
+ rc = fltfmt_encieee(&fltfmt_f16, t, x, r, errmask);
+ if (!(rc&~errmask)) *z_out = t[0];
+ return (rc);
+}
+
+unsigned fltfmt_encf32(uint32 *z_out, const struct floatbits *x,
+ unsigned r, unsigned errmask)
+ { return (fltfmt_encieee(&fltfmt_f32, z_out, x, r, errmask)); }
+
+unsigned fltfmt_encf64(kludge64 *z_out, const struct floatbits *x,
+ unsigned r, unsigned errmask)
+{
+ uint32 t[2];
+ unsigned rc;
+
+ rc = fltfmt_encieee(&fltfmt_f64, t, x, r, errmask);
+ if (!(rc&~errmask)) SET64(*z_out, t[0], t[1]);
+ return (rc);
+}
+
+unsigned fltfmt_encf128(uint32 *z_out, const struct floatbits *x,
+ unsigned r, unsigned errmask)
+ { return (fltfmt_encieee(&fltfmt_f128, z_out, x, r, errmask)); }
+
+unsigned fltfmt_encidblext80(uint16 *se_out, kludge64 *m_out,
+ const struct floatbits *x,
+ unsigned r, unsigned errmask)
+{
+ uint32 t[3];
+ unsigned rc;
+
+ rc = fltfmt_encieee(&fltfmt_idblext80, t, x, r, errmask);
+ if (!(rc&~errmask)) { *se_out = t[0]; SET64(*m_out, t[1], t[2]); }
+ return (rc);
+}
+
+/* --- @fltfmt_decieee@ --- *
+ *
+ * Arguments: @const struct fltfmt_ieeefmt *fmt@ = format description
+ * @struct floatbits *z_out@ = output decoded representation
+ * @const uint32 *x@ = input encoding
+ *
+ * Returns: Error flags (@FLTERR_...@).
+ *
+ * Use: Decode a floating-point value in an IEEE format. This is the
+ * machinery shared by the @fltfmt_dec...@ functions for
+ * deccoding IEEE-format values. Most of the arguments and
+ * behaviour are as described for those functions.
+ *
+ * The encoded value should be right-aligned and big-endian;
+ * i.e., the sign bit ends up in @z[0]@, and the least
+ * significant bit of the significand ends up in the least
+ * significant bit of @z[n - 1]@.
+ */
+
+unsigned fltfmt_decieee(const struct fltfmt_ieeefmt *fmt,
+ struct floatbits *z_out, const uint32 *x)
+{
+ unsigned sigwd, err = 0, f = 0;
+ unsigned i, nb, nw, mw, esh, sh;
+ int exp, minexp, maxexp;
+ uint32 x0, t, u, emask;
+
+ /* The following code assumes that the sign, biased exponent, unit, and
+ * quiet/signalling bits can all fit into the most significant 32 bits of
+ * the result.
+ */
+ assert(fmt->expwd + 3 <= 32);
+ esh = 31 - fmt->expwd; emask = M32(fmt->expwd);
+ sigwd = fmt->prec; if (fmt->f&IEEEF_HIDDEN) sigwd--;
+
+ /* Determine the input size. */
+ nb = sigwd + fmt->expwd + 1; nw = (nb + 31)/32;
+
+ /* Extract the sign, exponent, and top of the significand. */
+ sh = -nb%32;
+ x0 = x[0] << sh;
+ if (sh && nb >= 32) x0 |= x[1] >> (32 - sh);
+ if (x0&B31) f |= FLTF_NEG;
+ t = (x0 >> esh)&emask;
+
+ /* Time for a case analysis. */
+
+ if (t == emask) {
+ /* Infinity or NaN.
+ *
+ * Note that we don't include the quiet bit in our decoded payload.
+ */
+
+ if (!(fmt->f&IEEEF_HIDDEN)) {
+ /* No hidden bit, so we expect the unit bit to be set. If it isn't,
+ * that's technically invalid, and its absence won't survive a round
+ * trip, since the bit isn't considered part of a NaN payload -- or
+ * even to distinguish a NaN from an infinity. In any event, reduce
+ * the notional significand size to exclude this bit from further
+ * consideration.
+ */
+
+ if (!(x0&B32(esh - 1))) err = FLTERR_INVAL;
+ sigwd--;
+ }
+
+ if (ms_set_bit(x + nw, 0, sigwd) == ALLCLEAR)
+ f |= FLTF_INF;
+ else {
+ sh = esh - 2; if (fmt->f&IEEEF_HIDDEN) sh++;
+ if (x0&B32(sh)) f |= FLTF_QNAN;
+ else f |= FLTF_SNAN;
+ sigwd--; mw = (sigwd + 31)/32;
+ fltfmt_allocfrac(z_out, mw);
+ shl(z_out->frac, x + nw - mw, mw, -sigwd%32);
+ }
+ goto end;
+ }
+
+ /* Determine the exponent bounds. */
+ maxexp = (1 << (fmt->expwd - 1)) - 1;
+ minexp = 1 - maxexp;
+
+ /* Dispatch. If there's a hidden bit then everything is well defined.
+ * Otherwise, we'll normalize the incoming value regardless, but report
+ * settings of the unit bit which are inconsistent with the exponent.
+ */
+ if (fmt->f&IEEEF_HIDDEN) {
+ if (!t) { exp = minexp; goto normalize; }
+ else { exp = t - maxexp; goto hidden; }
+ } else {
+ u = x0&B32(esh - 1);
+ if (!t) { exp = minexp; if (u) err |= FLTERR_INVAL; }
+ else { exp = t - maxexp; if (!u) err |= FLTERR_INVAL; }
+ goto normalize;
+ }
+
+hidden:
+ /* We have a normal real number with a hidden bit. */
+
+ mw = (sigwd + 31)/32;
+
+ if (!(sigwd%32)) {
+ /* The bits we have occupy a whole number of words, but we need to shift
+ * to make space for the unit bit.
+ */
+
+ fltfmt_allocfrac(z_out, mw + 1);
+ z_out->frac[mw] = shr(z_out->frac, x + nw - mw, mw, 1);
+ } else {
+ fltfmt_allocfrac(z_out, mw);
+ shl(z_out->frac, x + nw - mw, mw, -(sigwd + 1)%32);
+ }
+ z_out->frac[0] |= B31;
+ z_out->exp = exp + 1;
+ goto end;
+
+normalize:
+ /* We have, at least potentially, a subnormal number, with no hidden
+ * bit.
+ */
+
+ i = ms_set_bit(x + nw, 0, sigwd);
+ if (i == ALLCLEAR) { f |= FLTF_ZERO; goto end; }
+ mw = i/32 + 1; sh = 32*mw - i - 1;
+ fltfmt_allocfrac(z_out, mw);
+ shl(z_out->frac, x + nw - mw, mw, sh);
+ z_out->exp = exp - fmt->prec + 2 + i;
+ goto end;
+
+end:
+ /* All done. */
+ z_out->f = f; return (err);
+}
+
+/* --- @fltfmt_decTY@ --- *
+ *
+ * Arguments: @const struct floatbits *z_out@ = storage for the result
+ * @octet x@, @uint16 x@, @uint32 x@, @kludge64 x@ =
+ * encoded input
+ * @uint16 se@, @kludge64 m@ = encoded sign-and-exponent and
+ * significand
+ *
+ * Returns: Error flags (@FLTERR_...@).
+ *
+ * Use: Encode a floating-point value in an IEEE (or IEEE-adjacent)
+ * format.
+ *
+ * The options for @TY@ are as documented for the encoding
+ * functions above.
+ *
+ * In formats without a hidden bit -- currently only @idblext80@
+ * -- not all bit patterns are valid encodings. If the explicit
+ * unit bit is set when the exponent field is all-bits-zero, or
+ * clear when the exponent field is not all-bits-zero, then the
+ * @FLTERR_INVAL@ error bit is set. If the exponent is all-
+ * bits-set, denoting infinity or a NaN, then the unit bit is
+ * otherwise ignored -- in particular, it does not affect the
+ * NaN payload, or even whether the input encodes a NaN or
+ * infinity. Otherwise, the unit bit is considered significant,
+ * and the result is normalized as one would expect.
+ * Consequently, biased exponent values 0 and 1 are distinct
+ * only with respect to which bit patterns are considered valid,
+ * and not with respect to the set of values denoted.
+ */
+
+unsigned fltfmt_decmini(struct floatbits *z_out, octet x)
+ { uint32 t[1]; t[0] = x; return (fltfmt_decieee(&fltfmt_mini, z_out, t)); }
+
+unsigned fltfmt_decbf16(struct floatbits *z_out, uint16 x)
+ { uint32 t[1]; t[0] = x; return (fltfmt_decieee(&fltfmt_bf16, z_out, t)); }
+
+unsigned fltfmt_decf16(struct floatbits *z_out, uint16 x)
+ { uint32 t[1]; t[0] = x; return (fltfmt_decieee(&fltfmt_f16, z_out, t)); }
+
+unsigned fltfmt_decf32(struct floatbits *z_out, uint32 x)
+ { uint32 t[1]; t[0] = x; return (fltfmt_decieee(&fltfmt_f32, z_out, t)); }
+
+unsigned fltfmt_decf64(struct floatbits *z_out, kludge64 x)
+{
+ uint32 t[2];
+
+ t[0] = HI64(x); t[1] = LO64(x);
+ return (fltfmt_decieee(&fltfmt_f64, z_out, t));
+}
+
+unsigned fltfmt_decf128(struct floatbits *z_out, const uint32 *x)
+ { return (fltfmt_decieee(&fltfmt_f128, z_out, x)); }
+
+unsigned fltfmt_decidblext80(struct floatbits *z_out, uint16 se, kludge64 m)
+{
+ uint32 t[3];
+
+ t[0] = se; t[1] = HI64(m); t[2] = LO64(m);
+ return (fltfmt_decieee(&fltfmt_idblext80, z_out, t));
+}
+
+/*----- Native formats ----------------------------------------------------*/
+
+/* If the floating-point radix is a power of two, determine how many bits
+ * there are in each digit. This isn't exhaustive, but it covers most of the
+ * bases, so to speak.
+ */
+#if FLT_RADIX == 2
+# define DIGIT_BITS 1
+#elif FLT_RADIX == 4
+# define DIGIT_BITS 2
+#elif FLT_RADIX == 8
+# define DIGIT_BITS 3
+#elif FLT_RADIX == 16
+# define DIGIT_BITS 4
+#endif
+
+/* --- @ENCFLT@ --- *
+ *
+ * Arguments: @ty@ = the C type to encode
+ * @TY@ = the uppercase prefix for macros
+ * @ty (*ldexp)(ty, int)@ = function to scale a @ty@ value by a
+ * power of two
+ * @unsigned &rc@ = error code to set
+ * @ty *z_out@ = storage for the result
+ * @const struct floatbits *x@ = value to convert
+ * @unsigned r@ = rounding mode
+ *
+ * Returns: ---
+ *
+ * Use: Encode a floating-point value @x@ as a native C object of
+ * type @ty@. This is the machinery shared by the
+ * @fltfmt_enc...@ functions for enccoding native-format values.
+ * Most of the behaviour is as described for those functions.
+ */
+
+/* Utilities based on conditional compilation, because we can't use
+ * %|#ifdef|% directly in macros.
+ */
+
+#ifdef NAN
+ /* The C implementation acknowledges the existence of (quiet) NaN values,
+ * but will neither let us set the payload in a useful way, nor
+ * acknowledge the existence of signalling NaNs. We have no good way to
+ * determine which NaN the @NAN@ macro produces, so report this conversion
+ * as inexact.
+ */
+
+# define SETNAN(rc, z) do { (z) = NAN; (rc) = FLTERR_INEXACT; } while (0)
+#else
+ /* This C implementation doesn't recognize NaNs. This value is totally
+ * unrepresentable, so just report the error. (Maybe it's C89 and would
+ * actually do the right thing with @0/0@. I'm not sure the requisite
+ * compile-time configuration machinery is worth the effort.)
+ */
+
+# define SETNAN(rc, z) do { (z) = 0; (rc) = FLTERR_REPR; } while (0)
+#endif
+
+#ifdef INFINITY
+ /* The C implementation supports infinities. This is a simple win. */
+
+# define SETINF(TY, rc, z) \
+ do { (z) = INFINITY; (rc) = FLTERR_OK; } while (0)
+#else
+ /* The C implementation doesn't support infinities. Return the maximum
+ * value and report it as an overflow; I think this is more useful than
+ * reporting a complete representation failure. (Maybe it's C89 and would
+ * actually do the right thing with @1/0@. Again, I'm not sure the
+ * requisite compile-time configuration machinery is worth the effort.)
+ */
+
+# define SETINF(TY, rc, z) \
+ do { (z) = TY##_MAX; (rc) = FLTERR_OFLOW | FLTERR_INEXACT; } while (0)
+#endif
+
+#ifdef DIGIT_BITS
+ /* The floating point formats use a power-of-two radix. This means that
+ * we can determine the correctly rounded value before we start building
+ * the native floating-point value.
+ */
+
+# define ENC_ROUND_DECLS struct floatbits _y;
+# define ENC_ROUND(TY, rc, x, r) do { \
+ (rc) |= fltfmt_round(&_y, (x), (r), DIGIT_BITS*TY##_MANT_DIG); \
+ (x) = &_y; \
+ } while (0)
+#else
+ /* The floating point formats use a non-power-of-two radix. This means
+ * that conversion is inherently inexact.
+ */
+
+# define ENC_ROUND_DECLS
+# define ENC_ROUND(TY, rc, x, r) \
+ do (rc) |= FLTERR_INEXACT; while (0)
+# define ENC_FIXUP(...)
+
+#endif
+
+#define ENCFLT(ty, TY, ldexp, rc, z_out, x, r) do { \
+ unsigned _rc = 0; \
+ \
+ /* See if the native format is one that we recognize. */ \
+ switch (TY##_FORMAT&(FLTFMT_ORGMASK | FLTFMT_TYPEMASK)) { \
+ \
+ case FLTFMT_IEEE_F32: { \
+ uint32 _t[1]; \
+ unsigned char *_z = (unsigned char *)(z_out); \
+ \
+ (rc) = fltfmt_encieee(&fltfmt_f32, _t, (x), (r), FLTERR_ALLERRS); \
+ FLTFMT__FROB_NAN_F32(_t, _rc); \
+ switch (TY##_FORMAT&FLTFMT_ENDMASK) { \
+ case FLTFMT_BE: STORE32_B(_z, _t[0]); break; \
+ case FLTFMT_LE: STORE32_L(_z, _t[0]); break; \
+ default: assert(!"unimplemented byte order"); break; \
+ } \
+ } break; \
+ \
+ case FLTFMT_IEEE_F64: { \
+ uint32 _t[2]; \
+ unsigned char *_z = (unsigned char *)(z_out); \
+ (rc) = fltfmt_encieee(&fltfmt_f64, _t, (x), (r), FLTERR_ALLERRS); \
+ FLTFMT__FROB_NAN_F64(_t, _rc); \
+ switch (TY##_FORMAT&FLTFMT_ENDMASK) { \
+ case FLTFMT_BE: \
+ STORE32_B(_z + 0, _t[0]); STORE32_B(_z + 4, _t[1]); \
+ break; \
+ case FLTFMT_LE: \
+ STORE32_L(_z + 0, _t[1]); STORE32_L(_z + 4, _t[0]); \
+ break; \
+ case FLTFMT_ARME: \
+ STORE32_L(_z + 0, _t[0]); STORE32_L(_z + 4, _t[1]); \
+ break; \
+ default: assert(!"unimplemented byte order"); break; \
+ } \
+ } break; \
+ \
+ case FLTFMT_IEEE_F128: { \
+ uint32 _t[4]; \
+ unsigned char *_z = (unsigned char *)(z_out); \
+ \
+ FLTFMT__FROB_NAN_F128(_t, _rc); \
+ (rc) = fltfmt_encieee(&fltfmt_f128, _t, (x), (r), FLTERR_ALLERRS); \
+ switch (TY##_FORMAT&FLTFMT_ENDMASK) { \
+ case FLTFMT_BE: \
+ STORE32_B(_z + 0, _t[0]); STORE32_B(_z + 4, _t[1]); \
+ STORE32_B(_z + 8, _t[0]); STORE32_B(_z + 12, _t[1]); \
+ break; \
+ case FLTFMT_LE: \
+ STORE32_L(_z + 0, _t[3]); STORE32_L(_z + 4, _t[2]); \
+ STORE32_L(_z + 8, _t[1]); STORE32_L(_z + 12, _t[0]); \
+ break; \
+ default: assert(!"unimplemented byte order"); break; \
+ } \
+ } break; \
+ \
+ case FLTFMT_INTEL_F80: { \
+ uint32 _t[3]; \
+ unsigned char *_z = (unsigned char *)(z_out); \
+ \
+ (rc) = fltfmt_encieee(&fltfmt_idblext80, _t, (x), (r), FLTERR_ALLERRS); \
+ FLTFMT__FROB_NAN_IDBLEXT80(_t, _rc); \
+ switch (TY##_FORMAT&FLTFMT_ENDMASK) { \
+ case FLTFMT_BE: \
+ STORE16_B(_z + 0, _t[0]); \
+ STORE32_B(_z + 2, _t[1]); STORE32_B(_z + 6, _t[2]); \
+ break; \
+ case FLTFMT_LE: \
+ STORE32_L(_z + 0, _t[2]); STORE32_L(_z + 4, _t[1]); \
+ STORE16_L(_z + 8, _t[0]); \
+ break; \
+ default: assert(!"unimplemented byte order"); break; \
+ } \
+ } break; \
+ \
+ default: { \
+ /* We must do this the hard way. */ \
+ \
+ const struct floatbits *_x = (x); \
+ ty _z; \
+ unsigned _i; \
+ ENC_ROUND_DECLS; \
+ \
+ /* Case analysis... */ \
+ if (_x->f&FLTF_NANMASK) { \
+ /* A NaN. Use the macro above. */ \
+ \
+ SETNAN(_rc, _z); \
+ if (x->f&FLTF_NEG) _z = -_z; \
+ } else if (_x->f&FLTF_INF) { \
+ /* Infinity. Use the macro. */ \
+ \
+ SETINF(TY, _rc, _z); \
+ if (_x->f&FLTF_NEG) _z = -_z; \
+ } else if (_x->f&FLTF_ZERO) { \
+ /* Zero. If we're asked for a negative zero then check that we \
+ * produced one: if not, then report an exactness failure. \
+ */ \
+ \
+ _z = 0.0; \
+ if (_x->f&FLTF_NEG) \
+ { _z = -_z; if (!NEGP(_z)) _rc |= FLTERR_INEXACT; } \
+ } else { \
+ /* A finite value. */ \
+ \
+ /* If the radix is a power of two, we can round to the correct \
+ * precision, which will save inexactness later. \
+ */ \
+ ENC_ROUND(TY, _rc, _x, (r)); \
+ \
+ /* Insert the 32-bit pieces of the fraction one at a time, \
+ * starting from the least-significant end. This minimizes the \
+ * inaccuracy from the overall approach, but it's imperfect \
+ * unless the value has already been rounded correctly. \
+ */ \
+ _z = 0.0; \
+ for (_i = _x->n, _z = 0.0; _i--; ) \
+ _z += ldexp(_x->frac[_i], _x->exp - 32*_i); \
+ \
+ /* Negate the value if we need to. */ \
+ if (_x->f&FLTF_NEG) _z = -_z; \
+ } \
+ \
+ /* All done. */ \
+ *(z_out) = _z; \
+ } break; \
+ } \
+ \
+ /* Set the error code. */ \
+ (rc) = _rc; \
+} while (0)
+
+/* --- @fltfmt_encTY@ --- *
+ *
+ * Arguments: @ty *z_out@ = storage for the result
+ * @const struct floatbits *x@ = value to encode
+ * @unsigned r@ = rounding mode
+ *
+ * Returns: Error flags (@FLTERR_...@).
+ *
+ * Use: Encode the floating-point value @x@ as a native C object and
+ * store the result in @z_out@.
+ *
+ * The @TY@ may be @flt@ to encode a @float@, @dbl@ to encode a
+ * @double@, or (on C99 implementations) @ldbl@ to encode a
+ * @long double@.
+ *
+ * In detail, conversion is performed as follows.
+ *
+ * * If a non-finite value cannot be represented by the
+ * implementation then the @FLTERR_REPR@ error bit is set
+ * and @*z_out@ is set to zero if @x@ is a NaN, or the
+ * (absolute) largest representable value, with appropriate
+ * sign, if @x@ is an infinity.
+ *
+ * * If the implementation can represent NaNs, but cannot set
+ * NaN payloads, then the @FLTERR_INEXACT@ error bit is set,
+ * and @*z_out@ is set to an arbitrary (quiet) NaN value.
+ *
+ * * If @x@ is negative zero, but the implementation does not
+ * distinguish negative and positive zero, then the
+ * @FLTERR_INEXACT@ error bit is set and @*z_out@ is set to
+ * zero.
+ *
+ * * If the implementation's floating-point radix is not a
+ * power of two, and @x@ is a nonzero finite value, then
+ * @FLTERR_INEXACT@ error bit is set (unconditionally), and
+ * the value is rounded by the implementation using its
+ * prevailing rounding policy. If the radix is a power of
+ * two, then the @FLTERR_INEXACT@ error bit is set only if
+ * rounding is necessary, and rounding is performed using
+ * the rounding mode @r@.
+ */
+
+unsigned fltfmt_encflt(float *z_out, const struct floatbits *x, unsigned r)
+{
+ unsigned rc;
+
+ ENCFLT(double, FLT, ldexp, rc, z_out, x, r);
+ return (rc);
+}
+
+unsigned fltfmt_encdbl(double *z_out, const struct floatbits *x, unsigned r)
+{
+ unsigned rc;
+
+ ENCFLT(double, DBL, ldexp, rc, z_out, x, r);
+ return (rc);
+}
+
+#if __STDC_VERSION__ >= 199001
+unsigned fltfmt_encldbl(long double *z_out,
+ const struct floatbits *x, unsigned r)
+{
+ unsigned rc;
+
+ ENCFLT(long double, LDBL, ldexpl, rc, z_out, x, r);
+ return (rc);
+}
+#endif
+
+/* --- @DECFLT@ --- *
+ *
+ * Arguments: @ty@ = the C type to encode
+ * @TY@ = the uppercase prefix for macros
+ * @ty (*frexp)(ty, int *)@ = function to decompose a @ty@ value
+ * into a binary exponent and normalized fraction
+ * @unsigned &rc@ = error code to set
+ * @struct floatbits *z_out@ = storage for the result
+ * @ty x@ = value to convert
+ * @unsigned r@ = rounding mode
+ *
+ * Returns: ---
+ *
+ * Use: Decode a C native floating-point object. This is the
+ * machinery shared by the @fltfmt_dec...@ functions for
+ * decoding native-format values. Most of the behaviour is as
+ * described for those functions.
+ */
+
+/* Define some utilities for decoding native floating-point formats.
+ *
+ * * @NFRAC(d)@ is the number of fraction limbs we'll need for @d@ native
+ * digits.
+ *
+ * * @CONVFIX@ is a final fixup applied to the decoded value.
+ */
+#ifdef DIGIT_BITS
+# define NFRAC(TY) ((DIGIT_BITS*TY##_MANT_DIG + 31)/32)
+# define CONVFIX(ty, rc, z, x, n, f, r) do assert(!(x)); while (0)
+#else
+# define NFRAC(TY) \
+ (ceil(log(pow(FLT_RADIX, TY##_MANT_DIG) - 1)/32.0*log(2.0)) + 1)
+# define CONVFIX(ty, rc, z, x, n, f, r) do { \
+ ty _x_ = (x); \
+ struct floatbits *_z_ = (z); \
+ uint32 _w_; \
+ unsigned _i_, _n_ = (n), _f_; \
+ \
+ /* Round the result according to the remainder left in %$x$%. */ \
+ _f_ = 0; _i_ = _n_ - 1; _w_ = _z_->frac[_i_]; \
+ if ((f)&FLTF_NEG) _f_ |= FRPF_NEG; \
+ if (_w_&1) _f_ |= FRPF_ODD; \
+ if (_y_ >= 0.5) _f_ |= FRPF_HALF; \
+ if (_y_ != 0 && _y_ != 0.5) _f_ |= FRPF_LOW; \
+ if (((r) >> _f_)&1) { \
+ for (;;) { \
+ _w_ = (_w_ + 1)&MASK32; \
+ if (_w_ || !_i_) break; \
+ _z_->frac[_i_] = 0; _w_ = _z_->frac[--_i_]; \
+ } \
+ if (!_w_) { _z_->exp++; _w_ = B31; } \
+ _z_->frac[_i_] = w; \
+ } \
+ \
+ /* The result is not exact. */ \
+ (rc) |= FLTERR_INEXACT; \
+ } while (0)
+#endif
+
+#define DECFLT(ty, TY, frexp, rc, z_out, x, r) do { \
+ unsigned _rc = 0; \
+ \
+ switch (TY##_FORMAT&(FLTFMT_ORGMASK | FLTFMT_TYPEMASK)) { \
+ \
+ case FLTFMT_IEEE_F32: { \
+ unsigned _t[1]; \
+ unsigned char *_x = (unsigned char *)&(x); \
+ \
+ switch (TY##_FORMAT&FLTFMT_ENDMASK) { \
+ case FLTFMT_BE: _t[0] = LOAD32_B(_x); break; \
+ case FLTFMT_LE: _t[0] = LOAD32_L(_x); break; \
+ default: assert(!"unimplemented byte order"); break; \
+ } \
+ FLTFMT__FROB_NAN_F32(_t, _rc); \
+ _rc |= fltfmt_decieee(&fltfmt_f32, (z_out), _t); \
+ } break; \
+ \
+ case FLTFMT_IEEE_F64: { \
+ unsigned _t[2]; \
+ unsigned char *_x = (unsigned char *)&(x); \
+ \
+ switch (TY##_FORMAT&FLTFMT_ENDMASK) { \
+ case FLTFMT_BE: \
+ _t[0] = LOAD32_B(_x + 0); _t[1] = LOAD32_B(_x + 4); \
+ break; \
+ case FLTFMT_LE: \
+ _t[1] = LOAD32_L(_x + 0); _t[0] = LOAD32_L(_x + 4); \
+ break; \
+ case FLTFMT_ARME: \
+ _t[0] = LOAD32_L(_x + 0); _t[1] = LOAD32_L(_x + 4); \
+ break; \
+ default: assert(!"unimplemented byte order"); break; \
+ } \
+ FLTFMT__FROB_NAN_F64(_t, _rc); \
+ _rc |= fltfmt_decieee(&fltfmt_f64, (z_out), _t); \
+ } break; \
+ \
+ case FLTFMT_IEEE_F128: { \
+ unsigned _t[4]; \
+ unsigned char *_x = (unsigned char *)&(x); \
+ \
+ switch (TY##_FORMAT&FLTFMT_ENDMASK) { \
+ case FLTFMT_BE: \
+ _t[0] = LOAD32_B(_x + 0); _t[1] = LOAD32_B(_x + 4); \
+ _t[2] = LOAD32_B(_x + 8); _t[3] = LOAD32_B(_x + 12); \
+ break; \
+ case FLTFMT_LE: \
+ _t[3] = LOAD32_L(_x + 0); _t[2] = LOAD32_L(_x + 4); \
+ _t[1] = LOAD32_L(_x + 8); _t[0] = LOAD32_L(_x + 12); \
+ break; \
+ default: assert(!"unimplemented byte order"); break; \
+ } \
+ FLTFMT__FROB_NAN_F128(_t, _rc); \
+ _rc |= fltfmt_decieee(&fltfmt_f128, (z_out), _t); \
+ } break; \
+ \
+ case FLTFMT_INTEL_F80: { \
+ unsigned _t[3]; \
+ unsigned char *_x = (unsigned char *)&(x); \
+ \
+ switch (TY##_FORMAT&FLTFMT_ENDMASK) { \
+ case FLTFMT_BE: \
+ _t[0] = LOAD16_B(_x + 0); \
+ _t[1] = LOAD32_B(_x + 2); _t[2] = LOAD32_B(_x + 6); \
+ break; \
+ case FLTFMT_LE: \
+ _t[2] = LOAD32_L(_x + 0); _t[1] = LOAD32_L(_x + 4); \
+ _t[0] = LOAD16_L(_x + 8); \
+ break; \
+ default: assert(!"unimplemented byte order"); break; \
+ } \
+ FLTFMT__FROB_NAN_IDBLEXT80(_t, _rc); \
+ _rc |= fltfmt_decieee(&fltfmt_idblext80, (z_out), _t); \
+ } break; \
+ \
+ default: { \
+ struct floatbits *_z = (z_out); \
+ ty _x = (x), _y; \
+ unsigned _i, _n, _f = 0; \
+ uint32 _t; \
+ \
+ /* If the value looks negative then negate it and set the sign \
+ * flag. \
+ */ \
+ if (NEGP(_x)) { _f |= FLTF_NEG; _x = -_x; } \
+ \
+ /* Now for the case analysis. Infinities and zero are \
+ * unproblematic. NaNs can't be decoded exactly using the \
+ * portable machinery. \
+ */ \
+ if (INFP(_x)) _f |= FLTF_INF; \
+ else if (_x == 0.0) _f |= FLTF_ZERO; \
+ else if (NANP(_x)) { _f |= FLTF_QNAN; _rc |= FLTERR_INEXACT; } \
+ else { \
+ /* A finite value. Determine the number of fraction limbs \
+ * we'll need based on the precision and radix and pull out \
+ * 32-bit chunks one at a time. This will be unproblematic \
+ * for power-of-two radices, requiring at most shifting the \
+ * significand left by a few bits, but inherently inexact (for \
+ * the most part) for others. \
+ */ \
+ \
+ _n = NFRAC(TY); fltfmt_allocfrac(_z, _n); \
+ _y = frexp(_x, &_z->exp); \
+ for (_i = 0; _i < _n; _i++) \
+ { _y *= SH32; _t = _y; _y -= _t; _z->frac[_i] = _t; } \
+ CONVFIX(ty, _rc, _z, _y, _n, _f, (r)); \
+ } \
+ \
+ /* Done. */ \
+ _z->f = _f; \
+ } break; \
+ } \
+ \
+ /* Set the error code. */ \
+ (rc) = _rc; \
+} while (0)
+
+/* --- @fltfmt_decTY@ --- *
+ *
+ * Arguments: @struct floatbits *z_out@ = storage for the result
+ * @ty x@ = value to decode
+ * @unsigned r@ = rounding mode
+ *
+ * Returns: Error flags (@FLTERR_...@).
+ *
+ * Use: Decode the native C floatingpoint value @x@ and store the
+ * result in @z_out@.
+ *
+ * The @TY@ may be @flt@ to encode a @float@, @dbl@ to encode a
+ * @double@, or (on C99 implementations) @ldbl@ to encode a
+ * @long double@.
+ *
+ * In detail, conversion is performed as follows.
+ *
+ * * If the implementation supports negative zeros and/or
+ * infinity, then these are recognized and decoded.
+ *
+ * * If the input as a NaN, but the implementation cannot
+ * usefully report NaN payloads, then the @FLTERR_INEXACT@
+ * error bit is set and the decoded payload is left empty.
+ *
+ * * If the implementation's floating-point radix is not a
+ * power of two, and @x@ is a nonzero finite value, then
+ * @FLTERR_INEXACT@ error bit is set (unconditionally), and
+ * the rounded value (according to the rounding mode @r@) is
+ * stored in as many fraction words as necessary to identify
+ * the original value uniquely. If the radix is a power of
+ * two, then the value is represented exactly.
+ */
+
+unsigned fltfmt_decflt(struct floatbits *z_out, float x, unsigned r)
+{
+ unsigned rc;
+
+ DECFLT(double, FLT, frexp, rc, z_out, x, r);
+ return (rc);
+}
+
+unsigned fltfmt_decdbl(struct floatbits *z_out, double x, unsigned r)
+{
+ unsigned rc;
+
+ DECFLT(double, DBL, frexp, rc, z_out, x, r);
+ return (rc);
+}
+
+#if __STDC_VERSION__ >= 199001
+unsigned fltfmt_decldbl(struct floatbits *z_out, long double x, unsigned r)
+{
+ unsigned rc;
+
+ DECFLT(long double, LDBL, frexpl, rc, z_out, x, r);
+ return (rc);
+}
+#endif
+
+/*----- That's all, folks -------------------------------------------------*/