3 * Floating-point format conversions
5 * (c) 2024 Straylight/Edgeware
8 /*----- Licensing notice --------------------------------------------------*
10 * This file is part of the mLib utilities library.
12 * mLib is free software: you can redistribute it and/or modify it under
13 * the terms of the GNU Library General Public License as published by
14 * the Free Software Foundation; either version 2 of the License, or (at
15 * your option) any later version.
17 * mLib is distributed in the hope that it will be useful, but WITHOUT
18 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
19 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public
20 * License for more details.
22 * You should have received a copy of the GNU Library General Public
23 * License along with mLib. If not, write to the Free Software
24 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
28 /*----- Header files ------------------------------------------------------*/
45 #if GCC_VERSION_P(4, 4)
46 # pragma GCC optimize "-frounding-math"
47 #elif CLANG_VERSION_P(11, 0) && !CLANG_VERSION_P(12, 0)
48 # pragma clang optimize "-frounding-math"
49 #elif GCC_VERSION_P(0, 0) || \
50 (CLANG_VERSION_P(0, 0) && !CLANG_VERSION_P(12, 0))
52 #elif __STDC_VERSION__ >= 199001
54 # pragma STDC FENV_ACCESS ON
57 /*----- Some useful constants ---------------------------------------------*/
59 #define B31 0x80000000u /* just bit 31 */
60 #define B30 0x40000000u /* just bit 30 */
62 #define SH32 4294967296.0 /* 2^32 as floating-point */
64 /*----- Useful macros -----------------------------------------------------*/
66 #define B32(k) ((uint32)1 << (k))
67 #define M32(k) (B32(k) - 1)
69 #define FINITEP(x) (!((x)->f&(FLTF_INF | FLTF_NANMASK | FLTF_ZERO)))
71 /*----- Utility functions -------------------------------------------------*/
75 * Arguments: @uint32 x@ = a 32-bit value
77 * Returns: The number of leading zeros in @x@, i.e., the nonnegative
78 * integer %$n$% such that %$2^{31} \le 2^n x < 2^{32}$%.
79 * Returns a nonsensical value if %$x = 0$%.
82 static unsigned clz32(uint32 x)
86 /* Divide and conquer. If the top half of the bits are clear, then there
87 * must be at least 16 leading zero bits, so accumulate and shift. Repeat
88 * for smaller powers of two.
90 * This ends up returning 31 if %$x = 0$%, but don't rely on this.
92 if (!(x&0xffff0000)) { x <<= 16; n += 16; }
93 if (!(x&0xff000000)) { x <<= 8; n += 8; }
94 if (!(x&0xf0000000)) { x <<= 4; n += 4; }
95 if (!(x&0xc0000000)) { x <<= 2; n += 2; }
96 if (!(x&0x80000000)) { n += 1; }
102 * Arguments: @uint32 x@ = a 32-bit value
104 * Returns: The number of trailing zeros in @x@, i.e., the nonnegative
105 * integer %$n$% such that %$x/2^n$% is an odd integer.
106 * Returns a nonsensical value if %$x = 0$%.
109 static unsigned ctz32(uint32 x)
111 #ifdef CTZ_TRADITIONAL
115 /* Divide and conquer. If the bottom half of the bits are clear, then
116 * there must be at least 16 trailing zero bits, so accumulate and shift.
117 * Repeat for smaller powers of two.
119 * This ends up returning 31 if %$x = 0$%, but don't rely on this.
121 if (!(x&0x0000ffff)) { x >>= 16; n += 16; }
122 if (!(x&0x000000ff)) { x >>= 8; n += 8; }
123 if (!(x&0x0000000f)) { x >>= 4; n += 4; }
124 if (!(x&0x00000003)) { x >>= 2; n += 2; }
125 if (!(x&0x00000001)) { n += 1; }
130 static unsigned char tab[] =
132 ;;; Compute the decoding table for the de Bruijn sequence trick below.
134 (let ((db #x04653adf)
135 (rv (make-vector 32 nil)))
137 (aset rv (logand (ash db (- i 27)) #x1f) i))
139 (goto-char (point-min))
140 (search-forward (concat "***" "BEGIN ctz32tab" "***"))
141 (beginning-of-line 2)
142 (delete-region (point)
144 (search-forward "***END***")
148 (cond ((zerop i) (insert " { "))
149 ((zerop (mod i 16)) (insert ",\n "))
150 ((zerop (mod i 4)) (insert ", "))
152 (insert (format "%2d" (aref rv i))))
156 /* ***BEGIN ctz32tab*** */
157 { 0, 1, 2, 6, 3, 11, 7, 16, 4, 14, 12, 21, 8, 23, 17, 26,
158 31, 5, 10, 15, 13, 20, 22, 25, 30, 9, 19, 24, 29, 18, 28, 27 };
161 /* Sneaky trick. Two's complement negation (which you effectively get
162 * using C unsigned arithmetic, whether you like it or not) complements all
163 * of the bits of an operand more significant than the least significant
164 * set bit. Therefore, this bit is the only one set in both %$x$% and
165 * %$-x$%, so @x&-x@ will isolate it for us.
167 * The magic number @0x04653adf@ is a %%\emph{de Bruijn} number%%: every
168 * group of five consecutive bits is distinct, including groups which `wrap
169 * around', including some low bits and some high bits. Multiplying this
170 * number by a power of two is equivalent to a left shift; and, because the
171 * top five bits are all zero, the most significant five bits of the
172 * product are the same as if we'd applied a rotation. The result is that
173 * we end up with a distinctive pattern in those bits which perfectly
174 * diagnose each shift from 0 up to 31, which we can decode using a table.
176 * David Seal described a similar trick -- using the six-bit pattern
177 * generated by the constant @0x0450fbaf@ -- in `comp.sys.arm' in 1994;
178 * this constant was particularly convenient to multiply by on early ARM
179 * processors. The use of a de Bruijn number is described in Henry
180 * Warren's %%\emph{Hacker's Delight}%%.
182 return (tab[((x&-x)*0x04653adf >> 27)&0x1f]);
187 /* --- @shl@, @shr@ --- *
189 * Arguments: @uint32 *z@ = destination vector
190 * @const uint32 *x@ = source vector
191 * @size_t sz@ = size of source vector, in elements
192 * @unsigned n@ = number of bits to shift by; must be less than
195 * Returns: The bits shifted off the end of the vector.
197 * Use: Shift a vector of 32-bit words left (@shl@) or right (@shr@)
198 * by some number of bit positions. These functions work
199 * correctly if @z@ and @x@ are the same pointer, but not if
200 * they otherwise overlap.
203 static uint32 shl(uint32 *z, const uint32 *x, size_t sz, unsigned n)
210 for (i = 0; i < sz; i++) z[i] = x[i];
214 for (t = 0, i = sz; i--; )
215 { u = x[i]; z[i] = ((u << n) | t)&MASK32; t = u >> r; }
220 static uint32 shr(uint32 *z, const uint32 *x, size_t sz, unsigned n)
227 for (i = 0; i < sz; i++) z[i] = x[i];
231 for (t = 0, i = 0; i < sz; i++)
232 { u = x[i]; z[i] = ((u >> n) | t)&MASK32; t = u << r; }
237 /* --- @sigbits@ --- *
239 * Arguments: @const struct floatbits *x@ = decoded floating-point number
241 * Returns: The number of significant digits in @x@'s fraction. This
242 * will be zero if @x@ is zero or infinite.
245 static unsigned sigbits(const struct floatbits *x)
250 if (x->f&(FLTF_ZERO | FLTF_INF)) return (0);
254 w = x->frac[--i]; if (w) return (32*(i + 1) - ctz32(w));
258 /* --- @ms_set_bit@ --- *
260 * Arguments: @const uint32 *x@ = pointer to the %%\emph{end}%% of the
262 * @unsigned from, to@ = lower (inclusive) and upper (exclusive)
263 * bounds on the region of bits to inspect
265 * Returns: Index of the most significant set bit, or @ALLCLEAR@.
267 * Use: For the (rather unusual) purposes of this function, the bits
268 * of the input are numbered from zero, being the least
269 * significant bit of @x[-1]@, upwards through more significant
270 * bits of @x[-1]@, and then through @x[-2]@ and so on.
272 * If all of the bits in the half-open region are clear then
273 * @ALLCLEAR@ is returned; otherwise, the return value is the
274 * index of the most significant set bit in the region. Note
275 * that @ALLCLEAR@ is equal to @UINT_MAX@: since this is the
276 * largest possible value of @to@, and the upper bound is
277 * exclusive, this cannot be the index of a bit in the region.
280 #define ALLCLEAR UINT_MAX
281 static unsigned ms_set_bit(const uint32 *x, unsigned from, unsigned to)
283 unsigned n0, n, b, base;
286 /* <--- increasing indices <---
288 * ---+-------+-------+-------+-------+-------+---
289 * ...S |///| | | | | |//| S...
290 * ---+-------+-------+-------+-------+-------+---
293 /* If the region is empty then it's technically true that all of the bits
294 * are zero. It's important to be able to do answer the case where
295 * %$\id{from} = \id{to} = 0$% without accessing memory.
297 assert(to >= from); if (to == from) return (ALLCLEAR);
299 /* This is distressingly complicated. Here's how it's going to work.
301 * There's at least one bit to check, or we'd have returned already --
302 * specifically, we must check the bit with index @from@. But that's at
303 * the wrong end. Instead, we start at the most significant end, with the
304 * word containing the bit one short of the @to@ position. Even though
305 * it's actually one off, because we use a half-open interval, we'll call
306 * that the `@to@ bit'.
308 * We start by loading the word containing the @to@ bit, and start @base@
309 * off as the bit index of the least significant bit of this word. We mask
310 * off the high bits (if any), leaving only the @to@ bit and the less
311 * significant ones. We %%\emph{don't}%% check the remaining bits yet.
313 * We then start an offset loop. In each iteration, we check the word
314 * we're currently holding: if it's not zero, then we return @base@ plus
315 * the position of the most-significant set bit, using @clz32@. Otherwise,
316 * we load the next (less significant) word, and drop @base@ by 32, but
317 * don't check it yet. We end this loop when the word we're holding
318 * contains the @from@ bit. It's possible that we didn't do any iterations
319 * of the loop, in which case we're still holding the word containing the
320 * @to@ bit at this point.
322 * Finally, we mask off the bits below the @from@ bit, and check that what
323 * we have left is zero. If it isn't, we return @base@ plus the position
324 * of the most significant bit; if it is, we return @ALLCEAR@.
327 /* The first job is to find the word containing the @to@ bit and mask off
328 * any higher bits that we don't care about.
330 * Recall that the bit's index is @to - 1@, but this must be a valid index
331 * because there is at least one bit in the region. But we start out
332 * pointing beyond the vector, so we must add an extra 32 bits.
334 n0 = (to + 31)/32; x -= n0; base = (to - 1)&~31u; w = *x++;
335 b = to%32; if (b) w &= M32(b);
337 /* Before we start the loop, it'd be useful to know how many iterations we
338 * need. This is going to be the offset from the word containing the @to@
339 * bit to the word containing the @from@ bit. Again, we're off by one
340 * because that's how our initial indexing is set up.
342 n = n0 - from/32 - 1;
344 /* Now it's time to do the loop. This is the easy bit. */
346 if (w) return (base + 31 - clz32(w));
347 w = *x++&MASK32; base -= 32;
350 /* We're now holding the final word -- the one containing the @from@ bit.
351 * We need to mask off any low bits that we don't care about.
353 m = M32(from%32); w &= MASK32&~m;
356 if (w) return (base + 31 - clz32(w));
357 else return (ALLCLEAR);
360 /*----- General floating-point hacking ------------------------------------*/
362 /* --- @fltfmt_initbits@ --- *
364 * Arguments: @struct floatbits *x@ = pointer to structure to initialize
368 * Use: Dynamically initialize @x@ to (positive) zero so that it can
369 * be used as the destination operand by other operations. This
370 * doesn't allocate resources and cannot fail. The
371 * @FLOATBITS_INIT@ macro is a suitable static initializer for
372 * performing the same task.
375 void fltfmt_initbits(struct floatbits *x)
379 x->frac = 0; x->n = x->fracsz = 0;
382 /* --- @fltfmt_freebits@ --- *
384 * Arguments: @struct floatbits *x@ = pointer to structure to free
388 * Use: Releases the memory held by @x@. Afterwards, @x@ is a valid
389 * (positive) zero, but can safely be discarded.
392 void fltfmt_freebits(struct floatbits *x)
394 if (x->frac) x_free(x->a, x->frac);
396 x->frac = 0; x->n = x->fracsz = 0;
399 /* --- @fltfmt_allocfrac@ --- *
401 * Arguments: @struct floatbits *x@ = structure to adjust
402 * @unsigned n@ = number of words required
406 * Use: Reallocate the @frac@ vector so that it has space for at
407 * least @n@ 32-bit words, and set @x->n@ equal to @n@. If the
408 * current size is already @n@ or greater, then just update the
409 * active length @n@ and return; otherwise, any existing vector
410 * is discarded and a fresh, larger one allocated.
413 void fltfmt_allocfrac(struct floatbits *x, unsigned n)
414 { GROWBUF_REPLACEV(unsigned, x->a, x->frac, x->fracsz, n, 4); x->n = n; }
416 /* --- @fltfmt_copybits@ --- *
418 * Arguments: @struct floatbits *z_out@ = where to leave the result
419 * @const struct floatbits *x@ = source to copy
423 * Use: Make @z_out@ be a copy of @x@. If @z_out@ is the same object
424 * as @x@ then do nothing.
427 void fltfmt_copybits(struct floatbits *z_out, const struct floatbits *x)
431 if (z_out == x) return;
433 if (!FINITEP(x)) z_out->exp = 0;
434 else z_out->exp = x->exp;
435 if ((x->f&(FLTF_ZERO | FLTF_INF)) || !x->n)
436 { z_out->n = 0; z_out->frac = 0; }
438 fltfmt_allocfrac(z_out, x->n);
439 for (i = 0; i < x->n; i++) z_out->frac[i] = x->frac[i];
443 /* --- @fltfmt_round@ --- *
445 * Arguments: @struct floatbits *z_out@ = destination (may equal source)
446 * @const struct floatbits *x@ = source
447 * @unsigned r@ = rounding mode (@FLTRND_...@ code)
448 * @unsigned n@ = nonzero number of bits to leave
450 * Returns: A @FLTERR_...@ code, specifically either @FLTERR_INEXACT@ if
451 * rounding discarded some nonzero value bits, or @FLTERR_OK@ if
452 * rounding was unnecessary.
454 * Use: Rounds a floating-point value to a given number of
455 * significant bits, using the given rounding rule.
458 unsigned fltfmt_round(struct floatbits *z_out, const struct floatbits *x,
459 unsigned r, unsigned n)
461 unsigned rf, i, uw, ub, hw, hb, rc = 0;
465 /* Check that this is likely to work. We must have at least one bit
466 * remaining, so that we can inspect the last-place unit bit. And we
467 * mustn't round up if the current value is already exact, because that
468 * would be nonsensical (and inconvenient).
470 assert(n > 0); assert(!(r&~(FRPMASK_LOW | FRPMASK_HALF)));
472 /* Eliminate trivial cases. There's nothing to do if the value is infinite
473 * or zero, or if we don't have enough precision already.
475 * The caller will have set the rounding mode and length suitably for a
478 if (x->f&(FLTF_ZERO | FLTF_INF) || n >= 32*x->n)
479 { fltfmt_copybits(z_out, x); return (FLTERR_OK); }
481 /* Determine various indices.
483 * The quantities @uw@ and @ub@ are the word and bit number which will hold
484 * the unit bit when we've finished; @hw@ and @hb@ similarly index the
485 * `half' bit, which is the next less significant bit.
487 uw = (n - 1)/32; ub = -n&31;
488 if (!ub) { hw = uw + 1; hb = 31; }
489 else { hw = uw; hb = ub - 1; }
491 /* Determine the necessary predicates for the rounding decision. */
493 if (x->f&FLTF_NEG) rf |= FRPF_NEG;
494 um = B32(ub); if (x->frac[uw]&um) rf |= FRPF_ODD;
495 hm = B32(hb); if (x->frac[hw]&hm) rf |= FRPF_HALF;
496 if (x->frac[hw]&(hm - 1)) rf |= FRPF_LOW;
497 for (i = hw + 1; i < x->n; i++) if (x->frac[i]) rf |= FRPF_LOW;
498 if (rf&(FRPF_LOW | FRPF_HALF)) rc |= FLTERR_INEXACT;
500 /* Allocate space for the result. */
501 fltfmt_allocfrac(z_out, uw + 1);
503 /* We start looking at the least significant word of the result. Clear the
506 i = uw; exp = x->exp; w = x->frac[i]&~(um - 1);
508 /* If the rounding function is true then we need to add one to the
509 * truncated fraction and propagate carries.
515 w = (x->frac[--i] + 1)&MASK32;
517 if (!w) { w = B31; exp++; }
520 /* Store, and copy the remaining words. */
528 z_out->f = x->f&(FLTF_NEG | FLTF_NANMASK);
529 if (x->f&FLTF_NANMASK) z_out->exp = 0;
530 else z_out->exp = exp;
534 /*----- IEEE formats ------------------------------------------------------*/
536 /* IEEE (and related) format descriptions. */
537 const struct fltfmt_ieeefmt
538 fltfmt_mini = { FLTIF_HIDDEN, 4, 4 },
539 fltfmt_bf16 = { FLTIF_HIDDEN, 8, 8 },
540 fltfmt_f16 = { FLTIF_HIDDEN, 5, 11 },
541 fltfmt_f32 = { FLTIF_HIDDEN, 8, 24 },
542 fltfmt_f64 = { FLTIF_HIDDEN, 11, 53 },
543 fltfmt_f128 = { FLTIF_HIDDEN, 15, 113 },
544 fltfmt_idblext80 = { 0, 15, 64 };
546 /* --- @fltfmt_encieee@ ---
548 * Arguments: @const struct fltfmt_ieeefmt *fmt@ = format description
549 * @uint32 *z@ = output vector
550 * @const struct floatbits *x@ = value to encode
551 * @unsigned r@ = rounding mode
552 * @unsigned errmask@ = error mask
554 * Returns: Error flags (@FLTERR_...@).
556 * Use: Encode a floating-point value in an IEEE format. This is the
557 * machinery shared by the @fltfmt_enc...@ functions for
558 * encoding IEEE-format values. Most of the arguments and
559 * behaviour are as described for those functions.
561 * The encoded value is right-aligned and big-endian; i.e., the
562 * sign bit ends up in @z[0]@, and the least significant bit of
563 * the significand ends up in the least significant bit of
567 unsigned fltfmt_encieee(const struct fltfmt_ieeefmt *fmt,
568 uint32 *z, const struct floatbits *x,
569 unsigned r, unsigned errmask)
572 unsigned sigwd, fracwd, err = 0, f = x->f, rf;
573 unsigned i, j, n, nb, nw, mb, mw, esh, sh;
574 int exp, minexp, maxexp;
577 #define ERR(e) do { err |= (e); if (err&~errmask) goto end; } while (0)
579 /* The following code assumes that the sign, biased exponent, unit, and
580 * quiet/signalling bits can all fit into the most significant 32 bits of
583 assert(fmt->expwd + 3 <= 32);
584 esh = 31 - fmt->expwd;
586 /* Determine the output size. */
587 nb = fmt->prec + fmt->expwd + 1;
588 if (fmt->f&FLTIF_HIDDEN) nb--;
591 /* Determine the top bits. */
593 if (x->f&FLTF_NEG) z0 |= B31;
595 /* And now for the main case analysis. */
598 /* Zero. There's very little to do here. */
600 } else if (f&FLTF_INF) {
601 /* Infinity. Set the exponent and, for non-hidden-bit formats, the unit
605 z0 |= M32(fmt->expwd) << esh;
606 if (!(fmt->f&FLTIF_HIDDEN)) z0 |= B32(esh - 1);
608 } else if (f&FLTF_NANMASK) {
611 * We must check that we won't lose significant bits. We need a bit for
612 * the quiet/signalling flag, and enough space for the significant
613 * payload bits. The unit bit is not in play here, so the available
614 * space is always one less than the advertised precision. To put it
615 * another way, we need space for the payload, a bit for the
616 * quiet/signalling flag, and a bit for the unit place.
620 if (fracwd + 2 > fmt->prec) ERR(FLTERR_INEXACT);
624 * If the payload is all-zero and we're meant to set a signalling NaN
625 * then report an exactness failure and set the low bit.
627 mb = fmt->prec - 2; mw = (mb + 31)/32; sh = -mb%32;
628 for (i = 0; i < nw - mw; i++) z[i] = 0;
629 n = x->n; if (n > mw) n = nw;
630 t = shr(z + i, x->frac, n, sh); i += n;
631 if (i < nw) z[i++] = t;
632 sh = esh - 2; if (fmt->f&FLTIF_HIDDEN) sh++;
633 if (f&FLTF_QNAN) z0 |= B32(sh);
634 else if (!fracwd) { ERR(FLTERR_INEXACT); z[nw - 1] |= 1; }
636 /* Set the exponent and, for non-hidden-bit formats, the unit bit. */
637 z0 |= M32(fmt->expwd) << esh;
638 if (!(fmt->f&FLTIF_HIDDEN)) z0 |= B32(esh - 1);
643 * Adjust the exponent by one place to compensate for the difference in
644 * significant conventions. Our significand lies between zero (in fact,
645 * a half, because we require normalization) and one, while an IEEE
646 * significand lies between zero (in fact, one) and two. Our exponent is
647 * therefore one larger than the IEEE exponent will be.
650 /* Determine the maximum true (unbiased) exponent. As noted above, this
654 maxexp = (1 << (fmt->expwd - 1)) - 1;
657 if (exp <= minexp - (int)fmt->prec) {
658 /* If the exponent is very small then we underflow. We have %$p - 1$%
659 * bits available to represent a subnormal significand, and therefore
660 * can represent at least one bit of a value as small as
661 * %$2^{e_{\text{min}}-p+1}$%.
663 * If the exponent is one short of the threshold, then we check to see
664 * whether the value will round up.
667 if ((minexp - exp == fmt->prec) &&
669 (sigbits(x) > 1 ? FRPF_LOW : 0) |
670 (f&FLTF_NEG ? FRPF_NEG : 0)))&1)) {
672 for (i = 0; i < nw - 1; i++) z[i] = 0;
675 ERR(FLTERR_UFLOW | FLTERR_INEXACT);
676 /* Return (signed) zero. */
680 /* We can at least try to store some bits. */
682 /* Let's see how many we need to deal with and how much space we have.
683 * We might as well set the biased exponent here while we're at it.
685 * If %$e \ge e_{\text{min}}$% then we can store %$p$% bits of
686 * significand. Otherwise, we must make a subnormal and we can only
687 * store %$p + e - e_{\text{min}}$% bits. (Cross-check: if %$e \le
688 * e_{\text{min}} - p$% then we can store zero bits or fewer and have
689 * underflowed to zero, which matches the previous case.) In the
690 * subnormal case, we also `correct' the exponent so that we store the
691 * correct sentinel value later.
694 if (exp >= minexp) sigwd = fmt->prec;
695 else { sigwd = fmt->prec + exp - minexp; exp = minexp - 1; }
696 mw = (sigwd + 31)/32; sh = -sigwd%32;
698 /* If we don't have enough significand bits then we must round. This
699 * might increase the exponent, so we must reload.
701 if (fracwd > sigwd) {
703 y.frac = z + nw - mw; y.fracsz = mw; fltfmt_round(&y, x, r, sigwd);
704 x = &y; exp = y.exp - 1; fracwd = sigwd;
708 /* If the exponent is too large, then we overflow. If the error is
709 * masked, then we must produce a default value, choosing between
710 * infinity and the largest representable finite value according to
714 ERR(FLTERR_OFLOW | FLTERR_INEXACT);
715 rf = FRPF_ODD | FRPF_HALF | FRPF_LOW;
716 if (f&FLTF_NEG) rf |= FRPF_NEG;
718 z0 |= M32(fmt->expwd) << esh;
720 z0 |= (B32(fmt->expwd) - 2) << esh;
721 mb = fmt->prec; if (fmt->f&FLTIF_HIDDEN) mb--;
725 while (i < nw) z[i++] = MASK32;
729 /* The exponent is in range. Everything is ready. */
731 /* Store the significand. */
732 n = (fracwd + 31)/32; i = nw - mw;
733 t = shr(z + i, x->frac, n, sh); i += n;
734 if (i < nw) z[i++] = t;
736 /* Fill in the top end. */
737 for (j = nw - mw; j--; ) z[j] = 0;
739 /* Set the biased exponent. */
740 z0 |= (exp + maxexp) << esh;
742 /* Clear the unit bit if we're suppose to use a hidden-bit convention. */
743 if (fmt->f&FLTIF_HIDDEN) {
744 mb = fmt->prec - 1; mw = (mb + 31)/32; mb = mb%32;
745 z[nw - mw] &= ~B32(mb);
751 /* Clear the significand bits that we haven't set explicitly yet. */
752 while (i < nw) z[i++] = 0;
754 /* All that remains is to insert the top bits @z0@ in the right place.
755 * This will set the exponent, and the unit and quiet bits.
759 if (sh && nb >= 32) z[1] |= z0 << (32 - sh);
767 /* --- @fltfmt_encTY@ --- *
769 * Arguments: @octet *z_out@, @uint16 *z_out@, @uint32 *z_out@,
770 * @kludge64 *z_out@ = where to put the encoded value
771 * @uint16 *se_out@, @kludge64 *m_out@ = where to put the
772 * encoded sign-and-exponent and significand
773 * @const struct floatbits *x@ = value to encode
774 * @unsigned r@ = rounding mode
775 * @unsigned errmask@ = error mask
777 * Returns: Error flags (@FLTERR_...@).
779 * Use: Encode a floating-point value in an IEEE (or IEEE-adjacent)
782 * If an error is encountered during the encoding, and the
783 * corresponding bit of @errmask@ is clear, then processing
784 * stops immediately and the error is returned; if the bit is
785 * set, then processing continues as described below.
789 * * @mini@ for the 8-bit `1.4.3 minifloat' format, with
790 * four-bit exponent and four-bit significand, represented
793 * * @bf16@ for the Google `bfloat16' format, with eight-bit
794 * exponent and eight-bit significand, represented as a
797 * * @f16@ for the IEEE `binary16' format, with five-bit
798 * exponent and eleven-bit significand, represented as a
801 * * @f32@ for the IEEE `binary32' format, with eight-bit
802 * exponent and 24-bit significand, represented as a
805 * * @f64@ for the IEEE `binary64' format, with eleven-bit
806 * exponent and 53-bit significand, represented as a
809 * * @f128@ for the IEEE `binary128' format, with fifteen-bit
810 * exponent and 113-bit significand, represented as four
811 * @uint32@ limbs, most significant first; or
813 * * @idblext80@ for the Intel 80-bit `double extended'
814 * format, with fifteen-bit exponent and 64-bit significand
815 * with no hidden bit, represented as a @uint16 se@
816 * holding the sign and exponent, and a @kludge64 m@
817 * holding the significand.
819 * Positive and negative zero and infinity are representable
822 * Following IEEE recommendations (and most implementations),
823 * the most significant fraction bit of a quiet NaN is set; this
824 * bit is clear in a signalling NaN. The most significant
825 * payload bits of a NaN, held in the top bits of @x->frac[0]@,
826 * are encoded in the output significand following the `quiet'
827 * bit. If the chosen format's significand field is too small
828 * to accommodate all of the set payload bits then the
829 * @FLTERR_INEXACT@ error bit is set and, if masked, the
830 * excess payload bits are discarded. No rounding of NaN
831 * payloads is performed.
833 * Otherwise, the input value is finite and nonzero. If the
834 * significand cannot be represented exactly then the
835 * @FLTERR_INEXACT@ error bit is set, and, if masked, the value
836 * will be rounded (internally -- the input @x@ is not changed).
837 * If the (rounded) value's exponent is too large to represent,
838 * then the @FLTERR_OFLOW@ and @FLTERR_INEXACT@ error bits are
839 * set and, if masked, the result is either the (absolute)
840 * largest representable finite value or infinity, with the
841 * appropriate sign, chosen according to the rounding mode. If
842 * the exponent is too small to represent, then the
843 * @FLTERR_UFLOW@ and @FLTERR_INEXACT@ error bits are set and,
844 * if masked, the result is either the (absolute) smallest
845 * nonzero value or zero, with the appropriate sign, chosen
846 * according to the rounding mode.
849 unsigned fltfmt_encmini(octet *z_out, const struct floatbits *x,
850 unsigned r, unsigned errmask)
855 rc = fltfmt_encieee(&fltfmt_mini, t, x, r, errmask);
856 if (!(rc&~errmask)) *z_out = t[0];
860 unsigned fltfmt_encbf16(uint16 *z_out, const struct floatbits *x,
861 unsigned r, unsigned errmask)
866 rc = fltfmt_encieee(&fltfmt_bf16, t, x, r, errmask);
867 if (!(rc&~errmask)) *z_out = t[0];
871 unsigned fltfmt_encf16(uint16 *z_out, const struct floatbits *x,
872 unsigned r, unsigned errmask)
877 rc = fltfmt_encieee(&fltfmt_f16, t, x, r, errmask);
878 if (!(rc&~errmask)) *z_out = t[0];
882 unsigned fltfmt_encf32(uint32 *z_out, const struct floatbits *x,
883 unsigned r, unsigned errmask)
884 { return (fltfmt_encieee(&fltfmt_f32, z_out, x, r, errmask)); }
886 unsigned fltfmt_encf64(kludge64 *z_out, const struct floatbits *x,
887 unsigned r, unsigned errmask)
892 rc = fltfmt_encieee(&fltfmt_f64, t, x, r, errmask);
893 if (!(rc&~errmask)) SET64(*z_out, t[0], t[1]);
897 unsigned fltfmt_encf128(uint32 *z_out, const struct floatbits *x,
898 unsigned r, unsigned errmask)
899 { return (fltfmt_encieee(&fltfmt_f128, z_out, x, r, errmask)); }
901 unsigned fltfmt_encidblext80(uint16 *se_out, kludge64 *m_out,
902 const struct floatbits *x,
903 unsigned r, unsigned errmask)
908 rc = fltfmt_encieee(&fltfmt_idblext80, t, x, r, errmask);
909 if (!(rc&~errmask)) { *se_out = t[0]; SET64(*m_out, t[1], t[2]); }
913 /* --- @fltfmt_decieee@ --- *
915 * Arguments: @const struct fltfmt_ieeefmt *fmt@ = format description
916 * @struct floatbits *z_out@ = output decoded representation
917 * @const uint32 *x@ = input encoding
919 * Returns: Error flags (@FLTERR_...@).
921 * Use: Decode a floating-point value in an IEEE format. This is the
922 * machinery shared by the @fltfmt_dec...@ functions for
923 * deccoding IEEE-format values. Most of the arguments and
924 * behaviour are as described for those functions.
926 * The encoded value should be right-aligned and big-endian;
927 * i.e., the sign bit ends up in @z[0]@, and the least
928 * significant bit of the significand ends up in the least
929 * significant bit of @z[n - 1]@.
932 unsigned fltfmt_decieee(const struct fltfmt_ieeefmt *fmt,
933 struct floatbits *z_out, const uint32 *x)
935 unsigned sigwd, err = 0, f = 0;
936 unsigned i, nb, nw, mw, esh, sh;
937 int exp, minexp, maxexp;
938 uint32 x0, t, u, emask;
940 /* The following code assumes that the sign, biased exponent, unit, and
941 * quiet/signalling bits can all fit into the most significant 32 bits of
944 assert(fmt->expwd + 3 <= 32);
945 esh = 31 - fmt->expwd; emask = M32(fmt->expwd);
946 sigwd = fmt->prec; if (fmt->f&FLTIF_HIDDEN) sigwd--;
948 /* Determine the input size. */
949 nb = sigwd + fmt->expwd + 1; nw = (nb + 31)/32;
951 /* Extract the sign, exponent, and top of the significand. */
954 if (sh && nb >= 32) x0 |= x[1] >> (32 - sh);
955 if (x0&B31) f |= FLTF_NEG;
956 t = (x0 >> esh)&emask;
958 /* Time for a case analysis. */
963 * Note that we don't include the quiet bit in our decoded payload.
966 if (!(fmt->f&FLTIF_HIDDEN)) {
967 /* No hidden bit, so we expect the unit bit to be set. If it isn't,
968 * that's technically invalid, and its absence won't survive a round
969 * trip, since the bit isn't considered part of a NaN payload -- or
970 * even to distinguish a NaN from an infinity. In any event, reduce
971 * the notional significand size to exclude this bit from further
975 if (!(x0&B32(esh - 1))) err = FLTERR_INVAL;
979 if (ms_set_bit(x + nw, 0, sigwd) == ALLCLEAR)
982 sh = esh - 2; if (fmt->f&FLTIF_HIDDEN) sh++;
983 if (x0&B32(sh)) f |= FLTF_QNAN;
985 sigwd--; mw = (sigwd + 31)/32;
986 fltfmt_allocfrac(z_out, mw);
987 shl(z_out->frac, x + nw - mw, mw, -sigwd%32);
992 /* Determine the exponent bounds. */
993 maxexp = (1 << (fmt->expwd - 1)) - 1;
996 /* Dispatch. If there's a hidden bit then everything is well defined.
997 * Otherwise, we'll normalize the incoming value regardless, but report
998 * settings of the unit bit which are inconsistent with the exponent.
1000 if (fmt->f&FLTIF_HIDDEN) {
1001 if (!t) { exp = minexp; goto normalize; }
1002 else { exp = t - maxexp; goto hidden; }
1004 u = x0&B32(esh - 1);
1005 if (!t) { exp = minexp; if (u) err |= FLTERR_INVAL; }
1006 else { exp = t - maxexp; if (!u) err |= FLTERR_INVAL; }
1011 /* We have a normal real number with a hidden bit. */
1013 mw = (sigwd + 31)/32;
1016 /* The bits we have occupy a whole number of words, but we need to shift
1017 * to make space for the unit bit.
1020 fltfmt_allocfrac(z_out, mw + 1);
1021 z_out->frac[mw] = shr(z_out->frac, x + nw - mw, mw, 1);
1023 fltfmt_allocfrac(z_out, mw);
1024 shl(z_out->frac, x + nw - mw, mw, -(sigwd + 1)%32);
1026 z_out->frac[0] |= B31;
1027 z_out->exp = exp + 1;
1031 /* We have, at least potentially, a subnormal number, with no hidden
1035 i = ms_set_bit(x + nw, 0, sigwd);
1036 if (i == ALLCLEAR) { f |= FLTF_ZERO; goto end; }
1037 mw = i/32 + 1; sh = 32*mw - i - 1;
1038 fltfmt_allocfrac(z_out, mw);
1039 shl(z_out->frac, x + nw - mw, mw, sh);
1040 z_out->exp = exp - fmt->prec + 2 + i;
1045 z_out->f = f; return (err);
1048 /* --- @fltfmt_decTY@ --- *
1050 * Arguments: @const struct floatbits *z_out@ = storage for the result
1051 * @octet x@, @uint16 x@, @uint32 x@, @kludge64 x@ =
1053 * @uint16 se@, @kludge64 m@ = encoded sign-and-exponent and
1056 * Returns: Error flags (@FLTERR_...@).
1058 * Use: Encode a floating-point value in an IEEE (or IEEE-adjacent)
1061 * The options for @TY@ are as documented for the encoding
1064 * In formats without a hidden bit -- currently only @idblext80@
1065 * -- not all bit patterns are valid encodings. If the explicit
1066 * unit bit is set when the exponent field is all-bits-zero, or
1067 * clear when the exponent field is not all-bits-zero, then the
1068 * @FLTERR_INVAL@ error bit is set. If the exponent is all-
1069 * bits-set, denoting infinity or a NaN, then the unit bit is
1070 * otherwise ignored -- in particular, it does not affect the
1071 * NaN payload, or even whether the input encodes a NaN or
1072 * infinity. Otherwise, the unit bit is considered significant,
1073 * and the result is normalized as one would expect.
1074 * Consequently, biased exponent values 0 and 1 are distinct
1075 * only with respect to which bit patterns are considered valid,
1076 * and not with respect to the set of values denoted.
1079 unsigned fltfmt_decmini(struct floatbits *z_out, octet x)
1080 { uint32 t[1]; t[0] = x; return (fltfmt_decieee(&fltfmt_mini, z_out, t)); }
1082 unsigned fltfmt_decbf16(struct floatbits *z_out, uint16 x)
1083 { uint32 t[1]; t[0] = x; return (fltfmt_decieee(&fltfmt_bf16, z_out, t)); }
1085 unsigned fltfmt_decf16(struct floatbits *z_out, uint16 x)
1086 { uint32 t[1]; t[0] = x; return (fltfmt_decieee(&fltfmt_f16, z_out, t)); }
1088 unsigned fltfmt_decf32(struct floatbits *z_out, uint32 x)
1089 { uint32 t[1]; t[0] = x; return (fltfmt_decieee(&fltfmt_f32, z_out, t)); }
1091 unsigned fltfmt_decf64(struct floatbits *z_out, kludge64 x)
1095 t[0] = HI64(x); t[1] = LO64(x);
1096 return (fltfmt_decieee(&fltfmt_f64, z_out, t));
1099 unsigned fltfmt_decf128(struct floatbits *z_out, const uint32 *x)
1100 { return (fltfmt_decieee(&fltfmt_f128, z_out, x)); }
1102 unsigned fltfmt_decidblext80(struct floatbits *z_out, uint16 se, kludge64 m)
1106 t[0] = se; t[1] = HI64(m); t[2] = LO64(m);
1107 return (fltfmt_decieee(&fltfmt_idblext80, z_out, t));
1110 /*----- Native formats ----------------------------------------------------*/
1112 /* If the floating-point radix is a power of two, determine how many bits
1113 * there are in each digit. This isn't exhaustive, but it covers most of the
1114 * bases, so to speak.
1117 # define DIGIT_BITS 1
1118 #elif FLT_RADIX == 4
1119 # define DIGIT_BITS 2
1120 #elif FLT_RADIX == 8
1121 # define DIGIT_BITS 3
1122 #elif FLT_RADIX == 16
1123 # define DIGIT_BITS 4
1126 /* --- @ENCFLT@ --- *
1128 * Arguments: @ty@ = the C type to encode
1129 * @TY@ = the uppercase prefix for macros
1130 * @ty (*ldexp)(ty, int)@ = function to scale a @ty@ value by a
1132 * @unsigned &rc@ = error code to set
1133 * @ty *z_out@ = storage for the result
1134 * @const struct floatbits *x@ = value to convert
1135 * @unsigned r@ = rounding mode
1139 * Use: Encode a floating-point value @x@ as a native C object of
1140 * type @ty@. This is the machinery shared by the
1141 * @fltfmt_enc...@ functions for enccoding native-format values.
1142 * Most of the behaviour is as described for those functions.
1145 /* Utilities based on conditional compilation, because we can't use
1146 * %|#ifdef|% directly in macros.
1150 /* The C implementation acknowledges the existence of (quiet) NaN values,
1151 * but will neither let us set the payload in a useful way, nor
1152 * acknowledge the existence of signalling NaNs. We have no good way to
1153 * determine which NaN the @NAN@ macro produces, so report this conversion
1157 # define SETNAN(rc, z) do { (z) = NAN; (rc) = FLTERR_INEXACT; } while (0)
1159 /* This C implementation doesn't recognize NaNs. This value is totally
1160 * unrepresentable, so just report the error. (Maybe it's C89 and would
1161 * actually do the right thing with @0/0@. I'm not sure the requisite
1162 * compile-time configuration machinery is worth the effort.)
1165 # define SETNAN(rc, z) do { (z) = 0; (rc) = FLTERR_REPR; } while (0)
1169 /* The C implementation supports infinities. This is a simple win. */
1171 # define SETINF(TY, rc, z) \
1172 do { (z) = INFINITY; (rc) = FLTERR_OK; } while (0)
1174 /* The C implementation doesn't support infinities. Return the maximum
1175 * value and report it as an overflow; I think this is more useful than
1176 * reporting a complete representation failure. (Maybe it's C89 and would
1177 * actually do the right thing with @1/0@. Again, I'm not sure the
1178 * requisite compile-time configuration machinery is worth the effort.)
1181 # define SETINF(TY, rc, z) \
1182 do { (z) = TY##_MAX; (rc) = FLTERR_OFLOW | FLTERR_INEXACT; } while (0)
1186 /* The floating point formats use a power-of-two radix. This means that
1187 * we can determine the correctly rounded value before we start building
1188 * the native floating-point value.
1191 # define ENC_ROUND_DECLS struct floatbits _y;
1192 # define ENC_ROUND(TY, rc, x, r) do { \
1193 (rc) |= fltfmt_round(&_y, (x), (r), DIGIT_BITS*TY##_MANT_DIG); \
1197 /* The floating point formats use a non-power-of-two radix. This means
1198 * that conversion is inherently inexact.
1201 # define ENC_ROUND_DECLS
1202 # define ENC_ROUND(TY, rc, x, r) \
1203 do (rc) |= FLTERR_INEXACT; while (0)
1204 # define ENC_FIXUP(...)
1208 #define ENCFLT(ty, TY, ldexp, rc, z_out, x, r) do { \
1211 /* See if the native format is one that we recognize. */ \
1212 switch (TY##_FORMAT&(FLTFMT_ORGMASK | FLTFMT_TYPEMASK)) { \
1214 case FLTFMT_IEEE_F32: { \
1216 unsigned char *_z = (unsigned char *)(z_out); \
1218 (rc) = fltfmt_encieee(&fltfmt_f32, _t, (x), (r), FLTERR_ALLERRS); \
1219 FLTFMT__FROB_NAN_F32(_t, _rc); \
1220 switch (TY##_FORMAT&FLTFMT_ENDMASK) { \
1221 case FLTFMT_BE: STORE32_B(_z, _t[0]); break; \
1222 case FLTFMT_LE: STORE32_L(_z, _t[0]); break; \
1223 default: assert(!"unimplemented byte order"); break; \
1227 case FLTFMT_IEEE_F64: { \
1229 unsigned char *_z = (unsigned char *)(z_out); \
1230 (rc) = fltfmt_encieee(&fltfmt_f64, _t, (x), (r), FLTERR_ALLERRS); \
1231 FLTFMT__FROB_NAN_F64(_t, _rc); \
1232 switch (TY##_FORMAT&FLTFMT_ENDMASK) { \
1234 STORE32_B(_z + 0, _t[0]); STORE32_B(_z + 4, _t[1]); \
1237 STORE32_L(_z + 0, _t[1]); STORE32_L(_z + 4, _t[0]); \
1240 STORE32_L(_z + 0, _t[0]); STORE32_L(_z + 4, _t[1]); \
1242 default: assert(!"unimplemented byte order"); break; \
1246 case FLTFMT_IEEE_F128: { \
1248 unsigned char *_z = (unsigned char *)(z_out); \
1250 FLTFMT__FROB_NAN_F128(_t, _rc); \
1251 (rc) = fltfmt_encieee(&fltfmt_f128, _t, (x), (r), FLTERR_ALLERRS); \
1252 switch (TY##_FORMAT&FLTFMT_ENDMASK) { \
1254 STORE32_B(_z + 0, _t[0]); STORE32_B(_z + 4, _t[1]); \
1255 STORE32_B(_z + 8, _t[0]); STORE32_B(_z + 12, _t[1]); \
1258 STORE32_L(_z + 0, _t[3]); STORE32_L(_z + 4, _t[2]); \
1259 STORE32_L(_z + 8, _t[1]); STORE32_L(_z + 12, _t[0]); \
1261 default: assert(!"unimplemented byte order"); break; \
1265 case FLTFMT_INTEL_F80: { \
1267 unsigned char *_z = (unsigned char *)(z_out); \
1269 (rc) = fltfmt_encieee(&fltfmt_idblext80, _t, (x), (r), FLTERR_ALLERRS); \
1270 FLTFMT__FROB_NAN_IDBLEXT80(_t, _rc); \
1271 switch (TY##_FORMAT&FLTFMT_ENDMASK) { \
1273 STORE16_B(_z + 0, _t[0]); \
1274 STORE32_B(_z + 2, _t[1]); STORE32_B(_z + 6, _t[2]); \
1277 STORE32_L(_z + 0, _t[2]); STORE32_L(_z + 4, _t[1]); \
1278 STORE16_L(_z + 8, _t[0]); \
1280 default: assert(!"unimplemented byte order"); break; \
1285 /* We must do this the hard way. */ \
1287 const struct floatbits *_x = (x); \
1292 /* Case analysis... */ \
1293 if (_x->f&FLTF_NANMASK) { \
1294 /* A NaN. Use the macro above. */ \
1297 if (x->f&FLTF_NEG) _z = -_z; \
1298 } else if (_x->f&FLTF_INF) { \
1299 /* Infinity. Use the macro. */ \
1301 SETINF(TY, _rc, _z); \
1302 if (_x->f&FLTF_NEG) _z = -_z; \
1303 } else if (_x->f&FLTF_ZERO) { \
1304 /* Zero. If we're asked for a negative zero then check that we \
1305 * produced one: if not, then report an exactness failure. \
1309 if (_x->f&FLTF_NEG) \
1310 { _z = -_z; if (!NEGP(_z)) _rc |= FLTERR_INEXACT; } \
1312 /* A finite value. */ \
1314 /* If the radix is a power of two, we can round to the correct \
1315 * precision, which will save inexactness later. \
1317 ENC_ROUND(TY, _rc, _x, (r)); \
1319 /* Insert the 32-bit pieces of the fraction one at a time, \
1320 * starting from the least-significant end. This minimizes the \
1321 * inaccuracy from the overall approach, but it's imperfect \
1322 * unless the value has already been rounded correctly. \
1325 for (_i = _x->n, _z = 0.0; _i--; ) \
1326 _z += ldexp(_x->frac[_i], _x->exp - 32*_i); \
1328 /* Negate the value if we need to. */ \
1329 if (_x->f&FLTF_NEG) _z = -_z; \
1337 /* Set the error code. */ \
1341 /* --- @fltfmt_encTY@ --- *
1343 * Arguments: @ty *z_out@ = storage for the result
1344 * @const struct floatbits *x@ = value to encode
1345 * @unsigned r@ = rounding mode
1347 * Returns: Error flags (@FLTERR_...@).
1349 * Use: Encode the floating-point value @x@ as a native C object and
1350 * store the result in @z_out@.
1352 * The @TY@ may be @flt@ to encode a @float@, @dbl@ to encode a
1353 * @double@, or (on C99 implementations) @ldbl@ to encode a
1356 * In detail, conversion is performed as follows.
1358 * * If a non-finite value cannot be represented by the
1359 * implementation then the @FLTERR_REPR@ error bit is set
1360 * and @*z_out@ is set to zero if @x@ is a NaN, or the
1361 * (absolute) largest representable value, with appropriate
1362 * sign, if @x@ is an infinity.
1364 * * If the implementation can represent NaNs, but cannot set
1365 * NaN payloads, then the @FLTERR_INEXACT@ error bit is set,
1366 * and @*z_out@ is set to an arbitrary (quiet) NaN value.
1368 * * If @x@ is negative zero, but the implementation does not
1369 * distinguish negative and positive zero, then the
1370 * @FLTERR_INEXACT@ error bit is set and @*z_out@ is set to
1373 * * If the implementation's floating-point radix is not a
1374 * power of two, and @x@ is a nonzero finite value, then
1375 * @FLTERR_INEXACT@ error bit is set (unconditionally), and
1376 * the value is rounded by the implementation using its
1377 * prevailing rounding policy. If the radix is a power of
1378 * two, then the @FLTERR_INEXACT@ error bit is set only if
1379 * rounding is necessary, and rounding is performed using
1380 * the rounding mode @r@.
1383 unsigned fltfmt_encflt(float *z_out, const struct floatbits *x, unsigned r)
1387 ENCFLT(double, FLT, ldexp, rc, z_out, x, r);
1391 unsigned fltfmt_encdbl(double *z_out, const struct floatbits *x, unsigned r)
1395 ENCFLT(double, DBL, ldexp, rc, z_out, x, r);
1399 #if __STDC_VERSION__ >= 199001
1400 unsigned fltfmt_encldbl(long double *z_out,
1401 const struct floatbits *x, unsigned r)
1405 ENCFLT(long double, LDBL, ldexpl, rc, z_out, x, r);
1410 /* --- @DECFLT@ --- *
1412 * Arguments: @ty@ = the C type to encode
1413 * @TY@ = the uppercase prefix for macros
1414 * @ty (*frexp)(ty, int *)@ = function to decompose a @ty@ value
1415 * into a binary exponent and normalized fraction
1416 * @unsigned &rc@ = error code to set
1417 * @struct floatbits *z_out@ = storage for the result
1418 * @ty x@ = value to convert
1419 * @unsigned r@ = rounding mode
1423 * Use: Decode a C native floating-point object. This is the
1424 * machinery shared by the @fltfmt_dec...@ functions for
1425 * decoding native-format values. Most of the behaviour is as
1426 * described for those functions.
1429 /* Define some utilities for decoding native floating-point formats.
1431 * * @NFRAC(d)@ is the number of fraction limbs we'll need for @d@ native
1434 * * @CONVFIX@ is a final fixup applied to the decoded value.
1437 # define NFRAC(TY) ((DIGIT_BITS*TY##_MANT_DIG + 31)/32)
1438 # define CONVFIX(ty, rc, z, x, n, f, r) do assert(!(x)); while (0)
1440 # define NFRAC(TY) \
1441 (ceil(log(pow(FLT_RADIX, TY##_MANT_DIG) - 1)/32.0*log(2.0)) + 1)
1442 # define CONVFIX(ty, rc, z, x, n, f, r) do { \
1444 struct floatbits *_z_ = (z); \
1446 unsigned _i_, _n_ = (n), _f_; \
1448 /* Round the result according to the remainder left in %$x$%. */ \
1449 _f_ = 0; _i_ = _n_ - 1; _w_ = _z_->frac[_i_]; \
1450 if ((f)&FLTF_NEG) _f_ |= FRPF_NEG; \
1451 if (_w_&1) _f_ |= FRPF_ODD; \
1452 if (_y_ >= 0.5) _f_ |= FRPF_HALF; \
1453 if (_y_ != 0 && _y_ != 0.5) _f_ |= FRPF_LOW; \
1454 if (((r) >> _f_)&1) { \
1456 _w_ = (_w_ + 1)&MASK32; \
1457 if (_w_ || !_i_) break; \
1458 _z_->frac[_i_] = 0; _w_ = _z_->frac[--_i_]; \
1460 if (!_w_) { _z_->exp++; _w_ = B31; } \
1461 _z_->frac[_i_] = w; \
1464 /* The result is not exact. */ \
1465 (rc) |= FLTERR_INEXACT; \
1469 #define DECFLT(ty, TY, frexp, rc, z_out, x, r) do { \
1472 switch (TY##_FORMAT&(FLTFMT_ORGMASK | FLTFMT_TYPEMASK)) { \
1474 case FLTFMT_IEEE_F32: { \
1476 unsigned char *_x = (unsigned char *)&(x); \
1478 switch (TY##_FORMAT&FLTFMT_ENDMASK) { \
1479 case FLTFMT_BE: _t[0] = LOAD32_B(_x); break; \
1480 case FLTFMT_LE: _t[0] = LOAD32_L(_x); break; \
1481 default: assert(!"unimplemented byte order"); break; \
1483 FLTFMT__FROB_NAN_F32(_t, _rc); \
1484 _rc |= fltfmt_decieee(&fltfmt_f32, (z_out), _t); \
1487 case FLTFMT_IEEE_F64: { \
1489 unsigned char *_x = (unsigned char *)&(x); \
1491 switch (TY##_FORMAT&FLTFMT_ENDMASK) { \
1493 _t[0] = LOAD32_B(_x + 0); _t[1] = LOAD32_B(_x + 4); \
1496 _t[1] = LOAD32_L(_x + 0); _t[0] = LOAD32_L(_x + 4); \
1499 _t[0] = LOAD32_L(_x + 0); _t[1] = LOAD32_L(_x + 4); \
1501 default: assert(!"unimplemented byte order"); break; \
1503 FLTFMT__FROB_NAN_F64(_t, _rc); \
1504 _rc |= fltfmt_decieee(&fltfmt_f64, (z_out), _t); \
1507 case FLTFMT_IEEE_F128: { \
1509 unsigned char *_x = (unsigned char *)&(x); \
1511 switch (TY##_FORMAT&FLTFMT_ENDMASK) { \
1513 _t[0] = LOAD32_B(_x + 0); _t[1] = LOAD32_B(_x + 4); \
1514 _t[2] = LOAD32_B(_x + 8); _t[3] = LOAD32_B(_x + 12); \
1517 _t[3] = LOAD32_L(_x + 0); _t[2] = LOAD32_L(_x + 4); \
1518 _t[1] = LOAD32_L(_x + 8); _t[0] = LOAD32_L(_x + 12); \
1520 default: assert(!"unimplemented byte order"); break; \
1522 FLTFMT__FROB_NAN_F128(_t, _rc); \
1523 _rc |= fltfmt_decieee(&fltfmt_f128, (z_out), _t); \
1526 case FLTFMT_INTEL_F80: { \
1528 unsigned char *_x = (unsigned char *)&(x); \
1530 switch (TY##_FORMAT&FLTFMT_ENDMASK) { \
1532 _t[0] = LOAD16_B(_x + 0); \
1533 _t[1] = LOAD32_B(_x + 2); _t[2] = LOAD32_B(_x + 6); \
1536 _t[2] = LOAD32_L(_x + 0); _t[1] = LOAD32_L(_x + 4); \
1537 _t[0] = LOAD16_L(_x + 8); \
1539 default: assert(!"unimplemented byte order"); break; \
1541 FLTFMT__FROB_NAN_IDBLEXT80(_t, _rc); \
1542 _rc |= fltfmt_decieee(&fltfmt_idblext80, (z_out), _t); \
1546 struct floatbits *_z = (z_out); \
1548 unsigned _i, _n, _f = 0; \
1551 /* If the value looks negative then negate it and set the sign \
1554 if (NEGP(_x)) { _f |= FLTF_NEG; _x = -_x; } \
1556 /* Now for the case analysis. Infinities and zero are \
1557 * unproblematic. NaNs can't be decoded exactly using the \
1558 * portable machinery. \
1560 if (INFP(_x)) _f |= FLTF_INF; \
1561 else if (_x == 0.0) _f |= FLTF_ZERO; \
1562 else if (NANP(_x)) { _f |= FLTF_QNAN; _rc |= FLTERR_INEXACT; } \
1564 /* A finite value. Determine the number of fraction limbs \
1565 * we'll need based on the precision and radix and pull out \
1566 * 32-bit chunks one at a time. This will be unproblematic \
1567 * for power-of-two radices, requiring at most shifting the \
1568 * significand left by a few bits, but inherently inexact (for \
1569 * the most part) for others. \
1572 _n = NFRAC(TY); fltfmt_allocfrac(_z, _n); \
1573 _y = frexp(_x, &_z->exp); \
1574 for (_i = 0; _i < _n; _i++) \
1575 { _y *= SH32; _t = _y; _y -= _t; _z->frac[_i] = _t; } \
1576 CONVFIX(ty, _rc, _z, _y, _n, _f, (r)); \
1584 /* Set the error code. */ \
1588 /* --- @fltfmt_decTY@ --- *
1590 * Arguments: @struct floatbits *z_out@ = storage for the result
1591 * @ty x@ = value to decode
1592 * @unsigned r@ = rounding mode
1594 * Returns: Error flags (@FLTERR_...@).
1596 * Use: Decode the native C floatingpoint value @x@ and store the
1597 * result in @z_out@.
1599 * The @TY@ may be @flt@ to encode a @float@, @dbl@ to encode a
1600 * @double@, or (on C99 implementations) @ldbl@ to encode a
1603 * In detail, conversion is performed as follows.
1605 * * If the implementation supports negative zeros and/or
1606 * infinity, then these are recognized and decoded.
1608 * * If the input as a NaN, but the implementation cannot
1609 * usefully report NaN payloads, then the @FLTERR_INEXACT@
1610 * error bit is set and the decoded payload is left empty.
1612 * * If the implementation's floating-point radix is not a
1613 * power of two, and @x@ is a nonzero finite value, then
1614 * @FLTERR_INEXACT@ error bit is set (unconditionally), and
1615 * the rounded value (according to the rounding mode @r@) is
1616 * stored in as many fraction words as necessary to identify
1617 * the original value uniquely. If the radix is a power of
1618 * two, then the value is represented exactly.
1621 unsigned fltfmt_decflt(struct floatbits *z_out, float x, unsigned r)
1625 DECFLT(double, FLT, frexp, rc, z_out, x, r);
1629 unsigned fltfmt_decdbl(struct floatbits *z_out, double x, unsigned r)
1633 DECFLT(double, DBL, frexp, rc, z_out, x, r);
1637 #if __STDC_VERSION__ >= 199001
1638 unsigned fltfmt_decldbl(struct floatbits *z_out, long double x, unsigned r)
1642 DECFLT(long double, LDBL, frexpl, rc, z_out, x, r);
1647 /*----- That's all, folks -------------------------------------------------*/