chiark / gitweb /
@@@ fltfmt wip
[mLib] / utils / fltfmt.c
CommitLineData
b1a20bee
MW
1/* -*-c-*-
2 *
3 * Floating-point format conversions
4 *
5 * (c) 2024 Straylight/Edgeware
6 */
7
8/*----- Licensing notice --------------------------------------------------*
9 *
10 * This file is part of the mLib utilities library.
11 *
12 * mLib is free software: you can redistribute it and/or modify it under
13 * the terms of the GNU Library General Public License as published by
14 * the Free Software Foundation; either version 2 of the License, or (at
15 * your option) any later version.
16 *
17 * mLib is distributed in the hope that it will be useful, but WITHOUT
18 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
19 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public
20 * License for more details.
21 *
22 * You should have received a copy of the GNU Library General Public
23 * License along with mLib. If not, write to the Free Software
24 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
25 * USA.
26 */
27
28/*----- Header files ------------------------------------------------------*/
29
30#include "config.h"
31
32#include <assert.h>
33#include <float.h>
34#include <limits.h>
35#include <math.h>
36
37#include "alloc.h"
38#include "arena.h"
39#include "bits.h"
40#include "fltfmt.h"
41#include "growbuf.h"
b1a20bee
MW
42#include "maths.h"
43
dc6eea4e
MW
44/*----- Preliminary hacking -----------------------------------------------*/
45
46/* The native-format conversions are -- at least if the format is
47 * unrecognized -- dependent on the implementation's rounding. Our own
48 * rounding mode specifications don't fit into the framework very well, but I
49 * still want to respect the prevailing rounding mode.
50 *
51 * The `proper' way to do this is with %|#pragma STDC FENV_ACCESS|%. But
52 * that doesn't actually work on GCC, or on Clang from not too long ago. So
53 * use compiler-specific hacking to support this.
54 */
b1a20bee
MW
55#if GCC_VERSION_P(4, 4)
56# pragma GCC optimize "-frounding-math"
57#elif CLANG_VERSION_P(11, 0) && !CLANG_VERSION_P(12, 0)
58# pragma clang optimize "-frounding-math"
59#elif GCC_VERSION_P(0, 0) || \
60 (CLANG_VERSION_P(0, 0) && !CLANG_VERSION_P(12, 0))
61 /* We just lose. */
62#elif __STDC_VERSION__ >= 199001
63# include <fenv.h>
64# pragma STDC FENV_ACCESS ON
65#endif
66
67/*----- Some useful constants ---------------------------------------------*/
68
289651a7
MW
69#define B31 0x80000000u /* just bit 31 */
70#define B30 0x40000000u /* just bit 30 */
b1a20bee
MW
71
72#define SH32 4294967296.0 /* 2^32 as floating-point */
73
74/*----- Useful macros -----------------------------------------------------*/
75
76#define B32(k) ((uint32)1 << (k))
77#define M32(k) (B32(k) - 1)
78
79#define FINITEP(x) (!((x)->f&(FLTF_INF | FLTF_NANMASK | FLTF_ZERO)))
80
81/*----- Utility functions -------------------------------------------------*/
82
83/* --- @clz32@ --- *
84 *
85 * Arguments: @uint32 x@ = a 32-bit value
86 *
87 * Returns: The number of leading zeros in @x@, i.e., the nonnegative
88 * integer %$n$% such that %$2^{31} \le 2^n x < 2^{32}$%.
89 * Returns a nonsensical value if %$x = 0$%.
90 */
91
92static unsigned clz32(uint32 x)
93{
94 unsigned n = 0;
95
96 /* Divide and conquer. If the top half of the bits are clear, then there
97 * must be at least 16 leading zero bits, so accumulate and shift. Repeat
98 * for smaller powers of two.
99 *
100 * This ends up returning 31 if %$x = 0$%, but don't rely on this.
101 */
102 if (!(x&0xffff0000)) { x <<= 16; n += 16; }
103 if (!(x&0xff000000)) { x <<= 8; n += 8; }
104 if (!(x&0xf0000000)) { x <<= 4; n += 4; }
105 if (!(x&0xc0000000)) { x <<= 2; n += 2; }
106 if (!(x&0x80000000)) { n += 1; }
107 return (n);
108}
109
110/* --- @ctz32@ --- *
111 *
112 * Arguments: @uint32 x@ = a 32-bit value
113 *
114 * Returns: The number of trailing zeros in @x@, i.e., the nonnegative
115 * integer %$n$% such that %$x/2^n$% is an odd integer.
116 * Returns a nonsensical value if %$x = 0$%.
117 */
118
119static unsigned ctz32(uint32 x)
120{
121#ifdef CTZ_TRADITIONAL
122
123 unsigned n = 0;
124
125 /* Divide and conquer. If the bottom half of the bits are clear, then
126 * there must be at least 16 trailing zero bits, so accumulate and shift.
127 * Repeat for smaller powers of two.
128 *
129 * This ends up returning 31 if %$x = 0$%, but don't rely on this.
130 */
131 if (!(x&0x0000ffff)) { x >>= 16; n += 16; }
132 if (!(x&0x000000ff)) { x >>= 8; n += 8; }
133 if (!(x&0x0000000f)) { x >>= 4; n += 4; }
134 if (!(x&0x00000003)) { x >>= 2; n += 2; }
135 if (!(x&0x00000001)) { n += 1; }
136 return (n);
137
138#else
139
140 static unsigned char tab[] =
141 /*
142 ;;; Compute the decoding table for the de Bruijn sequence trick below.
143
144 (let ((db #x04653adf)
145 (rv (make-vector 32 nil)))
146 (dotimes (i 32)
147 (aset rv (logand (ash db (- i 27)) #x1f) i))
148 (save-excursion
149 (goto-char (point-min))
150 (search-forward (concat "***" "BEGIN ctz32tab" "***"))
151 (beginning-of-line 2)
152 (delete-region (point)
153 (progn
154 (search-forward "***END***")
155 (beginning-of-line)
156 (point)))
157 (dotimes (i 32)
158 (cond ((zerop i) (insert " { "))
159 ((zerop (mod i 16)) (insert ",\n "))
160 ((zerop (mod i 4)) (insert ", "))
161 (t (insert ", ")))
162 (insert (format "%2d" (aref rv i))))
163 (insert " };\n")))
164 */
165
166 /* ***BEGIN ctz32tab*** */
167 { 0, 1, 2, 6, 3, 11, 7, 16, 4, 14, 12, 21, 8, 23, 17, 26,
168 31, 5, 10, 15, 13, 20, 22, 25, 30, 9, 19, 24, 29, 18, 28, 27 };
169 /* ***END*** */
170
171 /* Sneaky trick. Two's complement negation (which you effectively get
172 * using C unsigned arithmetic, whether you like it or not) complements all
173 * of the bits of an operand more significant than the least significant
174 * set bit. Therefore, this bit is the only one set in both %$x$% and
175 * %$-x$%, so @x&-x@ will isolate it for us.
176 *
177 * The magic number @0x04653adf@ is a %%\emph{de Bruijn} number%%: every
178 * group of five consecutive bits is distinct, including groups which `wrap
179 * around', including some low bits and some high bits. Multiplying this
180 * number by a power of two is equivalent to a left shift; and, because the
181 * top five bits are all zero, the most significant five bits of the
182 * product are the same as if we'd applied a rotation. The result is that
183 * we end up with a distinctive pattern in those bits which perfectly
184 * diagnose each shift from 0 up to 31, which we can decode using a table.
185 *
186 * David Seal described a similar trick -- using the six-bit pattern
187 * generated by the constant @0x0450fbaf@ -- in `comp.sys.arm' in 1994;
188 * this constant was particularly convenient to multiply by on early ARM
189 * processors. The use of a de Bruijn number is described in Henry
190 * Warren's %%\emph{Hacker's Delight}%%.
191 */
192 return (tab[((x&-x)*0x04653adf >> 27)&0x1f]);
193
194#endif
195}
196
197/* --- @shl@, @shr@ --- *
198 *
199 * Arguments: @uint32 *z@ = destination vector
200 * @const uint32 *x@ = source vector
201 * @size_t sz@ = size of source vector, in elements
202 * @unsigned n@ = number of bits to shift by; must be less than
203 * 32
204 *
205 * Returns: The bits shifted off the end of the vector.
206 *
207 * Use: Shift a vector of 32-bit words left (@shl@) or right (@shr@)
208 * by some number of bit positions. These functions work
209 * correctly if @z@ and @x@ are the same pointer, but not if
210 * they otherwise overlap.
211 */
212
213static uint32 shl(uint32 *z, const uint32 *x, size_t sz, unsigned n)
214{
215 size_t i;
216 uint32 t, u;
217 unsigned r;
218
219 if (!n) {
220 for (i = 0; i < sz; i++) z[i] = x[i];
221 t = 0;
222 } else {
223 r = 32 - n;
224 for (t = 0, i = sz; i--; )
225 { u = x[i]; z[i] = ((u << n) | t)&MASK32; t = u >> r; }
226 }
227 return (t);
228}
229
230static uint32 shr(uint32 *z, const uint32 *x, size_t sz, unsigned n)
231{
232 size_t i;
233 uint32 t, u;
234 unsigned r;
235
236 if (!n) {
237 for (i = 0; i < sz; i++) z[i] = x[i];
238 t = 0;
239 } else {
240 r = 32 - n;
241 for (t = 0, i = 0; i < sz; i++)
242 { u = x[i]; z[i] = ((u >> n) | t)&MASK32; t = u << r; }
243 }
244 return (t);
245}
246
247/* --- @sigbits@ --- *
248 *
249 * Arguments: @const struct floatbits *x@ = decoded floating-point number
250 *
251 * Returns: The number of significant digits in @x@'s fraction. This
252 * will be zero if @x@ is zero or infinite.
253 */
254
255static unsigned sigbits(const struct floatbits *x)
256{
257 unsigned i;
258 uint32 w;
259
260 if (x->f&(FLTF_ZERO | FLTF_INF)) return (0);
261 i = x->n;
262 for (;;) {
263 if (!i) return (0);
264 w = x->frac[--i]; if (w) return (32*(i + 1) - ctz32(w));
265 }
266}
267
268/* --- @ms_set_bit@ --- *
269 *
270 * Arguments: @const uint32 *x@ = pointer to the %%\emph{end}%% of the
271 * buffer
272 * @unsigned from, to@ = lower (inclusive) and upper (exclusive)
273 * bounds on the region of bits to inspect
274 *
275 * Returns: Index of the most significant set bit, or @ALLCLEAR@.
276 *
277 * Use: For the (rather unusual) purposes of this function, the bits
278 * of the input are numbered from zero, being the least
279 * significant bit of @x[-1]@, upwards through more significant
280 * bits of @x[-1]@, and then through @x[-2]@ and so on.
281 *
282 * If all of the bits in the half-open region are clear then
283 * @ALLCLEAR@ is returned; otherwise, the return value is the
284 * index of the most significant set bit in the region. Note
285 * that @ALLCLEAR@ is equal to @UINT_MAX@: since this is the
286 * largest possible value of @to@, and the upper bound is
287 * exclusive, this cannot be the index of a bit in the region.
288 */
289
290#define ALLCLEAR UINT_MAX
291static unsigned ms_set_bit(const uint32 *x, unsigned from, unsigned to)
292{
293 unsigned n0, n, b, base;
294 uint32 m, w;
295
296 /* <--- increasing indices <---
297 *
298 * ---+-------+-------+-------+-------+-------+---
299 * ...S |///| | | | | |//| S...
300 * ---+-------+-------+-------+-------+-------+---
301 */
302
303 /* If the region is empty then it's technically true that all of the bits
304 * are zero. It's important to be able to do answer the case where
305 * %$\id{from} = \id{to} = 0$% without accessing memory.
306 */
307 assert(to >= from); if (to == from) return (ALLCLEAR);
308
309 /* This is distressingly complicated. Here's how it's going to work.
310 *
311 * There's at least one bit to check, or we'd have returned already --
312 * specifically, we must check the bit with index @from@. But that's at
313 * the wrong end. Instead, we start at the most significant end, with the
314 * word containing the bit one short of the @to@ position. Even though
315 * it's actually one off, because we use a half-open interval, we'll call
316 * that the `@to@ bit'.
317 *
318 * We start by loading the word containing the @to@ bit, and start @base@
319 * off as the bit index of the least significant bit of this word. We mask
320 * off the high bits (if any), leaving only the @to@ bit and the less
321 * significant ones. We %%\emph{don't}%% check the remaining bits yet.
322 *
323 * We then start an offset loop. In each iteration, we check the word
324 * we're currently holding: if it's not zero, then we return @base@ plus
325 * the position of the most-significant set bit, using @clz32@. Otherwise,
326 * we load the next (less significant) word, and drop @base@ by 32, but
327 * don't check it yet. We end this loop when the word we're holding
328 * contains the @from@ bit. It's possible that we didn't do any iterations
329 * of the loop, in which case we're still holding the word containing the
330 * @to@ bit at this point.
331 *
332 * Finally, we mask off the bits below the @from@ bit, and check that what
333 * we have left is zero. If it isn't, we return @base@ plus the position
334 * of the most significant bit; if it is, we return @ALLCEAR@.
335 */
336
337 /* The first job is to find the word containing the @to@ bit and mask off
338 * any higher bits that we don't care about.
339 *
340 * Recall that the bit's index is @to - 1@, but this must be a valid index
341 * because there is at least one bit in the region. But we start out
342 * pointing beyond the vector, so we must add an extra 32 bits.
343 */
344 n0 = (to + 31)/32; x -= n0; base = (to - 1)&~31u; w = *x++;
345 b = to%32; if (b) w &= M32(b);
346
347 /* Before we start the loop, it'd be useful to know how many iterations we
348 * need. This is going to be the offset from the word containing the @to@
349 * bit to the word containing the @from@ bit. Again, we're off by one
350 * because that's how our initial indexing is set up.
351 */
352 n = n0 - from/32 - 1;
353
354 /* Now it's time to do the loop. This is the easy bit. */
355 while (n--) {
356 if (w) return (base + 31 - clz32(w));
357 w = *x++&MASK32; base -= 32;
358 }
359
360 /* We're now holding the final word -- the one containing the @from@ bit.
361 * We need to mask off any low bits that we don't care about.
362 */
363 m = M32(from%32); w &= MASK32&~m;
364
365 /* Final check. */
366 if (w) return (base + 31 - clz32(w));
367 else return (ALLCLEAR);
368}
369
370/*----- General floating-point hacking ------------------------------------*/
371
372/* --- @fltfmt_initbits@ --- *
373 *
374 * Arguments: @struct floatbits *x@ = pointer to structure to initialize
375 *
376 * Returns: ---
377 *
378 * Use: Dynamically initialize @x@ to (positive) zero so that it can
379 * be used as the destination operand by other operations. This
380 * doesn't allocate resources and cannot fail. The
381 * @FLOATBITS_INIT@ macro is a suitable static initializer for
382 * performing the same task.
383 */
384
385void fltfmt_initbits(struct floatbits *x)
386{
387 x->f = FLTF_ZERO;
388 x->a = arena_global;
389 x->frac = 0; x->n = x->fracsz = 0;
390}
391
392/* --- @fltfmt_freebits@ --- *
393 *
394 * Arguments: @struct floatbits *x@ = pointer to structure to free
395 *
396 * Returns: ---
397 *
398 * Use: Releases the memory held by @x@. Afterwards, @x@ is a valid
399 * (positive) zero, but can safely be discarded.
400 */
401
402void fltfmt_freebits(struct floatbits *x)
403{
404 if (x->frac) x_free(x->a, x->frac);
405 x->f = FLTF_ZERO;
406 x->frac = 0; x->n = x->fracsz = 0;
407}
408
409/* --- @fltfmt_allocfrac@ --- *
410 *
411 * Arguments: @struct floatbits *x@ = structure to adjust
412 * @unsigned n@ = number of words required
413 *
414 * Returns: ---
415 *
416 * Use: Reallocate the @frac@ vector so that it has space for at
417 * least @n@ 32-bit words, and set @x->n@ equal to @n@. If the
418 * current size is already @n@ or greater, then just update the
419 * active length @n@ and return; otherwise, any existing vector
420 * is discarded and a fresh, larger one allocated.
421 */
422
423void fltfmt_allocfrac(struct floatbits *x, unsigned n)
424 { GROWBUF_REPLACEV(unsigned, x->a, x->frac, x->fracsz, n, 4); x->n = n; }
425
426/* --- @fltfmt_copybits@ --- *
427 *
428 * Arguments: @struct floatbits *z_out@ = where to leave the result
429 * @const struct floatbits *x@ = source to copy
430 *
431 * Returns: ---
432 *
433 * Use: Make @z_out@ be a copy of @x@. If @z_out@ is the same object
434 * as @x@ then do nothing.
435 */
436
437void fltfmt_copybits(struct floatbits *z_out, const struct floatbits *x)
438{
439 unsigned i;
440
441 if (z_out == x) return;
442 z_out->f = x->f;
443 if (!FINITEP(x)) z_out->exp = 0;
444 else z_out->exp = x->exp;
445 if ((x->f&(FLTF_ZERO | FLTF_INF)) || !x->n)
446 { z_out->n = 0; z_out->frac = 0; }
447 else {
448 fltfmt_allocfrac(z_out, x->n);
449 for (i = 0; i < x->n; i++) z_out->frac[i] = x->frac[i];
450 }
451}
452
453/* --- @fltfmt_round@ --- *
454 *
455 * Arguments: @struct floatbits *z_out@ = destination (may equal source)
456 * @const struct floatbits *x@ = source
457 * @unsigned r@ = rounding mode (@FLTRND_...@ code)
458 * @unsigned n@ = nonzero number of bits to leave
459 *
460 * Returns: A @FLTERR_...@ code, specifically either @FLTERR_INEXACT@ if
461 * rounding discarded some nonzero value bits, or @FLTERR_OK@ if
462 * rounding was unnecessary.
463 *
464 * Use: Rounds a floating-point value to a given number of
465 * significant bits, using the given rounding rule.
466 */
467
468unsigned fltfmt_round(struct floatbits *z_out, const struct floatbits *x,
469 unsigned r, unsigned n)
470{
471 unsigned rf, i, uw, ub, hw, hb, rc = 0;
472 uint32 um, hm, w;
473 int exp;
474
475 /* Check that this is likely to work. We must have at least one bit
476 * remaining, so that we can inspect the last-place unit bit. And we
477 * mustn't round up if the current value is already exact, because that
478 * would be nonsensical (and inconvenient).
479 */
480 assert(n > 0); assert(!(r&~(FRPMASK_LOW | FRPMASK_HALF)));
481
482 /* Eliminate trivial cases. There's nothing to do if the value is infinite
483 * or zero, or if we don't have enough precision already.
484 *
485 * The caller will have set the rounding mode and length suitably for a
486 * NaN.
487 */
488 if (x->f&(FLTF_ZERO | FLTF_INF) || n >= 32*x->n)
489 { fltfmt_copybits(z_out, x); return (FLTERR_OK); }
490
491 /* Determine various indices.
492 *
493 * The quantities @uw@ and @ub@ are the word and bit number which will hold
494 * the unit bit when we've finished; @hw@ and @hb@ similarly index the
495 * `half' bit, which is the next less significant bit.
496 */
497 uw = (n - 1)/32; ub = -n&31;
498 if (!ub) { hw = uw + 1; hb = 31; }
499 else { hw = uw; hb = ub - 1; }
500
501 /* Determine the necessary predicates for the rounding decision. */
502 rf = 0;
503 if (x->f&FLTF_NEG) rf |= FRPF_NEG;
504 um = B32(ub); if (x->frac[uw]&um) rf |= FRPF_ODD;
505 hm = B32(hb); if (x->frac[hw]&hm) rf |= FRPF_HALF;
506 if (x->frac[hw]&(hm - 1)) rf |= FRPF_LOW;
507 for (i = hw + 1; i < x->n; i++) if (x->frac[i]) rf |= FRPF_LOW;
508 if (rf&(FRPF_LOW | FRPF_HALF)) rc |= FLTERR_INEXACT;
509
510 /* Allocate space for the result. */
511 fltfmt_allocfrac(z_out, uw + 1);
512
513 /* We start looking at the least significant word of the result. Clear the
514 * low bits.
515 */
516 i = uw; exp = x->exp; w = x->frac[i]&~(um - 1);
517
518 /* If the rounding function is true then we need to add one to the
519 * truncated fraction and propagate carries.
520 */
521 if ((r >> rf)&1) {
522 w = (w + um)&MASK32;
523 while (i && !w) {
524 z_out->frac[i] = w;
525 w = (x->frac[--i] + 1)&MASK32;
526 }
527 if (!w) { w = B31; exp++; }
528 }
529
530 /* Store, and copy the remaining words. */
531 for (;;) {
532 z_out->frac[i] = w;
533 if (!i) break;
534 w = x->frac[--i];
535 }
536
537 /* Done. */
538 z_out->f = x->f&(FLTF_NEG | FLTF_NANMASK);
539 if (x->f&FLTF_NANMASK) z_out->exp = 0;
540 else z_out->exp = exp;
541 return (rc);
542}
543
dc6eea4e 544/*----- IEEE and related formats ------------------------------------------*/
b1a20bee
MW
545
546/* IEEE (and related) format descriptions. */
547const struct fltfmt_ieeefmt
c752173d
MW
548 fltfmt_mini = { FLTIF_HIDDEN, 4, 4 },
549 fltfmt_bf16 = { FLTIF_HIDDEN, 8, 8 },
550 fltfmt_f16 = { FLTIF_HIDDEN, 5, 11 },
551 fltfmt_f32 = { FLTIF_HIDDEN, 8, 24 },
552 fltfmt_f64 = { FLTIF_HIDDEN, 11, 53 },
553 fltfmt_f128 = { FLTIF_HIDDEN, 15, 113 },
b1a20bee
MW
554 fltfmt_idblext80 = { 0, 15, 64 };
555
556/* --- @fltfmt_encieee@ ---
557 *
558 * Arguments: @const struct fltfmt_ieeefmt *fmt@ = format description
559 * @uint32 *z@ = output vector
560 * @const struct floatbits *x@ = value to encode
561 * @unsigned r@ = rounding mode
562 * @unsigned errmask@ = error mask
563 *
564 * Returns: Error flags (@FLTERR_...@).
565 *
566 * Use: Encode a floating-point value in an IEEE format. This is the
567 * machinery shared by the @fltfmt_enc...@ functions for
568 * encoding IEEE-format values. Most of the arguments and
569 * behaviour are as described for those functions.
570 *
571 * The encoded value is right-aligned and big-endian; i.e., the
572 * sign bit ends up in @z[0]@, and the least significant bit of
573 * the significand ends up in the least significant bit of
574 * @z[n - 1]@.
575 */
576
577unsigned fltfmt_encieee(const struct fltfmt_ieeefmt *fmt,
578 uint32 *z, const struct floatbits *x,
579 unsigned r, unsigned errmask)
580{
581 struct floatbits y;
582 unsigned sigwd, fracwd, err = 0, f = x->f, rf;
583 unsigned i, j, n, nb, nw, mb, mw, esh, sh;
584 int exp, minexp, maxexp;
585 uint32 z0, t;
586
587#define ERR(e) do { err |= (e); if (err&~errmask) goto end; } while (0)
588
589 /* The following code assumes that the sign, biased exponent, unit, and
590 * quiet/signalling bits can all fit into the most significant 32 bits of
591 * the result.
592 */
593 assert(fmt->expwd + 3 <= 32);
594 esh = 31 - fmt->expwd;
595
596 /* Determine the output size. */
597 nb = fmt->prec + fmt->expwd + 1;
c752173d 598 if (fmt->f&FLTIF_HIDDEN) nb--;
b1a20bee
MW
599 nw = (nb + 31)/32;
600
601 /* Determine the top bits. */
602 z0 = 0; i = 0;
603 if (x->f&FLTF_NEG) z0 |= B31;
604
605 /* And now for the main case analysis. */
606
607 if (f&FLTF_ZERO) {
608 /* Zero. There's very little to do here. */
609
610 } else if (f&FLTF_INF) {
611 /* Infinity. Set the exponent and, for non-hidden-bit formats, the unit
612 * bit.
613 */
614
615 z0 |= M32(fmt->expwd) << esh;
c752173d 616 if (!(fmt->f&FLTIF_HIDDEN)) z0 |= B32(esh - 1);
b1a20bee
MW
617
618 } else if (f&FLTF_NANMASK) {
619 /* Not-a-number.
620 *
621 * We must check that we won't lose significant bits. We need a bit for
622 * the quiet/signalling flag, and enough space for the significant
623 * payload bits. The unit bit is not in play here, so the available
624 * space is always one less than the advertised precision. To put it
625 * another way, we need space for the payload, a bit for the
626 * quiet/signalling flag, and a bit for the unit place.
627 */
628
629 fracwd = sigbits(x);
630 if (fracwd + 2 > fmt->prec) ERR(FLTERR_INEXACT);
631
632 /* Copy the payload.
633 *
634 * If the payload is all-zero and we're meant to set a signalling NaN
dc6eea4e 635 * then report an exactness failure and set the least-significant bit.
b1a20bee
MW
636 */
637 mb = fmt->prec - 2; mw = (mb + 31)/32; sh = -mb%32;
dc6eea4e
MW
638 n = x->n;
639 if (n < mw) j = 0;
640 else { n = mw; j = sh; }
641 if ((f&FLTF_SNAN) && ms_set_bit(x->frac + n, j, 32*n) == ALLCLEAR) {
642 ERR(FLTERR_INEXACT);
643 n = nw - 1; for (i = 0; i < n; i++) z[i] = 0;
644 z[i++] = 1;
645 } else {
646 for (i = 0; i < nw - mw; i++) z[i] = 0;
647 n = x->n; if (n > mw) n = mw;
648 t = shr(z + i, x->frac, n, sh); i += n;
649 if (i < nw) z[i++] = t;
650 sh = esh - 2; if (fmt->f&FLTIF_HIDDEN) sh++;
651 if (f&FLTF_QNAN) z0 |= B32(sh);
652 }
b1a20bee
MW
653
654 /* Set the exponent and, for non-hidden-bit formats, the unit bit. */
655 z0 |= M32(fmt->expwd) << esh;
c752173d 656 if (!(fmt->f&FLTIF_HIDDEN)) z0 |= B32(esh - 1);
b1a20bee
MW
657
658 } else {
659 /* A finite value.
660 *
661 * Adjust the exponent by one place to compensate for the difference in
662 * significant conventions. Our significand lies between zero (in fact,
663 * a half, because we require normalization) and one, while an IEEE
664 * significand lies between zero (in fact, one) and two. Our exponent is
665 * therefore one larger than the IEEE exponent will be.
666 */
667
668 /* Determine the maximum true (unbiased) exponent. As noted above, this
669 * is also the bias.
670 */
671 exp = x->exp - 1;
672 maxexp = (1 << (fmt->expwd - 1)) - 1;
673 minexp = 1 - maxexp;
674
675 if (exp <= minexp - (int)fmt->prec) {
676 /* If the exponent is very small then we underflow. We have %$p - 1$%
677 * bits available to represent a subnormal significand, and therefore
678 * can represent at least one bit of a value as small as
679 * %$2^{e_{\text{min}}-p+1}$%.
680 *
681 * If the exponent is one short of the threshold, then we check to see
682 * whether the value will round up.
683 */
684
685 if ((minexp - exp == fmt->prec) &&
686 ((r >> (FRPF_HALF |
687 (sigbits(x) > 1 ? FRPF_LOW : 0) |
688 (f&FLTF_NEG ? FRPF_NEG : 0)))&1)) {
689 ERR(FLTERR_INEXACT);
690 for (i = 0; i < nw - 1; i++) z[i] = 0;
691 z[i++] = 1;
692 } else {
693 ERR(FLTERR_UFLOW | FLTERR_INEXACT);
694 /* Return (signed) zero. */
695 }
696
697 } else {
698 /* We can at least try to store some bits. */
699
700 /* Let's see how many we need to deal with and how much space we have.
701 * We might as well set the biased exponent here while we're at it.
702 *
703 * If %$e \ge e_{\text{min}}$% then we can store %$p$% bits of
704 * significand. Otherwise, we must make a subnormal and we can only
705 * store %$p + e - e_{\text{min}}$% bits. (Cross-check: if %$e \le
706 * e_{\text{min}} - p$% then we can store zero bits or fewer and have
707 * underflowed to zero, which matches the previous case.) In the
708 * subnormal case, we also `correct' the exponent so that we store the
709 * correct sentinel value later.
710 */
711 fracwd = sigbits(x);
712 if (exp >= minexp) sigwd = fmt->prec;
713 else { sigwd = fmt->prec + exp - minexp; exp = minexp - 1; }
714 mw = (sigwd + 31)/32; sh = -sigwd%32;
715
716 /* If we don't have enough significand bits then we must round. This
717 * might increase the exponent, so we must reload.
718 */
719 if (fracwd > sigwd) {
720 ERR(FLTERR_INEXACT);
721 y.frac = z + nw - mw; y.fracsz = mw; fltfmt_round(&y, x, r, sigwd);
722 x = &y; exp = y.exp - 1; fracwd = sigwd;
723 }
724
725 if (exp > maxexp) {
726 /* If the exponent is too large, then we overflow. If the error is
727 * masked, then we must produce a default value, choosing between
728 * infinity and the largest representable finite value according to
729 * the rounding mode.
730 */
731
c752173d 732 ERR(FLTERR_OFLOW | FLTERR_INEXACT);
b1a20bee
MW
733 rf = FRPF_ODD | FRPF_HALF | FRPF_LOW;
734 if (f&FLTF_NEG) rf |= FRPF_NEG;
c752173d 735 if ((r >> rf)&1)
b1a20bee 736 z0 |= M32(fmt->expwd) << esh;
c752173d 737 else {
b1a20bee 738 z0 |= (B32(fmt->expwd) - 2) << esh;
c752173d 739 mb = fmt->prec; if (fmt->f&FLTIF_HIDDEN) mb--;
b1a20bee
MW
740 mw = (mb + 31)/32;
741 i = nw - mw;
742 z[i++] = M32(mb%32);
743 while (i < nw) z[i++] = MASK32;
744 }
745
746 } else {
747 /* The exponent is in range. Everything is ready. */
748
749 /* Store the significand. */
750 n = (fracwd + 31)/32; i = nw - mw;
751 t = shr(z + i, x->frac, n, sh); i += n;
752 if (i < nw) z[i++] = t;
753
754 /* Fill in the top end. */
755 for (j = nw - mw; j--; ) z[j] = 0;
756
757 /* Set the biased exponent. */
758 z0 |= (exp + maxexp) << esh;
759
760 /* Clear the unit bit if we're suppose to use a hidden-bit convention. */
c752173d 761 if (fmt->f&FLTIF_HIDDEN) {
b1a20bee
MW
762 mb = fmt->prec - 1; mw = (mb + 31)/32; mb = mb%32;
763 z[nw - mw] &= ~B32(mb);
764 }
765 }
766 }
767 }
768
769 /* Clear the significand bits that we haven't set explicitly yet. */
770 while (i < nw) z[i++] = 0;
771
772 /* All that remains is to insert the top bits @z0@ in the right place.
773 * This will set the exponent, and the unit and quiet bits.
774 */
775 sh = -nb%32;
776 z[0] |= z0 >> sh;
777 if (sh && nb >= 32) z[1] |= z0 << (32 - sh);
778
779end:
780 return (err);
781
782#undef ERR
783}
784
785/* --- @fltfmt_encTY@ --- *
786 *
787 * Arguments: @octet *z_out@, @uint16 *z_out@, @uint32 *z_out@,
788 * @kludge64 *z_out@ = where to put the encoded value
789 * @uint16 *se_out@, @kludge64 *m_out@ = where to put the
790 * encoded sign-and-exponent and significand
791 * @const struct floatbits *x@ = value to encode
792 * @unsigned r@ = rounding mode
793 * @unsigned errmask@ = error mask
794 *
795 * Returns: Error flags (@FLTERR_...@).
796 *
797 * Use: Encode a floating-point value in an IEEE (or IEEE-adjacent)
798 * format.
799 *
800 * If an error is encountered during the encoding, and the
801 * corresponding bit of @errmask@ is clear, then processing
802 * stops immediately and the error is returned; if the bit is
803 * set, then processing continues as described below.
804 *
805 * The @TY@ may be
806 *
807 * * @mini@ for the 8-bit `1.4.3 minifloat' format, with
808 * four-bit exponent and four-bit significand, represented
809 * as a single octet;
810 *
811 * * @bf16@ for the Google `bfloat16' format, with eight-bit
812 * exponent and eight-bit significand, represented as a
813 * @uint16@;
814 *
815 * * @f16@ for the IEEE `binary16' format, with five-bit
816 * exponent and eleven-bit significand, represented as a
817 * @uint16@;
818 *
819 * * @f32@ for the IEEE `binary32' format, with eight-bit
820 * exponent and 24-bit significand, represented as a
821 * @uint32@;
822 *
823 * * @f64@ for the IEEE `binary64' format, with eleven-bit
824 * exponent and 53-bit significand, represented as a
825 * @kludge64@;
826 *
827 * * @f128@ for the IEEE `binary128' format, with fifteen-bit
828 * exponent and 113-bit significand, represented as four
829 * @uint32@ limbs, most significant first; or
830 *
831 * * @idblext80@ for the Intel 80-bit `double extended'
832 * format, with fifteen-bit exponent and 64-bit significand
833 * with no hidden bit, represented as a @uint16 se@
834 * holding the sign and exponent, and a @kludge64 m@
835 * holding the significand.
836 *
837 * Positive and negative zero and infinity are representable
838 * exactly.
839 *
840 * Following IEEE recommendations (and most implementations),
841 * the most significant fraction bit of a quiet NaN is set; this
842 * bit is clear in a signalling NaN. The most significant
843 * payload bits of a NaN, held in the top bits of @x->frac[0]@,
844 * are encoded in the output significand following the `quiet'
845 * bit. If the chosen format's significand field is too small
846 * to accommodate all of the set payload bits then the
847 * @FLTERR_INEXACT@ error bit is set and, if masked, the
848 * excess payload bits are discarded. No rounding of NaN
849 * payloads is performed.
850 *
851 * Otherwise, the input value is finite and nonzero. If the
852 * significand cannot be represented exactly then the
853 * @FLTERR_INEXACT@ error bit is set, and, if masked, the value
854 * will be rounded (internally -- the input @x@ is not changed).
855 * If the (rounded) value's exponent is too large to represent,
856 * then the @FLTERR_OFLOW@ and @FLTERR_INEXACT@ error bits are
857 * set and, if masked, the result is either the (absolute)
858 * largest representable finite value or infinity, with the
859 * appropriate sign, chosen according to the rounding mode. If
860 * the exponent is too small to represent, then the
861 * @FLTERR_UFLOW@ and @FLTERR_INEXACT@ error bits are set and,
862 * if masked, the result is either the (absolute) smallest
863 * nonzero value or zero, with the appropriate sign, chosen
864 * according to the rounding mode.
865 */
866
867unsigned fltfmt_encmini(octet *z_out, const struct floatbits *x,
868 unsigned r, unsigned errmask)
869{
870 uint32 t[1];
871 unsigned rc;
872
873 rc = fltfmt_encieee(&fltfmt_mini, t, x, r, errmask);
874 if (!(rc&~errmask)) *z_out = t[0];
875 return (rc);
876}
877
878unsigned fltfmt_encbf16(uint16 *z_out, const struct floatbits *x,
879 unsigned r, unsigned errmask)
880{
881 uint32 t[1];
882 unsigned rc;
883
884 rc = fltfmt_encieee(&fltfmt_bf16, t, x, r, errmask);
885 if (!(rc&~errmask)) *z_out = t[0];
886 return (rc);
887}
888
889unsigned fltfmt_encf16(uint16 *z_out, const struct floatbits *x,
890 unsigned r, unsigned errmask)
891{
892 uint32 t[1];
893 unsigned rc;
894
895 rc = fltfmt_encieee(&fltfmt_f16, t, x, r, errmask);
896 if (!(rc&~errmask)) *z_out = t[0];
897 return (rc);
898}
899
900unsigned fltfmt_encf32(uint32 *z_out, const struct floatbits *x,
901 unsigned r, unsigned errmask)
902 { return (fltfmt_encieee(&fltfmt_f32, z_out, x, r, errmask)); }
903
904unsigned fltfmt_encf64(kludge64 *z_out, const struct floatbits *x,
905 unsigned r, unsigned errmask)
906{
907 uint32 t[2];
908 unsigned rc;
909
910 rc = fltfmt_encieee(&fltfmt_f64, t, x, r, errmask);
911 if (!(rc&~errmask)) SET64(*z_out, t[0], t[1]);
912 return (rc);
913}
914
915unsigned fltfmt_encf128(uint32 *z_out, const struct floatbits *x,
916 unsigned r, unsigned errmask)
917 { return (fltfmt_encieee(&fltfmt_f128, z_out, x, r, errmask)); }
918
919unsigned fltfmt_encidblext80(uint16 *se_out, kludge64 *m_out,
920 const struct floatbits *x,
921 unsigned r, unsigned errmask)
922{
923 uint32 t[3];
924 unsigned rc;
925
926 rc = fltfmt_encieee(&fltfmt_idblext80, t, x, r, errmask);
927 if (!(rc&~errmask)) { *se_out = t[0]; SET64(*m_out, t[1], t[2]); }
928 return (rc);
929}
930
931/* --- @fltfmt_decieee@ --- *
932 *
933 * Arguments: @const struct fltfmt_ieeefmt *fmt@ = format description
934 * @struct floatbits *z_out@ = output decoded representation
935 * @const uint32 *x@ = input encoding
936 *
937 * Returns: Error flags (@FLTERR_...@).
938 *
939 * Use: Decode a floating-point value in an IEEE format. This is the
940 * machinery shared by the @fltfmt_dec...@ functions for
941 * deccoding IEEE-format values. Most of the arguments and
942 * behaviour are as described for those functions.
943 *
944 * The encoded value should be right-aligned and big-endian;
945 * i.e., the sign bit ends up in @z[0]@, and the least
946 * significant bit of the significand ends up in the least
947 * significant bit of @z[n - 1]@.
948 */
949
950unsigned fltfmt_decieee(const struct fltfmt_ieeefmt *fmt,
951 struct floatbits *z_out, const uint32 *x)
952{
953 unsigned sigwd, err = 0, f = 0;
954 unsigned i, nb, nw, mw, esh, sh;
955 int exp, minexp, maxexp;
956 uint32 x0, t, u, emask;
957
958 /* The following code assumes that the sign, biased exponent, unit, and
959 * quiet/signalling bits can all fit into the most significant 32 bits of
960 * the result.
961 */
962 assert(fmt->expwd + 3 <= 32);
963 esh = 31 - fmt->expwd; emask = M32(fmt->expwd);
c752173d 964 sigwd = fmt->prec; if (fmt->f&FLTIF_HIDDEN) sigwd--;
b1a20bee
MW
965
966 /* Determine the input size. */
967 nb = sigwd + fmt->expwd + 1; nw = (nb + 31)/32;
968
969 /* Extract the sign, exponent, and top of the significand. */
970 sh = -nb%32;
971 x0 = x[0] << sh;
972 if (sh && nb >= 32) x0 |= x[1] >> (32 - sh);
973 if (x0&B31) f |= FLTF_NEG;
974 t = (x0 >> esh)&emask;
975
976 /* Time for a case analysis. */
977
978 if (t == emask) {
979 /* Infinity or NaN.
980 *
981 * Note that we don't include the quiet bit in our decoded payload.
982 */
983
c752173d 984 if (!(fmt->f&FLTIF_HIDDEN)) {
b1a20bee
MW
985 /* No hidden bit, so we expect the unit bit to be set. If it isn't,
986 * that's technically invalid, and its absence won't survive a round
987 * trip, since the bit isn't considered part of a NaN payload -- or
988 * even to distinguish a NaN from an infinity. In any event, reduce
989 * the notional significand size to exclude this bit from further
990 * consideration.
991 */
992
993 if (!(x0&B32(esh - 1))) err = FLTERR_INVAL;
994 sigwd--;
995 }
996
997 if (ms_set_bit(x + nw, 0, sigwd) == ALLCLEAR)
998 f |= FLTF_INF;
999 else {
c752173d 1000 sh = esh - 2; if (fmt->f&FLTIF_HIDDEN) sh++;
b1a20bee
MW
1001 if (x0&B32(sh)) f |= FLTF_QNAN;
1002 else f |= FLTF_SNAN;
1003 sigwd--; mw = (sigwd + 31)/32;
1004 fltfmt_allocfrac(z_out, mw);
1005 shl(z_out->frac, x + nw - mw, mw, -sigwd%32);
1006 }
1007 goto end;
1008 }
1009
1010 /* Determine the exponent bounds. */
1011 maxexp = (1 << (fmt->expwd - 1)) - 1;
1012 minexp = 1 - maxexp;
1013
1014 /* Dispatch. If there's a hidden bit then everything is well defined.
1015 * Otherwise, we'll normalize the incoming value regardless, but report
1016 * settings of the unit bit which are inconsistent with the exponent.
1017 */
c752173d 1018 if (fmt->f&FLTIF_HIDDEN) {
b1a20bee
MW
1019 if (!t) { exp = minexp; goto normalize; }
1020 else { exp = t - maxexp; goto hidden; }
1021 } else {
1022 u = x0&B32(esh - 1);
1023 if (!t) { exp = minexp; if (u) err |= FLTERR_INVAL; }
1024 else { exp = t - maxexp; if (!u) err |= FLTERR_INVAL; }
1025 goto normalize;
1026 }
1027
1028hidden:
1029 /* We have a normal real number with a hidden bit. */
1030
1031 mw = (sigwd + 31)/32;
1032
1033 if (!(sigwd%32)) {
1034 /* The bits we have occupy a whole number of words, but we need to shift
1035 * to make space for the unit bit.
1036 */
1037
1038 fltfmt_allocfrac(z_out, mw + 1);
1039 z_out->frac[mw] = shr(z_out->frac, x + nw - mw, mw, 1);
1040 } else {
1041 fltfmt_allocfrac(z_out, mw);
1042 shl(z_out->frac, x + nw - mw, mw, -(sigwd + 1)%32);
1043 }
1044 z_out->frac[0] |= B31;
1045 z_out->exp = exp + 1;
1046 goto end;
1047
1048normalize:
1049 /* We have, at least potentially, a subnormal number, with no hidden
1050 * bit.
1051 */
1052
1053 i = ms_set_bit(x + nw, 0, sigwd);
1054 if (i == ALLCLEAR) { f |= FLTF_ZERO; goto end; }
1055 mw = i/32 + 1; sh = 32*mw - i - 1;
1056 fltfmt_allocfrac(z_out, mw);
1057 shl(z_out->frac, x + nw - mw, mw, sh);
1058 z_out->exp = exp - fmt->prec + 2 + i;
1059 goto end;
1060
1061end:
1062 /* All done. */
1063 z_out->f = f; return (err);
1064}
1065
1066/* --- @fltfmt_decTY@ --- *
1067 *
1068 * Arguments: @const struct floatbits *z_out@ = storage for the result
1069 * @octet x@, @uint16 x@, @uint32 x@, @kludge64 x@ =
1070 * encoded input
1071 * @uint16 se@, @kludge64 m@ = encoded sign-and-exponent and
1072 * significand
1073 *
1074 * Returns: Error flags (@FLTERR_...@).
1075 *
1076 * Use: Encode a floating-point value in an IEEE (or IEEE-adjacent)
1077 * format.
1078 *
1079 * The options for @TY@ are as documented for the encoding
1080 * functions above.
1081 *
1082 * In formats without a hidden bit -- currently only @idblext80@
1083 * -- not all bit patterns are valid encodings. If the explicit
1084 * unit bit is set when the exponent field is all-bits-zero, or
1085 * clear when the exponent field is not all-bits-zero, then the
1086 * @FLTERR_INVAL@ error bit is set. If the exponent is all-
1087 * bits-set, denoting infinity or a NaN, then the unit bit is
1088 * otherwise ignored -- in particular, it does not affect the
1089 * NaN payload, or even whether the input encodes a NaN or
1090 * infinity. Otherwise, the unit bit is considered significant,
1091 * and the result is normalized as one would expect.
1092 * Consequently, biased exponent values 0 and 1 are distinct
1093 * only with respect to which bit patterns are considered valid,
1094 * and not with respect to the set of values denoted.
1095 */
1096
1097unsigned fltfmt_decmini(struct floatbits *z_out, octet x)
1098 { uint32 t[1]; t[0] = x; return (fltfmt_decieee(&fltfmt_mini, z_out, t)); }
1099
1100unsigned fltfmt_decbf16(struct floatbits *z_out, uint16 x)
1101 { uint32 t[1]; t[0] = x; return (fltfmt_decieee(&fltfmt_bf16, z_out, t)); }
1102
1103unsigned fltfmt_decf16(struct floatbits *z_out, uint16 x)
1104 { uint32 t[1]; t[0] = x; return (fltfmt_decieee(&fltfmt_f16, z_out, t)); }
1105
1106unsigned fltfmt_decf32(struct floatbits *z_out, uint32 x)
1107 { uint32 t[1]; t[0] = x; return (fltfmt_decieee(&fltfmt_f32, z_out, t)); }
1108
1109unsigned fltfmt_decf64(struct floatbits *z_out, kludge64 x)
1110{
1111 uint32 t[2];
1112
1113 t[0] = HI64(x); t[1] = LO64(x);
1114 return (fltfmt_decieee(&fltfmt_f64, z_out, t));
1115}
1116
1117unsigned fltfmt_decf128(struct floatbits *z_out, const uint32 *x)
1118 { return (fltfmt_decieee(&fltfmt_f128, z_out, x)); }
1119
1120unsigned fltfmt_decidblext80(struct floatbits *z_out, uint16 se, kludge64 m)
1121{
1122 uint32 t[3];
1123
1124 t[0] = se; t[1] = HI64(m); t[2] = LO64(m);
1125 return (fltfmt_decieee(&fltfmt_idblext80, z_out, t));
1126}
1127
1128/*----- Native formats ----------------------------------------------------*/
1129
1130/* If the floating-point radix is a power of two, determine how many bits
1131 * there are in each digit. This isn't exhaustive, but it covers most of the
1132 * bases, so to speak.
1133 */
1134#if FLT_RADIX == 2
1135# define DIGIT_BITS 1
1136#elif FLT_RADIX == 4
1137# define DIGIT_BITS 2
1138#elif FLT_RADIX == 8
1139# define DIGIT_BITS 3
1140#elif FLT_RADIX == 16
1141# define DIGIT_BITS 4
1142#endif
1143
dc6eea4e
MW
1144/* Take note if we need to cope with the revered quiet/signalling convention
1145 * used by HP-PA and older MIPS processors.
1146 */
1147#if defined(__hppa__) || (defined(__mips__) && !defined(__mips_nan2008))
1148# define FROB_NANS
1149#endif
1150
b1a20bee
MW
1151/* --- @ENCFLT@ --- *
1152 *
1153 * Arguments: @ty@ = the C type to encode
1154 * @TY@ = the uppercase prefix for macros
1155 * @ty (*ldexp)(ty, int)@ = function to scale a @ty@ value by a
1156 * power of two
1157 * @unsigned &rc@ = error code to set
1158 * @ty *z_out@ = storage for the result
1159 * @const struct floatbits *x@ = value to convert
1160 * @unsigned r@ = rounding mode
1161 *
1162 * Returns: ---
1163 *
1164 * Use: Encode a floating-point value @x@ as a native C object of
1165 * type @ty@. This is the machinery shared by the
1166 * @fltfmt_enc...@ functions for enccoding native-format values.
1167 * Most of the behaviour is as described for those functions.
1168 */
1169
1170/* Utilities based on conditional compilation, because we can't use
1171 * %|#ifdef|% directly in macros.
1172 */
1173
1174#ifdef NAN
1175 /* The C implementation acknowledges the existence of (quiet) NaN values,
1176 * but will neither let us set the payload in a useful way, nor
1177 * acknowledge the existence of signalling NaNs. We have no good way to
1178 * determine which NaN the @NAN@ macro produces, so report this conversion
1179 * as inexact.
1180 */
1181
1182# define SETNAN(rc, z) do { (z) = NAN; (rc) = FLTERR_INEXACT; } while (0)
1183#else
1184 /* This C implementation doesn't recognize NaNs. This value is totally
1185 * unrepresentable, so just report the error. (Maybe it's C89 and would
1186 * actually do the right thing with @0/0@. I'm not sure the requisite
1187 * compile-time configuration machinery is worth the effort.)
1188 */
1189
1190# define SETNAN(rc, z) do { (z) = 0; (rc) = FLTERR_REPR; } while (0)
1191#endif
1192
1193#ifdef INFINITY
1194 /* The C implementation supports infinities. This is a simple win. */
1195
1196# define SETINF(TY, rc, z) \
1197 do { (z) = INFINITY; (rc) = FLTERR_OK; } while (0)
1198#else
1199 /* The C implementation doesn't support infinities. Return the maximum
1200 * value and report it as an overflow; I think this is more useful than
1201 * reporting a complete representation failure. (Maybe it's C89 and would
1202 * actually do the right thing with @1/0@. Again, I'm not sure the
1203 * requisite compile-time configuration machinery is worth the effort.)
1204 */
1205
1206# define SETINF(TY, rc, z) \
1207 do { (z) = TY##_MAX; (rc) = FLTERR_OFLOW | FLTERR_INEXACT; } while (0)
1208#endif
1209
1210#ifdef DIGIT_BITS
1211 /* The floating point formats use a power-of-two radix. This means that
1212 * we can determine the correctly rounded value before we start building
1213 * the native floating-point value.
1214 */
1215
1216# define ENC_ROUND_DECLS struct floatbits _y;
1217# define ENC_ROUND(TY, rc, x, r) do { \
1218 (rc) |= fltfmt_round(&_y, (x), (r), DIGIT_BITS*TY##_MANT_DIG); \
1219 (x) = &_y; \
1220 } while (0)
1221#else
1222 /* The floating point formats use a non-power-of-two radix. This means
1223 * that conversion is inherently inexact.
1224 */
1225
1226# define ENC_ROUND_DECLS
1227# define ENC_ROUND(TY, rc, x, r) \
1228 do (rc) |= FLTERR_INEXACT; while (0)
1229# define ENC_FIXUP(...)
1230
1231#endif
1232
dc6eea4e
MW
1233#ifdef FROB_NANS
1234# define FROBNAN_ENCDECLS struct floatbits _y
1235# define FROBNAN_ENC do { \
1236 if (_x->f&FLTF_NANMASK) { \
1237 _y.f = _x->f ^ FLTF_NANMASK; _y.frac = _x->frac; _y.n = _x->n; \
1238 _x = &_y; \
1239 } \
1240 } while (0)
1241#else
1242# define FROBNAN_ENCDECLS
1243# define FROBNAN_ENC do ; while (0)
1244#endif
1245
b1a20bee 1246#define ENCFLT(ty, TY, ldexp, rc, z_out, x, r) do { \
dc6eea4e 1247 const struct floatbits *_x = (x); \
b1a20bee 1248 unsigned _rc = 0; \
dc6eea4e 1249 FROBNAN_ENCDECLS; \
b1a20bee
MW
1250 \
1251 /* See if the native format is one that we recognize. */ \
1252 switch (TY##_FORMAT&(FLTFMT_ORGMASK | FLTFMT_TYPEMASK)) { \
1253 \
1254 case FLTFMT_IEEE_F32: { \
1255 uint32 _t[1]; \
1256 unsigned char *_z = (unsigned char *)(z_out); \
1257 \
dc6eea4e
MW
1258 FROBNAN_ENC; \
1259 (rc) = fltfmt_encieee(&fltfmt_f32, _t, _x, (r), FLTERR_ALLERRS); \
b1a20bee
MW
1260 switch (TY##_FORMAT&FLTFMT_ENDMASK) { \
1261 case FLTFMT_BE: STORE32_B(_z, _t[0]); break; \
1262 case FLTFMT_LE: STORE32_L(_z, _t[0]); break; \
1263 default: assert(!"unimplemented byte order"); break; \
1264 } \
1265 } break; \
1266 \
1267 case FLTFMT_IEEE_F64: { \
1268 uint32 _t[2]; \
1269 unsigned char *_z = (unsigned char *)(z_out); \
dc6eea4e
MW
1270 \
1271 FROBNAN_ENC; \
1272 (rc) = fltfmt_encieee(&fltfmt_f64, _t, _x, (r), FLTERR_ALLERRS); \
b1a20bee
MW
1273 switch (TY##_FORMAT&FLTFMT_ENDMASK) { \
1274 case FLTFMT_BE: \
1275 STORE32_B(_z + 0, _t[0]); STORE32_B(_z + 4, _t[1]); \
1276 break; \
1277 case FLTFMT_LE: \
1278 STORE32_L(_z + 0, _t[1]); STORE32_L(_z + 4, _t[0]); \
1279 break; \
1280 case FLTFMT_ARME: \
1281 STORE32_L(_z + 0, _t[0]); STORE32_L(_z + 4, _t[1]); \
1282 break; \
1283 default: assert(!"unimplemented byte order"); break; \
1284 } \
1285 } break; \
1286 \
1287 case FLTFMT_IEEE_F128: { \
1288 uint32 _t[4]; \
1289 unsigned char *_z = (unsigned char *)(z_out); \
1290 \
dc6eea4e
MW
1291 FROBNAN_ENC; \
1292 (rc) = fltfmt_encieee(&fltfmt_f128, _t, _x, (r), FLTERR_ALLERRS); \
b1a20bee
MW
1293 switch (TY##_FORMAT&FLTFMT_ENDMASK) { \
1294 case FLTFMT_BE: \
1295 STORE32_B(_z + 0, _t[0]); STORE32_B(_z + 4, _t[1]); \
1296 STORE32_B(_z + 8, _t[0]); STORE32_B(_z + 12, _t[1]); \
1297 break; \
1298 case FLTFMT_LE: \
1299 STORE32_L(_z + 0, _t[3]); STORE32_L(_z + 4, _t[2]); \
1300 STORE32_L(_z + 8, _t[1]); STORE32_L(_z + 12, _t[0]); \
1301 break; \
1302 default: assert(!"unimplemented byte order"); break; \
1303 } \
1304 } break; \
1305 \
1306 case FLTFMT_INTEL_F80: { \
1307 uint32 _t[3]; \
1308 unsigned char *_z = (unsigned char *)(z_out); \
1309 \
dc6eea4e
MW
1310 FROBNAN_ENC; \
1311 (rc) = fltfmt_encieee(&fltfmt_idblext80, \
1312 _t, _x, (r), FLTERR_ALLERRS); \
b1a20bee
MW
1313 switch (TY##_FORMAT&FLTFMT_ENDMASK) { \
1314 case FLTFMT_BE: \
1315 STORE16_B(_z + 0, _t[0]); \
1316 STORE32_B(_z + 2, _t[1]); STORE32_B(_z + 6, _t[2]); \
1317 break; \
1318 case FLTFMT_LE: \
1319 STORE32_L(_z + 0, _t[2]); STORE32_L(_z + 4, _t[1]); \
1320 STORE16_L(_z + 8, _t[0]); \
1321 break; \
1322 default: assert(!"unimplemented byte order"); break; \
1323 } \
1324 } break; \
1325 \
1326 default: { \
1327 /* We must do this the hard way. */ \
1328 \
b1a20bee
MW
1329 ty _z; \
1330 unsigned _i; \
1331 ENC_ROUND_DECLS; \
1332 \
1333 /* Case analysis... */ \
1334 if (_x->f&FLTF_NANMASK) { \
1335 /* A NaN. Use the macro above. */ \
1336 \
1337 SETNAN(_rc, _z); \
1338 if (x->f&FLTF_NEG) _z = -_z; \
1339 } else if (_x->f&FLTF_INF) { \
1340 /* Infinity. Use the macro. */ \
1341 \
1342 SETINF(TY, _rc, _z); \
1343 if (_x->f&FLTF_NEG) _z = -_z; \
1344 } else if (_x->f&FLTF_ZERO) { \
1345 /* Zero. If we're asked for a negative zero then check that we \
1346 * produced one: if not, then report an exactness failure. \
1347 */ \
1348 \
1349 _z = 0.0; \
1350 if (_x->f&FLTF_NEG) \
1351 { _z = -_z; if (!NEGP(_z)) _rc |= FLTERR_INEXACT; } \
1352 } else { \
1353 /* A finite value. */ \
1354 \
1355 /* If the radix is a power of two, we can round to the correct \
1356 * precision, which will save inexactness later. \
1357 */ \
1358 ENC_ROUND(TY, _rc, _x, (r)); \
1359 \
1360 /* Insert the 32-bit pieces of the fraction one at a time, \
1361 * starting from the least-significant end. This minimizes the \
1362 * inaccuracy from the overall approach, but it's imperfect \
1363 * unless the value has already been rounded correctly. \
1364 */ \
1365 _z = 0.0; \
1366 for (_i = _x->n, _z = 0.0; _i--; ) \
1367 _z += ldexp(_x->frac[_i], _x->exp - 32*_i); \
1368 \
1369 /* Negate the value if we need to. */ \
1370 if (_x->f&FLTF_NEG) _z = -_z; \
1371 } \
1372 \
1373 /* All done. */ \
1374 *(z_out) = _z; \
1375 } break; \
1376 } \
1377 \
1378 /* Set the error code. */ \
1379 (rc) = _rc; \
1380} while (0)
1381
1382/* --- @fltfmt_encTY@ --- *
1383 *
1384 * Arguments: @ty *z_out@ = storage for the result
1385 * @const struct floatbits *x@ = value to encode
1386 * @unsigned r@ = rounding mode
1387 *
1388 * Returns: Error flags (@FLTERR_...@).
1389 *
1390 * Use: Encode the floating-point value @x@ as a native C object and
1391 * store the result in @z_out@.
1392 *
1393 * The @TY@ may be @flt@ to encode a @float@, @dbl@ to encode a
1394 * @double@, or (on C99 implementations) @ldbl@ to encode a
1395 * @long double@.
1396 *
1397 * In detail, conversion is performed as follows.
1398 *
1399 * * If a non-finite value cannot be represented by the
1400 * implementation then the @FLTERR_REPR@ error bit is set
1401 * and @*z_out@ is set to zero if @x@ is a NaN, or the
1402 * (absolute) largest representable value, with appropriate
1403 * sign, if @x@ is an infinity.
1404 *
1405 * * If the implementation can represent NaNs, but cannot set
1406 * NaN payloads, then the @FLTERR_INEXACT@ error bit is set,
1407 * and @*z_out@ is set to an arbitrary (quiet) NaN value.
1408 *
1409 * * If @x@ is negative zero, but the implementation does not
1410 * distinguish negative and positive zero, then the
1411 * @FLTERR_INEXACT@ error bit is set and @*z_out@ is set to
1412 * zero.
1413 *
1414 * * If the implementation's floating-point radix is not a
1415 * power of two, and @x@ is a nonzero finite value, then
1416 * @FLTERR_INEXACT@ error bit is set (unconditionally), and
1417 * the value is rounded by the implementation using its
1418 * prevailing rounding policy. If the radix is a power of
1419 * two, then the @FLTERR_INEXACT@ error bit is set only if
1420 * rounding is necessary, and rounding is performed using
1421 * the rounding mode @r@.
1422 */
1423
1424unsigned fltfmt_encflt(float *z_out, const struct floatbits *x, unsigned r)
1425{
1426 unsigned rc;
1427
1428 ENCFLT(double, FLT, ldexp, rc, z_out, x, r);
1429 return (rc);
1430}
1431
1432unsigned fltfmt_encdbl(double *z_out, const struct floatbits *x, unsigned r)
1433{
1434 unsigned rc;
1435
1436 ENCFLT(double, DBL, ldexp, rc, z_out, x, r);
1437 return (rc);
1438}
1439
1440#if __STDC_VERSION__ >= 199001
1441unsigned fltfmt_encldbl(long double *z_out,
1442 const struct floatbits *x, unsigned r)
1443{
1444 unsigned rc;
1445
1446 ENCFLT(long double, LDBL, ldexpl, rc, z_out, x, r);
1447 return (rc);
1448}
1449#endif
1450
1451/* --- @DECFLT@ --- *
1452 *
1453 * Arguments: @ty@ = the C type to encode
1454 * @TY@ = the uppercase prefix for macros
1455 * @ty (*frexp)(ty, int *)@ = function to decompose a @ty@ value
1456 * into a binary exponent and normalized fraction
1457 * @unsigned &rc@ = error code to set
1458 * @struct floatbits *z_out@ = storage for the result
1459 * @ty x@ = value to convert
1460 * @unsigned r@ = rounding mode
1461 *
1462 * Returns: ---
1463 *
1464 * Use: Decode a C native floating-point object. This is the
1465 * machinery shared by the @fltfmt_dec...@ functions for
1466 * decoding native-format values. Most of the behaviour is as
1467 * described for those functions.
1468 */
1469
1470/* Define some utilities for decoding native floating-point formats.
1471 *
1472 * * @NFRAC(d)@ is the number of fraction limbs we'll need for @d@ native
1473 * digits.
1474 *
1475 * * @CONVFIX@ is a final fixup applied to the decoded value.
1476 */
1477#ifdef DIGIT_BITS
1478# define NFRAC(TY) ((DIGIT_BITS*TY##_MANT_DIG + 31)/32)
1479# define CONVFIX(ty, rc, z, x, n, f, r) do assert(!(x)); while (0)
1480#else
1481# define NFRAC(TY) \
1482 (ceil(log(pow(FLT_RADIX, TY##_MANT_DIG) - 1)/32.0*log(2.0)) + 1)
1483# define CONVFIX(ty, rc, z, x, n, f, r) do { \
1484 ty _x_ = (x); \
1485 struct floatbits *_z_ = (z); \
1486 uint32 _w_; \
1487 unsigned _i_, _n_ = (n), _f_; \
1488 \
1489 /* Round the result according to the remainder left in %$x$%. */ \
1490 _f_ = 0; _i_ = _n_ - 1; _w_ = _z_->frac[_i_]; \
1491 if ((f)&FLTF_NEG) _f_ |= FRPF_NEG; \
1492 if (_w_&1) _f_ |= FRPF_ODD; \
1493 if (_y_ >= 0.5) _f_ |= FRPF_HALF; \
1494 if (_y_ != 0 && _y_ != 0.5) _f_ |= FRPF_LOW; \
1495 if (((r) >> _f_)&1) { \
1496 for (;;) { \
1497 _w_ = (_w_ + 1)&MASK32; \
1498 if (_w_ || !_i_) break; \
1499 _z_->frac[_i_] = 0; _w_ = _z_->frac[--_i_]; \
1500 } \
1501 if (!_w_) { _z_->exp++; _w_ = B31; } \
1502 _z_->frac[_i_] = w; \
1503 } \
1504 \
1505 /* The result is not exact. */ \
1506 (rc) |= FLTERR_INEXACT; \
1507 } while (0)
1508#endif
1509
dc6eea4e
MW
1510#ifdef FROB_NANS
1511# define FROBNAN_DEC do { \
1512 if (_z->f&FLTF_NANMASK) _z->f ^= FLTF_NANMASK; \
1513 } while (0)
1514#else
1515# define FROBNAN_DEC do ; while (0)
1516#endif
1517
b1a20bee 1518#define DECFLT(ty, TY, frexp, rc, z_out, x, r) do { \
dc6eea4e 1519 struct floatbits *_z = (z_out); \
b1a20bee
MW
1520 unsigned _rc = 0; \
1521 \
1522 switch (TY##_FORMAT&(FLTFMT_ORGMASK | FLTFMT_TYPEMASK)) { \
1523 \
1524 case FLTFMT_IEEE_F32: { \
1525 unsigned _t[1]; \
1526 unsigned char *_x = (unsigned char *)&(x); \
1527 \
1528 switch (TY##_FORMAT&FLTFMT_ENDMASK) { \
1529 case FLTFMT_BE: _t[0] = LOAD32_B(_x); break; \
1530 case FLTFMT_LE: _t[0] = LOAD32_L(_x); break; \
1531 default: assert(!"unimplemented byte order"); break; \
1532 } \
dc6eea4e 1533 _rc |= fltfmt_decieee(&fltfmt_f32, _z, _t); FROBNAN_DEC; \
b1a20bee
MW
1534 } break; \
1535 \
1536 case FLTFMT_IEEE_F64: { \
1537 unsigned _t[2]; \
1538 unsigned char *_x = (unsigned char *)&(x); \
1539 \
1540 switch (TY##_FORMAT&FLTFMT_ENDMASK) { \
1541 case FLTFMT_BE: \
1542 _t[0] = LOAD32_B(_x + 0); _t[1] = LOAD32_B(_x + 4); \
1543 break; \
1544 case FLTFMT_LE: \
1545 _t[1] = LOAD32_L(_x + 0); _t[0] = LOAD32_L(_x + 4); \
1546 break; \
1547 case FLTFMT_ARME: \
1548 _t[0] = LOAD32_L(_x + 0); _t[1] = LOAD32_L(_x + 4); \
1549 break; \
1550 default: assert(!"unimplemented byte order"); break; \
1551 } \
dc6eea4e 1552 _rc |= fltfmt_decieee(&fltfmt_f64, _z, _t); FROBNAN_DEC; \
b1a20bee
MW
1553 } break; \
1554 \
1555 case FLTFMT_IEEE_F128: { \
1556 unsigned _t[4]; \
1557 unsigned char *_x = (unsigned char *)&(x); \
1558 \
1559 switch (TY##_FORMAT&FLTFMT_ENDMASK) { \
1560 case FLTFMT_BE: \
1561 _t[0] = LOAD32_B(_x + 0); _t[1] = LOAD32_B(_x + 4); \
1562 _t[2] = LOAD32_B(_x + 8); _t[3] = LOAD32_B(_x + 12); \
1563 break; \
1564 case FLTFMT_LE: \
1565 _t[3] = LOAD32_L(_x + 0); _t[2] = LOAD32_L(_x + 4); \
1566 _t[1] = LOAD32_L(_x + 8); _t[0] = LOAD32_L(_x + 12); \
1567 break; \
1568 default: assert(!"unimplemented byte order"); break; \
1569 } \
dc6eea4e 1570 _rc |= fltfmt_decieee(&fltfmt_f128, _z, _t); FROBNAN_DEC; \
b1a20bee
MW
1571 } break; \
1572 \
1573 case FLTFMT_INTEL_F80: { \
1574 unsigned _t[3]; \
1575 unsigned char *_x = (unsigned char *)&(x); \
1576 \
1577 switch (TY##_FORMAT&FLTFMT_ENDMASK) { \
1578 case FLTFMT_BE: \
1579 _t[0] = LOAD16_B(_x + 0); \
1580 _t[1] = LOAD32_B(_x + 2); _t[2] = LOAD32_B(_x + 6); \
1581 break; \
1582 case FLTFMT_LE: \
1583 _t[2] = LOAD32_L(_x + 0); _t[1] = LOAD32_L(_x + 4); \
1584 _t[0] = LOAD16_L(_x + 8); \
1585 break; \
1586 default: assert(!"unimplemented byte order"); break; \
1587 } \
dc6eea4e 1588 _rc |= fltfmt_decieee(&fltfmt_idblext80, _z, _t); FROBNAN_DEC; \
b1a20bee
MW
1589 } break; \
1590 \
1591 default: { \
b1a20bee
MW
1592 ty _x = (x), _y; \
1593 unsigned _i, _n, _f = 0; \
1594 uint32 _t; \
1595 \
1596 /* If the value looks negative then negate it and set the sign \
1597 * flag. \
1598 */ \
1599 if (NEGP(_x)) { _f |= FLTF_NEG; _x = -_x; } \
1600 \
1601 /* Now for the case analysis. Infinities and zero are \
1602 * unproblematic. NaNs can't be decoded exactly using the \
1603 * portable machinery. \
1604 */ \
1605 if (INFP(_x)) _f |= FLTF_INF; \
1606 else if (_x == 0.0) _f |= FLTF_ZERO; \
1607 else if (NANP(_x)) { _f |= FLTF_QNAN; _rc |= FLTERR_INEXACT; } \
1608 else { \
1609 /* A finite value. Determine the number of fraction limbs \
1610 * we'll need based on the precision and radix and pull out \
1611 * 32-bit chunks one at a time. This will be unproblematic \
1612 * for power-of-two radices, requiring at most shifting the \
1613 * significand left by a few bits, but inherently inexact (for \
1614 * the most part) for others. \
1615 */ \
1616 \
1617 _n = NFRAC(TY); fltfmt_allocfrac(_z, _n); \
1618 _y = frexp(_x, &_z->exp); \
1619 for (_i = 0; _i < _n; _i++) \
1620 { _y *= SH32; _t = _y; _y -= _t; _z->frac[_i] = _t; } \
1621 CONVFIX(ty, _rc, _z, _y, _n, _f, (r)); \
1622 } \
1623 \
1624 /* Done. */ \
1625 _z->f = _f; \
1626 } break; \
1627 } \
1628 \
1629 /* Set the error code. */ \
1630 (rc) = _rc; \
1631} while (0)
1632
1633/* --- @fltfmt_decTY@ --- *
1634 *
1635 * Arguments: @struct floatbits *z_out@ = storage for the result
1636 * @ty x@ = value to decode
1637 * @unsigned r@ = rounding mode
1638 *
1639 * Returns: Error flags (@FLTERR_...@).
1640 *
1641 * Use: Decode the native C floatingpoint value @x@ and store the
1642 * result in @z_out@.
1643 *
1644 * The @TY@ may be @flt@ to encode a @float@, @dbl@ to encode a
1645 * @double@, or (on C99 implementations) @ldbl@ to encode a
1646 * @long double@.
1647 *
1648 * In detail, conversion is performed as follows.
1649 *
1650 * * If the implementation supports negative zeros and/or
1651 * infinity, then these are recognized and decoded.
1652 *
1653 * * If the input as a NaN, but the implementation cannot
1654 * usefully report NaN payloads, then the @FLTERR_INEXACT@
1655 * error bit is set and the decoded payload is left empty.
1656 *
1657 * * If the implementation's floating-point radix is not a
1658 * power of two, and @x@ is a nonzero finite value, then
1659 * @FLTERR_INEXACT@ error bit is set (unconditionally), and
1660 * the rounded value (according to the rounding mode @r@) is
1661 * stored in as many fraction words as necessary to identify
1662 * the original value uniquely. If the radix is a power of
1663 * two, then the value is represented exactly.
1664 */
1665
1666unsigned fltfmt_decflt(struct floatbits *z_out, float x, unsigned r)
1667{
1668 unsigned rc;
1669
1670 DECFLT(double, FLT, frexp, rc, z_out, x, r);
1671 return (rc);
1672}
1673
1674unsigned fltfmt_decdbl(struct floatbits *z_out, double x, unsigned r)
1675{
1676 unsigned rc;
1677
1678 DECFLT(double, DBL, frexp, rc, z_out, x, r);
1679 return (rc);
1680}
1681
1682#if __STDC_VERSION__ >= 199001
1683unsigned fltfmt_decldbl(struct floatbits *z_out, long double x, unsigned r)
1684{
1685 unsigned rc;
1686
1687 DECFLT(long double, LDBL, frexpl, rc, z_out, x, r);
1688 return (rc);
1689}
1690#endif
1691
1692/*----- That's all, folks -------------------------------------------------*/