Commit | Line | Data |
---|---|---|
b1a20bee MW |
1 | /* -*-c-*- |
2 | * | |
3 | * Floating-point format conversions | |
4 | * | |
5 | * (c) 2024 Straylight/Edgeware | |
6 | */ | |
7 | ||
8 | /*----- Licensing notice --------------------------------------------------* | |
9 | * | |
10 | * This file is part of the mLib utilities library. | |
11 | * | |
12 | * mLib is free software: you can redistribute it and/or modify it under | |
13 | * the terms of the GNU Library General Public License as published by | |
14 | * the Free Software Foundation; either version 2 of the License, or (at | |
15 | * your option) any later version. | |
16 | * | |
17 | * mLib is distributed in the hope that it will be useful, but WITHOUT | |
18 | * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or | |
19 | * FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public | |
20 | * License for more details. | |
21 | * | |
22 | * You should have received a copy of the GNU Library General Public | |
23 | * License along with mLib. If not, write to the Free Software | |
24 | * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, | |
25 | * USA. | |
26 | */ | |
27 | ||
28 | /*----- Header files ------------------------------------------------------*/ | |
29 | ||
30 | #include "config.h" | |
31 | ||
32 | #include <assert.h> | |
33 | #include <float.h> | |
34 | #include <limits.h> | |
35 | #include <math.h> | |
36 | ||
37 | #include "alloc.h" | |
38 | #include "arena.h" | |
39 | #include "bits.h" | |
40 | #include "fltfmt.h" | |
41 | #include "growbuf.h" | |
b1a20bee MW |
42 | #include "maths.h" |
43 | ||
dc6eea4e MW |
44 | /*----- Preliminary hacking -----------------------------------------------*/ |
45 | ||
46 | /* The native-format conversions are -- at least if the format is | |
47 | * unrecognized -- dependent on the implementation's rounding. Our own | |
48 | * rounding mode specifications don't fit into the framework very well, but I | |
49 | * still want to respect the prevailing rounding mode. | |
50 | * | |
51 | * The `proper' way to do this is with %|#pragma STDC FENV_ACCESS|%. But | |
52 | * that doesn't actually work on GCC, or on Clang from not too long ago. So | |
53 | * use compiler-specific hacking to support this. | |
54 | */ | |
b1a20bee MW |
55 | #if GCC_VERSION_P(4, 4) |
56 | # pragma GCC optimize "-frounding-math" | |
57 | #elif CLANG_VERSION_P(11, 0) && !CLANG_VERSION_P(12, 0) | |
58 | # pragma clang optimize "-frounding-math" | |
59 | #elif GCC_VERSION_P(0, 0) || \ | |
60 | (CLANG_VERSION_P(0, 0) && !CLANG_VERSION_P(12, 0)) | |
61 | /* We just lose. */ | |
62 | #elif __STDC_VERSION__ >= 199001 | |
63 | # include <fenv.h> | |
64 | # pragma STDC FENV_ACCESS ON | |
65 | #endif | |
66 | ||
67 | /*----- Some useful constants ---------------------------------------------*/ | |
68 | ||
289651a7 MW |
69 | #define B31 0x80000000u /* just bit 31 */ |
70 | #define B30 0x40000000u /* just bit 30 */ | |
b1a20bee MW |
71 | |
72 | #define SH32 4294967296.0 /* 2^32 as floating-point */ | |
73 | ||
74 | /*----- Useful macros -----------------------------------------------------*/ | |
75 | ||
76 | #define B32(k) ((uint32)1 << (k)) | |
77 | #define M32(k) (B32(k) - 1) | |
78 | ||
79 | #define FINITEP(x) (!((x)->f&(FLTF_INF | FLTF_NANMASK | FLTF_ZERO))) | |
80 | ||
81 | /*----- Utility functions -------------------------------------------------*/ | |
82 | ||
83 | /* --- @clz32@ --- * | |
84 | * | |
85 | * Arguments: @uint32 x@ = a 32-bit value | |
86 | * | |
87 | * Returns: The number of leading zeros in @x@, i.e., the nonnegative | |
88 | * integer %$n$% such that %$2^{31} \le 2^n x < 2^{32}$%. | |
89 | * Returns a nonsensical value if %$x = 0$%. | |
90 | */ | |
91 | ||
92 | static unsigned clz32(uint32 x) | |
93 | { | |
94 | unsigned n = 0; | |
95 | ||
96 | /* Divide and conquer. If the top half of the bits are clear, then there | |
97 | * must be at least 16 leading zero bits, so accumulate and shift. Repeat | |
98 | * for smaller powers of two. | |
99 | * | |
100 | * This ends up returning 31 if %$x = 0$%, but don't rely on this. | |
101 | */ | |
102 | if (!(x&0xffff0000)) { x <<= 16; n += 16; } | |
103 | if (!(x&0xff000000)) { x <<= 8; n += 8; } | |
104 | if (!(x&0xf0000000)) { x <<= 4; n += 4; } | |
105 | if (!(x&0xc0000000)) { x <<= 2; n += 2; } | |
106 | if (!(x&0x80000000)) { n += 1; } | |
107 | return (n); | |
108 | } | |
109 | ||
110 | /* --- @ctz32@ --- * | |
111 | * | |
112 | * Arguments: @uint32 x@ = a 32-bit value | |
113 | * | |
114 | * Returns: The number of trailing zeros in @x@, i.e., the nonnegative | |
115 | * integer %$n$% such that %$x/2^n$% is an odd integer. | |
116 | * Returns a nonsensical value if %$x = 0$%. | |
117 | */ | |
118 | ||
119 | static unsigned ctz32(uint32 x) | |
120 | { | |
121 | #ifdef CTZ_TRADITIONAL | |
122 | ||
123 | unsigned n = 0; | |
124 | ||
125 | /* Divide and conquer. If the bottom half of the bits are clear, then | |
126 | * there must be at least 16 trailing zero bits, so accumulate and shift. | |
127 | * Repeat for smaller powers of two. | |
128 | * | |
129 | * This ends up returning 31 if %$x = 0$%, but don't rely on this. | |
130 | */ | |
131 | if (!(x&0x0000ffff)) { x >>= 16; n += 16; } | |
132 | if (!(x&0x000000ff)) { x >>= 8; n += 8; } | |
133 | if (!(x&0x0000000f)) { x >>= 4; n += 4; } | |
134 | if (!(x&0x00000003)) { x >>= 2; n += 2; } | |
135 | if (!(x&0x00000001)) { n += 1; } | |
136 | return (n); | |
137 | ||
138 | #else | |
139 | ||
140 | static unsigned char tab[] = | |
141 | /* | |
142 | ;;; Compute the decoding table for the de Bruijn sequence trick below. | |
143 | ||
144 | (let ((db #x04653adf) | |
145 | (rv (make-vector 32 nil))) | |
146 | (dotimes (i 32) | |
147 | (aset rv (logand (ash db (- i 27)) #x1f) i)) | |
148 | (save-excursion | |
149 | (goto-char (point-min)) | |
150 | (search-forward (concat "***" "BEGIN ctz32tab" "***")) | |
151 | (beginning-of-line 2) | |
152 | (delete-region (point) | |
153 | (progn | |
154 | (search-forward "***END***") | |
155 | (beginning-of-line) | |
156 | (point))) | |
157 | (dotimes (i 32) | |
158 | (cond ((zerop i) (insert " { ")) | |
159 | ((zerop (mod i 16)) (insert ",\n ")) | |
160 | ((zerop (mod i 4)) (insert ", ")) | |
161 | (t (insert ", "))) | |
162 | (insert (format "%2d" (aref rv i)))) | |
163 | (insert " };\n"))) | |
164 | */ | |
165 | ||
166 | /* ***BEGIN ctz32tab*** */ | |
167 | { 0, 1, 2, 6, 3, 11, 7, 16, 4, 14, 12, 21, 8, 23, 17, 26, | |
168 | 31, 5, 10, 15, 13, 20, 22, 25, 30, 9, 19, 24, 29, 18, 28, 27 }; | |
169 | /* ***END*** */ | |
170 | ||
171 | /* Sneaky trick. Two's complement negation (which you effectively get | |
172 | * using C unsigned arithmetic, whether you like it or not) complements all | |
173 | * of the bits of an operand more significant than the least significant | |
174 | * set bit. Therefore, this bit is the only one set in both %$x$% and | |
175 | * %$-x$%, so @x&-x@ will isolate it for us. | |
176 | * | |
177 | * The magic number @0x04653adf@ is a %%\emph{de Bruijn} number%%: every | |
178 | * group of five consecutive bits is distinct, including groups which `wrap | |
179 | * around', including some low bits and some high bits. Multiplying this | |
180 | * number by a power of two is equivalent to a left shift; and, because the | |
181 | * top five bits are all zero, the most significant five bits of the | |
182 | * product are the same as if we'd applied a rotation. The result is that | |
183 | * we end up with a distinctive pattern in those bits which perfectly | |
184 | * diagnose each shift from 0 up to 31, which we can decode using a table. | |
185 | * | |
186 | * David Seal described a similar trick -- using the six-bit pattern | |
187 | * generated by the constant @0x0450fbaf@ -- in `comp.sys.arm' in 1994; | |
188 | * this constant was particularly convenient to multiply by on early ARM | |
189 | * processors. The use of a de Bruijn number is described in Henry | |
190 | * Warren's %%\emph{Hacker's Delight}%%. | |
191 | */ | |
192 | return (tab[((x&-x)*0x04653adf >> 27)&0x1f]); | |
193 | ||
194 | #endif | |
195 | } | |
196 | ||
197 | /* --- @shl@, @shr@ --- * | |
198 | * | |
199 | * Arguments: @uint32 *z@ = destination vector | |
200 | * @const uint32 *x@ = source vector | |
201 | * @size_t sz@ = size of source vector, in elements | |
202 | * @unsigned n@ = number of bits to shift by; must be less than | |
203 | * 32 | |
204 | * | |
205 | * Returns: The bits shifted off the end of the vector. | |
206 | * | |
207 | * Use: Shift a vector of 32-bit words left (@shl@) or right (@shr@) | |
208 | * by some number of bit positions. These functions work | |
209 | * correctly if @z@ and @x@ are the same pointer, but not if | |
210 | * they otherwise overlap. | |
211 | */ | |
212 | ||
213 | static uint32 shl(uint32 *z, const uint32 *x, size_t sz, unsigned n) | |
214 | { | |
215 | size_t i; | |
216 | uint32 t, u; | |
217 | unsigned r; | |
218 | ||
219 | if (!n) { | |
220 | for (i = 0; i < sz; i++) z[i] = x[i]; | |
221 | t = 0; | |
222 | } else { | |
223 | r = 32 - n; | |
224 | for (t = 0, i = sz; i--; ) | |
225 | { u = x[i]; z[i] = ((u << n) | t)&MASK32; t = u >> r; } | |
226 | } | |
227 | return (t); | |
228 | } | |
229 | ||
230 | static uint32 shr(uint32 *z, const uint32 *x, size_t sz, unsigned n) | |
231 | { | |
232 | size_t i; | |
233 | uint32 t, u; | |
234 | unsigned r; | |
235 | ||
236 | if (!n) { | |
237 | for (i = 0; i < sz; i++) z[i] = x[i]; | |
238 | t = 0; | |
239 | } else { | |
240 | r = 32 - n; | |
241 | for (t = 0, i = 0; i < sz; i++) | |
242 | { u = x[i]; z[i] = ((u >> n) | t)&MASK32; t = u << r; } | |
243 | } | |
244 | return (t); | |
245 | } | |
246 | ||
247 | /* --- @sigbits@ --- * | |
248 | * | |
249 | * Arguments: @const struct floatbits *x@ = decoded floating-point number | |
250 | * | |
251 | * Returns: The number of significant digits in @x@'s fraction. This | |
252 | * will be zero if @x@ is zero or infinite. | |
253 | */ | |
254 | ||
255 | static unsigned sigbits(const struct floatbits *x) | |
256 | { | |
257 | unsigned i; | |
258 | uint32 w; | |
259 | ||
260 | if (x->f&(FLTF_ZERO | FLTF_INF)) return (0); | |
261 | i = x->n; | |
262 | for (;;) { | |
263 | if (!i) return (0); | |
264 | w = x->frac[--i]; if (w) return (32*(i + 1) - ctz32(w)); | |
265 | } | |
266 | } | |
267 | ||
268 | /* --- @ms_set_bit@ --- * | |
269 | * | |
270 | * Arguments: @const uint32 *x@ = pointer to the %%\emph{end}%% of the | |
271 | * buffer | |
272 | * @unsigned from, to@ = lower (inclusive) and upper (exclusive) | |
273 | * bounds on the region of bits to inspect | |
274 | * | |
275 | * Returns: Index of the most significant set bit, or @ALLCLEAR@. | |
276 | * | |
277 | * Use: For the (rather unusual) purposes of this function, the bits | |
278 | * of the input are numbered from zero, being the least | |
279 | * significant bit of @x[-1]@, upwards through more significant | |
280 | * bits of @x[-1]@, and then through @x[-2]@ and so on. | |
281 | * | |
282 | * If all of the bits in the half-open region are clear then | |
283 | * @ALLCLEAR@ is returned; otherwise, the return value is the | |
284 | * index of the most significant set bit in the region. Note | |
285 | * that @ALLCLEAR@ is equal to @UINT_MAX@: since this is the | |
286 | * largest possible value of @to@, and the upper bound is | |
287 | * exclusive, this cannot be the index of a bit in the region. | |
288 | */ | |
289 | ||
290 | #define ALLCLEAR UINT_MAX | |
291 | static unsigned ms_set_bit(const uint32 *x, unsigned from, unsigned to) | |
292 | { | |
293 | unsigned n0, n, b, base; | |
294 | uint32 m, w; | |
295 | ||
296 | /* <--- increasing indices <--- | |
297 | * | |
298 | * ---+-------+-------+-------+-------+-------+--- | |
299 | * ...S |///| | | | | |//| S... | |
300 | * ---+-------+-------+-------+-------+-------+--- | |
301 | */ | |
302 | ||
303 | /* If the region is empty then it's technically true that all of the bits | |
304 | * are zero. It's important to be able to do answer the case where | |
305 | * %$\id{from} = \id{to} = 0$% without accessing memory. | |
306 | */ | |
307 | assert(to >= from); if (to == from) return (ALLCLEAR); | |
308 | ||
309 | /* This is distressingly complicated. Here's how it's going to work. | |
310 | * | |
311 | * There's at least one bit to check, or we'd have returned already -- | |
312 | * specifically, we must check the bit with index @from@. But that's at | |
313 | * the wrong end. Instead, we start at the most significant end, with the | |
314 | * word containing the bit one short of the @to@ position. Even though | |
315 | * it's actually one off, because we use a half-open interval, we'll call | |
316 | * that the `@to@ bit'. | |
317 | * | |
318 | * We start by loading the word containing the @to@ bit, and start @base@ | |
319 | * off as the bit index of the least significant bit of this word. We mask | |
320 | * off the high bits (if any), leaving only the @to@ bit and the less | |
321 | * significant ones. We %%\emph{don't}%% check the remaining bits yet. | |
322 | * | |
323 | * We then start an offset loop. In each iteration, we check the word | |
324 | * we're currently holding: if it's not zero, then we return @base@ plus | |
325 | * the position of the most-significant set bit, using @clz32@. Otherwise, | |
326 | * we load the next (less significant) word, and drop @base@ by 32, but | |
327 | * don't check it yet. We end this loop when the word we're holding | |
328 | * contains the @from@ bit. It's possible that we didn't do any iterations | |
329 | * of the loop, in which case we're still holding the word containing the | |
330 | * @to@ bit at this point. | |
331 | * | |
332 | * Finally, we mask off the bits below the @from@ bit, and check that what | |
333 | * we have left is zero. If it isn't, we return @base@ plus the position | |
334 | * of the most significant bit; if it is, we return @ALLCEAR@. | |
335 | */ | |
336 | ||
337 | /* The first job is to find the word containing the @to@ bit and mask off | |
338 | * any higher bits that we don't care about. | |
339 | * | |
340 | * Recall that the bit's index is @to - 1@, but this must be a valid index | |
341 | * because there is at least one bit in the region. But we start out | |
342 | * pointing beyond the vector, so we must add an extra 32 bits. | |
343 | */ | |
344 | n0 = (to + 31)/32; x -= n0; base = (to - 1)&~31u; w = *x++; | |
345 | b = to%32; if (b) w &= M32(b); | |
346 | ||
347 | /* Before we start the loop, it'd be useful to know how many iterations we | |
348 | * need. This is going to be the offset from the word containing the @to@ | |
349 | * bit to the word containing the @from@ bit. Again, we're off by one | |
350 | * because that's how our initial indexing is set up. | |
351 | */ | |
352 | n = n0 - from/32 - 1; | |
353 | ||
354 | /* Now it's time to do the loop. This is the easy bit. */ | |
355 | while (n--) { | |
356 | if (w) return (base + 31 - clz32(w)); | |
357 | w = *x++&MASK32; base -= 32; | |
358 | } | |
359 | ||
360 | /* We're now holding the final word -- the one containing the @from@ bit. | |
361 | * We need to mask off any low bits that we don't care about. | |
362 | */ | |
363 | m = M32(from%32); w &= MASK32&~m; | |
364 | ||
365 | /* Final check. */ | |
366 | if (w) return (base + 31 - clz32(w)); | |
367 | else return (ALLCLEAR); | |
368 | } | |
369 | ||
370 | /*----- General floating-point hacking ------------------------------------*/ | |
371 | ||
372 | /* --- @fltfmt_initbits@ --- * | |
373 | * | |
374 | * Arguments: @struct floatbits *x@ = pointer to structure to initialize | |
375 | * | |
376 | * Returns: --- | |
377 | * | |
378 | * Use: Dynamically initialize @x@ to (positive) zero so that it can | |
379 | * be used as the destination operand by other operations. This | |
380 | * doesn't allocate resources and cannot fail. The | |
381 | * @FLOATBITS_INIT@ macro is a suitable static initializer for | |
382 | * performing the same task. | |
383 | */ | |
384 | ||
385 | void fltfmt_initbits(struct floatbits *x) | |
386 | { | |
387 | x->f = FLTF_ZERO; | |
388 | x->a = arena_global; | |
389 | x->frac = 0; x->n = x->fracsz = 0; | |
390 | } | |
391 | ||
392 | /* --- @fltfmt_freebits@ --- * | |
393 | * | |
394 | * Arguments: @struct floatbits *x@ = pointer to structure to free | |
395 | * | |
396 | * Returns: --- | |
397 | * | |
398 | * Use: Releases the memory held by @x@. Afterwards, @x@ is a valid | |
399 | * (positive) zero, but can safely be discarded. | |
400 | */ | |
401 | ||
402 | void fltfmt_freebits(struct floatbits *x) | |
403 | { | |
404 | if (x->frac) x_free(x->a, x->frac); | |
405 | x->f = FLTF_ZERO; | |
406 | x->frac = 0; x->n = x->fracsz = 0; | |
407 | } | |
408 | ||
409 | /* --- @fltfmt_allocfrac@ --- * | |
410 | * | |
411 | * Arguments: @struct floatbits *x@ = structure to adjust | |
412 | * @unsigned n@ = number of words required | |
413 | * | |
414 | * Returns: --- | |
415 | * | |
416 | * Use: Reallocate the @frac@ vector so that it has space for at | |
417 | * least @n@ 32-bit words, and set @x->n@ equal to @n@. If the | |
418 | * current size is already @n@ or greater, then just update the | |
419 | * active length @n@ and return; otherwise, any existing vector | |
420 | * is discarded and a fresh, larger one allocated. | |
421 | */ | |
422 | ||
423 | void fltfmt_allocfrac(struct floatbits *x, unsigned n) | |
424 | { GROWBUF_REPLACEV(unsigned, x->a, x->frac, x->fracsz, n, 4); x->n = n; } | |
425 | ||
426 | /* --- @fltfmt_copybits@ --- * | |
427 | * | |
428 | * Arguments: @struct floatbits *z_out@ = where to leave the result | |
429 | * @const struct floatbits *x@ = source to copy | |
430 | * | |
431 | * Returns: --- | |
432 | * | |
433 | * Use: Make @z_out@ be a copy of @x@. If @z_out@ is the same object | |
434 | * as @x@ then do nothing. | |
435 | */ | |
436 | ||
437 | void fltfmt_copybits(struct floatbits *z_out, const struct floatbits *x) | |
438 | { | |
439 | unsigned i; | |
440 | ||
441 | if (z_out == x) return; | |
442 | z_out->f = x->f; | |
443 | if (!FINITEP(x)) z_out->exp = 0; | |
444 | else z_out->exp = x->exp; | |
445 | if ((x->f&(FLTF_ZERO | FLTF_INF)) || !x->n) | |
446 | { z_out->n = 0; z_out->frac = 0; } | |
447 | else { | |
448 | fltfmt_allocfrac(z_out, x->n); | |
449 | for (i = 0; i < x->n; i++) z_out->frac[i] = x->frac[i]; | |
450 | } | |
451 | } | |
452 | ||
453 | /* --- @fltfmt_round@ --- * | |
454 | * | |
455 | * Arguments: @struct floatbits *z_out@ = destination (may equal source) | |
456 | * @const struct floatbits *x@ = source | |
457 | * @unsigned r@ = rounding mode (@FLTRND_...@ code) | |
458 | * @unsigned n@ = nonzero number of bits to leave | |
459 | * | |
460 | * Returns: A @FLTERR_...@ code, specifically either @FLTERR_INEXACT@ if | |
461 | * rounding discarded some nonzero value bits, or @FLTERR_OK@ if | |
462 | * rounding was unnecessary. | |
463 | * | |
464 | * Use: Rounds a floating-point value to a given number of | |
465 | * significant bits, using the given rounding rule. | |
466 | */ | |
467 | ||
468 | unsigned fltfmt_round(struct floatbits *z_out, const struct floatbits *x, | |
469 | unsigned r, unsigned n) | |
470 | { | |
471 | unsigned rf, i, uw, ub, hw, hb, rc = 0; | |
472 | uint32 um, hm, w; | |
473 | int exp; | |
474 | ||
475 | /* Check that this is likely to work. We must have at least one bit | |
476 | * remaining, so that we can inspect the last-place unit bit. And we | |
477 | * mustn't round up if the current value is already exact, because that | |
478 | * would be nonsensical (and inconvenient). | |
479 | */ | |
480 | assert(n > 0); assert(!(r&~(FRPMASK_LOW | FRPMASK_HALF))); | |
481 | ||
482 | /* Eliminate trivial cases. There's nothing to do if the value is infinite | |
483 | * or zero, or if we don't have enough precision already. | |
484 | * | |
485 | * The caller will have set the rounding mode and length suitably for a | |
486 | * NaN. | |
487 | */ | |
488 | if (x->f&(FLTF_ZERO | FLTF_INF) || n >= 32*x->n) | |
489 | { fltfmt_copybits(z_out, x); return (FLTERR_OK); } | |
490 | ||
491 | /* Determine various indices. | |
492 | * | |
493 | * The quantities @uw@ and @ub@ are the word and bit number which will hold | |
494 | * the unit bit when we've finished; @hw@ and @hb@ similarly index the | |
495 | * `half' bit, which is the next less significant bit. | |
496 | */ | |
497 | uw = (n - 1)/32; ub = -n&31; | |
498 | if (!ub) { hw = uw + 1; hb = 31; } | |
499 | else { hw = uw; hb = ub - 1; } | |
500 | ||
501 | /* Determine the necessary predicates for the rounding decision. */ | |
502 | rf = 0; | |
503 | if (x->f&FLTF_NEG) rf |= FRPF_NEG; | |
504 | um = B32(ub); if (x->frac[uw]&um) rf |= FRPF_ODD; | |
505 | hm = B32(hb); if (x->frac[hw]&hm) rf |= FRPF_HALF; | |
506 | if (x->frac[hw]&(hm - 1)) rf |= FRPF_LOW; | |
507 | for (i = hw + 1; i < x->n; i++) if (x->frac[i]) rf |= FRPF_LOW; | |
508 | if (rf&(FRPF_LOW | FRPF_HALF)) rc |= FLTERR_INEXACT; | |
509 | ||
510 | /* Allocate space for the result. */ | |
511 | fltfmt_allocfrac(z_out, uw + 1); | |
512 | ||
513 | /* We start looking at the least significant word of the result. Clear the | |
514 | * low bits. | |
515 | */ | |
516 | i = uw; exp = x->exp; w = x->frac[i]&~(um - 1); | |
517 | ||
518 | /* If the rounding function is true then we need to add one to the | |
519 | * truncated fraction and propagate carries. | |
520 | */ | |
521 | if ((r >> rf)&1) { | |
522 | w = (w + um)&MASK32; | |
523 | while (i && !w) { | |
524 | z_out->frac[i] = w; | |
525 | w = (x->frac[--i] + 1)&MASK32; | |
526 | } | |
527 | if (!w) { w = B31; exp++; } | |
528 | } | |
529 | ||
530 | /* Store, and copy the remaining words. */ | |
531 | for (;;) { | |
532 | z_out->frac[i] = w; | |
533 | if (!i) break; | |
534 | w = x->frac[--i]; | |
535 | } | |
536 | ||
537 | /* Done. */ | |
538 | z_out->f = x->f&(FLTF_NEG | FLTF_NANMASK); | |
539 | if (x->f&FLTF_NANMASK) z_out->exp = 0; | |
540 | else z_out->exp = exp; | |
541 | return (rc); | |
542 | } | |
543 | ||
dc6eea4e | 544 | /*----- IEEE and related formats ------------------------------------------*/ |
b1a20bee MW |
545 | |
546 | /* IEEE (and related) format descriptions. */ | |
547 | const struct fltfmt_ieeefmt | |
c752173d MW |
548 | fltfmt_mini = { FLTIF_HIDDEN, 4, 4 }, |
549 | fltfmt_bf16 = { FLTIF_HIDDEN, 8, 8 }, | |
550 | fltfmt_f16 = { FLTIF_HIDDEN, 5, 11 }, | |
551 | fltfmt_f32 = { FLTIF_HIDDEN, 8, 24 }, | |
552 | fltfmt_f64 = { FLTIF_HIDDEN, 11, 53 }, | |
553 | fltfmt_f128 = { FLTIF_HIDDEN, 15, 113 }, | |
b1a20bee MW |
554 | fltfmt_idblext80 = { 0, 15, 64 }; |
555 | ||
556 | /* --- @fltfmt_encieee@ --- | |
557 | * | |
558 | * Arguments: @const struct fltfmt_ieeefmt *fmt@ = format description | |
559 | * @uint32 *z@ = output vector | |
560 | * @const struct floatbits *x@ = value to encode | |
561 | * @unsigned r@ = rounding mode | |
562 | * @unsigned errmask@ = error mask | |
563 | * | |
564 | * Returns: Error flags (@FLTERR_...@). | |
565 | * | |
566 | * Use: Encode a floating-point value in an IEEE format. This is the | |
567 | * machinery shared by the @fltfmt_enc...@ functions for | |
568 | * encoding IEEE-format values. Most of the arguments and | |
569 | * behaviour are as described for those functions. | |
570 | * | |
571 | * The encoded value is right-aligned and big-endian; i.e., the | |
572 | * sign bit ends up in @z[0]@, and the least significant bit of | |
573 | * the significand ends up in the least significant bit of | |
574 | * @z[n - 1]@. | |
575 | */ | |
576 | ||
577 | unsigned fltfmt_encieee(const struct fltfmt_ieeefmt *fmt, | |
578 | uint32 *z, const struct floatbits *x, | |
579 | unsigned r, unsigned errmask) | |
580 | { | |
581 | struct floatbits y; | |
582 | unsigned sigwd, fracwd, err = 0, f = x->f, rf; | |
583 | unsigned i, j, n, nb, nw, mb, mw, esh, sh; | |
584 | int exp, minexp, maxexp; | |
585 | uint32 z0, t; | |
586 | ||
587 | #define ERR(e) do { err |= (e); if (err&~errmask) goto end; } while (0) | |
588 | ||
589 | /* The following code assumes that the sign, biased exponent, unit, and | |
590 | * quiet/signalling bits can all fit into the most significant 32 bits of | |
591 | * the result. | |
592 | */ | |
593 | assert(fmt->expwd + 3 <= 32); | |
594 | esh = 31 - fmt->expwd; | |
595 | ||
596 | /* Determine the output size. */ | |
597 | nb = fmt->prec + fmt->expwd + 1; | |
c752173d | 598 | if (fmt->f&FLTIF_HIDDEN) nb--; |
b1a20bee MW |
599 | nw = (nb + 31)/32; |
600 | ||
601 | /* Determine the top bits. */ | |
602 | z0 = 0; i = 0; | |
603 | if (x->f&FLTF_NEG) z0 |= B31; | |
604 | ||
605 | /* And now for the main case analysis. */ | |
606 | ||
607 | if (f&FLTF_ZERO) { | |
608 | /* Zero. There's very little to do here. */ | |
609 | ||
610 | } else if (f&FLTF_INF) { | |
611 | /* Infinity. Set the exponent and, for non-hidden-bit formats, the unit | |
612 | * bit. | |
613 | */ | |
614 | ||
615 | z0 |= M32(fmt->expwd) << esh; | |
c752173d | 616 | if (!(fmt->f&FLTIF_HIDDEN)) z0 |= B32(esh - 1); |
b1a20bee MW |
617 | |
618 | } else if (f&FLTF_NANMASK) { | |
619 | /* Not-a-number. | |
620 | * | |
621 | * We must check that we won't lose significant bits. We need a bit for | |
622 | * the quiet/signalling flag, and enough space for the significant | |
623 | * payload bits. The unit bit is not in play here, so the available | |
624 | * space is always one less than the advertised precision. To put it | |
625 | * another way, we need space for the payload, a bit for the | |
626 | * quiet/signalling flag, and a bit for the unit place. | |
627 | */ | |
628 | ||
629 | fracwd = sigbits(x); | |
630 | if (fracwd + 2 > fmt->prec) ERR(FLTERR_INEXACT); | |
631 | ||
632 | /* Copy the payload. | |
633 | * | |
634 | * If the payload is all-zero and we're meant to set a signalling NaN | |
dc6eea4e | 635 | * then report an exactness failure and set the least-significant bit. |
b1a20bee MW |
636 | */ |
637 | mb = fmt->prec - 2; mw = (mb + 31)/32; sh = -mb%32; | |
dc6eea4e MW |
638 | n = x->n; |
639 | if (n < mw) j = 0; | |
640 | else { n = mw; j = sh; } | |
641 | if ((f&FLTF_SNAN) && ms_set_bit(x->frac + n, j, 32*n) == ALLCLEAR) { | |
642 | ERR(FLTERR_INEXACT); | |
643 | n = nw - 1; for (i = 0; i < n; i++) z[i] = 0; | |
644 | z[i++] = 1; | |
645 | } else { | |
646 | for (i = 0; i < nw - mw; i++) z[i] = 0; | |
647 | n = x->n; if (n > mw) n = mw; | |
648 | t = shr(z + i, x->frac, n, sh); i += n; | |
649 | if (i < nw) z[i++] = t; | |
650 | sh = esh - 2; if (fmt->f&FLTIF_HIDDEN) sh++; | |
651 | if (f&FLTF_QNAN) z0 |= B32(sh); | |
652 | } | |
b1a20bee MW |
653 | |
654 | /* Set the exponent and, for non-hidden-bit formats, the unit bit. */ | |
655 | z0 |= M32(fmt->expwd) << esh; | |
c752173d | 656 | if (!(fmt->f&FLTIF_HIDDEN)) z0 |= B32(esh - 1); |
b1a20bee MW |
657 | |
658 | } else { | |
659 | /* A finite value. | |
660 | * | |
661 | * Adjust the exponent by one place to compensate for the difference in | |
662 | * significant conventions. Our significand lies between zero (in fact, | |
663 | * a half, because we require normalization) and one, while an IEEE | |
664 | * significand lies between zero (in fact, one) and two. Our exponent is | |
665 | * therefore one larger than the IEEE exponent will be. | |
666 | */ | |
667 | ||
668 | /* Determine the maximum true (unbiased) exponent. As noted above, this | |
669 | * is also the bias. | |
670 | */ | |
671 | exp = x->exp - 1; | |
672 | maxexp = (1 << (fmt->expwd - 1)) - 1; | |
673 | minexp = 1 - maxexp; | |
674 | ||
675 | if (exp <= minexp - (int)fmt->prec) { | |
676 | /* If the exponent is very small then we underflow. We have %$p - 1$% | |
677 | * bits available to represent a subnormal significand, and therefore | |
678 | * can represent at least one bit of a value as small as | |
679 | * %$2^{e_{\text{min}}-p+1}$%. | |
680 | * | |
681 | * If the exponent is one short of the threshold, then we check to see | |
682 | * whether the value will round up. | |
683 | */ | |
684 | ||
685 | if ((minexp - exp == fmt->prec) && | |
686 | ((r >> (FRPF_HALF | | |
687 | (sigbits(x) > 1 ? FRPF_LOW : 0) | | |
688 | (f&FLTF_NEG ? FRPF_NEG : 0)))&1)) { | |
689 | ERR(FLTERR_INEXACT); | |
690 | for (i = 0; i < nw - 1; i++) z[i] = 0; | |
691 | z[i++] = 1; | |
692 | } else { | |
693 | ERR(FLTERR_UFLOW | FLTERR_INEXACT); | |
694 | /* Return (signed) zero. */ | |
695 | } | |
696 | ||
697 | } else { | |
698 | /* We can at least try to store some bits. */ | |
699 | ||
700 | /* Let's see how many we need to deal with and how much space we have. | |
701 | * We might as well set the biased exponent here while we're at it. | |
702 | * | |
703 | * If %$e \ge e_{\text{min}}$% then we can store %$p$% bits of | |
704 | * significand. Otherwise, we must make a subnormal and we can only | |
705 | * store %$p + e - e_{\text{min}}$% bits. (Cross-check: if %$e \le | |
706 | * e_{\text{min}} - p$% then we can store zero bits or fewer and have | |
707 | * underflowed to zero, which matches the previous case.) In the | |
708 | * subnormal case, we also `correct' the exponent so that we store the | |
709 | * correct sentinel value later. | |
710 | */ | |
711 | fracwd = sigbits(x); | |
712 | if (exp >= minexp) sigwd = fmt->prec; | |
713 | else { sigwd = fmt->prec + exp - minexp; exp = minexp - 1; } | |
714 | mw = (sigwd + 31)/32; sh = -sigwd%32; | |
715 | ||
716 | /* If we don't have enough significand bits then we must round. This | |
717 | * might increase the exponent, so we must reload. | |
718 | */ | |
719 | if (fracwd > sigwd) { | |
720 | ERR(FLTERR_INEXACT); | |
721 | y.frac = z + nw - mw; y.fracsz = mw; fltfmt_round(&y, x, r, sigwd); | |
722 | x = &y; exp = y.exp - 1; fracwd = sigwd; | |
723 | } | |
724 | ||
725 | if (exp > maxexp) { | |
726 | /* If the exponent is too large, then we overflow. If the error is | |
727 | * masked, then we must produce a default value, choosing between | |
728 | * infinity and the largest representable finite value according to | |
729 | * the rounding mode. | |
730 | */ | |
731 | ||
c752173d | 732 | ERR(FLTERR_OFLOW | FLTERR_INEXACT); |
b1a20bee MW |
733 | rf = FRPF_ODD | FRPF_HALF | FRPF_LOW; |
734 | if (f&FLTF_NEG) rf |= FRPF_NEG; | |
c752173d | 735 | if ((r >> rf)&1) |
b1a20bee | 736 | z0 |= M32(fmt->expwd) << esh; |
c752173d | 737 | else { |
b1a20bee | 738 | z0 |= (B32(fmt->expwd) - 2) << esh; |
c752173d | 739 | mb = fmt->prec; if (fmt->f&FLTIF_HIDDEN) mb--; |
b1a20bee MW |
740 | mw = (mb + 31)/32; |
741 | i = nw - mw; | |
742 | z[i++] = M32(mb%32); | |
743 | while (i < nw) z[i++] = MASK32; | |
744 | } | |
745 | ||
746 | } else { | |
747 | /* The exponent is in range. Everything is ready. */ | |
748 | ||
749 | /* Store the significand. */ | |
750 | n = (fracwd + 31)/32; i = nw - mw; | |
751 | t = shr(z + i, x->frac, n, sh); i += n; | |
752 | if (i < nw) z[i++] = t; | |
753 | ||
754 | /* Fill in the top end. */ | |
755 | for (j = nw - mw; j--; ) z[j] = 0; | |
756 | ||
757 | /* Set the biased exponent. */ | |
758 | z0 |= (exp + maxexp) << esh; | |
759 | ||
760 | /* Clear the unit bit if we're suppose to use a hidden-bit convention. */ | |
c752173d | 761 | if (fmt->f&FLTIF_HIDDEN) { |
b1a20bee MW |
762 | mb = fmt->prec - 1; mw = (mb + 31)/32; mb = mb%32; |
763 | z[nw - mw] &= ~B32(mb); | |
764 | } | |
765 | } | |
766 | } | |
767 | } | |
768 | ||
769 | /* Clear the significand bits that we haven't set explicitly yet. */ | |
770 | while (i < nw) z[i++] = 0; | |
771 | ||
772 | /* All that remains is to insert the top bits @z0@ in the right place. | |
773 | * This will set the exponent, and the unit and quiet bits. | |
774 | */ | |
775 | sh = -nb%32; | |
776 | z[0] |= z0 >> sh; | |
777 | if (sh && nb >= 32) z[1] |= z0 << (32 - sh); | |
778 | ||
779 | end: | |
780 | return (err); | |
781 | ||
782 | #undef ERR | |
783 | } | |
784 | ||
785 | /* --- @fltfmt_encTY@ --- * | |
786 | * | |
787 | * Arguments: @octet *z_out@, @uint16 *z_out@, @uint32 *z_out@, | |
788 | * @kludge64 *z_out@ = where to put the encoded value | |
789 | * @uint16 *se_out@, @kludge64 *m_out@ = where to put the | |
790 | * encoded sign-and-exponent and significand | |
791 | * @const struct floatbits *x@ = value to encode | |
792 | * @unsigned r@ = rounding mode | |
793 | * @unsigned errmask@ = error mask | |
794 | * | |
795 | * Returns: Error flags (@FLTERR_...@). | |
796 | * | |
797 | * Use: Encode a floating-point value in an IEEE (or IEEE-adjacent) | |
798 | * format. | |
799 | * | |
800 | * If an error is encountered during the encoding, and the | |
801 | * corresponding bit of @errmask@ is clear, then processing | |
802 | * stops immediately and the error is returned; if the bit is | |
803 | * set, then processing continues as described below. | |
804 | * | |
805 | * The @TY@ may be | |
806 | * | |
807 | * * @mini@ for the 8-bit `1.4.3 minifloat' format, with | |
808 | * four-bit exponent and four-bit significand, represented | |
809 | * as a single octet; | |
810 | * | |
811 | * * @bf16@ for the Google `bfloat16' format, with eight-bit | |
812 | * exponent and eight-bit significand, represented as a | |
813 | * @uint16@; | |
814 | * | |
815 | * * @f16@ for the IEEE `binary16' format, with five-bit | |
816 | * exponent and eleven-bit significand, represented as a | |
817 | * @uint16@; | |
818 | * | |
819 | * * @f32@ for the IEEE `binary32' format, with eight-bit | |
820 | * exponent and 24-bit significand, represented as a | |
821 | * @uint32@; | |
822 | * | |
823 | * * @f64@ for the IEEE `binary64' format, with eleven-bit | |
824 | * exponent and 53-bit significand, represented as a | |
825 | * @kludge64@; | |
826 | * | |
827 | * * @f128@ for the IEEE `binary128' format, with fifteen-bit | |
828 | * exponent and 113-bit significand, represented as four | |
829 | * @uint32@ limbs, most significant first; or | |
830 | * | |
831 | * * @idblext80@ for the Intel 80-bit `double extended' | |
832 | * format, with fifteen-bit exponent and 64-bit significand | |
833 | * with no hidden bit, represented as a @uint16 se@ | |
834 | * holding the sign and exponent, and a @kludge64 m@ | |
835 | * holding the significand. | |
836 | * | |
837 | * Positive and negative zero and infinity are representable | |
838 | * exactly. | |
839 | * | |
840 | * Following IEEE recommendations (and most implementations), | |
841 | * the most significant fraction bit of a quiet NaN is set; this | |
842 | * bit is clear in a signalling NaN. The most significant | |
843 | * payload bits of a NaN, held in the top bits of @x->frac[0]@, | |
844 | * are encoded in the output significand following the `quiet' | |
845 | * bit. If the chosen format's significand field is too small | |
846 | * to accommodate all of the set payload bits then the | |
847 | * @FLTERR_INEXACT@ error bit is set and, if masked, the | |
848 | * excess payload bits are discarded. No rounding of NaN | |
849 | * payloads is performed. | |
850 | * | |
851 | * Otherwise, the input value is finite and nonzero. If the | |
852 | * significand cannot be represented exactly then the | |
853 | * @FLTERR_INEXACT@ error bit is set, and, if masked, the value | |
854 | * will be rounded (internally -- the input @x@ is not changed). | |
855 | * If the (rounded) value's exponent is too large to represent, | |
856 | * then the @FLTERR_OFLOW@ and @FLTERR_INEXACT@ error bits are | |
857 | * set and, if masked, the result is either the (absolute) | |
858 | * largest representable finite value or infinity, with the | |
859 | * appropriate sign, chosen according to the rounding mode. If | |
860 | * the exponent is too small to represent, then the | |
861 | * @FLTERR_UFLOW@ and @FLTERR_INEXACT@ error bits are set and, | |
862 | * if masked, the result is either the (absolute) smallest | |
863 | * nonzero value or zero, with the appropriate sign, chosen | |
864 | * according to the rounding mode. | |
865 | */ | |
866 | ||
867 | unsigned fltfmt_encmini(octet *z_out, const struct floatbits *x, | |
868 | unsigned r, unsigned errmask) | |
869 | { | |
870 | uint32 t[1]; | |
871 | unsigned rc; | |
872 | ||
873 | rc = fltfmt_encieee(&fltfmt_mini, t, x, r, errmask); | |
874 | if (!(rc&~errmask)) *z_out = t[0]; | |
875 | return (rc); | |
876 | } | |
877 | ||
878 | unsigned fltfmt_encbf16(uint16 *z_out, const struct floatbits *x, | |
879 | unsigned r, unsigned errmask) | |
880 | { | |
881 | uint32 t[1]; | |
882 | unsigned rc; | |
883 | ||
884 | rc = fltfmt_encieee(&fltfmt_bf16, t, x, r, errmask); | |
885 | if (!(rc&~errmask)) *z_out = t[0]; | |
886 | return (rc); | |
887 | } | |
888 | ||
889 | unsigned fltfmt_encf16(uint16 *z_out, const struct floatbits *x, | |
890 | unsigned r, unsigned errmask) | |
891 | { | |
892 | uint32 t[1]; | |
893 | unsigned rc; | |
894 | ||
895 | rc = fltfmt_encieee(&fltfmt_f16, t, x, r, errmask); | |
896 | if (!(rc&~errmask)) *z_out = t[0]; | |
897 | return (rc); | |
898 | } | |
899 | ||
900 | unsigned fltfmt_encf32(uint32 *z_out, const struct floatbits *x, | |
901 | unsigned r, unsigned errmask) | |
902 | { return (fltfmt_encieee(&fltfmt_f32, z_out, x, r, errmask)); } | |
903 | ||
904 | unsigned fltfmt_encf64(kludge64 *z_out, const struct floatbits *x, | |
905 | unsigned r, unsigned errmask) | |
906 | { | |
907 | uint32 t[2]; | |
908 | unsigned rc; | |
909 | ||
910 | rc = fltfmt_encieee(&fltfmt_f64, t, x, r, errmask); | |
911 | if (!(rc&~errmask)) SET64(*z_out, t[0], t[1]); | |
912 | return (rc); | |
913 | } | |
914 | ||
915 | unsigned fltfmt_encf128(uint32 *z_out, const struct floatbits *x, | |
916 | unsigned r, unsigned errmask) | |
917 | { return (fltfmt_encieee(&fltfmt_f128, z_out, x, r, errmask)); } | |
918 | ||
919 | unsigned fltfmt_encidblext80(uint16 *se_out, kludge64 *m_out, | |
920 | const struct floatbits *x, | |
921 | unsigned r, unsigned errmask) | |
922 | { | |
923 | uint32 t[3]; | |
924 | unsigned rc; | |
925 | ||
926 | rc = fltfmt_encieee(&fltfmt_idblext80, t, x, r, errmask); | |
927 | if (!(rc&~errmask)) { *se_out = t[0]; SET64(*m_out, t[1], t[2]); } | |
928 | return (rc); | |
929 | } | |
930 | ||
931 | /* --- @fltfmt_decieee@ --- * | |
932 | * | |
933 | * Arguments: @const struct fltfmt_ieeefmt *fmt@ = format description | |
934 | * @struct floatbits *z_out@ = output decoded representation | |
935 | * @const uint32 *x@ = input encoding | |
936 | * | |
937 | * Returns: Error flags (@FLTERR_...@). | |
938 | * | |
939 | * Use: Decode a floating-point value in an IEEE format. This is the | |
940 | * machinery shared by the @fltfmt_dec...@ functions for | |
941 | * deccoding IEEE-format values. Most of the arguments and | |
942 | * behaviour are as described for those functions. | |
943 | * | |
944 | * The encoded value should be right-aligned and big-endian; | |
945 | * i.e., the sign bit ends up in @z[0]@, and the least | |
946 | * significant bit of the significand ends up in the least | |
947 | * significant bit of @z[n - 1]@. | |
948 | */ | |
949 | ||
950 | unsigned fltfmt_decieee(const struct fltfmt_ieeefmt *fmt, | |
951 | struct floatbits *z_out, const uint32 *x) | |
952 | { | |
953 | unsigned sigwd, err = 0, f = 0; | |
954 | unsigned i, nb, nw, mw, esh, sh; | |
955 | int exp, minexp, maxexp; | |
956 | uint32 x0, t, u, emask; | |
957 | ||
958 | /* The following code assumes that the sign, biased exponent, unit, and | |
959 | * quiet/signalling bits can all fit into the most significant 32 bits of | |
960 | * the result. | |
961 | */ | |
962 | assert(fmt->expwd + 3 <= 32); | |
963 | esh = 31 - fmt->expwd; emask = M32(fmt->expwd); | |
c752173d | 964 | sigwd = fmt->prec; if (fmt->f&FLTIF_HIDDEN) sigwd--; |
b1a20bee MW |
965 | |
966 | /* Determine the input size. */ | |
967 | nb = sigwd + fmt->expwd + 1; nw = (nb + 31)/32; | |
968 | ||
969 | /* Extract the sign, exponent, and top of the significand. */ | |
970 | sh = -nb%32; | |
971 | x0 = x[0] << sh; | |
972 | if (sh && nb >= 32) x0 |= x[1] >> (32 - sh); | |
973 | if (x0&B31) f |= FLTF_NEG; | |
974 | t = (x0 >> esh)&emask; | |
975 | ||
976 | /* Time for a case analysis. */ | |
977 | ||
978 | if (t == emask) { | |
979 | /* Infinity or NaN. | |
980 | * | |
981 | * Note that we don't include the quiet bit in our decoded payload. | |
982 | */ | |
983 | ||
c752173d | 984 | if (!(fmt->f&FLTIF_HIDDEN)) { |
b1a20bee MW |
985 | /* No hidden bit, so we expect the unit bit to be set. If it isn't, |
986 | * that's technically invalid, and its absence won't survive a round | |
987 | * trip, since the bit isn't considered part of a NaN payload -- or | |
988 | * even to distinguish a NaN from an infinity. In any event, reduce | |
989 | * the notional significand size to exclude this bit from further | |
990 | * consideration. | |
991 | */ | |
992 | ||
993 | if (!(x0&B32(esh - 1))) err = FLTERR_INVAL; | |
994 | sigwd--; | |
995 | } | |
996 | ||
997 | if (ms_set_bit(x + nw, 0, sigwd) == ALLCLEAR) | |
998 | f |= FLTF_INF; | |
999 | else { | |
c752173d | 1000 | sh = esh - 2; if (fmt->f&FLTIF_HIDDEN) sh++; |
b1a20bee MW |
1001 | if (x0&B32(sh)) f |= FLTF_QNAN; |
1002 | else f |= FLTF_SNAN; | |
1003 | sigwd--; mw = (sigwd + 31)/32; | |
1004 | fltfmt_allocfrac(z_out, mw); | |
1005 | shl(z_out->frac, x + nw - mw, mw, -sigwd%32); | |
1006 | } | |
1007 | goto end; | |
1008 | } | |
1009 | ||
1010 | /* Determine the exponent bounds. */ | |
1011 | maxexp = (1 << (fmt->expwd - 1)) - 1; | |
1012 | minexp = 1 - maxexp; | |
1013 | ||
1014 | /* Dispatch. If there's a hidden bit then everything is well defined. | |
1015 | * Otherwise, we'll normalize the incoming value regardless, but report | |
1016 | * settings of the unit bit which are inconsistent with the exponent. | |
1017 | */ | |
c752173d | 1018 | if (fmt->f&FLTIF_HIDDEN) { |
b1a20bee MW |
1019 | if (!t) { exp = minexp; goto normalize; } |
1020 | else { exp = t - maxexp; goto hidden; } | |
1021 | } else { | |
1022 | u = x0&B32(esh - 1); | |
1023 | if (!t) { exp = minexp; if (u) err |= FLTERR_INVAL; } | |
1024 | else { exp = t - maxexp; if (!u) err |= FLTERR_INVAL; } | |
1025 | goto normalize; | |
1026 | } | |
1027 | ||
1028 | hidden: | |
1029 | /* We have a normal real number with a hidden bit. */ | |
1030 | ||
1031 | mw = (sigwd + 31)/32; | |
1032 | ||
1033 | if (!(sigwd%32)) { | |
1034 | /* The bits we have occupy a whole number of words, but we need to shift | |
1035 | * to make space for the unit bit. | |
1036 | */ | |
1037 | ||
1038 | fltfmt_allocfrac(z_out, mw + 1); | |
1039 | z_out->frac[mw] = shr(z_out->frac, x + nw - mw, mw, 1); | |
1040 | } else { | |
1041 | fltfmt_allocfrac(z_out, mw); | |
1042 | shl(z_out->frac, x + nw - mw, mw, -(sigwd + 1)%32); | |
1043 | } | |
1044 | z_out->frac[0] |= B31; | |
1045 | z_out->exp = exp + 1; | |
1046 | goto end; | |
1047 | ||
1048 | normalize: | |
1049 | /* We have, at least potentially, a subnormal number, with no hidden | |
1050 | * bit. | |
1051 | */ | |
1052 | ||
1053 | i = ms_set_bit(x + nw, 0, sigwd); | |
1054 | if (i == ALLCLEAR) { f |= FLTF_ZERO; goto end; } | |
1055 | mw = i/32 + 1; sh = 32*mw - i - 1; | |
1056 | fltfmt_allocfrac(z_out, mw); | |
1057 | shl(z_out->frac, x + nw - mw, mw, sh); | |
1058 | z_out->exp = exp - fmt->prec + 2 + i; | |
1059 | goto end; | |
1060 | ||
1061 | end: | |
1062 | /* All done. */ | |
1063 | z_out->f = f; return (err); | |
1064 | } | |
1065 | ||
1066 | /* --- @fltfmt_decTY@ --- * | |
1067 | * | |
1068 | * Arguments: @const struct floatbits *z_out@ = storage for the result | |
1069 | * @octet x@, @uint16 x@, @uint32 x@, @kludge64 x@ = | |
1070 | * encoded input | |
1071 | * @uint16 se@, @kludge64 m@ = encoded sign-and-exponent and | |
1072 | * significand | |
1073 | * | |
1074 | * Returns: Error flags (@FLTERR_...@). | |
1075 | * | |
1076 | * Use: Encode a floating-point value in an IEEE (or IEEE-adjacent) | |
1077 | * format. | |
1078 | * | |
1079 | * The options for @TY@ are as documented for the encoding | |
1080 | * functions above. | |
1081 | * | |
1082 | * In formats without a hidden bit -- currently only @idblext80@ | |
1083 | * -- not all bit patterns are valid encodings. If the explicit | |
1084 | * unit bit is set when the exponent field is all-bits-zero, or | |
1085 | * clear when the exponent field is not all-bits-zero, then the | |
1086 | * @FLTERR_INVAL@ error bit is set. If the exponent is all- | |
1087 | * bits-set, denoting infinity or a NaN, then the unit bit is | |
1088 | * otherwise ignored -- in particular, it does not affect the | |
1089 | * NaN payload, or even whether the input encodes a NaN or | |
1090 | * infinity. Otherwise, the unit bit is considered significant, | |
1091 | * and the result is normalized as one would expect. | |
1092 | * Consequently, biased exponent values 0 and 1 are distinct | |
1093 | * only with respect to which bit patterns are considered valid, | |
1094 | * and not with respect to the set of values denoted. | |
1095 | */ | |
1096 | ||
1097 | unsigned fltfmt_decmini(struct floatbits *z_out, octet x) | |
1098 | { uint32 t[1]; t[0] = x; return (fltfmt_decieee(&fltfmt_mini, z_out, t)); } | |
1099 | ||
1100 | unsigned fltfmt_decbf16(struct floatbits *z_out, uint16 x) | |
1101 | { uint32 t[1]; t[0] = x; return (fltfmt_decieee(&fltfmt_bf16, z_out, t)); } | |
1102 | ||
1103 | unsigned fltfmt_decf16(struct floatbits *z_out, uint16 x) | |
1104 | { uint32 t[1]; t[0] = x; return (fltfmt_decieee(&fltfmt_f16, z_out, t)); } | |
1105 | ||
1106 | unsigned fltfmt_decf32(struct floatbits *z_out, uint32 x) | |
1107 | { uint32 t[1]; t[0] = x; return (fltfmt_decieee(&fltfmt_f32, z_out, t)); } | |
1108 | ||
1109 | unsigned fltfmt_decf64(struct floatbits *z_out, kludge64 x) | |
1110 | { | |
1111 | uint32 t[2]; | |
1112 | ||
1113 | t[0] = HI64(x); t[1] = LO64(x); | |
1114 | return (fltfmt_decieee(&fltfmt_f64, z_out, t)); | |
1115 | } | |
1116 | ||
1117 | unsigned fltfmt_decf128(struct floatbits *z_out, const uint32 *x) | |
1118 | { return (fltfmt_decieee(&fltfmt_f128, z_out, x)); } | |
1119 | ||
1120 | unsigned fltfmt_decidblext80(struct floatbits *z_out, uint16 se, kludge64 m) | |
1121 | { | |
1122 | uint32 t[3]; | |
1123 | ||
1124 | t[0] = se; t[1] = HI64(m); t[2] = LO64(m); | |
1125 | return (fltfmt_decieee(&fltfmt_idblext80, z_out, t)); | |
1126 | } | |
1127 | ||
1128 | /*----- Native formats ----------------------------------------------------*/ | |
1129 | ||
1130 | /* If the floating-point radix is a power of two, determine how many bits | |
1131 | * there are in each digit. This isn't exhaustive, but it covers most of the | |
1132 | * bases, so to speak. | |
1133 | */ | |
1134 | #if FLT_RADIX == 2 | |
1135 | # define DIGIT_BITS 1 | |
1136 | #elif FLT_RADIX == 4 | |
1137 | # define DIGIT_BITS 2 | |
1138 | #elif FLT_RADIX == 8 | |
1139 | # define DIGIT_BITS 3 | |
1140 | #elif FLT_RADIX == 16 | |
1141 | # define DIGIT_BITS 4 | |
1142 | #endif | |
1143 | ||
dc6eea4e MW |
1144 | /* Take note if we need to cope with the revered quiet/signalling convention |
1145 | * used by HP-PA and older MIPS processors. | |
1146 | */ | |
1147 | #if defined(__hppa__) || (defined(__mips__) && !defined(__mips_nan2008)) | |
1148 | # define FROB_NANS | |
1149 | #endif | |
1150 | ||
b1a20bee MW |
1151 | /* --- @ENCFLT@ --- * |
1152 | * | |
1153 | * Arguments: @ty@ = the C type to encode | |
1154 | * @TY@ = the uppercase prefix for macros | |
1155 | * @ty (*ldexp)(ty, int)@ = function to scale a @ty@ value by a | |
1156 | * power of two | |
1157 | * @unsigned &rc@ = error code to set | |
1158 | * @ty *z_out@ = storage for the result | |
1159 | * @const struct floatbits *x@ = value to convert | |
1160 | * @unsigned r@ = rounding mode | |
1161 | * | |
1162 | * Returns: --- | |
1163 | * | |
1164 | * Use: Encode a floating-point value @x@ as a native C object of | |
1165 | * type @ty@. This is the machinery shared by the | |
1166 | * @fltfmt_enc...@ functions for enccoding native-format values. | |
1167 | * Most of the behaviour is as described for those functions. | |
1168 | */ | |
1169 | ||
1170 | /* Utilities based on conditional compilation, because we can't use | |
1171 | * %|#ifdef|% directly in macros. | |
1172 | */ | |
1173 | ||
1174 | #ifdef NAN | |
1175 | /* The C implementation acknowledges the existence of (quiet) NaN values, | |
1176 | * but will neither let us set the payload in a useful way, nor | |
1177 | * acknowledge the existence of signalling NaNs. We have no good way to | |
1178 | * determine which NaN the @NAN@ macro produces, so report this conversion | |
1179 | * as inexact. | |
1180 | */ | |
1181 | ||
1182 | # define SETNAN(rc, z) do { (z) = NAN; (rc) = FLTERR_INEXACT; } while (0) | |
1183 | #else | |
1184 | /* This C implementation doesn't recognize NaNs. This value is totally | |
1185 | * unrepresentable, so just report the error. (Maybe it's C89 and would | |
1186 | * actually do the right thing with @0/0@. I'm not sure the requisite | |
1187 | * compile-time configuration machinery is worth the effort.) | |
1188 | */ | |
1189 | ||
1190 | # define SETNAN(rc, z) do { (z) = 0; (rc) = FLTERR_REPR; } while (0) | |
1191 | #endif | |
1192 | ||
1193 | #ifdef INFINITY | |
1194 | /* The C implementation supports infinities. This is a simple win. */ | |
1195 | ||
1196 | # define SETINF(TY, rc, z) \ | |
1197 | do { (z) = INFINITY; (rc) = FLTERR_OK; } while (0) | |
1198 | #else | |
1199 | /* The C implementation doesn't support infinities. Return the maximum | |
1200 | * value and report it as an overflow; I think this is more useful than | |
1201 | * reporting a complete representation failure. (Maybe it's C89 and would | |
1202 | * actually do the right thing with @1/0@. Again, I'm not sure the | |
1203 | * requisite compile-time configuration machinery is worth the effort.) | |
1204 | */ | |
1205 | ||
1206 | # define SETINF(TY, rc, z) \ | |
1207 | do { (z) = TY##_MAX; (rc) = FLTERR_OFLOW | FLTERR_INEXACT; } while (0) | |
1208 | #endif | |
1209 | ||
1210 | #ifdef DIGIT_BITS | |
1211 | /* The floating point formats use a power-of-two radix. This means that | |
1212 | * we can determine the correctly rounded value before we start building | |
1213 | * the native floating-point value. | |
1214 | */ | |
1215 | ||
1216 | # define ENC_ROUND_DECLS struct floatbits _y; | |
1217 | # define ENC_ROUND(TY, rc, x, r) do { \ | |
1218 | (rc) |= fltfmt_round(&_y, (x), (r), DIGIT_BITS*TY##_MANT_DIG); \ | |
1219 | (x) = &_y; \ | |
1220 | } while (0) | |
1221 | #else | |
1222 | /* The floating point formats use a non-power-of-two radix. This means | |
1223 | * that conversion is inherently inexact. | |
1224 | */ | |
1225 | ||
1226 | # define ENC_ROUND_DECLS | |
1227 | # define ENC_ROUND(TY, rc, x, r) \ | |
1228 | do (rc) |= FLTERR_INEXACT; while (0) | |
1229 | # define ENC_FIXUP(...) | |
1230 | ||
1231 | #endif | |
1232 | ||
dc6eea4e MW |
1233 | #ifdef FROB_NANS |
1234 | # define FROBNAN_ENCDECLS struct floatbits _y | |
1235 | # define FROBNAN_ENC do { \ | |
1236 | if (_x->f&FLTF_NANMASK) { \ | |
1237 | _y.f = _x->f ^ FLTF_NANMASK; _y.frac = _x->frac; _y.n = _x->n; \ | |
1238 | _x = &_y; \ | |
1239 | } \ | |
1240 | } while (0) | |
1241 | #else | |
1242 | # define FROBNAN_ENCDECLS | |
1243 | # define FROBNAN_ENC do ; while (0) | |
1244 | #endif | |
1245 | ||
b1a20bee | 1246 | #define ENCFLT(ty, TY, ldexp, rc, z_out, x, r) do { \ |
dc6eea4e | 1247 | const struct floatbits *_x = (x); \ |
b1a20bee | 1248 | unsigned _rc = 0; \ |
dc6eea4e | 1249 | FROBNAN_ENCDECLS; \ |
b1a20bee MW |
1250 | \ |
1251 | /* See if the native format is one that we recognize. */ \ | |
1252 | switch (TY##_FORMAT&(FLTFMT_ORGMASK | FLTFMT_TYPEMASK)) { \ | |
1253 | \ | |
1254 | case FLTFMT_IEEE_F32: { \ | |
1255 | uint32 _t[1]; \ | |
1256 | unsigned char *_z = (unsigned char *)(z_out); \ | |
1257 | \ | |
dc6eea4e MW |
1258 | FROBNAN_ENC; \ |
1259 | (rc) = fltfmt_encieee(&fltfmt_f32, _t, _x, (r), FLTERR_ALLERRS); \ | |
b1a20bee MW |
1260 | switch (TY##_FORMAT&FLTFMT_ENDMASK) { \ |
1261 | case FLTFMT_BE: STORE32_B(_z, _t[0]); break; \ | |
1262 | case FLTFMT_LE: STORE32_L(_z, _t[0]); break; \ | |
1263 | default: assert(!"unimplemented byte order"); break; \ | |
1264 | } \ | |
1265 | } break; \ | |
1266 | \ | |
1267 | case FLTFMT_IEEE_F64: { \ | |
1268 | uint32 _t[2]; \ | |
1269 | unsigned char *_z = (unsigned char *)(z_out); \ | |
dc6eea4e MW |
1270 | \ |
1271 | FROBNAN_ENC; \ | |
1272 | (rc) = fltfmt_encieee(&fltfmt_f64, _t, _x, (r), FLTERR_ALLERRS); \ | |
b1a20bee MW |
1273 | switch (TY##_FORMAT&FLTFMT_ENDMASK) { \ |
1274 | case FLTFMT_BE: \ | |
1275 | STORE32_B(_z + 0, _t[0]); STORE32_B(_z + 4, _t[1]); \ | |
1276 | break; \ | |
1277 | case FLTFMT_LE: \ | |
1278 | STORE32_L(_z + 0, _t[1]); STORE32_L(_z + 4, _t[0]); \ | |
1279 | break; \ | |
1280 | case FLTFMT_ARME: \ | |
1281 | STORE32_L(_z + 0, _t[0]); STORE32_L(_z + 4, _t[1]); \ | |
1282 | break; \ | |
1283 | default: assert(!"unimplemented byte order"); break; \ | |
1284 | } \ | |
1285 | } break; \ | |
1286 | \ | |
1287 | case FLTFMT_IEEE_F128: { \ | |
1288 | uint32 _t[4]; \ | |
1289 | unsigned char *_z = (unsigned char *)(z_out); \ | |
1290 | \ | |
dc6eea4e MW |
1291 | FROBNAN_ENC; \ |
1292 | (rc) = fltfmt_encieee(&fltfmt_f128, _t, _x, (r), FLTERR_ALLERRS); \ | |
b1a20bee MW |
1293 | switch (TY##_FORMAT&FLTFMT_ENDMASK) { \ |
1294 | case FLTFMT_BE: \ | |
1295 | STORE32_B(_z + 0, _t[0]); STORE32_B(_z + 4, _t[1]); \ | |
1296 | STORE32_B(_z + 8, _t[0]); STORE32_B(_z + 12, _t[1]); \ | |
1297 | break; \ | |
1298 | case FLTFMT_LE: \ | |
1299 | STORE32_L(_z + 0, _t[3]); STORE32_L(_z + 4, _t[2]); \ | |
1300 | STORE32_L(_z + 8, _t[1]); STORE32_L(_z + 12, _t[0]); \ | |
1301 | break; \ | |
1302 | default: assert(!"unimplemented byte order"); break; \ | |
1303 | } \ | |
1304 | } break; \ | |
1305 | \ | |
1306 | case FLTFMT_INTEL_F80: { \ | |
1307 | uint32 _t[3]; \ | |
1308 | unsigned char *_z = (unsigned char *)(z_out); \ | |
1309 | \ | |
dc6eea4e MW |
1310 | FROBNAN_ENC; \ |
1311 | (rc) = fltfmt_encieee(&fltfmt_idblext80, \ | |
1312 | _t, _x, (r), FLTERR_ALLERRS); \ | |
b1a20bee MW |
1313 | switch (TY##_FORMAT&FLTFMT_ENDMASK) { \ |
1314 | case FLTFMT_BE: \ | |
1315 | STORE16_B(_z + 0, _t[0]); \ | |
1316 | STORE32_B(_z + 2, _t[1]); STORE32_B(_z + 6, _t[2]); \ | |
1317 | break; \ | |
1318 | case FLTFMT_LE: \ | |
1319 | STORE32_L(_z + 0, _t[2]); STORE32_L(_z + 4, _t[1]); \ | |
1320 | STORE16_L(_z + 8, _t[0]); \ | |
1321 | break; \ | |
1322 | default: assert(!"unimplemented byte order"); break; \ | |
1323 | } \ | |
1324 | } break; \ | |
1325 | \ | |
1326 | default: { \ | |
1327 | /* We must do this the hard way. */ \ | |
1328 | \ | |
b1a20bee MW |
1329 | ty _z; \ |
1330 | unsigned _i; \ | |
1331 | ENC_ROUND_DECLS; \ | |
1332 | \ | |
1333 | /* Case analysis... */ \ | |
1334 | if (_x->f&FLTF_NANMASK) { \ | |
1335 | /* A NaN. Use the macro above. */ \ | |
1336 | \ | |
1337 | SETNAN(_rc, _z); \ | |
1338 | if (x->f&FLTF_NEG) _z = -_z; \ | |
1339 | } else if (_x->f&FLTF_INF) { \ | |
1340 | /* Infinity. Use the macro. */ \ | |
1341 | \ | |
1342 | SETINF(TY, _rc, _z); \ | |
1343 | if (_x->f&FLTF_NEG) _z = -_z; \ | |
1344 | } else if (_x->f&FLTF_ZERO) { \ | |
1345 | /* Zero. If we're asked for a negative zero then check that we \ | |
1346 | * produced one: if not, then report an exactness failure. \ | |
1347 | */ \ | |
1348 | \ | |
1349 | _z = 0.0; \ | |
1350 | if (_x->f&FLTF_NEG) \ | |
1351 | { _z = -_z; if (!NEGP(_z)) _rc |= FLTERR_INEXACT; } \ | |
1352 | } else { \ | |
1353 | /* A finite value. */ \ | |
1354 | \ | |
1355 | /* If the radix is a power of two, we can round to the correct \ | |
1356 | * precision, which will save inexactness later. \ | |
1357 | */ \ | |
1358 | ENC_ROUND(TY, _rc, _x, (r)); \ | |
1359 | \ | |
1360 | /* Insert the 32-bit pieces of the fraction one at a time, \ | |
1361 | * starting from the least-significant end. This minimizes the \ | |
1362 | * inaccuracy from the overall approach, but it's imperfect \ | |
1363 | * unless the value has already been rounded correctly. \ | |
1364 | */ \ | |
1365 | _z = 0.0; \ | |
1366 | for (_i = _x->n, _z = 0.0; _i--; ) \ | |
1367 | _z += ldexp(_x->frac[_i], _x->exp - 32*_i); \ | |
1368 | \ | |
1369 | /* Negate the value if we need to. */ \ | |
1370 | if (_x->f&FLTF_NEG) _z = -_z; \ | |
1371 | } \ | |
1372 | \ | |
1373 | /* All done. */ \ | |
1374 | *(z_out) = _z; \ | |
1375 | } break; \ | |
1376 | } \ | |
1377 | \ | |
1378 | /* Set the error code. */ \ | |
1379 | (rc) = _rc; \ | |
1380 | } while (0) | |
1381 | ||
1382 | /* --- @fltfmt_encTY@ --- * | |
1383 | * | |
1384 | * Arguments: @ty *z_out@ = storage for the result | |
1385 | * @const struct floatbits *x@ = value to encode | |
1386 | * @unsigned r@ = rounding mode | |
1387 | * | |
1388 | * Returns: Error flags (@FLTERR_...@). | |
1389 | * | |
1390 | * Use: Encode the floating-point value @x@ as a native C object and | |
1391 | * store the result in @z_out@. | |
1392 | * | |
1393 | * The @TY@ may be @flt@ to encode a @float@, @dbl@ to encode a | |
1394 | * @double@, or (on C99 implementations) @ldbl@ to encode a | |
1395 | * @long double@. | |
1396 | * | |
1397 | * In detail, conversion is performed as follows. | |
1398 | * | |
1399 | * * If a non-finite value cannot be represented by the | |
1400 | * implementation then the @FLTERR_REPR@ error bit is set | |
1401 | * and @*z_out@ is set to zero if @x@ is a NaN, or the | |
1402 | * (absolute) largest representable value, with appropriate | |
1403 | * sign, if @x@ is an infinity. | |
1404 | * | |
1405 | * * If the implementation can represent NaNs, but cannot set | |
1406 | * NaN payloads, then the @FLTERR_INEXACT@ error bit is set, | |
1407 | * and @*z_out@ is set to an arbitrary (quiet) NaN value. | |
1408 | * | |
1409 | * * If @x@ is negative zero, but the implementation does not | |
1410 | * distinguish negative and positive zero, then the | |
1411 | * @FLTERR_INEXACT@ error bit is set and @*z_out@ is set to | |
1412 | * zero. | |
1413 | * | |
1414 | * * If the implementation's floating-point radix is not a | |
1415 | * power of two, and @x@ is a nonzero finite value, then | |
1416 | * @FLTERR_INEXACT@ error bit is set (unconditionally), and | |
1417 | * the value is rounded by the implementation using its | |
1418 | * prevailing rounding policy. If the radix is a power of | |
1419 | * two, then the @FLTERR_INEXACT@ error bit is set only if | |
1420 | * rounding is necessary, and rounding is performed using | |
1421 | * the rounding mode @r@. | |
1422 | */ | |
1423 | ||
1424 | unsigned fltfmt_encflt(float *z_out, const struct floatbits *x, unsigned r) | |
1425 | { | |
1426 | unsigned rc; | |
1427 | ||
1428 | ENCFLT(double, FLT, ldexp, rc, z_out, x, r); | |
1429 | return (rc); | |
1430 | } | |
1431 | ||
1432 | unsigned fltfmt_encdbl(double *z_out, const struct floatbits *x, unsigned r) | |
1433 | { | |
1434 | unsigned rc; | |
1435 | ||
1436 | ENCFLT(double, DBL, ldexp, rc, z_out, x, r); | |
1437 | return (rc); | |
1438 | } | |
1439 | ||
1440 | #if __STDC_VERSION__ >= 199001 | |
1441 | unsigned fltfmt_encldbl(long double *z_out, | |
1442 | const struct floatbits *x, unsigned r) | |
1443 | { | |
1444 | unsigned rc; | |
1445 | ||
1446 | ENCFLT(long double, LDBL, ldexpl, rc, z_out, x, r); | |
1447 | return (rc); | |
1448 | } | |
1449 | #endif | |
1450 | ||
1451 | /* --- @DECFLT@ --- * | |
1452 | * | |
1453 | * Arguments: @ty@ = the C type to encode | |
1454 | * @TY@ = the uppercase prefix for macros | |
1455 | * @ty (*frexp)(ty, int *)@ = function to decompose a @ty@ value | |
1456 | * into a binary exponent and normalized fraction | |
1457 | * @unsigned &rc@ = error code to set | |
1458 | * @struct floatbits *z_out@ = storage for the result | |
1459 | * @ty x@ = value to convert | |
1460 | * @unsigned r@ = rounding mode | |
1461 | * | |
1462 | * Returns: --- | |
1463 | * | |
1464 | * Use: Decode a C native floating-point object. This is the | |
1465 | * machinery shared by the @fltfmt_dec...@ functions for | |
1466 | * decoding native-format values. Most of the behaviour is as | |
1467 | * described for those functions. | |
1468 | */ | |
1469 | ||
1470 | /* Define some utilities for decoding native floating-point formats. | |
1471 | * | |
1472 | * * @NFRAC(d)@ is the number of fraction limbs we'll need for @d@ native | |
1473 | * digits. | |
1474 | * | |
1475 | * * @CONVFIX@ is a final fixup applied to the decoded value. | |
1476 | */ | |
1477 | #ifdef DIGIT_BITS | |
1478 | # define NFRAC(TY) ((DIGIT_BITS*TY##_MANT_DIG + 31)/32) | |
1479 | # define CONVFIX(ty, rc, z, x, n, f, r) do assert(!(x)); while (0) | |
1480 | #else | |
1481 | # define NFRAC(TY) \ | |
1482 | (ceil(log(pow(FLT_RADIX, TY##_MANT_DIG) - 1)/32.0*log(2.0)) + 1) | |
1483 | # define CONVFIX(ty, rc, z, x, n, f, r) do { \ | |
1484 | ty _x_ = (x); \ | |
1485 | struct floatbits *_z_ = (z); \ | |
1486 | uint32 _w_; \ | |
1487 | unsigned _i_, _n_ = (n), _f_; \ | |
1488 | \ | |
1489 | /* Round the result according to the remainder left in %$x$%. */ \ | |
1490 | _f_ = 0; _i_ = _n_ - 1; _w_ = _z_->frac[_i_]; \ | |
1491 | if ((f)&FLTF_NEG) _f_ |= FRPF_NEG; \ | |
1492 | if (_w_&1) _f_ |= FRPF_ODD; \ | |
1493 | if (_y_ >= 0.5) _f_ |= FRPF_HALF; \ | |
1494 | if (_y_ != 0 && _y_ != 0.5) _f_ |= FRPF_LOW; \ | |
1495 | if (((r) >> _f_)&1) { \ | |
1496 | for (;;) { \ | |
1497 | _w_ = (_w_ + 1)&MASK32; \ | |
1498 | if (_w_ || !_i_) break; \ | |
1499 | _z_->frac[_i_] = 0; _w_ = _z_->frac[--_i_]; \ | |
1500 | } \ | |
1501 | if (!_w_) { _z_->exp++; _w_ = B31; } \ | |
1502 | _z_->frac[_i_] = w; \ | |
1503 | } \ | |
1504 | \ | |
1505 | /* The result is not exact. */ \ | |
1506 | (rc) |= FLTERR_INEXACT; \ | |
1507 | } while (0) | |
1508 | #endif | |
1509 | ||
dc6eea4e MW |
1510 | #ifdef FROB_NANS |
1511 | # define FROBNAN_DEC do { \ | |
1512 | if (_z->f&FLTF_NANMASK) _z->f ^= FLTF_NANMASK; \ | |
1513 | } while (0) | |
1514 | #else | |
1515 | # define FROBNAN_DEC do ; while (0) | |
1516 | #endif | |
1517 | ||
b1a20bee | 1518 | #define DECFLT(ty, TY, frexp, rc, z_out, x, r) do { \ |
dc6eea4e | 1519 | struct floatbits *_z = (z_out); \ |
b1a20bee MW |
1520 | unsigned _rc = 0; \ |
1521 | \ | |
1522 | switch (TY##_FORMAT&(FLTFMT_ORGMASK | FLTFMT_TYPEMASK)) { \ | |
1523 | \ | |
1524 | case FLTFMT_IEEE_F32: { \ | |
1525 | unsigned _t[1]; \ | |
1526 | unsigned char *_x = (unsigned char *)&(x); \ | |
1527 | \ | |
1528 | switch (TY##_FORMAT&FLTFMT_ENDMASK) { \ | |
1529 | case FLTFMT_BE: _t[0] = LOAD32_B(_x); break; \ | |
1530 | case FLTFMT_LE: _t[0] = LOAD32_L(_x); break; \ | |
1531 | default: assert(!"unimplemented byte order"); break; \ | |
1532 | } \ | |
dc6eea4e | 1533 | _rc |= fltfmt_decieee(&fltfmt_f32, _z, _t); FROBNAN_DEC; \ |
b1a20bee MW |
1534 | } break; \ |
1535 | \ | |
1536 | case FLTFMT_IEEE_F64: { \ | |
1537 | unsigned _t[2]; \ | |
1538 | unsigned char *_x = (unsigned char *)&(x); \ | |
1539 | \ | |
1540 | switch (TY##_FORMAT&FLTFMT_ENDMASK) { \ | |
1541 | case FLTFMT_BE: \ | |
1542 | _t[0] = LOAD32_B(_x + 0); _t[1] = LOAD32_B(_x + 4); \ | |
1543 | break; \ | |
1544 | case FLTFMT_LE: \ | |
1545 | _t[1] = LOAD32_L(_x + 0); _t[0] = LOAD32_L(_x + 4); \ | |
1546 | break; \ | |
1547 | case FLTFMT_ARME: \ | |
1548 | _t[0] = LOAD32_L(_x + 0); _t[1] = LOAD32_L(_x + 4); \ | |
1549 | break; \ | |
1550 | default: assert(!"unimplemented byte order"); break; \ | |
1551 | } \ | |
dc6eea4e | 1552 | _rc |= fltfmt_decieee(&fltfmt_f64, _z, _t); FROBNAN_DEC; \ |
b1a20bee MW |
1553 | } break; \ |
1554 | \ | |
1555 | case FLTFMT_IEEE_F128: { \ | |
1556 | unsigned _t[4]; \ | |
1557 | unsigned char *_x = (unsigned char *)&(x); \ | |
1558 | \ | |
1559 | switch (TY##_FORMAT&FLTFMT_ENDMASK) { \ | |
1560 | case FLTFMT_BE: \ | |
1561 | _t[0] = LOAD32_B(_x + 0); _t[1] = LOAD32_B(_x + 4); \ | |
1562 | _t[2] = LOAD32_B(_x + 8); _t[3] = LOAD32_B(_x + 12); \ | |
1563 | break; \ | |
1564 | case FLTFMT_LE: \ | |
1565 | _t[3] = LOAD32_L(_x + 0); _t[2] = LOAD32_L(_x + 4); \ | |
1566 | _t[1] = LOAD32_L(_x + 8); _t[0] = LOAD32_L(_x + 12); \ | |
1567 | break; \ | |
1568 | default: assert(!"unimplemented byte order"); break; \ | |
1569 | } \ | |
dc6eea4e | 1570 | _rc |= fltfmt_decieee(&fltfmt_f128, _z, _t); FROBNAN_DEC; \ |
b1a20bee MW |
1571 | } break; \ |
1572 | \ | |
1573 | case FLTFMT_INTEL_F80: { \ | |
1574 | unsigned _t[3]; \ | |
1575 | unsigned char *_x = (unsigned char *)&(x); \ | |
1576 | \ | |
1577 | switch (TY##_FORMAT&FLTFMT_ENDMASK) { \ | |
1578 | case FLTFMT_BE: \ | |
1579 | _t[0] = LOAD16_B(_x + 0); \ | |
1580 | _t[1] = LOAD32_B(_x + 2); _t[2] = LOAD32_B(_x + 6); \ | |
1581 | break; \ | |
1582 | case FLTFMT_LE: \ | |
1583 | _t[2] = LOAD32_L(_x + 0); _t[1] = LOAD32_L(_x + 4); \ | |
1584 | _t[0] = LOAD16_L(_x + 8); \ | |
1585 | break; \ | |
1586 | default: assert(!"unimplemented byte order"); break; \ | |
1587 | } \ | |
dc6eea4e | 1588 | _rc |= fltfmt_decieee(&fltfmt_idblext80, _z, _t); FROBNAN_DEC; \ |
b1a20bee MW |
1589 | } break; \ |
1590 | \ | |
1591 | default: { \ | |
b1a20bee MW |
1592 | ty _x = (x), _y; \ |
1593 | unsigned _i, _n, _f = 0; \ | |
1594 | uint32 _t; \ | |
1595 | \ | |
1596 | /* If the value looks negative then negate it and set the sign \ | |
1597 | * flag. \ | |
1598 | */ \ | |
1599 | if (NEGP(_x)) { _f |= FLTF_NEG; _x = -_x; } \ | |
1600 | \ | |
1601 | /* Now for the case analysis. Infinities and zero are \ | |
1602 | * unproblematic. NaNs can't be decoded exactly using the \ | |
1603 | * portable machinery. \ | |
1604 | */ \ | |
1605 | if (INFP(_x)) _f |= FLTF_INF; \ | |
1606 | else if (_x == 0.0) _f |= FLTF_ZERO; \ | |
1607 | else if (NANP(_x)) { _f |= FLTF_QNAN; _rc |= FLTERR_INEXACT; } \ | |
1608 | else { \ | |
1609 | /* A finite value. Determine the number of fraction limbs \ | |
1610 | * we'll need based on the precision and radix and pull out \ | |
1611 | * 32-bit chunks one at a time. This will be unproblematic \ | |
1612 | * for power-of-two radices, requiring at most shifting the \ | |
1613 | * significand left by a few bits, but inherently inexact (for \ | |
1614 | * the most part) for others. \ | |
1615 | */ \ | |
1616 | \ | |
1617 | _n = NFRAC(TY); fltfmt_allocfrac(_z, _n); \ | |
1618 | _y = frexp(_x, &_z->exp); \ | |
1619 | for (_i = 0; _i < _n; _i++) \ | |
1620 | { _y *= SH32; _t = _y; _y -= _t; _z->frac[_i] = _t; } \ | |
1621 | CONVFIX(ty, _rc, _z, _y, _n, _f, (r)); \ | |
1622 | } \ | |
1623 | \ | |
1624 | /* Done. */ \ | |
1625 | _z->f = _f; \ | |
1626 | } break; \ | |
1627 | } \ | |
1628 | \ | |
1629 | /* Set the error code. */ \ | |
1630 | (rc) = _rc; \ | |
1631 | } while (0) | |
1632 | ||
1633 | /* --- @fltfmt_decTY@ --- * | |
1634 | * | |
1635 | * Arguments: @struct floatbits *z_out@ = storage for the result | |
1636 | * @ty x@ = value to decode | |
1637 | * @unsigned r@ = rounding mode | |
1638 | * | |
1639 | * Returns: Error flags (@FLTERR_...@). | |
1640 | * | |
1641 | * Use: Decode the native C floatingpoint value @x@ and store the | |
1642 | * result in @z_out@. | |
1643 | * | |
1644 | * The @TY@ may be @flt@ to encode a @float@, @dbl@ to encode a | |
1645 | * @double@, or (on C99 implementations) @ldbl@ to encode a | |
1646 | * @long double@. | |
1647 | * | |
1648 | * In detail, conversion is performed as follows. | |
1649 | * | |
1650 | * * If the implementation supports negative zeros and/or | |
1651 | * infinity, then these are recognized and decoded. | |
1652 | * | |
1653 | * * If the input as a NaN, but the implementation cannot | |
1654 | * usefully report NaN payloads, then the @FLTERR_INEXACT@ | |
1655 | * error bit is set and the decoded payload is left empty. | |
1656 | * | |
1657 | * * If the implementation's floating-point radix is not a | |
1658 | * power of two, and @x@ is a nonzero finite value, then | |
1659 | * @FLTERR_INEXACT@ error bit is set (unconditionally), and | |
1660 | * the rounded value (according to the rounding mode @r@) is | |
1661 | * stored in as many fraction words as necessary to identify | |
1662 | * the original value uniquely. If the radix is a power of | |
1663 | * two, then the value is represented exactly. | |
1664 | */ | |
1665 | ||
1666 | unsigned fltfmt_decflt(struct floatbits *z_out, float x, unsigned r) | |
1667 | { | |
1668 | unsigned rc; | |
1669 | ||
1670 | DECFLT(double, FLT, frexp, rc, z_out, x, r); | |
1671 | return (rc); | |
1672 | } | |
1673 | ||
1674 | unsigned fltfmt_decdbl(struct floatbits *z_out, double x, unsigned r) | |
1675 | { | |
1676 | unsigned rc; | |
1677 | ||
1678 | DECFLT(double, DBL, frexp, rc, z_out, x, r); | |
1679 | return (rc); | |
1680 | } | |
1681 | ||
1682 | #if __STDC_VERSION__ >= 199001 | |
1683 | unsigned fltfmt_decldbl(struct floatbits *z_out, long double x, unsigned r) | |
1684 | { | |
1685 | unsigned rc; | |
1686 | ||
1687 | DECFLT(long double, LDBL, frexpl, rc, z_out, x, r); | |
1688 | return (rc); | |
1689 | } | |
1690 | #endif | |
1691 | ||
1692 | /*----- That's all, folks -------------------------------------------------*/ |