3 .\" Manual for floating-point format conversions
5 .\" (c) 2024 Straylight/Edgeware
8 .\"----- Licensing notice ---------------------------------------------------
10 .\" This file is part of the mLib utilities library.
12 .\" mLib is free software: you can redistribute it and/or modify it under
13 .\" the terms of the GNU Library General Public License as published by
14 .\" the Free Software Foundation; either version 2 of the License, or (at
15 .\" your option) any later version.
17 .\" mLib is distributed in the hope that it will be useful, but WITHOUT
18 .\" ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
19 .\" FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public
20 .\" License for more details.
22 .\" You should have received a copy of the GNU Library General Public
23 .\" License along with mLib. If not, write to the Free Software
24 .\" Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
27 .\"--------------------------------------------------------------------------
28 .so ../defs.man \" @@@PRE@@@
30 .\"--------------------------------------------------------------------------
31 .TH fltfmt 3mLib "22 April 2024" "Straylight/Edgeware" "mLib utilities library"
92 .\" @fltfmt_encidblext80
100 .\" @fltfmt_decidblext80
109 .\" @fltfmt_flttof32l
110 .\" @fltfmt_flttof32b
111 .\" @fltfmt_dbltof64l
112 .\" @fltfmt_dbltof64b
113 .\" @fltfmt_f32ltoflt
114 .\" @fltfmt_f32btoflt
115 .\" @fltfmt_f64ltodbl
116 .\" @fltfmt_f64btodbl
118 .\"--------------------------------------------------------------------------
120 fltfmt \- floating-point format conversions
122 .\"--------------------------------------------------------------------------
126 .B "#define FLTERR_OK 0"
127 .B "#define FLTERR_INVAL ..."
128 .B "#define FLTERR_INEXACT ..."
129 .B "#define FLTERR_UFLOW ..."
130 .B "#define FLTERR_OFLOW ..."
131 .B "#define FLTERR_REPR ..."
132 .B "#define FLTERR_ALLERRS ..."
135 .B "#define FRPF_LOW 1u"
136 .B "#define FRPF_HALF 2u"
137 .B "#define FRPF_ODD 4u"
138 .B "#define FRPF_NEG 8u"
139 .B "#define FRPMASK_LOW 0xaaaau"
140 .B "#define FRPMASK_HALF 0xccccu."
141 .B "#define FRPMASK_ODD 0xf0f0u"
142 .B "#define FRPMASK_NEG 0xff00u"
143 .B "#define FRPMASK_INEXACT ... /* LOW | HALF */"
144 .BI "unsigned FRPMASK_NEAR(unsigned " dir "); /* HALF&(LOW | " dir ") */"
145 .B "#define FLTRND_ZERO ... /* 0 */"
146 .B "#define FLTRND_PROJINF ... /* INEXACT */"
147 .B "#define FLTRND_NEGINF ... /* INEXACT&NEG */"
148 .B "#define FLTRND_POSINF ... /* INEXACT&~NEG */"
149 .B "#define FLTRND_EVEN ... /* INEXACT&ODD */"
150 .B "#define FLTRND_ODD ... /* INEXACT&~ODD */"
151 .B "#define FLTRND_NEAREVEN ... /* HALF&(LOW | ODD) */"
152 .B "#define FLTRND_NEARODD ... /* HALF&(LOW | ~ODD) */"
153 .B "#define FLTRND_NEARZERO ... /* HALF&LOW */"
154 .B "#define FLTRND_NEARINF ... /* HALF */"
155 .B "#define FLTRND_NEARNEG ... /* HALF&(LOW | NEG) */"
156 .B "#define FLTRND_NEARPOS ... /* HALF&(LOW | ~NEG) */"
159 .B "#define FLTF_NEG ..."
160 .B "#define FLTF_INF ..."
161 .B "#define FLTF_QNAN ..."
162 .B "#define FLTF_SNAN ..."
163 .B "#define FLTF_ZERO ..."
164 .B "#define FLTF_NANMASK (FLTF_QNAN | FLTF_SNAN)"
165 .B "struct floatbits {"
170 .B " unsigned n, fracsz;"
172 .B "#define FLOATBITS_INIT { ...\& };"
174 .BI "void fltfmt_initbits(struct floatbits *" x );
175 .BI "void fltfmt_freebits(struct floatbits *" x );
176 .BI "void fltfmt_allocfrac(struct floatbits *" x ", unsigned " n );
177 .ta \w'\fBvoid fltfmt_copybits('u
178 .BI "void fltfmt_copybits(struct floatbits *" z_out ,
179 .BI " const struct floatbits *" x );
180 .ta \w'\fBvoid fltfmt_round('u
181 .BI "void fltfmt_round(struct floatbits *" z_out ,
182 .BI " const struct floatbits *" x ,
183 .BI " unsigned " r ", unsigned " n );
187 .B "#define FLTIF_HIDDEN ..."
188 .B "struct fltfmt_ieeefmt {"
190 .B " unsigned expwd;"
193 .B "const struct fltfmt_ieeefmt fltfmt_f16;"
194 .B "const struct fltfmt_ieeefmt fltfmt_f32;"
195 .B "const struct fltfmt_ieeefmt fltfmt_f64;"
196 .B "const struct fltfmt_ieeefmt fltfmt_f128;"
197 .B "const struct fltfmt_ieeefmt fltfmt_mini;"
198 .B "const struct fltfmt_ieeefmt fltfmt_bf16;"
199 .B "const struct fltfmt_ieeefmt fltfmt_idblext80;"
201 .ta \w'\fBunsigned fltfmt_encieee('u
202 .BI "unsigned fltfmt_encieee(const struct fltfmt_ieeefmt *" fmt ,
203 .BI " uint32 *" z ", const struct floatbits *" x ,
204 .BI " unsigned " r ", unsigned " errmask );
205 .ta \w'\fBunsigned fltfmt_encf16('u
206 .BI "unsigned fltfmt_encf16(uint16 *" z_out ", const struct floatbits *" x ,
207 .BI " unsigned " r ", unsigned " errmask );
208 .ta \w'\fBunsigned fltfmt_encf32('u
209 .BI "unsigned fltfmt_encf32(uint32 *" z_out ", const struct floatbits *" x ,
210 .BI " unsigned " r ", unsigned " errmask );
211 .ta \w'\fBunsigned fltfmt_encf64('u
212 .BI "unsigned fltfmt_encf64(kludge64 *" z_out ", const struct floatbits *" x ,
213 .BI " unsigned " r ", unsigned " errmask );
214 .ta \w'\fBunsigned fltfmt_encf128('u
215 .BI "unsigned fltfmt_encf128(uint32 *" z_out ", const struct floatbits *" x ,
216 .BI " unsigned " r ", unsigned " errmask );
217 .ta \w'\fBunsigned fltfmt_encmini('u
218 .BI "unsigned fltfmt_encmini(octet *" z_out ", const struct floatbits *" x ,
219 .BI " unsigned " r ", unsigned " errmask );
220 .ta \w'\fBunsigned fltfmt_encbf16('u
221 .BI "unsigned fltfmt_encbf16(uint16 *" z_out ", const struct floatbits *" x ,
222 .BI " unsigned " r ", unsigned " errmask );
223 .ta \w'\fBunsigned fltfmt_encidblext80('u
224 .BI "unsigned fltfmt_encidblext80(uint16 *" se_out ", kludge64 *" m_out ,
225 .BI " const struct floatbits *" x ,
226 .BI " unsigned " r ", unsigned " errmask );
228 .ta \w'\fBunsigned fltfmt_decieee('u
229 .BI "unsigned fltfmt_decieee(const struct fltfmt_ieeefmt *" fmt ,
230 .BI " struct floatbits *" z_out ", const uint32 *" x );
231 .BI "unsigned fltfmt_decf16(struct floatbits *" z_out ", uint16 " x );
232 .BI "unsigned fltfmt_decf32(struct floatbits *" z_out ", uint32 " x );
233 .BI "unsigned fltfmt_decf64(struct floatbits *" z_out ", kludge64 " x );
234 .BI "unsigned fltfmt_decf128(struct floatbits *" z_out ", const uint32 *" x );
235 .BI "unsigned fltfmt_decmini(struct floatbits *" z_out ", octet " x );
236 .BI "unsigned fltfmt_decbf16(struct floatbits *" z_out ", uint16 " x );
237 .ta \w'\fBunsigned fltfmt_decidblext80('u
238 .BI "unsigned fltfmt_decidblext80(struct floatbits *" z_out ,
239 .BI " uint16 " se ", kludge64 " m );
241 .ta \w'\fBunsigned fltfmt_encflt('u
242 .BI "unsigned fltfmt_encflt(float *" z_out ,
243 .BI " const struct floatbits *" x ", unsigned " r );
244 .ta \w'\fBunsigned fltfmt_encdbl('u
245 .BI "unsigned fltfmt_encdbl(double *" z_out ,
246 .BI " const struct floatbits *" x ", unsigned " r );
247 .ta \w'\fBunsigned fltfmt_encldbl('u
248 .BI "unsigned fltfmt_encldbl(long double *" z_out ,
249 .BI " const struct floatbits *" x ", unsigned " r );
250 .ta \w'\fBunsigned fltfmt_decflt('u
251 .BI "unsigned fltfmt_decflt(struct floatbits *" z_out ,
252 .BI " float *" x ", unsigned " r );
253 .ta \w'\fBunsigned fltfmt_decdbl('u
254 .BI "unsigned fltfmt_decdbl(struct floatbits *" z_out ,
255 .BI " double *" x ", unsigned " r );
256 .ta \w'\fBunsigned fltfmt_decldbl('u
257 .BI "unsigned fltfmt_decldbl(struct floatbits *" z_out ,
258 .BI " long double *" x ", unsigned " r );
260 .BI "unsigned fltfmt_flttof32l(octet *" p ", float " x ", unsigned " r );
261 .BI "unsigned fltfmt_flttof32b(octet *" p ", float " x ", unsigned " r );
262 .BI "unsigned fltfmt_dbltof64l(octet *" p ", double " x ", unsigned " r );
263 .BI "unsigned fltfmt_dbltof64b(octet *" p ", double " x ", unsigned " r );
264 .BI "unsigned fltfmt_f32ltoflt(float *" z_out ", const octet *" p ", unsigned " r );
265 .BI "unsigned fltfmt_f32btoflt(float *" z_out ", const octet *" p ", unsigned " r );
266 .BI "unsigned fltfmt_f64ltodbl(float *" z_out ", const octet *" p ", unsigned " r );
267 .BI "unsigned fltfmt_f64btodbl(float *" z_out ", const octet *" p ", unsigned " r );
269 .\"--------------------------------------------------------------------------
274 header file defines structures, macros, and functions
275 for converting floating-point values between various formats,
276 including the native floating-point formats
277 and IEEE\ 754 and related formats.
280 Most of the functions in this module return an unsigned integer.
281 A return value of zero means that no error occurred;
282 set bits indicate various error conditions.
285 A binary input to be decoded contained an invalid bit pattern,
286 e.g., an unnormalized input value with a nonminimal exponent.
287 The function will have produced a reasonable output anyway,
288 but the original value will not be recoverable from the result.
291 The conversion was inexact.
292 Converting the output back into the format of the input
293 may not reproduce the original input value.
294 This error flag is sometimes set conservatively.
297 The conversion underflowed:
298 a nonzero input was too tiny (in absolute value) to represent,
299 and a zero result was returned.
302 The conversion overflowed:
303 a finite input was too huge (in absolute value) to represent,
304 and either the appropriately signed infinity
305 or largest-magnitude finite value
306 was returned, determined by the requested rounding mode.
309 The output format failed entirely to represent the input value.
310 The result is zero if the input was a NaN,
311 or the appropriately signed largest-magnitude finite value
312 if the input was an infinity.
315 The rounding system works as follows.
317 .I rounding predicates
318 considered when a rounding decision is taken.
319 These are determined from the unrounded input value
321 and the two nearest rounded values
322 .RI | u "|\ \*(<=\ |" x |
324 .RI | v "|\ >\ |" x |.
325 The predicates are as follows.
331 .IR x "\ \*/=\ (" u "\ +\ " v )/2,
334 is neither equal to a rounded value,
335 nor exactly halfway between two rounded values.
336 This predicate is sometimes referred to as a `sticky bit'.
340 .RI | x "|\ \*(>=\ |(" u "\ +\ " v )/2|,
343 is halfway or more towards its larger rounded neighbour.
346 If least significant digit of
349 In binary floating-point formats,
350 this is just the least significant bit of
358 These four predicates are packed into a four-bit mask value
363 is simply a 16-bit mask:
366 of the rounding-mode mask is set,
371 otherwise it is rounded to
373 That is, the rounding-mode mask is essentially a truth table.
376 bits corresponding to situations where both
383 is already a rounded value,
386 Some useful machinery is provided
387 for constructing rounding-mode masks.
393 are mask with set bits corresponding to their respective predicates.
394 Bitwise boolean logic can be applied to these masks
395 in order to calculate the masks corresponding to
396 the same logical expression applied to the individual predicates.
397 .B FRPMASK_INEXACT holds if
403 .IR x "\ \*(/=\ " u ;
404 as mentioned above, only these bits may be set
405 in a valid rounding-mode mask.
406 .BI FRPMASK_NEAR( dir )
407 is the mask for rounding to nearest with ties broken according to
409 which is another rounding-mode mask.
410 The complete set of predefined masks is listed above in the synopsis,
411 together with their description in terms of the basic predicates.
412 The usual IEEE rounding mode is
413 round-to-nearest/ties-to-even,
415 .BR FLTRND_NEAREVEN .
416 This is likely a good option
417 if there is no compelling argument for a different specific choice.
419 .SS Direct conversions
426 argument to an IEEE\ 754 Binary32 value
427 in little- or big-endian byte order, respectively;
434 argument to an IEEE\ 754 Binary64 value
435 in little- or big-endian byte order, respectively.
436 The value to convert is given as
438 and the result is written at the address
445 convert an IEEE\ 754 Binary32 value,
446 in little- or big-endian byte order, respectively,
453 convert an IEEE\ 754 Binary64 value,
454 in little- or big-endian byte order, respectively,
457 The value to convert is read from address
459 and the result is written to
462 Both functions additionally take a rounding mode
464 which is applied if the conversion cannot be performed exactly,
465 and return an error code as described above.
467 On many modern platforms, the
471 types are represented internally using the IEEE
472 Binary32 and Binary64 formats,
473 so these conversions are trivial, or nearly so.
474 A complication arises on PA-RISC and older MIPS processors:
475 see the descriptions of
479 below for the details.
482 the conversion is decidedly nontrivial,
483 and makes use of the machinery described below;
484 this may also be useful for more complex conversions.
486 .SS The floatbits structure
487 In order to avoid a combinatorial explosion in conversion operations,
488 all the basic conversions involve,
490 a `common currency' format represented by the type
491 .BR "struct floatbits" .
493 This structure consists of
512 the currently allocated size
519 count elements, not bytes.
531 is the most significant word.
532 The value represented by a
534 is never changed by adding or removing zero-valued words
538 It is always the case that
539 .BR n "\ \*(<=\ " fracsz ;
544 may be a null pointer.
546 The interpretation of the
550 members depends on the flags set in
555 .IR "mutually exclusive" :
556 at most one flag may be set.
559 The value is negative.
562 The value is positive or negative infinity.
569 .BR FLTF_QNAN " and " FLTF_SNAN
570 The value is a quiet or signalling not-a-number, respectively.
574 The payload is stored in
576 the payload does not include the `quiet' bit.
580 Negative zero is distinct from positive zero.
586 .IP "All non-sign bits clear"
587 The value is a finite nonzero number.
590 holds the significand.
591 The most significand significand bit must be set, so
592 (a)\ the number must be nonzero, and
593 (b)\ the significand is normalized.
594 The significand is interpreted as a fraction
595 .RI "1/2\ \*(<=\ " m "\ <\ 1."
609 then the number represented is
610 .IR s "\ \*(mu\ " m "\ \*(mu\ 2\*(ss" e \*(se.
614 can be initialized statically by
616 or dynamically using the function
617 .BR fltfmt_initbits .
618 These are not quite the same:
626 sets it to the runtime value of
629 both forms of initialization set the value to (positive) zero;
630 neither allocates any storage or other resources,
634 In this state, it is safe to modify the arena pointer
636 if the default initialization is unsatisfactory.
640 function is given a pointer
646 it ensures that there is enough storage at
651 if the current size is too small,
652 then any existing buffer is discarded and a new one allocated
655 any existing contents of the buffer are lost.
666 structure, releasing the storage held by
671 function simply copies its input
675 both must refer to initialized
682 are equal, then nothing happens.
686 function rounds the value in the
692 bits using the rounding mode
694 the result is written to
702 is a zero or infinity,
703 then the output is equal to the input,
706 had been called instead.
710 then the payload is simply truncated to
712 bits, without regard to the rounding mode.
713 Otherwise, the input is nonzero and finite;
714 the significand is rounded to
716 bits according to the rounding mode.
717 In all cases, the return value is
718 zero if the output is equal to the input,
721 if the rounded result is not equal to the input.
723 .SS IEEE and related formats
724 An IEEE floating-point format is characterized by three parameters:
736 The encoded value consists of
737 .IR p "\ +\ " w "\ +\ " h "\ \-\ 1"
739 This is divided, from the most significant bit downwards,
752 .RI ( p "\ \-\ " h )-bit
758 .IR e "\*(us0\*(ue\ =\ 2\*(ss" w "\-1\*(se\ \-\ 1;"
761 is calculated from the biased exponent by
762 .IR e "\ =\ " e "\*'\ \-\ " e \*(us0\*(ue.
763 The unit and fraction field are usually interpreted as denoting
765 .IR m "\ =\ " u "\ +\ " f /2\*(ss p \-1\*(se
767 .RI "0\ \*(<=\ " m "\ <\ 2."
771 the value of the unit bit
773 is implied by the exponent as described below.
774 The encoded value is interpreted as follows.
777 .IR e "\ =\ \-" e \*(us0\*(ue
778 then the value is zero or a subnormal,
780 .RI (\-1)\*(ss s "\*(se\ \*(mu\ " m "\ \*(mu\ 2\*(ss" e +1\*(se.
784 then the value is positive or negative zero,
785 according to the sign bit
796 then the encoding is invalid:
799 but the result will be as described.
802 .RI "1\ \-\ " e "\*(us0\*(ue \*(<=\ " e "\ < 2" e "\*(us0\*(ue\ +\ 1"
803 then the value is a (supposedly) normal number
804 .RI (\-1)\*(ss s "\*(se\ \*(mu\ " m "\ \*(mu\ 2\*(ss" e \*(se.
814 then the encoding is invalid:
817 but the result will be as described.
820 .IR e "\ =\ 2" e "\*(us0\*(ue\ +\ 1"
823 then the value is positive or negative infinity,
824 according to the sign bit
830 then the encoding is invalid:
833 but the result will still be infinity.
836 .IR e "\ =\ 2" e "\*(us0\*(ue\ +\ 1"
839 then the value is not-a-number (NaN).
840 The most significant bit of
843 if the bit is set, the value is a `quiet NaN';
844 if the bit is clear, the value is a `signalling NaN'.
845 (This is the convention recommended by IEEE\ 754-2008 \(sc6.2.1,
846 it has the advantage that a signalling NaN can be `quieted'
847 by setting the most significant fraction bit;
848 HP-PA and older MIPS processors use the opposite convention
849 for distinguishing quiet and signalling NaNs,
850 but a signalling NaN with all but the most significant
851 fraction bit zero cannot be `quieted' by clearing the
852 most significant bit, since the resulting encoding denotes
853 an infinity, not a QNaN.)
854 The remaining bits of
858 Positive and negative NaN values are distinguished,
859 with sign determined by the sign bit.
864 then the encoding is invalid:
867 but the result will still be a NaN;
868 the unit bit does not affect the NaN payload.
870 An IEEE format is described by the type
871 .BR "struct fltfmt_ieeefmt" .
872 This has three members:
878 is set, the the format uses a `hidden bit' convention:
879 in the notation above
881 if the flag is clear,
882 the format has an explicit unit bit, and
887 in the notation above this is
892 in the notation above this is
899 functions convert between IEEE and related formats
903 They respectively encode or decode an IEEE-format value,
908 most-significant word first
909 \(en so the sign bit is in the first word.
910 For formats whose size is not a multiple of 32,
913 the least significant bit of the fraction
914 is in the least significant bit of the last word in the vector.
918 function encodes an IEEE-format value.
919 The function is given five arguments:
922 to the IEEE format description,
925 to a sufficiently long vector of 32-bit words
926 in which to store the encoded value,
931 holding the value to encode,
936 If the input is a NaN,
937 then the payload is truncated to fit
938 regardless of the rounding mode,
939 discarding low-significant bits;
940 if the input is a finite value,
941 then the significand is rounded to fit
942 according to the requested rounding mode.
943 If a signalling NaN ends up with all of its payload bits zero,
944 as a result of truncation or otherwise,
945 then the least-signficant bit of the output payload is forced on
946 in order to distinguish the result from an infinity.
947 The possible errors are
949 if the value is unrepresentably tiny,
951 if the value is unrepresentably huge,
954 if the encoding fails to preserve the input value exactly;
962 or if bits are lost due to NaN-payload truncation or rounding.
964 an error is encountered,
965 processing stops immediately
966 unless the corresponding bit of
972 function decodes an IEEE-format value.
973 The function is given three arguments:
976 to the IEEE format description,
984 to the IEEE-encoded value to decode,
985 in a vector of 32-bit words as described above.
986 The only error that can occur during decoding is
989 this occurs in non-hidden-bit formats
990 when the unit bit does not match that implied by the exponent;
991 the result is returned anyway,
992 with the unit bit interpreted as encoded in finite numbers,
993 and discarded in infinities and NaNs.
995 A number of IEEE and IEEE-like formats are predefined:
999 an IEEE format description, named
1000 .BI fltfmt_ fmt \fR,
1001 together with encoding and decoding functions, named
1004 .BI fltfmt_dec fmt \fR;
1006 these functions use more convenient types
1007 to hold encoded values.
1010 The IEEE\ 754 Binary16 format, with
1019 The IEEE\ 754 Binary32 (`single precision') format, with
1028 The IEEE\ 754 Binary64 (`double precision') format, with
1040 The IEEE\ 754 Binary128 (`quad precision') format, with
1045 stored in a big-endian vector of
1047 just as for the generic functions described above.
1050 An eight-bit `minifloat' format, with
1059 The Google `BFloat16' format, with
1067 .B "fltfmt_idblext80"
1068 The Intel 8087 80-bit `double extended' format, with
1075 holding the sign and exponent,
1078 holding the significand.
1081 There are also functions for converting between
1083 and the implementation's native floating-point types
1095 For each native type abbreviation
1100 .BI fltfmt_dec ty \fR,
1101 which respectively convert the value held in
1103 to or from a value of the corresponding C type.
1104 (The functions acting on
1106 values are only available if the platform supports C99 or later.)
1110 functions read an input value from a
1114 and store the encoded result through a pointer
1116 to the appropriate C type;
1117 the function also receives a rounding mode
1122 functions are given an input value of the appropriate C type,
1123 and store the decoded result in a
1125 structure pointed to by
1127 again, the function also receives a rounding mode
1131 These functions can use two different strategies for conversion.
1132 If the compile-time configuration step detects
1133 that the implementation is using
1134 a specific, supported format for a native type,
1135 then conversions involving the native type
1136 are performed using the existing machinery for that format.
1138 as is in fact nearly universal on modern-ish systems,
1141 type uses the IEEE\ 754 Binary64 format,
1150 described above for the conversion.
1151 This approach has the benefit that
1152 everything is done under the control of the
1155 which can faithfully preserve signs of zero values,
1157 The error conditions are, for the most part, the same as for the
1161 functions described above.
1162 The encoding functions have an additional source of inexactness
1163 on PA-RISC and older MIPS processors
1164 which use the reversed quiet/signalling NaN convention:
1165 a quiet NaN with an all-zero payload
1166 is not representable on such implementations
1167 (the encoding is an infinity instead);
1169 the least significant payload bit is forced on,
1170 just as if the payload required truncation,
1175 .\"--------------------------------------------------------------------------
1181 .\"--------------------------------------------------------------------------
1184 Mark Wooding, <mdw@distorted.org.uk>
1186 .\"----- That's all, folks --------------------------------------------------