. ds us \s8\d
. ds ue \u\s0
. ds *d \(*d
-. ds <= \<>=
+. ds /= \(!=
+. ds <= \(<=
. ds >= \(>=
+. ds ' \(fm
. ds , \h'\w'\ 'u/2u'
. if \n(.g \{\
. fam P
. ds se
. ds us _
. ds ue
+. ds ' \(aq
. ds *d \,\fIdelta\/\fP
+. ds /= /=
. ds <= <=
. ds >= >=
. ds , \ \"
.\" @dbuf_getbyte
.\" @dbuf_putbyte
.
-.\" @buf_getf64
-.\" @buf_getf64l
-.\" @buf_getf64b
-.\" @buf_putf64
-.\" @buf_putf64l
-.\" @buf_putf64b
-.\" @dbuf_getf64
-.\" @dbuf_getf64l
-.\" @dbuf_getf64b
-.\" @dbuf_putf64
-.\" @dbuf_putf64l
-.\" @dbuf_putf64b
-.
.\" @buf_putstrf
.\" @buf_vputstrf
.\" @dbuf_putstrf
.\" @dbuf_putk64b
.\" @dbuf_putk64l
.
+.\" @buf_getf32
+.\" @buf_getf32l
+.\" @buf_getf32b
+.\" @buf_putf32
+.\" @buf_putf32l
+.\" @buf_putf32b
+.\" @buf_getf64
+.\" @buf_getf64l
+.\" @buf_getf64b
+.\" @buf_putf64
+.\" @buf_putf64l
+.\" @buf_putf64b
+.\" @dbuf_getf32
+.\" @dbuf_getf32l
+.\" @dbuf_getf32b
+.\" @dbuf_putf32
+.\" @dbuf_putf32l
+.\" @dbuf_putf32b
+.\" @dbuf_getf64
+.\" @dbuf_getf64l
+.\" @dbuf_getf64b
+.\" @dbuf_putf64
+.\" @dbuf_putf64l
+.\" @dbuf_putf64b
+.
.\" @buf_getbuf8
.\" @buf_getbuf16
.\" @buf_getbuf16b
.BI "int buf_putk" suff "(buf *" b ", kludge64 " w );
.BI "int buf_getk" suff "(buf *" b ", kludge64 *" w );
.PP
+.BI "int buf_getf32(buf *" b ", float " x );
+.BI "int buf_getf32l(buf *" b ", float " x );
+.BI "int buf_getf32b(buf *" b ", float " x );
+.BI "int buf_getf64(buf *" b ", double " x );
+.BI "int buf_getf64l(buf *" b ", double " x );
+.BI "int buf_getf64b(buf *" b ", double " x );
+.BI "int buf_putf32(buf *" b ", float *" x_out );
+.BI "int buf_putf32l(buf *" b ", float *" x_out );
+.BI "int buf_putf32b(buf *" b ", float *" x_out );
+.BI "int buf_putf64(buf *" b ", double *" x_out );
+.BI "int buf_putf64l(buf *" b ", double *" x_out );
+.BI "int buf_putf64b(buf *" b ", double *" x_out );
+.PP
.ta 2n
.BI "BUF_ENCLOSETAG(" tag ", buf *" b ", size_t " mk ", " check ", " poke ", size_t " lensz )
.I " body"
.BR z :
.nf
.BI "int buf_putstr" suff "(buf *" b ", const char *" p );
-.BI "int dbuf_putstr" suff "(dbuf *" db ", const char *" p );
.BI "int buf_putstr" suff "(buf *" b ", const char *" p ", ...);"
-.BI "int dbuf_putstr" suff "(dbuf *" db ", const char *" p ", ...);"
.BI "int buf_vputstr" suff "(buf *" b ", const char *" p ", va_list *" ap );
-.BI "int dbuf_vputstr" suff "(dbuf *" db ", const char *" p ", va_list *" ap );
.BI "int buf_putdstr" suff "(buf *" b ", dstr *" d );
-.BI "int dbuf_putdstr" suff "(dbuf *" db ", dstr *" d );
.BI "int buf_getdstr" suff "(buf *" b ", dstr *" d );
-.BI "int dbuf_getdstr" suff "(dbuf *" db ", dstr *" d );
.BI "int buf_putbuf" suff "(buf *" b ", buf *" bb );
-.BI "int dbuf_putbuf" suff "(dbuf *" db ", buf *" bb );
.BI "int buf_getbuf" suff "(buf *" b ", buf *" bb );
-.BI "int dbuf_getbuf" suff "(dbuf *" db ", buf *" bb );
.BI "int buf_putmem" suff "(buf *" b ", const void *" p ", size_t " sz );
-.BI "int dbuf_putmem" suff "(dbuf *" db ", const void *" p ", size_t " sz );
.BI "void *buf_getmem" suff "(buf *" b ", size_t *" sz );
-.BI "void d*buf_getmem" suff "(dbuf *" db ", size_t *" sz );
.PP
.fi
For
.BR 64b :
.nf
.BI "int buf_putf" suff "(buf *" b ", double " x );
-.BI "int dbuf_putf" suff "(dbuf *" db ", double " x );
.BI "int buf_getf" suff "(buf *" b ", double *" x );
-.BI "int dbuf_getf" suff "(dbuf *" db ", double *" x );
.fi
.
.\"--------------------------------------------------------------------------
.BR bits (3).
.PP
The functions
+.BR buf_getf32 ,
+.BR buf_getf32l ,
+and
+.BR buf_getf32b ,
+and
.BR buf_getf64 ,
.BR buf_getf64l ,
and
-.BR buf_getf64b
-read 64-bit floating-point values
-in IEEE\ 754 Binary64 format
+.BR buf_getf64b ,
+read floating-point values
+in IEEE\ 754 Binary32 and Binary64 format
from the buffer;
-as usual, the suffix indicates the byte ordering convention.
+as usual, the suffix indicates the format and byte ordering convention.
On success, they store the result in
.BI *x
and return zero;
on failure, they break the buffer and return zero.
The functions
+.BR buf_putf32 ,
+.BR buf_putf32l ,
+and
+.BR buf_putf32b ,
+and
.BR buf_putf64 ,
.BR buf_putf64l ,
and
.BR buf_putf64b
write floating-point numbers
-in IEEE\ 754 Binary64 format
+in IEEE\ 754 Binary32 and Binary64 format
from the buffer.
On success, they return zero; on failure, they return \-1.
Note that these functions use IEEE\ 754 format
-even if this is not the platform-native floating-point representation.
+even if this is not the platform-native floating-point representation:
+they use the
+.BR fltfmt (3)
+functions to do their work.
+Specifically,
+they use the
+.B FLTRND_NEAREVEN
+rounding convention,
+and they ignore
+.BR FLTERR_INEXACT ,
+.BR FLTERR_UFLOW ,
+and
+.B FLTERR_OFLOW
+errors,
+and fail on
+.B FLTERR_INVAL
+and
+.B FLTERR_REPR
+errors.
+If more subtle control over error handling is necessary,
+use the
+.BR fltfmt (3)
+functions directly.
.PP
The function
.B buf_putstrf
.BR bits (3),
.BR control (3),
.BR dstr (3),
+.BR fltfmt (3),
.BR gprintf (3),
.BR mLib (3).
.
#define dbuf_putk64l(db, w) (buf_putk64l(DBUF_BUF(db), (w)))
#define dbuf_putk64b(db, w) (buf_putk64b(DBUF_BUF(db), (w)))
+/* --- @buf_getf{32,64}{,l,b} --- *
+ *
+ * Arguments: @buf *b@ = a buffer to read from
+ * @float *x_out@, @double *x_out@ = where to put the result
+ *
+ * Returns: Zero on success, %$-1$% on failure (and the buffer is
+ * broken).
+ *
+ * Use: Get an IEEE Binary32 or Binary64 value from the buffer.
+ * Conversion is performed using the `fltfmt' machinery, with
+ * the usual round-to-nearest/ties-to-even rounding mode.
+ */
+
+extern int buf_getf32(buf */*b*/, float */*x_out*/);
+extern int buf_getf32l(buf */*b*/, float */*x_out*/);
+extern int buf_getf32b(buf */*b*/, float */*x_out*/);
+#define dbuf_getf32(db, x_out) (buf_getf32(DBUF_BUF(db), (x_out)))
+#define dbuf_getf32l(db, x_out) (buf_getf32l(DBUF_BUF(db), (x_out)))
+#define dbuf_getf32b(db, x_out) (buf_getf32b(DBUF_BUF(db), (x_out)))
+
+extern int buf_getf64(buf */*b*/, double */*x_out*/);
+extern int buf_getf64l(buf */*b*/, double */*x_out*/);
+extern int buf_getf64b(buf */*b*/, double */*x_out*/);
+#define dbuf_getf64(db, x_out) (buf_getf64(DBUF_BUF(db), (x_out)))
+#define dbuf_getf64l(db, x_out) (buf_getf64l(DBUF_BUF(db), (x_out)))
+#define dbuf_getf64b(db, x_out) (buf_getf64b(DBUF_BUF(db), (x_out)))
+
+/* --- @buf_putf{32,64}{,l,b} --- *
+ *
+ * Arguments: @buf *b@ = a buffer to write to
+ * @double x@ = a number to write
+ *
+ * Returns: Zero on success, %$-1$% on failure (and the buffer is
+ * broken).
+ *
+ * Use: Get an IEEE Binary32 or Binary64 value from the buffer.
+ * Conversion is performed using the `fltfmt' machinery, with
+ * the usual round-to-nearest/ties-to-even rounding mode.
+ */
+
+extern int buf_putf32(buf */*b*/, float /*x*/);
+extern int buf_putf32l(buf */*b*/, float /*x*/);
+extern int buf_putf32b(buf */*b*/, float /*x*/);
+#define dbuf_putf32(db, x) (buf_putf32(DBUF_BUF(db), (x)))
+#define dbuf_putf32l(db, x) (buf_putf32l(DBUF_BUF(db), (x)))
+#define dbuf_putf32b(db, x) (buf_putf32b(DBUF_BUF(db), (x)))
+
+extern int buf_putf64(buf */*b*/, double /*x*/);
+extern int buf_putf64l(buf */*b*/, double /*x*/);
+extern int buf_putf64b(buf */*b*/, double /*x*/);
+#define dbuf_putf64(db, x) (buf_putf64(DBUF_BUF(db), (x)))
+#define dbuf_putf64l(db, x) (buf_putf64l(DBUF_BUF(db), (x)))
+#define dbuf_putf64b(db, x) (buf_putf64b(DBUF_BUF(db), (x)))
+
/* --- @{,d}buf_getmem{8,{16,24,32,64}{,l,b},z} --- *
*
* Arguments: @buf *b@ or @dbuf *db@ = pointer to a buffer block
#define dbuf_putstr64b(db, p) (buf_putstr64b(DBUF_BUF(db), (p)))
#define dbuf_putstrz(db, p) (buf_putstrz(DBUF_BUF(db), (p)))
-/* --- @buf_getf{32,64}{,l,b} --- *
- *
- * Arguments: @buf *b@ = a buffer to read from
- * @float *x_out@, @double *x_out@ = where to put the result
- *
- * Returns: Zero on success, %$-1$% on failure (and the buffer is
- * broken).
- *
- * Use: Get an IEEE Binary32 or Binary64 value from the buffer.
- * Conversion is performed using the `fltfmt' machinery, with
- * the usual round-to-nearest/ties-to-even rounding mode.
- */
-
-extern int buf_getf32(buf */*b*/, float */*x_out*/);
-extern int buf_getf32l(buf */*b*/, float */*x_out*/);
-extern int buf_getf32b(buf */*b*/, float */*x_out*/);
-#define dbuf_getf32(db, x_out) (buf_getf32(DBUF_BUF(db), (x_out)))
-#define dbuf_getf32l(db, x_out) (buf_getf32l(DBUF_BUF(db), (x_out)))
-#define dbuf_getf32b(db, x_out) (buf_getf32b(DBUF_BUF(db), (x_out)))
-
-extern int buf_getf64(buf */*b*/, double */*x_out*/);
-extern int buf_getf64l(buf */*b*/, double */*x_out*/);
-extern int buf_getf64b(buf */*b*/, double */*x_out*/);
-#define dbuf_getf64(db, x_out) (buf_getf64(DBUF_BUF(db), (x_out)))
-#define dbuf_getf64l(db, x_out) (buf_getf64l(DBUF_BUF(db), (x_out)))
-#define dbuf_getf64b(db, x_out) (buf_getf64b(DBUF_BUF(db), (x_out)))
-
-/* --- @buf_putf{32,64}{,l,b} --- *
- *
- * Arguments: @buf *b@ = a buffer to write to
- * @double x@ = a number to write
- *
- * Returns: Zero on success, %$-1$% on failure (and the buffer is
- * broken).
- *
- * Use: Get an IEEE Binary32 or Binary64 value from the buffer.
- * Conversion is performed using the `fltfmt' machinery, with
- * the usual round-to-nearest/ties-to-even rounding mode.
- */
-
-extern int buf_putf32(buf */*b*/, float /*x*/);
-extern int buf_putf32l(buf */*b*/, float /*x*/);
-extern int buf_putf32b(buf */*b*/, float /*x*/);
-#define dbuf_putf32(db, x) (buf_putf32(DBUF_BUF(db), (x)))
-#define dbuf_putf32l(db, x) (buf_putf32l(DBUF_BUF(db), (x)))
-#define dbuf_putf32b(db, x) (buf_putf32b(DBUF_BUF(db), (x)))
-
-extern int buf_putf64(buf */*b*/, double /*x*/);
-extern int buf_putf64l(buf */*b*/, double /*x*/);
-extern int buf_putf64b(buf */*b*/, double /*x*/);
-#define dbuf_putf64(db, x) (buf_putf64(DBUF_BUF(db), (x)))
-#define dbuf_putf64l(db, x) (buf_putf64l(DBUF_BUF(db), (x)))
-#define dbuf_putf64b(db, x) (buf_putf64b(DBUF_BUF(db), (x)))
-
/* --- @{,D}BUF_ENCLOSETAG@ --- *
*
* Arguments: @tag@ = a control-structure macro tag
MC_BEFORE(tag##__benchmark_before, { fflush(stdout); }) \e
MC_AFTER(tag##__benchmark_after, { \e
if (_bmark_rc) \e
- printf(": FAILED\en"); \e
+ puts(": FAILED"); \e
else { \e
fputs(": ", stdout); \e
bench_report(&file_printops, stdout, (unit), &_bmark_tm);\ \e
- putchar('\n'); \e
+ putchar('\en'); \e
} \e
}) \e
BENCH_MEASURE_TAG(tag##__bmarkmark_measure, \e
MC_BEFORE(tag##__benchmark_before, { fflush(stdout); }) \
MC_AFTER(tag##__benchmark_after, { \
if (_bmark_rc) \
- printf(": FAILED\n"); \
+ puts(": FAILED"); \
else { \
fputs(": ", stdout); \
bench_report(&file_printops, stdout, (unit), &_bmark_t); \
--- /dev/null
+.\" -*-nroff-*-
+.\"
+.\" Manual for floating-point format conversions
+.\"
+.\" (c) 2024 Straylight/Edgeware
+.\"
+.
+.\"----- Licensing notice ---------------------------------------------------
+.\"
+.\" This file is part of the mLib utilities library.
+.\"
+.\" mLib is free software: you can redistribute it and/or modify it under
+.\" the terms of the GNU Library General Public License as published by
+.\" the Free Software Foundation; either version 2 of the License, or (at
+.\" your option) any later version.
+.\"
+.\" mLib is distributed in the hope that it will be useful, but WITHOUT
+.\" ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+.\" FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public
+.\" License for more details.
+.\"
+.\" You should have received a copy of the GNU Library General Public
+.\" License along with mLib. If not, write to the Free Software
+.\" Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
+.\" USA.
+.
+.\"--------------------------------------------------------------------------
+.so ../defs.man \" @@@PRE@@@
+.
+.\"--------------------------------------------------------------------------
+.TH fltfmt 3mLib "22 April 2024" "Straylight/Edgeware" "mLib utilities library"
+.\" @FLTERR_OK
+.\" @FLTERR_INVAL
+.\" @FLTERR_INEXACT
+.\" @FLTERR_UFLOW
+.\" @FLTERR_OFLOW
+.\" @FLTERR_REPR
+.\" @FLTERR_ALLERRS
+.
+.\" @FRPF_LOW
+.\" @FRPF_HALF
+.\" @FRPF_ODD
+.\" @FRPF_NEG
+.\" @FRPMASK_LOW
+.\" @FRPMASK_HALF
+.\" @FRPMASK_ODD
+.\" @FRPMASK_NEG
+.\" @FRPMASK_INEXACT
+.\" @FRPMASK_NEAR
+.\" @FLTRND_ZERO
+.\" @FLTRND_PROJINF
+.\" @FLTRND_NEGINF
+.\" @FLTRND_POSINF
+.\" @FLTRND_EVEN
+.\" @FLTRND_ODD
+.\" @FLTRND_NEAREVEN
+.\" @FLTRND_NEARODD
+.\" @FLTRND_NEARZERO
+.\" @FLTRND_NEARINF
+.\" @FLTRND_NEARNEG
+.\" @FLTRND_NEARPOS
+.
+.\" @FLTFMT_NEG
+.\" @FLTFMT_INF
+.\" @FLTFMT_QNAN
+.\" @FLTFMT_SNAN
+.\" @FLTFMT_ZERO
+.\" @FLTFMT_NANMASK
+.\" @FLOATBITS_INIT
+.\" @fltfmt_initbits
+.\" @fltfmt_freebits
+.\" @fltfmt_allocfrac
+.\" @fltfmt_copybits
+.\" @fltfmt_round
+.
+.\" @FLTIF_HIDDEN
+.\" @fltfmt_f16
+.\" @fltfmt_f32
+.\" @fltfmt_f64
+.\" @fltfmt_f128
+.\" @fltfmt_mini
+.\" @fltfmt_bf16
+.\" @fltfmt_idblext80
+.
+.\" @fltfmt_encieee
+.\" @fltfmt_encf16
+.\" @fltfmt_encf32
+.\" @fltfmt_encf64
+.\" @fltfmt_encf128
+.\" @fltfmt_encmini
+.\" @fltfmt_encbf16
+.\" @fltfmt_encidblext80
+.\" @fltfmt_decieee
+.\" @fltfmt_decf16
+.\" @fltfmt_decf32
+.\" @fltfmt_decf64
+.\" @fltfmt_decf128
+.\" @fltfmt_decmini
+.\" @fltfmt_decbf16
+.\" @fltfmt_decidblext80
+.
+.\" @fltfmt_encflt
+.\" @fltfmt_encdbl
+.\" @fltfmt_encldbl
+.\" @fltfmt_decflt
+.\" @fltfmt_decdbl
+.\" @fltfmt_decldbl
+.
+.\" @fltfmt_flttof32l
+.\" @fltfmt_flttof32b
+.\" @fltfmt_dbltof64l
+.\" @fltfmt_dbltof64b
+.\" @fltfmt_f32ltoflt
+.\" @fltfmt_f32btoflt
+.\" @fltfmt_f64ltodbl
+.\" @fltfmt_f64btodbl
+.
+.\"--------------------------------------------------------------------------
+.SH NAME
+fltfmt \- floating-point format conversions
+.
+.\"--------------------------------------------------------------------------
+.SH SYNOPSIS
+.
+.nf
+.B "#define FLTERR_OK 0"
+.B "#define FLTERR_INVAL ..."
+.B "#define FLTERR_INEXACT ..."
+.B "#define FLTERR_UFLOW ..."
+.B "#define FLTERR_OFLOW ..."
+.B "#define FLTERR_REPR ..."
+.B "#define FLTERR_ALLERRS ..."
+.PP
+.ta 40n
+.B "#define FRPF_LOW 1u"
+.B "#define FRPF_HALF 2u"
+.B "#define FRPF_ODD 4u"
+.B "#define FRPF_NEG 8u"
+.B "#define FRPMASK_LOW 0xaaaau"
+.B "#define FRPMASK_HALF 0xccccu."
+.B "#define FRPMASK_ODD 0xf0f0u"
+.B "#define FRPMASK_NEG 0xff00u"
+.B "#define FRPMASK_INEXACT ... /* LOW | HALF */"
+.BI "unsigned FRPMASK_NEAR(unsigned " dir "); /* HALF&(LOW | " dir ") */"
+.B "#define FLTRND_ZERO ... /* 0 */"
+.B "#define FLTRND_PROJINF ... /* INEXACT */"
+.B "#define FLTRND_NEGINF ... /* INEXACT&NEG */"
+.B "#define FLTRND_POSINF ... /* INEXACT&~NEG */"
+.B "#define FLTRND_EVEN ... /* INEXACT&ODD */"
+.B "#define FLTRND_ODD ... /* INEXACT&~ODD */"
+.B "#define FLTRND_NEAREVEN ... /* HALF&(LOW | ODD) */"
+.B "#define FLTRND_NEARODD ... /* HALF&(LOW | ~ODD) */"
+.B "#define FLTRND_NEARZERO ... /* HALF&LOW */"
+.B "#define FLTRND_NEARINF ... /* HALF */"
+.B "#define FLTRND_NEARNEG ... /* HALF&(LOW | NEG) */"
+.B "#define FLTRND_NEARPOS ... /* HALF&(LOW | ~NEG) */"
+.PP
+.ta 2n
+.B "#define FLTF_NEG ..."
+.B "#define FLTF_INF ..."
+.B "#define FLTF_QNAN ..."
+.B "#define FLTF_SNAN ..."
+.B "#define FLTF_ZERO ..."
+.B "#define FLTF_NANMASK (FLTF_QNAN | FLTF_SNAN)"
+.B "struct floatbits {"
+.B " unsigned f;"
+.B " int exp;"
+.B " arena *a;"
+.B " uint32 *frac;"
+.B " unsigned n, fracsz;"
+.B "};"
+.B "#define FLOATBITS_INIT { ...\& };"
+.PP
+.BI "void fltfmt_initbits(struct floatbits *" x );
+.BI "void fltfmt_freebits(struct floatbits *" x );
+.BI "void fltfmt_allocfrac(struct floatbits *" x ", unsigned " n );
+.ta \w'\fBvoid fltfmt_copybits('u
+.BI "void fltfmt_copybits(struct floatbits *" z_out ,
+.BI " const struct floatbits *" x );
+.ta \w'\fBvoid fltfmt_round('u
+.BI "void fltfmt_round(struct floatbits *" z_out ,
+.BI " const struct floatbits *" x ,
+.BI " unsigned " r ", unsigned " n );
+.PP
+.
+.ta 2n
+.B "#define FLTIF_HIDDEN ..."
+.B "struct fltfmt_ieeefmt {"
+.B " unsigned f;"
+.B " unsigned expwd;"
+.B " unsigned prec;"
+.B "};"
+.B "const struct fltfmt_ieeefmt fltfmt_f16;"
+.B "const struct fltfmt_ieeefmt fltfmt_f32;"
+.B "const struct fltfmt_ieeefmt fltfmt_f64;"
+.B "const struct fltfmt_ieeefmt fltfmt_f128;"
+.B "const struct fltfmt_ieeefmt fltfmt_mini;"
+.B "const struct fltfmt_ieeefmt fltfmt_bf16;"
+.B "const struct fltfmt_ieeefmt fltfmt_idblext80;"
+.PP
+.ta \w'\fBunsigned fltfmt_encieee('u
+.BI "unsigned fltfmt_encieee(const struct fltfmt_ieeefmt *" fmt ,
+.BI " uint32 *" z ", const struct floatbits *" x ,
+.BI " unsigned " r ", unsigned " errmask );
+.ta \w'\fBunsigned fltfmt_encf16('u
+.BI "unsigned fltfmt_encf16(uint16 *" z_out ", const struct floatbits *" x ,
+.BI " unsigned " r ", unsigned " errmask );
+.ta \w'\fBunsigned fltfmt_encf32('u
+.BI "unsigned fltfmt_encf32(uint32 *" z_out ", const struct floatbits *" x ,
+.BI " unsigned " r ", unsigned " errmask );
+.ta \w'\fBunsigned fltfmt_encf64('u
+.BI "unsigned fltfmt_encf64(kludge64 *" z_out ", const struct floatbits *" x ,
+.BI " unsigned " r ", unsigned " errmask );
+.ta \w'\fBunsigned fltfmt_encf128('u
+.BI "unsigned fltfmt_encf128(uint32 *" z_out ", const struct floatbits *" x ,
+.BI " unsigned " r ", unsigned " errmask );
+.ta \w'\fBunsigned fltfmt_encmini('u
+.BI "unsigned fltfmt_encmini(octet *" z_out ", const struct floatbits *" x ,
+.BI " unsigned " r ", unsigned " errmask );
+.ta \w'\fBunsigned fltfmt_encbf16('u
+.BI "unsigned fltfmt_encbf16(uint16 *" z_out ", const struct floatbits *" x ,
+.BI " unsigned " r ", unsigned " errmask );
+.ta \w'\fBunsigned fltfmt_encidblext80('u
+.BI "unsigned fltfmt_encidblext80(uint16 *" se_out ", kludge64 *" m_out ,
+.BI " const struct floatbits *" x ,
+.BI " unsigned " r ", unsigned " errmask );
+.PP
+.ta \w'\fBunsigned fltfmt_decieee('u
+.BI "unsigned fltfmt_decieee(const struct fltfmt_ieeefmt *" fmt ,
+.BI " struct floatbits *" z_out ", const uint32 *" x );
+.BI "unsigned fltfmt_decf16(struct floatbits *" z_out ", uint16 " x );
+.BI "unsigned fltfmt_decf32(struct floatbits *" z_out ", uint32 " x );
+.BI "unsigned fltfmt_decf64(struct floatbits *" z_out ", kludge64 " x );
+.BI "unsigned fltfmt_decf128(struct floatbits *" z_out ", const uint32 *" x );
+.BI "unsigned fltfmt_decmini(struct floatbits *" z_out ", octet " x );
+.BI "unsigned fltfmt_decbf16(struct floatbits *" z_out ", uint16 " x );
+.ta \w'\fBunsigned fltfmt_decidblext80('u
+.BI "unsigned fltfmt_decidblext80(struct floatbits *" z_out ,
+.BI " uint16 " se ", kludge64 " m );
+.PP
+.ta \w'\fBunsigned fltfmt_encflt('u
+.BI "unsigned fltfmt_encflt(float *" z_out ,
+.BI " const struct floatbits *" x ", unsigned " r );
+.ta \w'\fBunsigned fltfmt_encdbl('u
+.BI "unsigned fltfmt_encdbl(double *" z_out ,
+.BI " const struct floatbits *" x ", unsigned " r );
+.ta \w'\fBunsigned fltfmt_encldbl('u
+.BI "unsigned fltfmt_encldbl(long double *" z_out ,
+.BI " const struct floatbits *" x ", unsigned " r );
+.ta \w'\fBunsigned fltfmt_decflt('u
+.BI "unsigned fltfmt_decflt(struct floatbits *" z_out ,
+.BI " float *" x ", unsigned " r );
+.ta \w'\fBunsigned fltfmt_decdbl('u
+.BI "unsigned fltfmt_decdbl(struct floatbits *" z_out ,
+.BI " double *" x ", unsigned " r );
+.ta \w'\fBunsigned fltfmt_decldbl('u
+.BI "unsigned fltfmt_decldbl(struct floatbits *" z_out ,
+.BI " long double *" x ", unsigned " r );
+.PP
+.BI "unsigned fltfmt_flttof32l(octet *" p ", float " x ", unsigned " r );
+.BI "unsigned fltfmt_flttof32b(octet *" p ", float " x ", unsigned " r );
+.BI "unsigned fltfmt_dbltof64l(octet *" p ", double " x ", unsigned " r );
+.BI "unsigned fltfmt_dbltof64b(octet *" p ", double " x ", unsigned " r );
+.BI "unsigned fltfmt_f32ltoflt(float *" z_out ", const octet *" p ", unsigned " r );
+.BI "unsigned fltfmt_f32btoflt(float *" z_out ", const octet *" p ", unsigned " r );
+.BI "unsigned fltfmt_f64ltodbl(float *" z_out ", const octet *" p ", unsigned " r );
+.BI "unsigned fltfmt_f64btodbl(float *" z_out ", const octet *" p ", unsigned " r );
+.
+.\"--------------------------------------------------------------------------
+.SH DESCRIPTION
+.
+The
+.B "<mLib/fltfmt.h>"
+header file defines structures, macros, and functions
+for converting floating-point values between various formats,
+including the native floating-point formats
+and IEEE\ 754 and related formats.
+.
+.SS Error conditions
+Most of the functions in this module return an unsigned integer.
+A return value of zero means that no error occurred;
+set bits indicate various error conditions.
+.TP
+.B FLTERR_INVAL
+A binary input to be decoded contained an invalid bit pattern,
+e.g., an unnormalized input value with a nonminimal exponent.
+The function will have produced a reasonable output anyway,
+but the original value will not be recoverable from the result.
+.TP
+.B FLTERR_INEXACT
+The conversion was inexact.
+Converting the output back into the format of the input
+may not reproduce the original input value.
+This error flag is sometimes set conservatively.
+.TP
+.B FLTERR_UFLOW
+The conversion underflowed:
+a nonzero input was too tiny (in asbolute value) to represent,
+and a zero result was returned.
+.TP
+.B FLTERR_OFLOW
+The conversion overflowed:
+a finite input was too huge (in absolute value) to represent,
+and either the appropriately signed infinity
+or largest-magnitude finite value
+was returned, determined by the requested rounding mode.
+.TP
+.B FLTERR_REPR
+The output format failed entirely to represent the input value.
+The result is zero if the input was a NaN,
+or the appropriately signed largest-magnitude finite value
+if the input was an infinity.
+.
+.SS Rounding modes
+The rounding system works as follows.
+There are four
+.I rounding predicates
+considered when a rounding decision is taken.
+These are determined from the unrounded input value
+.IR x ,
+and the two nearest rounded values
+.RI | u "|\ \*(<=\ |" x |
+and
+.RI | v "|\ >\ |" x |.
+The predicates are as follows.
+.TP
+.B FRPF_LOW
+If
+.IR x "\ \*(/=\ " u
+and
+.IR x "\ \*/=\ (" u "\ +\ " v )/2,
+i.e.,
+.I x
+is neither equal to a rounded value,
+nor exactly halfway between two rounded values.
+This predicate is sometimes referred to as a `sticky bit'.
+.TP
+.B FRPF_HALF
+If
+.RI | x "|\ \*(>=\ |(" u "\ +\ " v )/2|,
+i.e.,
+.I x
+is halfway or more towards its larger rounded neighbour.
+.TP
+.B FRPF_ODD
+If least significant digit of
+.I u
+is odd.
+In binary floating-point formats,
+this is just the least significant bit of
+.IR u .
+.TP
+.B FRPF_NEG
+If
+.I x
+is negative.
+.PP
+These four predicates are packed into a four-bit mask value
+.I rf
+between 0 and 15.
+A
+.I rounding mode
+is simply a 16-bit mask:
+if bit
+.I rf
+of the rounding-mode mask is set,
+then
+.I x
+is rounded to
+.IR v ;
+otherwise it is rounded to
+.IR u .
+That is, the rounding-mode mask is essentially a truth table.
+Rounding modes with
+.I set
+bits corresponding to situations where both
+.B FRPF_LOW
+and
+.FRPF_HALF
+are false,
+i.e., where
+.I x
+is already a rounded value,
+are forbidden.
+.PP
+Some useful machinery is provided
+for constructing rounding-mode masks.
+.BR FRPMASK_LOW ,
+.BR FRPMASK_HALF ,
+.BR FRPMASK_ODD ,
+and
+.B FRPMASK_NEG ,
+are mask with set bits corresponding to their respective predicates.
+Bitwise boolean logic can be applied to these masks
+in order to calculate the masks corresponding to
+the same logical expresssion applied to the individual predicates.
+.B FRPMASK_INEXACT holds if
+.B LOW
+or
+.B HALF
+holds;
+i.e., if
+.IR x "\ \*(/=\ " u ;
+as mentioned above, only these bits may be set
+in a valid rounding-mode mask.
+.BI FRPMASK_NEAR( dir )
+is the mask for rounding to nearest with ties broken according to
+.IR dir ,
+which is another rounding-mode mask.
+The complete set of predefined masks is listed above in the synopsis,
+together with their description in terms of the basic predicates.
+The usual IEEE rounding mode is
+round-to-nearest/ties-to-even,
+denoted
+.BR FLTRND_NEAREVEN .
+This is likely a good option
+if there is no compelling argument for a different specific choice.
+.
+.SS The floatbits structure
+In order to avoid a combinatorial explosion in conversion operations,
+all the basic conversions involve,
+as source or target,
+a `common currency' format represented by the type
+.BR "struct floatbits" .
+.PP
+This structure consists of
+a set of flags
+.BR f ;
+a signed exponent
+.BR exp ;
+an
+.B arena
+pointer
+.BR a ;
+a pointer
+.B frac
+to a vector of
+.B uint32
+values;
+the length
+.B n
+of the
+.B frac
+vector; and
+the currently allocated size
+.B fracsz
+of the vector.
+Both
+.B n
+and
+.B fracsz
+count elements, not bytes.
+.PP
+Storage for
+.B frac
+comes from the arena
+.BR a .
+Only the first
+.B n
+words of
+.B frac
+are significant;
+.B frac[0]
+is the most significant word.
+The value represented by a
+.B struct floatbits
+is never changed by adding or removing zero-valued words
+at the end of the
+.B frac
+vector.
+It is always the case that
+.BR n "\ \*(<=\ " fracsz ;
+if
+.B fracsz
+is zero then
+.B frac
+may be a null pointer.
+.PP
+The interpretation of the
+.B exp
+and
+.B frac
+members depends on the flags set in
+.BR f .
+Apart from
+.BR FLTF_NEG ,
+the flags are
+.IR "mutually exclusive" :
+at most one flag may be set.
+.TP
+.B FLTF_NEG
+The value is negative.
+.TP
+.B FLTF_INF
+The value is positive or negative infinity.
+The
+.B exp
+and
+.B frac
+are ignored.
+.TP
+.BR FLTF_QNAN " and " FLTF_SNAN
+The value is a quiet or signalling not-a-number, respectively.
+The
+.B exp
+is ignored.
+The payload is stored in
+.BR frac ;
+the payload does not include the `quiet' bit.
+.TP
+.B FLTF_ZERO
+The number is zero.
+Negative zero is distinct from positive zero.
+The
+.B exp
+and
+.B frac
+are ignored.
+.IP "All non-sign bits clear"
+The value is a finite nonzero number.
+The
+.B frac
+holds the significand.
+The most significand significand bit must be set, so
+(a)\ the number must be nonzero, and
+(b)\ the significand is normalized.
+The significand is interpreted as a fraction
+.RI "1/2\ \*(<=\ " m "\ <\ 1."
+If
+.I e
+is the value of the
+.B exp
+member,
+and
+.I s
+is \-1 if
+.B FLTF_NEG
+is set
+or +1 if
+.B FLTF_NEG
+is clear,
+then the number represented is
+.IR s "\ \(mu\ " m "\ \(mu\ 2\*(ss" e \*(se.
+.PP
+A
+.B struct floatbits
+can be initialized statically by
+.BR FLOATBITS_INIT ,
+or dynamically using the function
+.BR fltfmt_initbits .
+These are not quite the same:
+.B FLOATBITS_INIT
+initializes
+.B a
+to
+.BR &arena_stdlib ,
+while
+.B fltfmt_initbits
+sets it to the runtime value of
+.BR arena_global .
+With this exception,
+both forms of initialization set the value to (positive) zero;
+neither allocates any storage or other resources,
+leaving
+.B frac
+null.
+.PP
+The
+.B fltfmt_allocfrac
+function is given a pointer
+.I x
+to a
+.B struct floatbits
+and a length
+.IR n :
+it ensures that there is enough storage at
+.IB x ->frac
+for at least
+.I n
+words:
+if the current size is too small,
+then any existing buffer is discarded and a new one allocated
+from the arena
+.IB x ->a \fR;
+any existing contents of the buffer are lost.
+On exit,
+.IB x ->n
+is set to
+.IR n .
+.PP
+The
+.B fltfmt_freebits
+function
+frees a
+.B struct floatbits
+structure, releasing the storage held by
+.BR frac .
+.PP
+The
+.B fltfmt_copybits
+function simply copies its input
+.I x
+to its output
+.IR z_out ;
+both must refer to initialized
+.B struct floatbits
+structures.
+If
+.I z_out
+and
+.I x
+are equal, then nothing happens.
+.PP
+Finally, the
+.B fltfmt_round
+function rounds the value in the
+.B struct floatbits
+structure
+.I x
+to
+.I n
+bits using the rounding mode
+.IR r ;
+the result is written to
+.IR z_out ;
+it is permitted for
+.I z_out
+to be equal to
+.IR x .
+If
+.I x
+is a zero or infinity,
+then the output is equal to the input,
+as if
+.B fltfmt_copybits
+had been called instead.
+If
+.I x
+is a NaN,
+then the payload is simply truncated to
+.I n
+bits, without regard to the rounding mode.
+Otherwise, the input is nonzero and finite;
+the significand is rounded to
+.I n
+bits according to the rounding mode.
+In all cases, the return value is
+zero if the output is equal to the input,
+or
+.B FLTERR_INEXACT
+if the rounded result is not equal to the input.
+.
+.SS IEEE and related formats
+An IEEE floating-point format is characterized by three parameters:
+the
+.I "exponent width"
+.IR w ,
+the
+.I "precision"
+.IR p ,
+and
+the
+.I "unit width"
+.IR h .
+.PP
+The encoded value consists of
+.IR p "\ +\ " w "\ +\ " h "\ \-\ 1"
+bits.
+This is divided, from the most significant bit downwards,
+into a
+.I "sign bit"
+.IR s ,
+a
+.IR w -bit
+.I "biased exponent"
+.IR e \*',
+a
+.IR h -bit
+.I "unit bit"
+.IR u ,
+and a
+.RI ( p "\ \-\ " h )-bit
+.I fraction
+.IR f .
+The
+.I "exponent bias"
+is
+.IR e "\*(us0\*(ue\ =\ 2\*(ss" w "\-1\*(se\ \-\ 1;"
+the true exponent
+.I e
+is calculated from the biased exponent by
+.IR e "\ =\ " e "\*'\ \-\ " e \*(us0\*(ue.
+The unit and fraction field are usually interpreted as denoting
+a significand
+.IR m "\ =\ " u "\ +\ " f /2\*(ss p \-1\*(se
+with
+.RI "0\ \*(<=\ " m "\ <\ 2."
+If
+.I h
+is zero,
+the value of the unit bit
+.I u
+is implied by the exponent as described below.
+The encoded value is interpreted as follows.
+.hP \*o
+If
+.IR e "\ =\ \-" e \*(us0\*(ue
+then the value is zero or a subnormal,
+with the value
+.RI (\-1)\*(ss s "\*(se\ \(mu\ " m "\ \(mu\ 2\*(ss" e +1\*(se.
+In particular,
+if
+.IR m "\ =\ 0"
+then the value is positive or negative zero,
+according to the sign bit
+.IR s .
+If
+.I h
+is zero then
+.IR u "\ =\ 0;"
+if
+.I h
+is nonzero
+but
+.IR u "\ \*(/=\ 0"
+then the encoding is invalid:
+decoding returns
+.BR FLTERR_INVAL ,
+but the result will be as described.
+.hP \*o
+If
+.RI "1\ \-\ " e "\*(us0\*(ue \*(<=\ " e "\ < 2" e "\*(us0\*(ue\ +\ 1"
+then the value is a (supposedly) normal number
+.RI (\-1)\*(ss s "\*(se\ \(mu\ " m "\ \(mu\ 2\*(ss" e \*(se.
+If
+.I h
+is zero then
+.IR u "\ =\ 1;"
+if
+.I h
+is nonzero
+but
+.IR u "\ \*(/=\ 0"
+then the encoding is invalid:
+decoding returns
+.BR FLTERR_INVAL ,
+but the result will be as described.
+.hP \*o
+If
+.IR e "\ =\ 2" e "\*(us0\*(ue\ +\ 1"
+and
+.IR f "\ =\ 0"
+then the value is positive or negative infinity,
+according to the sign bit
+.IR s .
+If
+.I h
+is nonzero and
+.IR u "\ =\ 0"
+then the encoding is invalid:
+decoding returns
+.BR FLTERR_INVAL ,
+but the result will still be infinity.
+.hP \*o
+If
+.IR e "\ =\ 2" e "\*(us0\*(ue\ +\ 1"
+and
+.IR f "\ \*(/=\ 0"
+then the value is not-a-number (NaN).
+The most significant bit of
+.I f
+is the `quiet bit':
+if the bit is set, the value is a `quiet NaN';
+if the bit is clear, the value is a `signalling NaN'.
+(This is the convention recommended by IEEE\ 754-2008 \(sc6.2.1,
+it has the advantage that a signalling NaN can be `quieted'
+by setting the most significant fraction bit;
+HP-PA and older MIPS processors use the opposite convention
+for distinguishing quiet and signalling NaNs,
+but a signalling NaN with all but the most significant
+fraction bit zero cannot be `quieted' by clearing the
+most significant bit, since the resulting encoding denotes
+an infinity, not a QNaN.)
+The remaining bits of
+.I f
+form the
+.I payload.
+Positive and negative NaN values are distinguished,
+with sign determined by the sign bit.
+If
+.I h
+is nonzero and
+.IR u "\ =\ 0"
+then the encoding is invalid:
+decoding returns
+.BR FLTERR_INVAL ,
+but the result will still be a NaN;
+the unit bit does not affect the NaN payload.
+.PP
+An IEEE format is described by the type
+.BR "struct fltfmt_ieeefmt" .
+This has three members:
+.TP
+.B f
+A flags word.
+If
+.B FLTIF_HIDDEN
+is set, the the format uses a `hidden bit' convention:
+in the notation above
+.IR h "\ =\ 0;"
+if the flag is clear,
+the format has an explicit unit bit, and
+.IR h "\ =\ 1."
+.TP
+.B expwd
+The exponent width;
+in the notation above this is
+.IR w .
+.TP
+.B prec
+The precision;
+in the notation above this is
+.IR p .
+.PP
+The following IEEE formats descriptions are already defined.
+.TP
+.B "fltfmt_f16"
+The IEEE\ 754 Binary16 format, with
+.IR w "\ =\ 5,"
+.IR p "\ =\ 11,"
+and
+.IR h "\ =\ 0."
+.TP
+.B "fltfmt_f32"
+The IEEE\ 754 Binary32 (`single precision') format, with
+.IR w "\ =\ 8,"
+.IR p "\ =\ 24,"
+and
+.IR h "\ =\ 0."
+.TP
+.B "fltfmt_f64"
+The IEEE\ 754 Binary64 (`double precision') format, with
+.IR w "\ =\ 11,"
+.IR p "\ =\ 53,"
+and
+.IR h "\ =\ 0."
+.TP
+.B "fltfmt_f128"
+The IEEE\ 754 Binary128 (`quad precision') format, with
+.IR w "\ =\ 15,"
+.IR p "\ =\ 113,"
+and
+.IR h "\ =\ 0."
+.TP
+.B "fltfmt_mini"
+An eight-bit `minifloat' format, with
+.IR w "\ =\ 4,"
+.IR p "\ =\ 4,"
+and
+.IR h "\ =\ 0."
+.TP
+.B "fltfmt_bf16"
+The Google `BFloat16' format, with
+.IR w "\ =\ 8,"
+.IR p "\ =\ 8,"
+and
+.IR h "\ =\ 0."
+.TP
+.B "fltfmt_idblext80"
+The Intel 8087 80-bit `double extended' format, with
+.IR w "\ =\ 15,"
+.IR p "\ =\ 64,"
+and
+.IR h "\ =\ 1."
+.PP
+The
+.B fltfmt_encieee
+and
+.B fltfmt_decieee
+functions convert between IEEE and related formats
+and the internal
+.B struct floatbits
+representation.
+They respectively encode or decode an IEEE-format value,
+as described above,
+from a vector of
+.B uint32
+words,
+most-significant word first
+\(en so the sign bit is in the first word.
+For formats whose size is not a multiple of 32,
+the encoding is
+.IR right-aligned :
+the least significant bit of the fraction
+is in the least significant bit of the last word in the vector.
+.PP
+The
+.B fltfmt_encieee
+function encodes an IEEE-format value.
+The function is given five arguments:
+a pointer
+.I fmt
+to the IEEE format description,
+a pointer
+.I p
+to a sufficiently long vector of 32-bit words
+in which to store the encoded value,
+a pointer
+.I x
+to the
+.B struct floatbits
+holding the value to encode,
+a rounding mode
+.IR r ,
+and an error mask
+.IR errmask .
+If the input is a NaN,
+then the payload is truncated to fit
+regardless of the rounding mode,
+discarding low-significant bits;
+if the input is a finite value,
+then the significand is rounded to fit
+according to the requested rounding mode.
+The possible errors are
+.B FLTERR_UFLOW
+if the value is unrepresentably tiny,
+.B FLTERR_OFLOW
+if the value is unrepresentably huge,
+and
+.B FLTERR_INEXACT
+if the encoding fails to preserve the input value exactly;
+hence
+.B FLTERR_INEXACT
+is set whenever
+.B FLTERR_OFLOW
+or
+.B FLTERR_UFLOW
+is set,
+or if bits are lost due to NaN-payload truncation or rounding.
+If, during encoding,
+an error is encountered,
+processing stops immediately
+unless the corresponding bit of
+.I errmask
+is set.
+.PP
+The
+.B fltfmt_decieee
+function decodes an IEEE-format value.
+The function is given three arguments:
+a pointer
+.I fmt
+to the IEEE format description,
+a pointer
+.I z_out
+to the initialized
+.B struct floatbits
+to fill in, and
+a pointer
+.I p
+to the IEEE-encoded value to decode,
+in a vector of 32-bit words as described above.
+The only error that can occur during decoding is
+.BR FLTERR_INVAL :
+as described above,
+this occurs in non-hidden-bit formats
+when the unit bit does not match that implied by the exponent;
+the result is returned anyway,
+with the unit bit interpreted as encoded in finite numbers,
+and discarded in infinities and NaNs.
+.PP
+For each of the format
+
+
+
+.
+.\"--------------------------------------------------------------------------
+.SH "SEE ALSO"
+.
+.BR bits (3),
+.BR mLib (3).
+.
+.\"--------------------------------------------------------------------------
+.SH AUTHOR
+.
+Mark Wooding, <mdw@distorted.org.uk>
+.
+.\"----- That's all, folks --------------------------------------------------
/* IEEE (and related) format descriptions. */
const struct fltfmt_ieeefmt
- fltfmt_mini = { IEEEF_HIDDEN, 4, 4 },
- fltfmt_bf16 = { IEEEF_HIDDEN, 8, 8 },
- fltfmt_f16 = { IEEEF_HIDDEN, 5, 11 },
- fltfmt_f32 = { IEEEF_HIDDEN, 8, 24 },
- fltfmt_f64 = { IEEEF_HIDDEN, 11, 53 },
- fltfmt_f128 = { IEEEF_HIDDEN, 15, 113 },
+ fltfmt_mini = { FLTIF_HIDDEN, 4, 4 },
+ fltfmt_bf16 = { FLTIF_HIDDEN, 8, 8 },
+ fltfmt_f16 = { FLTIF_HIDDEN, 5, 11 },
+ fltfmt_f32 = { FLTIF_HIDDEN, 8, 24 },
+ fltfmt_f64 = { FLTIF_HIDDEN, 11, 53 },
+ fltfmt_f128 = { FLTIF_HIDDEN, 15, 113 },
fltfmt_idblext80 = { 0, 15, 64 };
/* --- @fltfmt_encieee@ ---
/* Determine the output size. */
nb = fmt->prec + fmt->expwd + 1;
- if (fmt->f&IEEEF_HIDDEN) nb--;
+ if (fmt->f&FLTIF_HIDDEN) nb--;
nw = (nb + 31)/32;
/* Determine the top bits. */
*/
z0 |= M32(fmt->expwd) << esh;
- if (!(fmt->f&IEEEF_HIDDEN)) z0 |= B32(esh - 1);
+ if (!(fmt->f&FLTIF_HIDDEN)) z0 |= B32(esh - 1);
} else if (f&FLTF_NANMASK) {
/* Not-a-number.
n = x->n; if (n > mw) n = nw;
t = shr(z + i, x->frac, n, sh); i += n;
if (i < nw) z[i++] = t;
- sh = esh - 2; if (fmt->f&IEEEF_HIDDEN) sh++;
+ sh = esh - 2; if (fmt->f&FLTIF_HIDDEN) sh++;
if (f&FLTF_QNAN) z0 |= B32(sh);
else if (!fracwd) { ERR(FLTERR_INEXACT); z[nw - 1] |= 1; }
/* Set the exponent and, for non-hidden-bit formats, the unit bit. */
z0 |= M32(fmt->expwd) << esh;
- if (!(fmt->f&IEEEF_HIDDEN)) z0 |= B32(esh - 1);
+ if (!(fmt->f&FLTIF_HIDDEN)) z0 |= B32(esh - 1);
} else {
/* A finite value.
* the rounding mode.
*/
+ ERR(FLTERR_OFLOW | FLTERR_INEXACT);
rf = FRPF_ODD | FRPF_HALF | FRPF_LOW;
if (f&FLTF_NEG) rf |= FRPF_NEG;
- if ((r >> rf)&1) {
- ERR(FLTERR_OFLOW | FLTERR_INEXACT);
+ if ((r >> rf)&1)
z0 |= M32(fmt->expwd) << esh;
- } else {
- ERR(FLTERR_INEXACT);
+ else {
z0 |= (B32(fmt->expwd) - 2) << esh;
- mb = fmt->prec; if (fmt->f&IEEEF_HIDDEN) mb--;
+ mb = fmt->prec; if (fmt->f&FLTIF_HIDDEN) mb--;
mw = (mb + 31)/32;
i = nw - mw;
z[i++] = M32(mb%32);
z0 |= (exp + maxexp) << esh;
/* Clear the unit bit if we're suppose to use a hidden-bit convention. */
- if (fmt->f&IEEEF_HIDDEN) {
+ if (fmt->f&FLTIF_HIDDEN) {
mb = fmt->prec - 1; mw = (mb + 31)/32; mb = mb%32;
z[nw - mw] &= ~B32(mb);
}
*/
assert(fmt->expwd + 3 <= 32);
esh = 31 - fmt->expwd; emask = M32(fmt->expwd);
- sigwd = fmt->prec; if (fmt->f&IEEEF_HIDDEN) sigwd--;
+ sigwd = fmt->prec; if (fmt->f&FLTIF_HIDDEN) sigwd--;
/* Determine the input size. */
nb = sigwd + fmt->expwd + 1; nw = (nb + 31)/32;
* Note that we don't include the quiet bit in our decoded payload.
*/
- if (!(fmt->f&IEEEF_HIDDEN)) {
+ if (!(fmt->f&FLTIF_HIDDEN)) {
/* No hidden bit, so we expect the unit bit to be set. If it isn't,
* that's technically invalid, and its absence won't survive a round
* trip, since the bit isn't considered part of a NaN payload -- or
if (ms_set_bit(x + nw, 0, sigwd) == ALLCLEAR)
f |= FLTF_INF;
else {
- sh = esh - 2; if (fmt->f&IEEEF_HIDDEN) sh++;
+ sh = esh - 2; if (fmt->f&FLTIF_HIDDEN) sh++;
if (x0&B32(sh)) f |= FLTF_QNAN;
else f |= FLTF_SNAN;
sigwd--; mw = (sigwd + 31)/32;
* Otherwise, we'll normalize the incoming value regardless, but report
* settings of the unit bit which are inconsistent with the exponent.
*/
- if (fmt->f&IEEEF_HIDDEN) {
+ if (fmt->f&FLTIF_HIDDEN) {
if (!t) { exp = minexp; goto normalize; }
else { exp = t - maxexp; goto hidden; }
} else {
/*----- Data structures ---------------------------------------------------*/
+/* Error codes. */
+#define FLTERR_OK 0x0000u /* no trouble */
+#define FLTERR_INVAL 0x0001u /* technically invalid encoding */
+#define FLTERR_INEXACT 0x0002u /* result is inexect */
+#define FLTERR_UFLOW 0x0004u /* underflowed to zero */
+#define FLTERR_OFLOW 0x0008u /* overflowed to ±∞ or max finite */
+#define FLTERR_REPR 0x0010 /* not representable */
+#define FLTERR_ALLERRS 0xffff /* all errors */
+
+/* Predicates considered for rounding. */
+#define FRPF_LOW 0x0001u /* lost bits not exactly zero or half */
+#define FRPF_HALF 0x0002u /* lost a half or more */
+#define FRPF_ODD 0x0004u /* final place is currently odd */
+#define FRPF_NEG 0x0008u /* number is negative */
+
+/* Rounding policies. These are represented as a 16-bit truth table applied
+ * to the predicate bits listed above. The following are the mask values
+ * corresponding to the predicate bits being set; a set bit means that the
+ * number should be rounded away from zero.
+ */
+#define FRPMASK_LOW 0xaaaau /* lost bits below half */
+#define FRPMASK_HALF 0xccccu /* lost a half or more */
+#define FRPMASK_ODD 0xf0f0u /* final place is dod */
+#define FRPMASK_NEG 0xff00u /* number is negative */
+
+/* Useful constructed masks from the above. */
+#define FRPMASK_INEXACT (FRPMASK_LOW | FRPMASK_HALF) /* lost nonzero bits */
+#define FRPMASK_NEAR(dir) (FRPMASK_HALF&(FRPMASK_LOW | (dir))) /* */
+
+/* Generally useful rounding criteria. */
+#define FLTRND_ZERO 0 /* towards zero (truncate) */
+#define FLTRND_PROJINF FRPMASK_INEXACT /* towards (projective) ±∞ */
+#define FLTRND_NEGINF (FRPMASK_INEXACT&FRPMASK_NEG) /* down, towards -∞ */
+#define FLTRND_POSINF (FRPMASK_INEXACT&~FRPMASK_NEG) /* up, towards +∞ */
+#define FLTRND_EVEN (FRPMASK_INEXACT&FRPMASK_ODD) /* to even */
+#define FLTRND_ODD (FRPMASK_INEXACT&~FRPMASK_ODD) /* to odd */
+#define FLTRND_NEAREVEN FRPMASK_NEAR(FLTRND_EVEN) /* nearest, ties to even */
+#define FLTRND_NEARODD FRPMASK_NEAR(FLTRND_ODD) /* nearest, ties to odd */
+#define FLTRND_NEARZERO FRPMASK_NEAR(FLTRND_ZERO) /* nearest, ties to zero */
+#define FLTRND_NEARINF FRPMASK_NEAR(FLTRND_PROJINF) /* nearest, ties to ±∞ */
+#define FLTRND_NEARNEG FRPMASK_NEAR(FLTRND_NEGINF) /* nearest, ties to -∞ */
+#define FLTRND_NEARPOS FRPMASK_NEAR(FLTRND_POSINF) /* nearest, ties to +∞ */
+
struct floatbits {
/* A decoded floating-point number.
*
};
#define FLOATBITS_INIT { FLTF_ZERO, 0, &arena_stdlib, 0, 0, 0 }
-/* Error codes. */
-#define FLTERR_OK 0x0000u /* no trouble */
-#define FLTERR_INVAL 0x0001u /* technically invalid encoding */
-#define FLTERR_INEXACT 0x0002u /* result is inexect */
-#define FLTERR_UFLOW 0x0004u /* underflowed to zero */
-#define FLTERR_OFLOW 0x0008u /* overflowed to ±∞ or max finite */
-#define FLTERR_REPR 0x0010 /* not representable */
-#define FLTERR_ALLERRS 0xffff /* all errors */
-
-/* Predicates considered for rounding. */
-#define FRPF_LOW 0x0001u /* lost bits not exactly zero or half */
-#define FRPF_HALF 0x0002u /* lost a half or more */
-#define FRPF_ODD 0x0004u /* final place is currently odd */
-#define FRPF_NEG 0x0008u /* number is negative */
-
-/* Rounding policies. These are represented as a 16-bit truth table applied
- * to the predicate bits listed above. The following are the mask values
- * corresponding to the predicate bits being set; a set bit means that the
- * number should be rounded away from zero.
- */
-#define FRPMASK_LOW 0xaaaau /* lost bits below half */
-#define FRPMASK_HALF 0xccccu /* lost a half or more */
-#define FRPMASK_ODD 0xf0f0u /* final place is dod */
-#define FRPMASK_NEG 0xff00u /* number is negative */
-
-/* Useful constructed masks from the above. */
-#define FRPMASK_INEXACT (FRPMASK_LOW | FRPMASK_HALF) /* lost nonzero bits */
-#define FRPMASK_NEAR(dir) (FRPMASK_HALF&(FRPMASK_LOW | (dir))) /* */
-
-/* Generally useful rounding criteria. */
-#define FLTRND_ZERO 0 /* towards zero (truncate) */
-#define FLTRND_PROJINF FRPMASK_INEXACT /* ½³ towards (projective) ±∞ */
-#define FLTRND_NEGINF (FRPMASK_INEXACT&FRPMASK_NEG) /* down, towards -∞ */
-#define FLTRND_POSINF (FRPMASK_INEXACT&~FRPMASK_NEG) /* up, towards +∞ */
-#define FLTRND_EVEN (FRPMASK_INEXACT&FRPMASK_ODD) /* to even */
-#define FLTRND_ODD (FRPMASK_INEXACT&~FRPMASK_ODD) /* to odd */
-#define FLTRND_NEAREVEN FRPMASK_NEAR(FLTRND_EVEN) /* nearest, ties to even */
-#define FLTRND_NEARODD FRPMASK_NEAR(FLTRND_ODD) /* nearest, ties to odd */
-#define FLTRND_NEARZERO FRPMASK_NEAR(FLTRND_ZERO) /* nearest, ties to zero */
-#define FLTRND_NEARINF FRPMASK_NEAR(FLTRND_PROJINF) /* nearest, ties to ±∞ */
-#define FLTRND_NEARNEG FRPMASK_NEAR(FLTRND_NEGINF) /* nearest, ties to -∞ */
-#define FLTRND_NEARPOS FRPMASK_NEAR(FLTRND_POSINF) /* nearest, ties to +∞ */
-
/*----- General floating-point hacking ------------------------------------*/
/* --- @fltfmt_initbits@ --- *
*/
unsigned f; /* flags */
-#define IEEEF_HIDDEN 1u /* unit bit is implicit */
+#define FLTIF_HIDDEN 1u /* unit bit is implicit */
unsigned expwd; /* exponent field width %$w$% */
unsigned prec; /* precision %$p$% */
};
.B nputf
operations.
On entry,
+.I a
+should be a pointer to an arena,
+typically
+.BR arena_global (3);
.BI * buf_inout
should be a pointer to a buffer of
.BI * sz_inout
-bytes, allocated from
-.BR arena_global (3);
+bytes, allocated from the arena
+.IR a ;
instead,
.BI * buf_inout
may be null
The function is designed to be efficient when called multiple times,
retaining the same buffer across calls,
resizing it as necessary in a geometric progression.
-When the buffer is no longer wanted, free it using
-.BR xfree (3).
.PP
A typical
.B nputf
.\" @N
.\" @STR
.\" @GLUE
+.\" @GLUE3
.\" @STATIC_ASSERT
.\" @CHECK_TYPE
.\" @CONVERT_CAREFULLY
.\" @DISCARD
.\" @IGNORE
.\" @LAUNDER
+.\" @ADMIRE
+.\" @ADMIRE_BUF
.\" @RELAX
.
.\" @DEPRECATED
.BI "size_t N(" type " " array "[]);"
.BI "STR(" tokens\fR... ")"
.BI "GLUE(" tokens\fR... ", " tokens\fR... ")"
+.BI "GLUE3(" tokens\fR... ", " tokens\fR... ", " tokens\fR... ")"
.BI "STATIC_ASSERT(" cond ", " msg ");"
.BI "int CHECK_TYPE(" expty ", " expty " " x );
.IB newty " CONVERT_CAREFULLY(" newty ", " expty ", " expty " " x );
.PP
.BI "void DISCARD(" scalar ");"
.BI "void IGNORE(" variable ");"
+.IB type " LAUNDER(" type " " x ");"
+.BI "void ADMIRE(" type " " x ");"
+.BI "void ADMIRE_BUF(void *" p ", size_t " sz ");"
+.B "void RELAX;"
.PP
.BI "DEPRECATED(" msg ")"
.BI "EXECL_LIKE(" ntrail ")"
the tokens resulting from expanding its argument token lists. Each of
the argument token lists must expand to a single preprocessing token,
and the result of gluing these tokens together must be valid
-preprocessing token.
+preprocessing token. The
+.B GLUE3
+macro does the same, except that it glues together
+.I three
+argument token lists rather than two.
.PP
The
.B STATIC_ASSERT
The
.B LAUNDER
macro tries to confuse a compiler so that it `forgets' what it knows
-about a particular value. This is most useful in benchmarking or
-similar applications.
+about a particular value.
+.PP
+The
+.B ADMIRE
+macro tries to confuse a compiler so that it will faithfully computes
+the argument
+.I x
+even though it's not used for anything. The
+.B ADMIRE_BUF
+macro works similarly, but on regions of memory.
.PP
The
.B RELAX
macro tries do nothing, but in a way that a compiler won't optimize
away.
-.
+.PP
+The
+.BR LAUNDER ,
+.BR ADMIRE ,
+.BR ADMIRE_BUF ,
+and
+.B RELAX
+macros are most useful in benchmarking and similar applications.
.SS Annotations
The following annotations can be attached to function declarations and
definitions, as part of the declaration specifiers. (Other positions
e = 4096
m = ffffffff
z = 7fefffff ffffffff
-err = INEXACT
+err = INEXACT | OFLOW
[encf128]