.\" -*-nroff-*-
.\"
.\" Manual for floating-point format conversions
.\"
.\" (c) 2024 Straylight/Edgeware
.\"
.
.\"----- Licensing notice ---------------------------------------------------
.\"
.\" This file is part of the mLib utilities library.
.\"
.\" mLib is free software: you can redistribute it and/or modify it under
.\" the terms of the GNU Library General Public License as published by
.\" the Free Software Foundation; either version 2 of the License, or (at
.\" your option) any later version.
.\"
.\" mLib is distributed in the hope that it will be useful, but WITHOUT
.\" ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
.\" FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
.\" License for more details.
.\"
.\" You should have received a copy of the GNU Library General Public
.\" License along with mLib.  If not, write to the Free Software
.\" Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
.\" USA.
.
.\"--------------------------------------------------------------------------
.so ../defs.man \" @@@PRE@@@
.
.\"--------------------------------------------------------------------------
.TH fltfmt 3mLib "22 April 2024" "Straylight/Edgeware" "mLib utilities library"
.\" @FLTERR_OK
.\" @FLTERR_INVAL
.\" @FLTERR_INEXACT
.\" @FLTERR_UFLOW
.\" @FLTERR_OFLOW
.\" @FLTERR_REPR
.\" @FLTERR_ALLERRS
.
.\" @FRPF_LOW
.\" @FRPF_HALF
.\" @FRPF_ODD
.\" @FRPF_NEG
.\" @FRPMASK_LOW
.\" @FRPMASK_HALF
.\" @FRPMASK_ODD
.\" @FRPMASK_NEG
.\" @FRPMASK_INEXACT
.\" @FRPMASK_NEAR
.\" @FLTRND_ZERO
.\" @FLTRND_PROJINF
.\" @FLTRND_NEGINF
.\" @FLTRND_POSINF
.\" @FLTRND_EVEN
.\" @FLTRND_ODD
.\" @FLTRND_NEAREVEN
.\" @FLTRND_NEARODD
.\" @FLTRND_NEARZERO
.\" @FLTRND_NEARINF
.\" @FLTRND_NEARNEG
.\" @FLTRND_NEARPOS
.
.\" @FLTFMT_NEG
.\" @FLTFMT_INF
.\" @FLTFMT_QNAN
.\" @FLTFMT_SNAN
.\" @FLTFMT_ZERO
.\" @FLTFMT_NANMASK
.\" @FLOATBITS_INIT
.\" @fltfmt_initbits
.\" @fltfmt_freebits
.\" @fltfmt_allocfrac
.\" @fltfmt_copybits
.\" @fltfmt_round
.
.\" @FLTIF_HIDDEN
.\" @fltfmt_f16
.\" @fltfmt_f32
.\" @fltfmt_f64
.\" @fltfmt_f128
.\" @fltfmt_mini
.\" @fltfmt_bf16
.\" @fltfmt_idblext80
.
.\" @fltfmt_encieee
.\" @fltfmt_encf16
.\" @fltfmt_encf32
.\" @fltfmt_encf64
.\" @fltfmt_encf128
.\" @fltfmt_encmini
.\" @fltfmt_encbf16
.\" @fltfmt_encidblext80
.\" @fltfmt_decieee
.\" @fltfmt_decf16
.\" @fltfmt_decf32
.\" @fltfmt_decf64
.\" @fltfmt_decf128
.\" @fltfmt_decmini
.\" @fltfmt_decbf16
.\" @fltfmt_decidblext80
.
.\" @fltfmt_encflt
.\" @fltfmt_encdbl
.\" @fltfmt_encldbl
.\" @fltfmt_decflt
.\" @fltfmt_decdbl
.\" @fltfmt_decldbl
.
.\" @fltfmt_flttof32l
.\" @fltfmt_flttof32b
.\" @fltfmt_dbltof64l
.\" @fltfmt_dbltof64b
.\" @fltfmt_f32ltoflt
.\" @fltfmt_f32btoflt
.\" @fltfmt_f64ltodbl
.\" @fltfmt_f64btodbl
.
.\"--------------------------------------------------------------------------
.SH NAME
fltfmt \- floating-point format conversions
.
.\"--------------------------------------------------------------------------
.SH SYNOPSIS
.
.nf
.B "#define FLTERR_OK 0"
.B "#define FLTERR_INVAL ..."
.B "#define FLTERR_INEXACT ..."
.B "#define FLTERR_UFLOW ..."
.B "#define FLTERR_OFLOW ..."
.B "#define FLTERR_REPR ..."
.B "#define FLTERR_ALLERRS ..."
.PP
.ta 40n
.B "#define FRPF_LOW 1u"
.B "#define FRPF_HALF 2u"
.B "#define FRPF_ODD 4u"
.B "#define FRPF_NEG 8u"
.B "#define FRPMASK_LOW 0xaaaau"
.B "#define FRPMASK_HALF 0xccccu."
.B "#define FRPMASK_ODD 0xf0f0u"
.B "#define FRPMASK_NEG 0xff00u"
.B "#define FRPMASK_INEXACT ...	/* LOW | HALF */"
.BI "unsigned FRPMASK_NEAR(unsigned " dir ");	/* HALF&(LOW | " dir ") */"
.B "#define FLTRND_ZERO ...	/* 0 */"
.B "#define FLTRND_PROJINF ...	/* INEXACT */"
.B "#define FLTRND_NEGINF ...	/* INEXACT&NEG */"
.B "#define FLTRND_POSINF ...	/* INEXACT&~NEG */"
.B "#define FLTRND_EVEN ...	/* INEXACT&ODD */"
.B "#define FLTRND_ODD ...	/* INEXACT&~ODD */"
.B "#define FLTRND_NEAREVEN ...	/* HALF&(LOW | ODD) */"
.B "#define FLTRND_NEARODD ...	/* HALF&(LOW | ~ODD) */"
.B "#define FLTRND_NEARZERO ...	/* HALF&LOW */"
.B "#define FLTRND_NEARINF ...	/* HALF */"
.B "#define FLTRND_NEARNEG ...	/* HALF&(LOW | NEG) */"
.B "#define FLTRND_NEARPOS ...	/* HALF&(LOW | ~NEG) */"
.PP
.ta 2n
.B "#define FLTF_NEG ..."
.B "#define FLTF_INF ..."
.B "#define FLTF_QNAN ..."
.B "#define FLTF_SNAN ..."
.B "#define FLTF_ZERO ..."
.B "#define FLTF_NANMASK (FLTF_QNAN | FLTF_SNAN)"
.B "struct floatbits {"
.B "	unsigned f;"
.B "	int exp;"
.B "	arena *a;"
.B "	uint32 *frac;"
.B "	unsigned n, fracsz;"
.B "};"
.B "#define FLOATBITS_INIT { ...\& };"
.PP
.BI "void fltfmt_initbits(struct floatbits *" x );
.BI "void fltfmt_freebits(struct floatbits *" x );
.BI "void fltfmt_allocfrac(struct floatbits *" x ", unsigned " n );
.ta \w'\fBvoid fltfmt_copybits('u
.BI "void fltfmt_copybits(struct floatbits *" z_out ,
.BI "	const struct floatbits *" x );
.ta \w'\fBvoid fltfmt_round('u
.BI "void fltfmt_round(struct floatbits *" z_out ,
.BI "	const struct floatbits *" x ,
.BI "	unsigned " r ", unsigned " n );
.PP
.
.ta 2n
.B "#define FLTIF_HIDDEN ..."
.B "struct fltfmt_ieeefmt {"
.B "	unsigned f;"
.B "	unsigned expwd;"
.B "	unsigned prec;"
.B "};"
.B "const struct fltfmt_ieeefmt fltfmt_f16;"
.B "const struct fltfmt_ieeefmt fltfmt_f32;"
.B "const struct fltfmt_ieeefmt fltfmt_f64;"
.B "const struct fltfmt_ieeefmt fltfmt_f128;"
.B "const struct fltfmt_ieeefmt fltfmt_mini;"
.B "const struct fltfmt_ieeefmt fltfmt_bf16;"
.B "const struct fltfmt_ieeefmt fltfmt_idblext80;"
.PP
.ta \w'\fBunsigned fltfmt_encieee('u
.BI "unsigned fltfmt_encieee(const struct fltfmt_ieeefmt *" fmt ,
.BI "	uint32 *" z ", const struct floatbits *" x ,
.BI "	unsigned " r ", unsigned " errmask );
.ta \w'\fBunsigned fltfmt_encf16('u
.BI "unsigned fltfmt_encf16(uint16 *" z_out ", const struct floatbits *" x ,
.BI "	unsigned " r ", unsigned " errmask );
.ta \w'\fBunsigned fltfmt_encf32('u
.BI "unsigned fltfmt_encf32(uint32 *" z_out ", const struct floatbits *" x ,
.BI "	unsigned " r ", unsigned " errmask );
.ta \w'\fBunsigned fltfmt_encf64('u
.BI "unsigned fltfmt_encf64(kludge64 *" z_out ", const struct floatbits *" x ,
.BI "	unsigned " r ", unsigned " errmask );
.ta \w'\fBunsigned fltfmt_encf128('u
.BI "unsigned fltfmt_encf128(uint32 *" z_out ", const struct floatbits *" x ,
.BI "	unsigned " r ", unsigned " errmask );
.ta \w'\fBunsigned fltfmt_encmini('u
.BI "unsigned fltfmt_encmini(octet *" z_out ", const struct floatbits *" x ,
.BI "	unsigned " r ", unsigned " errmask );
.ta \w'\fBunsigned fltfmt_encbf16('u
.BI "unsigned fltfmt_encbf16(uint16 *" z_out ", const struct floatbits *" x ,
.BI "	unsigned " r ", unsigned " errmask );
.ta \w'\fBunsigned fltfmt_encidblext80('u
.BI "unsigned fltfmt_encidblext80(uint16 *" se_out ", kludge64 *" m_out ,
.BI "	const struct floatbits *" x ,
.BI "	unsigned " r ", unsigned " errmask );
.PP
.ta \w'\fBunsigned fltfmt_decieee('u
.BI "unsigned fltfmt_decieee(const struct fltfmt_ieeefmt *" fmt ,
.BI "	struct floatbits *" z_out ", const uint32 *" x );
.BI "unsigned fltfmt_decf16(struct floatbits *" z_out ", uint16 " x );
.BI "unsigned fltfmt_decf32(struct floatbits *" z_out ", uint32 " x );
.BI "unsigned fltfmt_decf64(struct floatbits *" z_out ", kludge64 " x );
.BI "unsigned fltfmt_decf128(struct floatbits *" z_out ", const uint32 *" x );
.BI "unsigned fltfmt_decmini(struct floatbits *" z_out ", octet " x );
.BI "unsigned fltfmt_decbf16(struct floatbits *" z_out ", uint16 " x );
.ta \w'\fBunsigned fltfmt_decidblext80('u
.BI "unsigned fltfmt_decidblext80(struct floatbits *" z_out ,
.BI "	uint16 " se ", kludge64 " m );
.PP
.ta \w'\fBunsigned fltfmt_encflt('u
.BI "unsigned fltfmt_encflt(float *" z_out ,
.BI "	const struct floatbits *" x ", unsigned " r );
.ta \w'\fBunsigned fltfmt_encdbl('u
.BI "unsigned fltfmt_encdbl(double *" z_out ,
.BI "	const struct floatbits *" x ", unsigned " r );
.ta \w'\fBunsigned fltfmt_encldbl('u
.BI "unsigned fltfmt_encldbl(long double *" z_out ,
.BI "	const struct floatbits *" x ", unsigned " r );
.ta \w'\fBunsigned fltfmt_decflt('u
.BI "unsigned fltfmt_decflt(struct floatbits *" z_out ,
.BI "	float *" x ", unsigned " r );
.ta \w'\fBunsigned fltfmt_decdbl('u
.BI "unsigned fltfmt_decdbl(struct floatbits *" z_out ,
.BI "	double *" x ", unsigned " r );
.ta \w'\fBunsigned fltfmt_decldbl('u
.BI "unsigned fltfmt_decldbl(struct floatbits *" z_out ,
.BI "	long double *" x ", unsigned " r );
.PP
.BI "unsigned fltfmt_flttof32l(octet *" p ", float " x ", unsigned " r );
.BI "unsigned fltfmt_flttof32b(octet *" p ", float " x ", unsigned " r );
.BI "unsigned fltfmt_dbltof64l(octet *" p ", double " x ", unsigned " r );
.BI "unsigned fltfmt_dbltof64b(octet *" p ", double " x ", unsigned " r );
.BI "unsigned fltfmt_f32ltoflt(float *" z_out ", const octet *" p ", unsigned " r );
.BI "unsigned fltfmt_f32btoflt(float *" z_out ", const octet *" p ", unsigned " r );
.BI "unsigned fltfmt_f64ltodbl(float *" z_out ", const octet *" p ", unsigned " r );
.BI "unsigned fltfmt_f64btodbl(float *" z_out ", const octet *" p ", unsigned " r );
.
.\"--------------------------------------------------------------------------
.SH DESCRIPTION
.
The
.B "<mLib/fltfmt.h>"
header file defines structures, macros, and functions
for converting floating-point values between various formats,
including the native floating-point formats
and IEEE\ 754 and related formats.
.
.SS Error conditions
Most of the functions in this module return an unsigned integer.
A return value of zero means that no error occurred;
set bits indicate various error conditions.
.TP
.B FLTERR_INVAL
A binary input to be decoded contained an invalid bit pattern,
e.g., an unnormalized input value with a nonminimal exponent.
The function will have produced a reasonable output anyway,
but the original value will not be recoverable from the result.
.TP
.B FLTERR_INEXACT
The conversion was inexact.
Converting the output back into the format of the input
may not reproduce the original input value.
This error flag is sometimes set conservatively.
.TP
.B FLTERR_UFLOW
The conversion underflowed:
a nonzero input was too tiny (in absolute value) to represent,
and a zero result was returned.
.TP
.B FLTERR_OFLOW
The conversion overflowed:
a finite input was too huge (in absolute value) to represent,
and either the appropriately signed infinity
or largest-magnitude finite value
was returned, determined by the requested rounding mode.
.TP
.B FLTERR_REPR
The output format failed entirely to represent the input value.
The result is zero if the input was a NaN,
or the appropriately signed largest-magnitude finite value
if the input was an infinity.
.
.SS Rounding modes
The rounding system works as follows.
There are four
.I rounding predicates
considered when a rounding decision is taken.
These are determined from the unrounded input value
.IR x ,
and the two nearest rounded values
.RI | u "|\ \*(<=\ |" x |
and
.RI | v "|\ >\ |" x |.
The predicates are as follows.
.TP
.B FRPF_LOW
If
.IR x "\ \*(/=\ " u
and
.IR x "\ \*/=\ (" u "\ +\ " v )/2,
i.e.,
.I x
is neither equal to a rounded value,
nor exactly halfway between two rounded values.
This predicate is sometimes referred to as a `sticky bit'.
.TP
.B FRPF_HALF
If
.RI | x "|\ \*(>=\ |(" u "\ +\ " v )/2|,
i.e.,
.I x
is halfway or more towards its larger rounded neighbour.
.TP
.B FRPF_ODD
If least significant digit of
.I u
is odd.
In binary floating-point formats,
this is just the least significant bit of
.IR u .
.TP
.B FRPF_NEG
If
.I x
is negative.
.PP
These four predicates are packed into a four-bit mask value
.I rf
between 0 and 15.
A
.I rounding mode
is simply a 16-bit mask:
if bit
.I rf
of the rounding-mode mask is set,
then
.I x
is rounded to
.IR v ;
otherwise it is rounded to
.IR u .
That is, the rounding-mode mask is essentially a truth table.
Rounding modes with
.I set
bits corresponding to situations where both
.B FRPF_LOW
and
.FRPF_HALF
are false,
i.e., where
.I x
is already a rounded value,
are forbidden.
.PP
Some useful machinery is provided
for constructing rounding-mode masks.
.BR FRPMASK_LOW ,
.BR FRPMASK_HALF ,
.BR FRPMASK_ODD ,
and
.B FRPMASK_NEG ,
are mask with set bits corresponding to their respective predicates.
Bitwise boolean logic can be applied to these masks
in order to calculate the masks corresponding to
the same logical expression applied to the individual predicates.
.B FRPMASK_INEXACT holds if
.B LOW
or
.B HALF
holds;
i.e., if
.IR x "\ \*(/=\ " u ;
as mentioned above, only these bits may be set
in a valid rounding-mode mask.
.BI FRPMASK_NEAR( dir )
is the mask for rounding to nearest with ties broken according to
.IR dir ,
which is another rounding-mode mask.
The complete set of predefined masks is listed above in the synopsis,
together with their description in terms of the basic predicates.
The usual IEEE rounding mode is
round-to-nearest/ties-to-even,
denoted
.BR FLTRND_NEAREVEN .
This is likely a good option
if there is no compelling argument for a different specific choice.
.
.SS Direct conversions
The functions
.B fltfmt_flttof32l
and
.B fltfmt_flttof32b
convert a
.B float
argument to an IEEE\ 754 Binary32 value
in little- or big-endian byte order, respectively;
similarly
.B fltfmt_dbltof64l
and
.B fltfmt_dbltof64b
convert a
.B double
argument to an IEEE\ 754 Binary64 value
in little- or big-endian byte order, respectively.
The value to convert is given as
.I x
and the result is written at the address
.IR p .
.PP
The functions
.B fltfmt_f32ltoflt
and
.B fltfmt_f32btoflt
convert an IEEE\ 754 Binary32 value,
in little- or big-endian byte order, respectively,
to a
.BR float ;
similarly,
.B fltfmt_f32ltoflt
and
.B fltfmt_f32btoflt
convert an IEEE\ 754 Binary64 value,
in little- or big-endian byte order, respectively,
to a
.BR double .
The value to convert is read from address
.I p
and the result is written to
.RI * z_out \fR.
.PP
Both functions additionally take a rounding mode
.I r
which is applied if the conversion cannot be performed exactly,
and return an error code as described above.
.PP
On many modern platforms, the
.B float
and
.I double
types are represented internally using the IEEE
Binary32 and Binary64 formats,
so these conversions are trivial, or nearly so.
A complication arises on PA-RISC and older MIPS processors:
see the descriptions of
.B fltfmt_encieee
and
.B fltfmt_decdbl
below for the details.
.PP
On other platforms,
the conversion is decidedly nontrivial,
and makes use of the machinery described below;
this may also be useful for more complex conversions.
.
.SS The floatbits structure
In order to avoid a combinatorial explosion in conversion operations,
all the basic conversions involve,
as source or target,
a `common currency' format represented by the type
.BR "struct floatbits" .
.PP
This structure consists of
a set of flags
.BR f ;
a signed exponent
.BR exp ;
an
.B arena
pointer
.BR a ;
a pointer
.B frac
to a vector of
.B uint32
values;
the length
.B n
of the
.B frac
vector; and
the currently allocated size
.B fracsz
of the vector.
Both
.B n
and
.B fracsz
count elements, not bytes.
.PP
Storage for
.B frac
comes from the arena
.BR a .
Only the first
.B n
words of
.B frac
are significant;
.B frac[0]
is the most significant word.
The value represented by a
.B struct floatbits
is never changed by adding or removing zero-valued words
at the end of the
.B frac
vector.
It is always the case that
.BR n "\ \*(<=\ " fracsz ;
if
.B fracsz
is zero then
.B frac
may be a null pointer.
.PP
The interpretation of the
.B exp
and
.B frac
members depends on the flags set in
.BR f .
Apart from
.BR FLTF_NEG ,
the flags are
.IR "mutually exclusive" :
at most one flag may be set.
.TP
.B FLTF_NEG
The value is negative.
.TP
.B FLTF_INF
The value is positive or negative infinity.
The
.B exp
and
.B frac
are ignored.
.TP
.BR FLTF_QNAN " and " FLTF_SNAN
The value is a quiet or signalling not-a-number, respectively.
The
.B exp
is ignored.
The payload is stored in
.BR frac ;
the payload does not include the `quiet' bit.
.TP
.B FLTF_ZERO
The number is zero.
Negative zero is distinct from positive zero.
The
.B exp
and
.B frac
are ignored.
.IP "All non-sign bits clear"
The value is a finite nonzero number.
The
.B frac
holds the significand.
The most significand significand bit must be set, so
(a)\ the number must be nonzero, and
(b)\ the significand is normalized.
The significand is interpreted as a fraction
.RI "1/2\ \*(<=\ " m "\ <\ 1."
If
.I e
is the value of the
.B exp
member,
and
.I s
is \-1 if
.B FLTF_NEG
is set
or +1 if
.B FLTF_NEG
is clear,
then the number represented is
.IR s "\ \*(mu\ " m "\ \*(mu\ 2\*(ss" e \*(se.
.PP
A
.B struct floatbits
can be initialized statically by
.BR FLOATBITS_INIT ,
or dynamically using the function
.BR fltfmt_initbits .
These are not quite the same:
.B FLOATBITS_INIT
initializes
.B a
to
.BR &arena_stdlib ,
while
.B fltfmt_initbits
sets it to the runtime value of
.BR arena_global .
With this exception,
both forms of initialization set the value to (positive) zero;
neither allocates any storage or other resources,
leaving
.B frac
null.
In this state, it is safe to modify the arena pointer
.B a
if the default initialization is unsatisfactory.
.PP
The
.B fltfmt_allocfrac
function is given a pointer
.I x
to a
.B struct floatbits
and a length
.IR n :
it ensures that there is enough storage at
.IB x ->frac
for at least
.I n
words:
if the current size is too small,
then any existing buffer is discarded and a new one allocated
from the arena
.IB x ->a \fR;
any existing contents of the buffer are lost.
On exit,
.IB x ->n
is set to
.IR n .
.PP
The
.B fltfmt_freebits
function
frees a
.B struct floatbits
structure, releasing the storage held by
.BR frac .
.PP
The
.B fltfmt_copybits
function simply copies its input
.I x
to its output
.IR z_out ;
both must refer to initialized
.B struct floatbits
structures.
If
.I z_out
and
.I x
are equal, then nothing happens.
.PP
Finally, the
.B fltfmt_round
function rounds the value in the
.B struct floatbits
structure
.I x
to
.I n
bits using the rounding mode
.IR r ;
the result is written to
.IR z_out ;
it is permitted for
.I z_out
to be equal to
.IR x .
If
.I x
is a zero or infinity,
then the output is equal to the input,
as if
.B fltfmt_copybits
had been called instead.
If
.I x
is a NaN,
then the payload is simply truncated to
.I n
bits, without regard to the rounding mode.
Otherwise, the input is nonzero and finite;
the significand is rounded to
.I n
bits according to the rounding mode.
In all cases, the return value is
zero if the output is equal to the input,
or
.B FLTERR_INEXACT
if the rounded result is not equal to the input.
.
.SS IEEE and related formats
An IEEE floating-point format is characterized by three parameters:
the
.I "exponent width"
.IR w ,
the
.I "precision"
.IR p ,
and
the
.I "unit width"
.IR h .
.PP
The encoded value consists of
.IR p "\ +\ " w "\ +\ " h "\ \-\ 1"
bits.
This is divided, from the most significant bit downwards,
into a
.I "sign bit"
.IR s ,
a
.IR w -bit
.I "biased exponent"
.IR e \*',
a
.IR h -bit
.I "unit bit"
.IR u ,
and a
.RI ( p "\ \-\ " h )-bit
.I fraction
.IR f .
The
.I "exponent bias"
is
.IR e "\*(us0\*(ue\ =\ 2\*(ss" w "\-1\*(se\ \-\ 1;"
the true exponent
.I e
is calculated from the biased exponent by
.IR e "\ =\ " e "\*'\ \-\ " e \*(us0\*(ue.
The unit and fraction field are usually interpreted as denoting
a significand
.IR m "\ =\ " u "\ +\ " f /2\*(ss p \-1\*(se
with
.RI "0\ \*(<=\ " m "\ <\ 2."
If
.I h
is zero,
the value of the unit bit
.I u
is implied by the exponent as described below.
The encoded value is interpreted as follows.
.hP \*o
If
.IR e "\ =\ \-" e \*(us0\*(ue
then the value is zero or a subnormal,
with the value
.RI (\-1)\*(ss s "\*(se\ \*(mu\ " m "\ \*(mu\ 2\*(ss" e +1\*(se.
In particular,
if
.IR m "\ =\ 0"
then the value is positive or negative zero,
according to the sign bit
.IR s .
If
.I h
is zero then
.IR u "\ =\ 0;"
if
.I h
is nonzero
but
.IR u "\ \*(/=\ 0"
then the encoding is invalid:
decoding returns
.BR FLTERR_INVAL ,
but the result will be as described.
.hP \*o
If
.RI "1\ \-\ " e "\*(us0\*(ue \*(<=\ " e "\ < 2" e "\*(us0\*(ue\ +\ 1"
then the value is a (supposedly) normal number
.RI (\-1)\*(ss s "\*(se\ \*(mu\ " m "\ \*(mu\ 2\*(ss" e \*(se.
If
.I h
is zero then
.IR u "\ =\ 1;"
if
.I h
is nonzero
but
.IR u "\ \*(/=\ 0"
then the encoding is invalid:
decoding returns
.BR FLTERR_INVAL ,
but the result will be as described.
.hP \*o
If
.IR e "\ =\ 2" e "\*(us0\*(ue\ +\ 1"
and
.IR f "\ =\ 0"
then the value is positive or negative infinity,
according to the sign bit
.IR s .
If
.I h
is nonzero and
.IR u "\ =\ 0"
then the encoding is invalid:
decoding returns
.BR FLTERR_INVAL ,
but the result will still be infinity.
.hP \*o
If
.IR e "\ =\ 2" e "\*(us0\*(ue\ +\ 1"
and
.IR f "\ \*(/=\ 0"
then the value is not-a-number (NaN).
The most significant bit of
.I f
is the `quiet bit':
if the bit is set, the value is a `quiet NaN';
if the bit is clear, the value is a `signalling NaN'.
(This is the convention recommended by IEEE\ 754-2008 \(sc6.2.1,
it has the advantage that a signalling NaN can be `quieted'
by setting the most significant fraction bit;
HP-PA and older MIPS processors use the opposite convention
for distinguishing quiet and signalling NaNs,
but a signalling NaN with all but the most significant
fraction bit zero cannot be `quieted' by clearing the
most significant bit, since the resulting encoding denotes
an infinity, not a QNaN.)
The remaining bits of
.I f
form the
.I payload.
Positive and negative NaN values are distinguished,
with sign determined by the sign bit.
If
.I h
is nonzero and
.IR u "\ =\ 0"
then the encoding is invalid:
decoding returns
.BR FLTERR_INVAL ,
but the result will still be a NaN;
the unit bit does not affect the NaN payload.
.PP
An IEEE format is described by the type
.BR "struct fltfmt_ieeefmt" .
This has three members:
.TP
.B f
A flags word.
If
.B FLTIF_HIDDEN
is set, the the format uses a `hidden bit' convention:
in the notation above
.IR h "\ =\ 0;"
if the flag is clear,
the format has an explicit unit bit, and
.IR h "\ =\ 1."
.TP
.B expwd
The exponent width;
in the notation above this is
.IR w .
.TP
.B prec
The precision;
in the notation above this is
.IR p .

The
.B fltfmt_encieee
and
.B fltfmt_decieee
functions convert between IEEE and related formats
and the internal
.B struct floatbits
representation.
They respectively encode or decode an IEEE-format value,
as described above,
from a vector of
.B uint32
words,
most-significant word first
\(en so the sign bit is in the first word.
For formats whose size is not a multiple of 32,
the encoding is
.IR right-aligned :
the least significant bit of the fraction
is in the least significant bit of the last word in the vector.
.PP
The
.B fltfmt_encieee
function encodes an IEEE-format value.
The function is given five arguments:
a pointer
.I fmt
to the IEEE format description,
a pointer
.I p
to a sufficiently long vector of 32-bit words
in which to store the encoded value,
a pointer
.I x
to the
.B struct floatbits
holding the value to encode,
a rounding mode
.IR r ,
and an error mask
.IR errmask .
If the input is a NaN,
then the payload is truncated to fit
regardless of the rounding mode,
discarding low-significant bits;
if the input is a finite value,
then the significand is rounded to fit
according to the requested rounding mode.
If a signalling NaN ends up with all of its payload bits zero,
as a result of truncation or otherwise,
then the least-signficant bit of the output payload is forced on
in order to distinguish the result from an infinity.
The possible errors are
.B FLTERR_UFLOW
if the value is unrepresentably tiny,
.B FLTERR_OFLOW
if the value is unrepresentably huge,
and
.B FLTERR_INEXACT
if the encoding fails to preserve the input value exactly;
hence
.B FLTERR_INEXACT
is set whenever
.B FLTERR_OFLOW
or
.B FLTERR_UFLOW
is set,
or if bits are lost due to NaN-payload truncation or rounding.
If, during encoding,
an error is encountered,
processing stops immediately
unless the corresponding bit of
.I errmask
is set.
.PP
The
.B fltfmt_decieee
function decodes an IEEE-format value.
The function is given three arguments:
a pointer
.I fmt
to the IEEE format description,
a pointer
.I z_out
to the initialized
.B struct floatbits
to fill in, and
a pointer
.I p
to the IEEE-encoded value to decode,
in a vector of 32-bit words as described above.
The only error that can occur during decoding is
.BR FLTERR_INVAL :
as described above,
this occurs in non-hidden-bit formats
when the unit bit does not match that implied by the exponent;
the result is returned anyway,
with the unit bit interpreted as encoded in finite numbers,
and discarded in infinities and NaNs.
.PP
A number of IEEE and IEEE-like formats are predefined:
for format
.IR fmt ,
there is
an IEEE format description, named
.BI fltfmt_ fmt \fR,
together with encoding and decoding functions, named
.BI fltfmt_enc fmt
and
.BI fltfmt_dec fmt \fR;
for the most part,
these functions use more convenient types
to hold encoded values.
.TP
.B "f16"
The IEEE\ 754 Binary16 format, with
.IR w "\ =\ 5,"
.IR p "\ =\ 11,"
and
.IR h "\ =\ 0;"
stored in a
.BR uint16 .
.TP
.B "fltfmt_f32"
The IEEE\ 754 Binary32 (`single precision') format, with
.IR w "\ =\ 8,"
.IR p "\ =\ 24,"
and
.IR h "\ =\ 0;"
stored in a
.BR uint32 .
.TP
.B "fltfmt_f64"
The IEEE\ 754 Binary64 (`double precision') format, with
.IR w "\ =\ 11,"
.IR p "\ =\ 53,"
and
.IR h "\ =\ 0;"
stored in a
.B kludge64
(see
.BR bits (3)
for details).
.TP
.B "fltfmt_f128"
The IEEE\ 754 Binary128 (`quad precision') format, with
.IR w "\ =\ 15,"
.IR p "\ =\ 113,"
and
.IR h "\ =\ 0;"uint
stored in a big-endian vector of
.BR uint32 ,
just as for the generic functions described above.
.TP
.B "fltfmt_mini"
An eight-bit `minifloat' format, with
.IR w "\ =\ 4,"
.IR p "\ =\ 4,"
and
.IR h "\ =\ 0;"
stored in an
.BR octet .
.TP
.B "fltfmt_bf16"
The Google `BFloat16' format, with
.IR w "\ =\ 8,"
.IR p "\ =\ 8,"
and
.IR h "\ =\ 0;"
stored in a
.BR uint16 .
.TP
.B "fltfmt_idblext80"
The Intel 8087 80-bit `double extended' format, with
.IR w "\ =\ 15,"
.IR p "\ =\ 64,"
and
.IR h "\ =\ 1;"
stored as a
.B uint16
holding the sign and exponent,
and a
.B kludge64
holding the significand.
.
.SS Native formats
There are also functions for converting between
.B struct floatbits
and the implementation's native floating-point types
.B float
(abbreviated
.BR flt ),
.B double
(abbreviated
.BR dbl ),
and
.B "long double"
(abbreviated
.BR ldbl ).
.PP
For each native type abbreviation
.IR ty ,
there are functions
.BI fltfmt_enc ty
and
.BI fltfmt_dec ty \fR,
which respectively convert the value held in
.B struct floatbits
to or from a value of the corresponding C type.
(The functions acting on
.B long double
values are only available if the platform supports C99 or later.)
.PP
The
.BI fltfmt_enc ty
functions read an input value from a
.B struct floatbits
pointer
.I x
and store the encoded result through a pointer
.I z_out
to the appropriate C type;
the function also receives a rounding mode
.IR r ,
but see below.
The
.BI fltfmt_dec ty
functions are given an input value of the appropriate C type,
and store the decoded result in a
.B struct floatbits
structure pointed to by
.I z_arg ;
again, the function also receives a rounding mode
.IR r ,
but see below.
.PP
These functions can use two different strategies for conversion.
If the compile-time configuration step detects
that the implementation is using
a specific, supported format for a native type,
then conversions involving the native type
are performed using the existing machinery for that format.
For example, if,
as is in fact nearly universal on modern-ish systems,
the
.B double
type uses the IEEE\ 754 Binary64 format,
then
.B fltfmt_encdbl
and
.B fltfmt_decdbl
use the functions
.B fltfmt_encf64
and
.B fltfmt_decf64
described above for the conversion.
This approach has the benefit that
everything is done under the control of the
.B fltfmt
machinery,
which can faithfully preserve signs of zero values,
and NaN payloads.
The error conditions are, for the most part, the same as for the
.B fltfmt_encieee
and
.B fltfmt_decieee
functions described above.
The encoding functions have an additional source of inexactness
on PA-RISC and older MIPS processors
which use the reversed quiet/signalling NaN convention:
a quiet NaN with an all-zero payload
is not representable on such implementations
(the encoding is an infinity instead);
in this situation,
the least significant payload bit is forced on,
just as if the payload required truncation,
and
.B FLTERR_INEXACT
is returned.
.
.\"--------------------------------------------------------------------------
.SH "SEE ALSO"
.
.BR bits (3),
.BR mLib (3).
.
.\"--------------------------------------------------------------------------
.SH AUTHOR
.
Mark Wooding, <mdw@distorted.org.uk>
.
.\"----- That's all, folks --------------------------------------------------