chiark - git - mdw - mLib/blob - struct/buf-float.c

   1 /* -*-c-*-
   2  *
   3  * Encoding and decoding floating-point values
   4  *
   5  * (c) 2023 Straylight/Edgeware
   6  */
   7
   8 /*----- Licensing notice --------------------------------------------------*
   9  *
  10  * This file is part of the mLib utilities library.
  11  *
  12  * mLib is free software: you can redistribute it and/or modify it under
  13  * the terms of the GNU Library General Public License as published by
  14  * the Free Software Foundation; either version 2 of the License, or (at
  15  * your option) any later version.
  16  *
  17  * mLib is distributed in the hope that it will be useful, but WITHOUT
  18  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  19  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
  20  * License for more details.
  21  *
  22  * You should have received a copy of the GNU Library General Public
  23  * License along with mLib.  If not, write to the Free Software
  24  * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
  25  * USA.
  26  */
  27
  28 /*----- Header files ------------------------------------------------------*/
  29
  30 #include <float.h>
  31 #include <math.h>
  32
  33 #include "bits.h"
  34 #include "buf.h"
  35
  36 /*----- Formatting primitives ---------------------------------------------*/
  37
  38 /* We use the IEEE 754 `binary64' format.  Briefly:
  39  *
  40  *   * The top bit is the sign %$s$%: 0 encodes %$s = +1$%, and 1 encodes
  41  *     %$s = -1$%..  The format is signed-magnitude, so everything else is
  42  *     the same for positive and negative numbers.
  43  *
  44  *   * The next eleven bits are the biased exponent %$e$%.
  45  *
  46  *   * The remaining 52 bits are the significand %$m$%.
  47  *
  48  * If %$0 < e < 2047$% then the encoding represents the normal number
  49  * %$s \cdot (1 + m/2^{52}) \cdot 2^{e-1023}$%.
  50  *
  51  * If %$e = 0$% and %$m = 0$% then the encoding represents positive or
  52  * negative zero.
  53  *
  54  * If %$e = 0$% and %$m \ne 0$% then the encoding represents a subnormal
  55  * number %$s \cdot m/2^{52} \cdot 2^{-1022}$%.
  56  *
  57  * If %$e = 2047$% and %$m = 0$% then the encoding represents positive or
  58  * negative infinity.
  59  *
  60  * If %$e = 2047$% and %$m \ne 0$% then the encoding represents a NaN.  If
  61  * the most significant bit of %$m$% is set then this is a quiet NaN;
  62  * otherwise it's a signalling NaN.
  63  */
  64
  65 /* --- @f64_to_k64@ --- *
  66  *
  67  * Arguments:   @double x@ = a floating-point number
  68  *
  69  * Returns:     A 64-bit encoding of @x@.
  70  *
  71  * Use:         Encodes @x@ as a `binary64' value.  See `buf_putf64' for the
  72  *              caveats.
  73  */
  74
  75 static kludge64 f64_to_k64(double x)
  76 {
  77   kludge64 k;
  78   uint32 lo, hi, t;
  79   int e; double m;
  80
  81   /* Some machinery before we start. */
  82
  83 #ifdef isnan
  84 #  define NANP(x) isnan(x)
  85 #else
  86 #  define NANP(x) (!((x) == (x)))
  87 #endif
  88
  89 #ifdef isinf
  90 #  define INFP(x) isinf(x)
  91 #else
  92 #  define INFP(x) ((x) > DBL_MAX || (x) < -DBL_MAX)
  93 #endif
  94
  95 #ifdef signbit
  96 #  define NEGP(x) signbit(x)
  97 #else
  98 #  define NEGP(x) ((x) < 0)             /* incorrect for negative zero! */
  99 #endif
 100
 101   if (NANP(x)) {
 102     /* A NaN. */
 103     hi = 0x7ff80000; lo = 0;
 104   } else if (INFP(x)) {
 105     /* Positive or negative infinity. */
 106     hi = NEGP(x) ? 0xfff00000 : 0x7ff00000; lo = 0;
 107   } else if (x == 0) {
 108     /* Positive or negative zero. */
 109     hi = NEGP(x) ? 0x80000000 : 0; lo = 0;
 110   } else {
 111     /* A normal or subnormal number.  Now we have to do some actual work. */
 112
 113     /* Let's get the sign dealt with so we don't have to worry about it any
 114      * more.
 115      */
 116     if (!NEGP(x)) hi = 0;
 117     else { x = -x; hi = 0x80000000; }
 118
 119     /* Now we start on the value.  The first thing to do is to split off the
 120      * exponent.  Our number will be %$m \cdot 2^e$%, with %$1/2 \le m < 1$%.
 121      */
 122     m = frexp(x, &e);
 123
 124     /* If our number is too big, we'll round it to infinity.  This will
 125      * happen if %$x \ge 2^{1024}$%, i.e., if %$e > 1024$%.
 126      */
 127     if (e > 1024)
 128       { hi |= 0x7ff00000; lo = 0; }
 129     else {
 130       /* Our number is sufficiently small that we can represent it at least
 131        * approximately (though maybe we'll have to flush it to zero).  The
 132        * next step, then, is to pull the significand bits out.
 133        */
 134
 135       /* Determine the correct exponent to store.  We're not going to bias it
 136        * yet, but this is where we deal with subnormal numbers.  Our number
 137        * is normal if %$x \ge 2^{-1022}$%, i.e., %$e > -1022$%.  In this
 138        * case, there's an implicit bit which we'll clear.  Otherwise, if it's
 139        * subnormal, we'll scale our floating-point number so that the
 140        * significand will look right when we extract it, and adjust the
 141        * exponent so that, when we're finally done, it will have the correct
 142        * sentinel value.
 143        */
 144       if (e > -1022) m -= 0.5;
 145       else { m = ldexp(m, 1021 + e); e = -1022; }
 146
 147       /* Now we pull out the 53 bits of the significand.   This will, in
 148        * general, leave a tail which we address through rounding.  Scale it
 149        * up so that we end up with %$0 \le m' < 2$%; then we round up if
 150        * %$m > 1$%, or if %$m = 1$% and the low bit of the significand is
 151        * set.
 152        */
 153       t = ldexp(m, 21); m -= ldexp(t, -21);
 154       lo = ldexp(m, 53); m -= ldexp(lo, -53);
 155       m = ldexp(m, 54);
 156
 157       /* Round the number if necessary. */
 158       if (lo&1 ? m >= 1.0 : m > 1)
 159         { lo = U32(lo + 1); if (!lo) t++; }
 160
 161       /* Now we just put the pieces together.  Note that our %$e$% is one
 162        * greater than it should be, because our implicit bit should have
 163        * been the unit bit not the 1/2 bit.
 164        */
 165       hi |= ((uint32)(e + 1022) << 20) | t;
 166     }
 167   }
 168
 169   /* Convert to external format and go home. */
 170   SET64(k, hi, lo); return (k);
 171
 172 #undef NANP
 173 #undef INFP
 174 #undef NEGP
 175 }
 176
 177 /* --- @k64_to_f64@ --- *
 178  *
 179  * Arguments:   @double *x_out@ = where to put the result
 180  *              @kludge64 k@ = a 64-bit encoding of a floating-point value
 181  *
 182  * Returns:     Zero on success, @-1@ on failure.
 183  *
 184  * Use:         Decodes @k@ as a `binary64' value.  See `buf_getf64' for the
 185  *              caveats.
 186  */
 187
 188 static int k64_to_f64(double *x_out, kludge64 k)
 189 {
 190   uint32 lo, hi, t;
 191   int s, e; double x;
 192
 193   /* We're using the IEEE 754 `binary64' format: see `float_to_k64' above. */
 194
 195   /* Pick the encoded number apart. */
 196   hi = HI64(k); lo = LO64(k);
 197   s = (hi >> 31)&1; e = (hi >> 20)&0x07ff; t = hi&0x000fffff;
 198
 199   /* Deal with various special cases. */
 200   if (e == 2047) {
 201     /* Maximum exponent indicates (positive or negative) infinity or NaN. */
 202
 203     if (t || lo) {
 204       /* It's a NaN.  We're not going to be picky about which one.  If we
 205        * can't represent it then we'll just have to fail.
 206        */
 207
 208 #ifdef NAN
 209       x = NAN;
 210 #else
 211       return (-1);
 212 #endif
 213     } else {
 214       /* It's an infinity.  If we don't have one of those to hand, then pick
 215        * something really big.
 216        */
 217
 218 #ifdef INFINITY
 219     x = s ? -INFINITY : INFINITY;
 220 #else
 221     x = s ? -DBL_MAX : DBL_MAX;
 222 #endif
 223     }
 224   } else {
 225     /* It's a finite number, though maybe it's weird in some way. */
 226
 227     if (e == 0) {
 228       /* Minimum exponent indicates zero or a subnormal number.  The
 229        * subnormal exponent is a sentinel value that shouldn't be taken
 230        * literally, so we should fix that.  If the number is actually zero
 231        * then the exponent won't matter much so don't bother checking.
 232        */
 233
 234       e = 1;
 235     } else {
 236       /* It's a normal number.  In which case there's an implicit bit which
 237        * we can now set.
 238        */
 239
 240       t |= 0x00100000;
 241     }
 242
 243     /* All that remains is to stuff the significant and exponent into a
 244      * floating point number.  We'll have to do this in pieces, and we'll
 245      * lean on the floating-point machinery to do rounding correctly.
 246      */
 247     x = ldexp(t, e - 1043) + ldexp(lo, e - 1075);
 248     if (s) x = -x;
 249   }
 250
 251   /* And we're done. */
 252   *x_out = x; return (0);
 253 }
 254
 255 /*----- External functions ------------------------------------------------*/
 256
 257 /* --- @buf_putf64{,b,l} --- *
 258  *
 259  * Arguments:   @buf *b@ = a buffer to write to
 260  *              @double x@ = a number to write
 261  *
 262  * Returns:     Zero on success, @-1@ on failure (and the buffer is broken).
 263  *
 264  *              On C89, this function can't detect negative zero so these
 265  *              will be silently written as positive zero.
 266  *
 267  *              This function doesn't distinguish NaNs.  Any NaN is written
 268  *              as a quiet NaN with all payload bits zero.
 269  *
 270  *              A finite value with too large a magnitude to be represented
 271  *              is rounded to the appropriate infinity.  Other finite values
 272  *              are rounded as necessary, in the usual IEEE 754 round-to-
 273  *              nearest-or-even way.
 274  */
 275
 276 int buf_putf64(buf *b, double x)
 277   { return (buf_putk64(b, f64_to_k64(x))); }
 278 int buf_putf64b(buf *b, double x)
 279   { return (buf_putk64b(b, f64_to_k64(x))); }
 280 int buf_putf64l(buf *b, double x)
 281   { return (buf_putk64l(b, f64_to_k64(x))); }
 282
 283 /* --- @buf_getf64{,b,l} --- *
 284  *
 285  * Arguments:   @buf *b@ = a buffer to read from
 286  *              @double *x_out@ = where to put the result
 287  *
 288  * Returns:     Zero on success, @-1@ on failure (and the buffer is broken).
 289  *
 290  *              If the system supports NaNs, then any encoded NaN is returned
 291  *              as the value of @NAN@ in @<math.h>@; otherwise, this function
 292  *              reports failure.
 293  *
 294  *              In general, values are rounded to the nearest available
 295  *              value, in the way that the system usually rounds.  If the
 296  *              system doesn't support infinities, then any encoded infinity
 297  *              is reported as the largest-possible-magnitude finite value
 298  *              instead.
 299  */
 300
 301 int buf_getf64(buf *b, double *x_out)
 302 {
 303   kludge64 k;
 304
 305   if (buf_getk64(b, &k)) return (-1);
 306   if (k64_to_f64(x_out, k)) { b->f |= BF_BROKEN; return (-1); }
 307   return (0);
 308 }
 309 int buf_getf64b(buf *b, double *x_out)
 310 {
 311   kludge64 k;
 312
 313   if (buf_getk64b(b, &k)) return (-1);
 314   if (k64_to_f64(x_out, k)) { b->f |= BF_BROKEN; return (-1); }
 315   return (0);
 316 }
 317 int buf_getf64l(buf *b, double *x_out)
 318 {
 319   kludge64 k;
 320
 321   if (buf_getk64l(b, &k)) return (-1);
 322   if (k64_to_f64(x_out, k)) { b->f |= BF_BROKEN; return (-1); }
 323   return (0);
 324 }
 325
 326 /*----- That's all, folks -------------------------------------------------*/