chiark - git - mdw - mLib/blob - struct/buf-float.c

   1 /* -*-c-*-
   2  *
   3  * Encoding and decoding floating-point values
   4  *
   5  * (c) 2023 Straylight/Edgeware
   6  */
   7
   8 /*----- Licensing notice --------------------------------------------------*
   9  *
  10  * This file is part of the mLib utilities library.
  11  *
  12  * mLib is free software: you can redistribute it and/or modify it under
  13  * the terms of the GNU Library General Public License as published by
  14  * the Free Software Foundation; either version 2 of the License, or (at
  15  * your option) any later version.
  16  *
  17  * mLib is distributed in the hope that it will be useful, but WITHOUT
  18  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  19  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
  20  * License for more details.
  21  *
  22  * You should have received a copy of the GNU Library General Public
  23  * License along with mLib.  If not, write to the Free Software
  24  * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
  25  * USA.
  26  */
  27
  28 /*----- Header files ------------------------------------------------------*/
  29
  30 #include <float.h>
  31 #include <math.h>
  32
  33 #include "bits.h"
  34 #include "buf.h"
  35 #include "maths.h"
  36
  37 /*----- Formatting primitives ---------------------------------------------*/
  38
  39 /* We use the IEEE 754 `binary64' format.  Briefly:
  40  *
  41  *   * The top bit is the sign %$s$%: 0 encodes %$s = +1$%, and 1 encodes
  42  *     %$s = -1$%..  The format is signed-magnitude, so everything else is
  43  *     the same for positive and negative numbers.
  44  *
  45  *   * The next eleven bits are the biased exponent %$e$%.
  46  *
  47  *   * The remaining 52 bits are the significand %$m$%.
  48  *
  49  * If %$0 < e < 2047$% then the encoding represents the normal number
  50  * %$s \cdot (1 + m/2^{52}) \cdot 2^{e-1023}$%.
  51  *
  52  * If %$e = 0$% and %$m = 0$% then the encoding represents positive or
  53  * negative zero.
  54  *
  55  * If %$e = 0$% and %$m \ne 0$% then the encoding represents a subnormal
  56  * number %$s \cdot m/2^{52} \cdot 2^{-1022}$%.
  57  *
  58  * If %$e = 2047$% and %$m = 0$% then the encoding represents positive or
  59  * negative infinity.
  60  *
  61  * If %$e = 2047$% and %$m \ne 0$% then the encoding represents a NaN.  If
  62  * the most significant bit of %$m$% is set then this is a quiet NaN;
  63  * otherwise it's a signalling NaN.
  64  */
  65
  66 /* --- @k64_to_f64@ --- *
  67  *
  68  * Arguments:   @double *x_out@ = where to put the result
  69  *              @kludge64 k@ = a 64-bit encoding of a floating-point value
  70  *
  71  * Returns:     Zero on success, @-1@ on failure.
  72  *
  73  * Use:         Decodes @k@ as a `binary64' value.  See `buf_getf64' for the
  74  *              caveats.
  75  */
  76
  77 static int k64_to_f64(double *x_out, kludge64 k)
  78 {
  79   uint32 lo, hi, t;
  80   int s, e; double x;
  81
  82   /* We're using the IEEE 754 `binary64' format: see `float_to_k64' above. */
  83
  84   /* Pick the encoded number apart. */
  85   hi = HI64(k); lo = LO64(k);
  86   s = (hi >> 31)&1; e = (hi >> 20)&0x07ff; t = hi&0x000fffff;
  87
  88   /* Deal with various special cases. */
  89   if (e == 2047) {
  90     /* Maximum exponent indicates (positive or negative) infinity or NaN. */
  91
  92     if (t || lo) {
  93       /* It's a NaN.  We're not going to be picky about which one.  If we
  94        * can't represent it then we'll just have to fail.
  95        */
  96
  97 #ifdef NAN
  98       x = NAN;
  99 #else
 100       return (-1);
 101 #endif
 102     } else {
 103       /* It's an infinity.  If we don't have one of those to hand, then pick
 104        * something really big.
 105        */
 106
 107 #ifdef INFINITY
 108     x = s ? -INFINITY : INFINITY;
 109 #else
 110     x = s ? -DBL_MAX : DBL_MAX;
 111 #endif
 112     }
 113   } else {
 114     /* It's a finite number, though maybe it's weird in some way. */
 115
 116     if (e == 0) {
 117       /* Minimum exponent indicates zero or a subnormal number.  The
 118        * subnormal exponent is a sentinel value that shouldn't be taken
 119        * literally, so we should fix that.  If the number is actually zero
 120        * then the exponent won't matter much so don't bother checking.
 121        */
 122
 123       e = 1;
 124     } else {
 125       /* It's a normal number.  In which case there's an implicit bit which
 126        * we can now set.
 127        */
 128
 129       t |= 0x00100000;
 130     }
 131
 132     /* All that remains is to stuff the significant and exponent into a
 133      * floating point number.  We'll have to do this in pieces, and we'll
 134      * lean on the floating-point machinery to do rounding correctly.
 135      */
 136     x = ldexp(t, e - 1043) + ldexp(lo, e - 1075);
 137     if (s) x = -x;
 138   }
 139
 140   /* And we're done. */
 141   *x_out = x; return (0);
 142 }
 143
 144 /* --- @f64_to_k64@ --- *
 145  *
 146  * Arguments:   @double x@ = a floating-point number
 147  *
 148  * Returns:     A 64-bit encoding of @x@.
 149  *
 150  * Use:         Encodes @x@ as a `binary64' value.  See `buf_putf64' for the
 151  *              caveats.
 152  */
 153
 154 static kludge64 f64_to_k64(double x)
 155 {
 156   kludge64 k;
 157   uint32 lo, hi, t;
 158   int e; double m;
 159
 160   /* Some machinery before we start. */
 161
 162   if (NANP(x)) {
 163     /* A NaN. */
 164     hi = 0x7ff80000; lo = 0;
 165   } else if (INFP(x)) {
 166     /* Positive or negative infinity. */
 167     hi = NEGP(x) ? 0xfff00000 : 0x7ff00000; lo = 0;
 168   } else if (x == 0) {
 169     /* Positive or negative zero. */
 170     hi = NEGP(x) ? 0x80000000 : 0; lo = 0;
 171   } else {
 172     /* A normal or subnormal number.  Now we have to do some actual work. */
 173
 174     /* Let's get the sign dealt with so we don't have to worry about it any
 175      * more.
 176      */
 177     if (!NEGP(x)) hi = 0;
 178     else { x = -x; hi = 0x80000000; }
 179
 180     /* Now we start on the value.  The first thing to do is to split off the
 181      * exponent.  Our number will be %$m \cdot 2^e$%, with %$1/2 \le m < 1$%.
 182      */
 183     m = frexp(x, &e);
 184
 185     /* If our number is too big, we'll round it to infinity.  This will
 186      * happen if %$x \ge 2^{1024}$%, i.e., if %$e > 1024$%.
 187      */
 188     if (e > 1024)
 189       { hi |= 0x7ff00000; lo = 0; }
 190     else {
 191       /* Our number is sufficiently small that we can represent it at least
 192        * approximately (though maybe we'll have to flush it to zero).  The
 193        * next step, then, is to pull the significand bits out.
 194        */
 195
 196       /* Determine the correct exponent to store.  We're not going to bias it
 197        * yet, but this is where we deal with subnormal numbers.  Our number
 198        * is normal if %$x \ge 2^{-1022}$%, i.e., %$e > -1022$%.  In this
 199        * case, there's an implicit bit which we'll clear.  Otherwise, if it's
 200        * subnormal, we'll scale our floating-point number so that the
 201        * significand will look right when we extract it, and adjust the
 202        * exponent so that, when we're finally done, it will have the correct
 203        * sentinel value.
 204        */
 205       if (e > -1022) m -= 0.5;
 206       else { m = ldexp(m, 1021 + e); e = -1022; }
 207
 208       /* Now we pull out the 53 bits of the significand.   This will, in
 209        * general, leave a tail which we address through rounding.  Scale it
 210        * up so that we end up with %$0 \le m' < 2$%; then we round up if
 211        * %$m > 1$%, or if %$m = 1$% and the low bit of the significand is
 212        * set.
 213        */
 214       t = ldexp(m, 21); m -= ldexp(t, -21);
 215       lo = ldexp(m, 53); m -= ldexp(lo, -53);
 216       m = ldexp(m, 54);
 217
 218       /* Round the number if necessary. */
 219       if (lo&1 ? m >= 1.0 : m > 1)
 220         { lo = U32(lo + 1); if (!lo) t++; }
 221
 222       /* Now we just put the pieces together.  Note that our %$e$% is one
 223        * greater than it should be, because our implicit bit should have
 224        * been the unit bit not the 1/2 bit.
 225        */
 226       hi |= ((uint32)(e + 1022) << 20) | t;
 227     }
 228   }
 229
 230   /* Convert to external format and go home. */
 231   SET64(k, hi, lo); return (k);
 232 }
 233
 234 /*----- External functions ------------------------------------------------*/
 235
 236 /* --- @buf_getf64{,l,b} --- *
 237  *
 238  * Arguments:   @buf *b@ = a buffer to read from
 239  *              @double *x_out@ = where to put the result
 240  *
 241  * Returns:     Zero on success, @-1@ on failure (and the buffer is broken).
 242  *
 243  *              If the system supports NaNs, then any encoded NaN is returned
 244  *              as the value of @NAN@ in @<math.h>@; otherwise, this function
 245  *              reports failure.
 246  *
 247  *              In general, values are rounded to the nearest available
 248  *              value, in the way that the system usually rounds.  If the
 249  *              system doesn't support infinities, then any encoded infinity
 250  *              is reported as the largest-possible-magnitude finite value
 251  *              instead.
 252  */
 253
 254 int buf_getf64(buf *b, double *x_out)
 255 {
 256   kludge64 k;
 257
 258   if (buf_getk64(b, &k)) return (-1);
 259   if (k64_to_f64(x_out, k)) { b->f |= BF_BROKEN; return (-1); }
 260   return (0);
 261 }
 262
 263 int buf_getf64l(buf *b, double *x_out)
 264 {
 265   kludge64 k;
 266
 267   if (buf_getk64l(b, &k)) return (-1);
 268   if (k64_to_f64(x_out, k)) { b->f |= BF_BROKEN; return (-1); }
 269   return (0);
 270 }
 271
 272 int buf_getf64b(buf *b, double *x_out)
 273 {
 274   kludge64 k;
 275
 276   if (buf_getk64b(b, &k)) return (-1);
 277   if (k64_to_f64(x_out, k)) { b->f |= BF_BROKEN; return (-1); }
 278   return (0);
 279 }
 280
 281 int (dbuf_getf64)(dbuf *db, double *x_out)
 282   { return (dbuf_getf64(db, x_out)); }
 283 int (dbuf_getf64l)(dbuf *db, double *x_out)
 284   { return (dbuf_getf64l(db, x_out)); }
 285 int (dbuf_getf64b)(dbuf *db, double *x_out)
 286   { return (dbuf_getf64b(db, x_out)); }
 287
 288 /* --- @buf_putf64{,l,b} --- *
 289  *
 290  * Arguments:   @buf *b@ = a buffer to write to
 291  *              @double x@ = a number to write
 292  *
 293  * Returns:     Zero on success, @-1@ on failure (and the buffer is broken).
 294  *
 295  *              On C89, this function can't detect negative zero so these
 296  *              will be silently written as positive zero.
 297  *
 298  *              This function doesn't distinguish NaNs.  Any NaN is written
 299  *              as a quiet NaN with all payload bits zero.
 300  *
 301  *              A finite value with too large a magnitude to be represented
 302  *              is rounded to the appropriate infinity.  Other finite values
 303  *              are rounded as necessary, in the usual IEEE 754 round-to-
 304  *              nearest-or-even way.
 305  */
 306
 307 int buf_putf64(buf *b, double x)
 308   { return (buf_putk64(b, f64_to_k64(x))); }
 309 int buf_putf64l(buf *b, double x)
 310   { return (buf_putk64l(b, f64_to_k64(x))); }
 311 int buf_putf64b(buf *b, double x)
 312   { return (buf_putk64b(b, f64_to_k64(x))); }
 313
 314 int (dbuf_putf64)(dbuf *db, double x)
 315   { return (dbuf_putf64(db, x)); }
 316 int (dbuf_putf64l)(dbuf *db, double x)
 317   { return (dbuf_putf64l(db, x)); }
 318 int (dbuf_putf64b)(dbuf *db, double x)
 319   { return (dbuf_putf64b(db, x)); }
 320
 321 /*----- That's all, folks -------------------------------------------------*/