chiark - git - mdw - mLib/blob - struct/buf-float.c

   1 /* -*-c-*-
   2  *
   3  * Encoding and decoding floating-point values
   4  *
   5  * (c) 2023 Straylight/Edgeware
   6  */
   7
   8 /*----- Licensing notice --------------------------------------------------*
   9  *
  10  * This file is part of the mLib utilities library.
  11  *
  12  * mLib is free software: you can redistribute it and/or modify it under
  13  * the terms of the GNU Library General Public License as published by
  14  * the Free Software Foundation; either version 2 of the License, or (at
  15  * your option) any later version.
  16  *
  17  * mLib is distributed in the hope that it will be useful, but WITHOUT
  18  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  19  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
  20  * License for more details.
  21  *
  22  * You should have received a copy of the GNU Library General Public
  23  * License along with mLib.  If not, write to the Free Software
  24  * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
  25  * USA.
  26  */
  27
  28 /*----- Header files ------------------------------------------------------*/
  29
  30 #include <float.h>
  31 #include <math.h>
  32
  33 #include "bits.h"
  34 #include "buf.h"
  35 #include "maths.h"
  36
  37 /*----- Formatting primitives ---------------------------------------------*/
  38
  39 /* We use the IEEE 754 `binary64' format.  Briefly:
  40  *
  41  *   * The top bit is the sign %$s$%: 0 encodes %$s = +1$%, and 1 encodes
  42  *     %$s = -1$%..  The format is signed-magnitude, so everything else is
  43  *     the same for positive and negative numbers.
  44  *
  45  *   * The next eleven bits are the biased exponent %$e$%.
  46  *
  47  *   * The remaining 52 bits are the significand %$m$%.
  48  *
  49  * If %$0 < e < 2047$% then the encoding represents the normal number
  50  * %$s \cdot (1 + m/2^{52}) \cdot 2^{e-1023}$%.
  51  *
  52  * If %$e = 0$% and %$m = 0$% then the encoding represents positive or
  53  * negative zero.
  54  *
  55  * If %$e = 0$% and %$m \ne 0$% then the encoding represents a subnormal
  56  * number %$s \cdot m/2^{52} \cdot 2^{-1022}$%.
  57  *
  58  * If %$e = 2047$% and %$m = 0$% then the encoding represents positive or
  59  * negative infinity.
  60  *
  61  * If %$e = 2047$% and %$m \ne 0$% then the encoding represents a NaN.  If
  62  * the most significant bit of %$m$% is set then this is a quiet NaN;
  63  * otherwise it's a signalling NaN.
  64  */
  65
  66 /* --- @f64_to_k64@ --- *
  67  *
  68  * Arguments:   @double x@ = a floating-point number
  69  *
  70  * Returns:     A 64-bit encoding of @x@.
  71  *
  72  * Use:         Encodes @x@ as a `binary64' value.  See `buf_putf64' for the
  73  *              caveats.
  74  */
  75
  76 static kludge64 f64_to_k64(double x)
  77 {
  78   kludge64 k;
  79   uint32 lo, hi, t;
  80   int e; double m;
  81
  82   /* Some machinery before we start. */
  83
  84   if (NANP(x)) {
  85     /* A NaN. */
  86     hi = 0x7ff80000; lo = 0;
  87   } else if (INFP(x)) {
  88     /* Positive or negative infinity. */
  89     hi = NEGP(x) ? 0xfff00000 : 0x7ff00000; lo = 0;
  90   } else if (x == 0) {
  91     /* Positive or negative zero. */
  92     hi = NEGP(x) ? 0x80000000 : 0; lo = 0;
  93   } else {
  94     /* A normal or subnormal number.  Now we have to do some actual work. */
  95
  96     /* Let's get the sign dealt with so we don't have to worry about it any
  97      * more.
  98      */
  99     if (!NEGP(x)) hi = 0;
 100     else { x = -x; hi = 0x80000000; }
 101
 102     /* Now we start on the value.  The first thing to do is to split off the
 103      * exponent.  Our number will be %$m \cdot 2^e$%, with %$1/2 \le m < 1$%.
 104      */
 105     m = frexp(x, &e);
 106
 107     /* If our number is too big, we'll round it to infinity.  This will
 108      * happen if %$x \ge 2^{1024}$%, i.e., if %$e > 1024$%.
 109      */
 110     if (e > 1024)
 111       { hi |= 0x7ff00000; lo = 0; }
 112     else {
 113       /* Our number is sufficiently small that we can represent it at least
 114        * approximately (though maybe we'll have to flush it to zero).  The
 115        * next step, then, is to pull the significand bits out.
 116        */
 117
 118       /* Determine the correct exponent to store.  We're not going to bias it
 119        * yet, but this is where we deal with subnormal numbers.  Our number
 120        * is normal if %$x \ge 2^{-1022}$%, i.e., %$e > -1022$%.  In this
 121        * case, there's an implicit bit which we'll clear.  Otherwise, if it's
 122        * subnormal, we'll scale our floating-point number so that the
 123        * significand will look right when we extract it, and adjust the
 124        * exponent so that, when we're finally done, it will have the correct
 125        * sentinel value.
 126        */
 127       if (e > -1022) m -= 0.5;
 128       else { m = ldexp(m, 1021 + e); e = -1022; }
 129
 130       /* Now we pull out the 53 bits of the significand.   This will, in
 131        * general, leave a tail which we address through rounding.  Scale it
 132        * up so that we end up with %$0 \le m' < 2$%; then we round up if
 133        * %$m > 1$%, or if %$m = 1$% and the low bit of the significand is
 134        * set.
 135        */
 136       t = ldexp(m, 21); m -= ldexp(t, -21);
 137       lo = ldexp(m, 53); m -= ldexp(lo, -53);
 138       m = ldexp(m, 54);
 139
 140       /* Round the number if necessary. */
 141       if (lo&1 ? m >= 1.0 : m > 1)
 142         { lo = U32(lo + 1); if (!lo) t++; }
 143
 144       /* Now we just put the pieces together.  Note that our %$e$% is one
 145        * greater than it should be, because our implicit bit should have
 146        * been the unit bit not the 1/2 bit.
 147        */
 148       hi |= ((uint32)(e + 1022) << 20) | t;
 149     }
 150   }
 151
 152   /* Convert to external format and go home. */
 153   SET64(k, hi, lo); return (k);
 154 }
 155
 156 /* --- @k64_to_f64@ --- *
 157  *
 158  * Arguments:   @double *x_out@ = where to put the result
 159  *              @kludge64 k@ = a 64-bit encoding of a floating-point value
 160  *
 161  * Returns:     Zero on success, @-1@ on failure.
 162  *
 163  * Use:         Decodes @k@ as a `binary64' value.  See `buf_getf64' for the
 164  *              caveats.
 165  */
 166
 167 static int k64_to_f64(double *x_out, kludge64 k)
 168 {
 169   uint32 lo, hi, t;
 170   int s, e; double x;
 171
 172   /* We're using the IEEE 754 `binary64' format: see `float_to_k64' above. */
 173
 174   /* Pick the encoded number apart. */
 175   hi = HI64(k); lo = LO64(k);
 176   s = (hi >> 31)&1; e = (hi >> 20)&0x07ff; t = hi&0x000fffff;
 177
 178   /* Deal with various special cases. */
 179   if (e == 2047) {
 180     /* Maximum exponent indicates (positive or negative) infinity or NaN. */
 181
 182     if (t || lo) {
 183       /* It's a NaN.  We're not going to be picky about which one.  If we
 184        * can't represent it then we'll just have to fail.
 185        */
 186
 187 #ifdef NAN
 188       x = NAN;
 189 #else
 190       return (-1);
 191 #endif
 192     } else {
 193       /* It's an infinity.  If we don't have one of those to hand, then pick
 194        * something really big.
 195        */
 196
 197 #ifdef INFINITY
 198     x = s ? -INFINITY : INFINITY;
 199 #else
 200     x = s ? -DBL_MAX : DBL_MAX;
 201 #endif
 202     }
 203   } else {
 204     /* It's a finite number, though maybe it's weird in some way. */
 205
 206     if (e == 0) {
 207       /* Minimum exponent indicates zero or a subnormal number.  The
 208        * subnormal exponent is a sentinel value that shouldn't be taken
 209        * literally, so we should fix that.  If the number is actually zero
 210        * then the exponent won't matter much so don't bother checking.
 211        */
 212
 213       e = 1;
 214     } else {
 215       /* It's a normal number.  In which case there's an implicit bit which
 216        * we can now set.
 217        */
 218
 219       t |= 0x00100000;
 220     }
 221
 222     /* All that remains is to stuff the significant and exponent into a
 223      * floating point number.  We'll have to do this in pieces, and we'll
 224      * lean on the floating-point machinery to do rounding correctly.
 225      */
 226     x = ldexp(t, e - 1043) + ldexp(lo, e - 1075);
 227     if (s) x = -x;
 228   }
 229
 230   /* And we're done. */
 231   *x_out = x; return (0);
 232 }
 233
 234 /*----- External functions ------------------------------------------------*/
 235
 236 /* --- @buf_putf64{,b,l} --- *
 237  *
 238  * Arguments:   @buf *b@ = a buffer to write to
 239  *              @double x@ = a number to write
 240  *
 241  * Returns:     Zero on success, @-1@ on failure (and the buffer is broken).
 242  *
 243  *              On C89, this function can't detect negative zero so these
 244  *              will be silently written as positive zero.
 245  *
 246  *              This function doesn't distinguish NaNs.  Any NaN is written
 247  *              as a quiet NaN with all payload bits zero.
 248  *
 249  *              A finite value with too large a magnitude to be represented
 250  *              is rounded to the appropriate infinity.  Other finite values
 251  *              are rounded as necessary, in the usual IEEE 754 round-to-
 252  *              nearest-or-even way.
 253  */
 254
 255 int buf_putf64(buf *b, double x)
 256   { return (buf_putk64(b, f64_to_k64(x))); }
 257 int buf_putf64b(buf *b, double x)
 258   { return (buf_putk64b(b, f64_to_k64(x))); }
 259 int buf_putf64l(buf *b, double x)
 260   { return (buf_putk64l(b, f64_to_k64(x))); }
 261
 262 /* --- @buf_getf64{,b,l} --- *
 263  *
 264  * Arguments:   @buf *b@ = a buffer to read from
 265  *              @double *x_out@ = where to put the result
 266  *
 267  * Returns:     Zero on success, @-1@ on failure (and the buffer is broken).
 268  *
 269  *              If the system supports NaNs, then any encoded NaN is returned
 270  *              as the value of @NAN@ in @<math.h>@; otherwise, this function
 271  *              reports failure.
 272  *
 273  *              In general, values are rounded to the nearest available
 274  *              value, in the way that the system usually rounds.  If the
 275  *              system doesn't support infinities, then any encoded infinity
 276  *              is reported as the largest-possible-magnitude finite value
 277  *              instead.
 278  */
 279
 280 int buf_getf64(buf *b, double *x_out)
 281 {
 282   kludge64 k;
 283
 284   if (buf_getk64(b, &k)) return (-1);
 285   if (k64_to_f64(x_out, k)) { b->f |= BF_BROKEN; return (-1); }
 286   return (0);
 287 }
 288 int buf_getf64b(buf *b, double *x_out)
 289 {
 290   kludge64 k;
 291
 292   if (buf_getk64b(b, &k)) return (-1);
 293   if (k64_to_f64(x_out, k)) { b->f |= BF_BROKEN; return (-1); }
 294   return (0);
 295 }
 296 int buf_getf64l(buf *b, double *x_out)
 297 {
 298   kludge64 k;
 299
 300   if (buf_getk64l(b, &k)) return (-1);
 301   if (k64_to_f64(x_out, k)) { b->f |= BF_BROKEN; return (-1); }
 302   return (0);
 303 }
 304
 305 /*----- That's all, folks -------------------------------------------------*/