chiark - git - mdw - catacomb/blob - math/scaf.c

   1 /* -*-c-*-
   2  *
   3  * Simple scalar fields
   4  *
   5  * (c) 2017 Straylight/Edgeware
   6  */
   7
   8 /*----- Licensing notice --------------------------------------------------*
   9  *
  10  * This file is part of Catacomb.
  11  *
  12  * Catacomb is free software; you can redistribute it and/or modify
  13  * it under the terms of the GNU Library General Public License as
  14  * published by the Free Software Foundation; either version 2 of the
  15  * License, or (at your option) any later version.
  16  *
  17  * Catacomb is distributed in the hope that it will be useful,
  18  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  19  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  20  * GNU Library General Public License for more details.
  21  *
  22  * You should have received a copy of the GNU Library General Public
  23  * License along with Catacomb; if not, write to the Free
  24  * Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
  25  * MA 02111-1307, USA.
  26  */
  27
  28 /*----- Header files ------------------------------------------------------*/
  29
  30 #include <string.h>
  31
  32 #include "scaf.h"
  33
  34 /*----- Debugging utilties ------------------------------------------------*/
  35
  36 #ifdef SCAF_DEBUG
  37
  38 #include <stdio.h>
  39
  40 #include "mp.h"
  41 #include "mpint.h"
  42 #include "mptext.h"
  43
  44 static void scaf_dump(const char *what, const scaf_piece *x,
  45                       size_t npiece, size_t piecewd)
  46 {
  47   mp *y = MP_ZERO, *t = MP_NEW;
  48   size_t i;
  49   unsigned o = 0;
  50
  51   for (i = 0; i < npiece; i++) {
  52     t = mp_fromuint64(t, x[i]);
  53     t = mp_lsl(t, t, o);
  54     y = mp_add(y, y, t);
  55     o += piecewd;
  56   }
  57   printf(";; %s", what); MP_PRINT("", y); putchar('\n');
  58   mp_drop(y); mp_drop(t);
  59 }
  60
  61 static void scaf_dumpdbl(const char *what, const scaf_dblpiece *x,
  62                       size_t npiece, size_t piecewd)
  63 {
  64   mp *y = MP_ZERO, *t = MP_NEW;
  65   size_t i;
  66   unsigned o = 0;
  67
  68   for (i = 0; i < npiece; i++) {
  69     t = mp_fromuint64(t, x[i]);
  70     t = mp_lsl(t, t, o);
  71     y = mp_add(y, y, t);
  72     o += piecewd;
  73   }
  74   printf(";; %s", what); MP_PRINT("", y); putchar('\n');
  75   mp_drop(y); mp_drop(t);
  76 }
  77
  78 #endif
  79
  80 /*----- Main code ---------------------------------------------------------*/
  81
  82 /* --- @scaf_load@ --- *
  83  *
  84  * Arguments:   @scaf_piece *z@ = where to write the result
  85  *              @const octet *b@ = source buffer to read
  86  *              @size_t sz@ = size of the source buffer
  87  *              @size_t npiece@ = number of pieces to read
  88  *              @unsigned piecewd@ = nominal width of pieces in bits
  89  *
  90  * Returns:     ---
  91  *
  92  * Use:         Loads a little-endian encoded scalar into a vector @z@ of
  93  *              single-precision pieces.
  94  */
  95
  96 void scaf_load(scaf_piece *z, const octet *b, size_t sz,
  97                size_t npiece, unsigned piecewd)
  98 {
  99   uint32 a, m = ((scaf_piece)1 << piecewd) - 1;
 100   unsigned i, j, n;
 101
 102   for (i = j = n = 0, a = 0; i < sz; i++) {
 103     a |= b[i] << n; n += 8;
 104     if (n >= piecewd) {
 105       z[j++] = a&m; a >>= piecewd; n -= piecewd;
 106       if (j >= npiece) return;
 107     }
 108   }
 109   z[j++] = a;
 110   while (j < npiece) z[j++] = 0;
 111 }
 112
 113 /* --- @scaf_loaddbl@ --- *
 114  *
 115  * Arguments:   @scaf_dblpiece *z@ = where to write the result
 116  *              @const octet *b@ = source buffer to read
 117  *              @size_t sz@ = size of the source buffer
 118  *              @size_t npiece@ = number of pieces to read
 119  *              @unsigned piecewd@ = nominal width of pieces in bits
 120  *
 121  * Returns:     ---
 122  *
 123  * Use:         Loads a little-endian encoded scalar into a vector @z@ of
 124  *              double-precision pieces.
 125  */
 126
 127 void scaf_loaddbl(scaf_dblpiece *z, const octet *b, size_t sz,
 128                   size_t npiece, unsigned piecewd)
 129 {
 130   uint32 a, m = ((scaf_piece)1 << piecewd) - 1;
 131   unsigned i, j, n;
 132
 133   for (i = j = n = 0, a = 0; i < sz; i++) {
 134     a |= b[i] << n; n += 8;
 135     if (n >= piecewd) {
 136       z[j++] = a&m; a >>= piecewd; n -= piecewd;
 137       if (j >= npiece) return;
 138     }
 139   }
 140   z[j++] = a;
 141   while (j < npiece) z[j++] = 0;
 142 }
 143
 144 /* --- @scaf_store@ --- *
 145  *
 146  * Arguments:   @octet *b@ = buffer to fill in
 147  *              @size_t sz@ = size of the buffer
 148  *              @const scaf_piece *x@ = scalar to store
 149  *              @size_t npiece@ = number of pieces in @x@
 150  *              @unsigned piecewd@ = nominal width of pieces in bits
 151  *
 152  * Returns:     ---
 153  *
 154  * Use:         Stores a scalar in a vector of single-precison pieces as a
 155  *              little-endian vector of bytes.
 156  */
 157
 158 void scaf_store(octet *b, size_t sz, const scaf_piece *x,
 159                 size_t npiece, unsigned piecewd)
 160 {
 161   uint32 a;
 162   unsigned i, j, n;
 163
 164   for (i = j = n = 0, a = 0; i < npiece; i++) {
 165     a |= x[i] << n; n += piecewd;
 166     while (n >= 8) {
 167       b[j++] = a&0xffu; a >>= 8; n -= 8;
 168       if (j >= sz) return;
 169     }
 170   }
 171   b[j++] = a;
 172   memset(b + j, 0, sz - j);
 173 }
 174
 175 /* --- @scaf_mul@ --- *
 176  *
 177  * Arguments:   @scaf_dblpiece *z@ = where to put the answer
 178  *              @const scaf_piece *x, *y@ = the operands
 179  *              @size_t npiece@ = the length of the operands
 180  *
 181  * Returns:     ---
 182  *
 183  * Use:         Multiply two scalars.  The destination must have space for
 184  *              @2*npiece@ pieces (though the last one will always be zero).
 185  *              The result is not reduced.
 186  */
 187
 188 void scaf_mul(scaf_dblpiece *z, const scaf_piece *x, const scaf_piece *y,
 189               size_t npiece)
 190 {
 191   unsigned i, j;
 192
 193   for (i = 0; i < 2*npiece; i++) z[i] = 0;
 194
 195   for (i = 0; i < npiece; i++)
 196     for (j = 0; j < npiece; j++)
 197       z[i + j] += (scaf_dblpiece)x[i]*y[j];
 198 }
 199
 200 /* --- @scaf_reduce@ --- *
 201  *
 202  * Arguments:   @scaf_piece *z@ = where to write the result
 203  *              @const scaf_dblpiece *x@ = the operand to reduce
 204  *              @const scaf_piece *l@ = the modulus, in internal format
 205  *              @const scaf_piece *mu@ = scaled approximation to @1/l@
 206  *              @size_t npiece@ = number of pieces in @l@
 207  *              @unsigned piecewd@ = nominal width of a piece in bits
 208  *              @scaf_piece *scratch@ = @3*npiece@ scratch pieces
 209  *
 210  * Returns:     ---
 211  *
 212  * Use:         Reduce @x@ (a vector of @2*npiece@ double-precision pieces)
 213  *              modulo @l@ (a vector of @npiece@ single-precision pieces),
 214  *              writing the result to @z@.
 215  *
 216  *              Write @n = npiece@, @w = piecewd@, and %$B = 2^w$%.  The
 217  *              operand @mu@ must contain %$\lfloor B^{2n}/l \rfloor$%, in
 218  *              @npiece + 1@ pieces.  Furthermore, we must have
 219  *              %$3 l < B^n$%.  (Fiddle with %$w$% if necessary.)
 220  */
 221
 222 void scaf_reduce(scaf_piece *z, const scaf_dblpiece *x,
 223                  const scaf_piece *l, const scaf_piece *mu,
 224                  size_t npiece, unsigned piecewd, scaf_piece *scratch)
 225 {
 226   unsigned i, j;
 227   scaf_piece *t = scratch, *q = scratch + 2*npiece;
 228   scaf_piece u, m = ((scaf_piece)1 << piecewd) - 1;
 229   scaf_dblpiece c;
 230
 231   /* This here is the hard part.
 232    *
 233    * Let w = PIECEWD, let n = NPIECE, and let B = 2^w.  We must have
 234    * B^(n-1) <= l < B^n.
 235    *
 236    * The argument MU contains pieces of the quantity µ = floor(B^2n/l), which
 237    * is a scaled approximation to 1/l.  We'll calculate
 238    *
 239    *    q = floor(µ floor(x/B^(n-1))/B^(n+1))
 240    *
 241    * which is an underestimate of x/l.
 242    *
 243    * With a bit more precision: by definition, u - 1 < floor(u) <= u.  Hence,
 244    *
 245    *    B^2n/l - 1 < µ <= B^2/l
 246    *
 247    * and
 248    *
 249    *    x/B^(n-1) - 1 < floor(x/B^(n-1)) <= x/B^(n-1)
 250    *
 251    * Multiplying these together, and dividing through by B^(n+1), gives
 252    *
 253    *    floor(x/l - B^(n-1)/l - x/B^2n + 1/B^(n+1)) <=
 254    *            q <= µ floor(x/B^(n-1))/B^(n+1) <= floor(x/l)
 255    *
 256    * Now, noticing that x < B^2n and l > B^(n-1) shows that x/B^2n and
 257    * B^(n-1)/l are each less than 1; hence
 258    *
 259    *    floor(x/l) - 2 <= q <= floor(x/l) <= x/l
 260    *
 261    * Now we set r = x - q l.  Certainly, r == x (mod l); and
 262    *
 263    *    0 <= r < x - l floor(x/l) + 2 l < 3 l < B^n
 264    */
 265
 266   /* Before we start on the fancy stuff, we need to resolve the pending
 267    * carries in x.  We'll be doing the floor division by just ignoring some
 268    * of the pieces, and it would be bad if we missed some significant bits.
 269    * Of course, this means that we don't actually have to store the low
 270    * NPIECE - 1 pieces of the result.
 271    */
 272   for (i = 0, c = 0; i < 2*npiece; i++)
 273     { c += x[i]; t[i] = c&m; c >>= piecewd; }
 274
 275   /* Now we calculate q.  If we calculate this in product-scanning order, we
 276    * can avoid having to store the low NPIECE + 1 pieces of the product as
 277    * long as we keep track of the carry out properly.  Conveniently, NMU =
 278    * NPIECE + 1, which keeps the loop bounds easy in the first pass.
 279    *
 280    * Furthermore, because we know that r fits in NPIECE pieces, we only need
 281    * the low NPIECE pieces of q.
 282    */
 283   for (i = 0, c = 0; i < npiece + 1; i++) {
 284     for (j = 0; j <= i; j++)
 285       c += (scaf_dblpiece)t[j + npiece - 1]*mu[i - j];
 286     c >>= piecewd;
 287   }
 288   for (i = 0; i < npiece; i++) {
 289     for (j = i + 1; j < npiece + 1; j++)
 290       c += (scaf_dblpiece)t[j + npiece - 1]*mu[npiece + 1 + i - j];
 291     q[i] = c&m; c >>= piecewd;
 292   }
 293
 294   /* Next, we calculate r - q l in z.  Product-scanning seems to be working
 295    * out for us, and this time it will save us needing a large temporary
 296    * space for the product q l as we go.  On the downside, we have to track
 297    * the carries from the multiplication and subtraction separately.
 298    *
 299    * Notice that the result r is at most NPIECE pieces long, so we can stop
 300    * once we have that many.
 301    */
 302   u = 1; c = 0;
 303   for (i = 0; i < npiece; i++) {
 304     for (j = 0; j <= i; j++) c += (scaf_dblpiece)q[j]*l[i - j];
 305     u += t[i] + ((scaf_piece)(c&m) ^ m);
 306     z[i] = u&m; u >>= piecewd; c >>= piecewd;
 307   }
 308
 309   /* Finally, two passes of conditional subtraction.  Calculate t = z - l; if
 310    * there's no borrow out the top, then update z = t; otherwise leave t
 311    * alone.
 312    */
 313   for (i = 0; i < 2; i++) {
 314     for (j = 0, u = 1; j < npiece; j++) {
 315       u += z[j] + (l[j] ^ m);
 316       t[j] = u&m; u >>= piecewd;
 317     }
 318     for (j = 0, u = -u; j < npiece; j++) z[j] = (t[j]&u) | (z[j]&~u);
 319   }
 320 }
 321
 322 /*----- That's all, folks -------------------------------------------------*/