chiark - git - mdw - catacomb/blob - math/fgoldi.c

   1 /* -*-c-*-
   2  *
   3  * Arithmetic in the Goldilocks field GF(2^448 - 2^224 - 1)
   4  *
   5  * (c) 2017 Straylight/Edgeware
   6  */
   7
   8 /*----- Licensing notice --------------------------------------------------*
   9  *
  10  * This file is part of Catacomb.
  11  *
  12  * Catacomb is free software; you can redistribute it and/or modify
  13  * it under the terms of the GNU Library General Public License as
  14  * published by the Free Software Foundation; either version 2 of the
  15  * License, or (at your option) any later version.
  16  *
  17  * Catacomb is distributed in the hope that it will be useful,
  18  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  19  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  20  * GNU Library General Public License for more details.
  21  *
  22  * You should have received a copy of the GNU Library General Public
  23  * License along with Catacomb; if not, write to the Free
  24  * Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
  25  * MA 02111-1307, USA.
  26  */
  27
  28 /*----- Header files ------------------------------------------------------*/
  29
  30 #include "config.h"
  31
  32 #include "fgoldi.h"
  33
  34 /*----- Basic setup -------------------------------------------------------*
  35  *
  36  * Let φ = 2^224; then p = φ^2 - φ - 1, and, in GF(p), we have φ^2 = φ + 1
  37  * (hence the name).
  38  */
  39
  40 typedef fgoldi_piece piece;
  41
  42 #if FGOLDI_IMPL == 28
  43 /* We represent an element of GF(p) as 16 28-bit signed integer pieces x_i:
  44  * x = SUM_{0<=i<16} x_i 2^(28i).
  45  */
  46
  47                        typedef  int64  dblpiece;
  48 typedef uint32 upiece; typedef uint64 udblpiece;
  49 #define PIECEWD(i) 28
  50 #define NPIECE 16
  51 #define P p28
  52
  53 #define B28 0x10000000u
  54 #define B27 0x08000000u
  55 #define M28 0x0fffffffu
  56 #define M27 0x07ffffffu
  57 #define M32 0xffffffffu
  58
  59 #elif FGOLDI_IMPL == 12
  60 /* We represent an element of GF(p) as 40 signed integer pieces x_i: x =
  61  * SUM_{0<=i<40} x_i 2^ceil(224i/20).  Pieces i with i == 0 (mod 5) are 12
  62  * bits wide; the others are 11 bits wide, so they form eight groups of 56
  63  * bits.
  64  */
  65
  66                        typedef  int32  dblpiece;
  67 typedef uint16 upiece; typedef uint32 udblpiece;
  68 #define PIECEWD(i) ((i)%5 ? 11 : 12)
  69 #define NPIECE 40
  70 #define P p12
  71
  72 #define B12 0x1000u
  73 #define B11 0x0800u
  74 #define B10 0x0400u
  75 #define M12 0xfffu
  76 #define M11 0x7ffu
  77 #define M10 0x3ffu
  78 #define M8 0xffu
  79
  80 #endif
  81
  82 /*----- Debugging machinery -----------------------------------------------*/
  83
  84 #if defined(FGOLDI_DEBUG) || defined(TEST_RIG)
  85
  86 #include <stdio.h>
  87
  88 #include "mp.h"
  89 #include "mptext.h"
  90
  91 static mp *get_pgoldi(void)
  92 {
  93   mp *p = MP_NEW, *t = MP_NEW;
  94
  95   p = mp_setbit(p, MP_ZERO, 448);
  96   t = mp_setbit(t, MP_ZERO, 224);
  97   p = mp_sub(p, p, t);
  98   p = mp_sub(p, p, MP_ONE);
  99   mp_drop(t);
 100   return (p);
 101 }
 102
 103 DEF_FDUMP(fdump, piece, PIECEWD, NPIECE, 56, get_pgoldi())
 104
 105 #endif
 106
 107 /*----- Loading and storing -----------------------------------------------*/
 108
 109 /* --- @fgoldi_load@ --- *
 110  *
 111  * Arguments:   @fgoldi *z@ = where to store the result
 112  *              @const octet xv[56]@ = source to read
 113  *
 114  * Returns:     ---
 115  *
 116  * Use:         Reads an element of %$\gf{2^{448} - 2^{224} - 19}$% in
 117  *              external representation from @xv@ and stores it in @z@.
 118  *
 119  *              External representation is little-endian base-256.  Some
 120  *              elements have multiple encodings, which are not produced by
 121  *              correct software; use of noncanonical encodings is not an
 122  *              error, and toleration of them is considered a performance
 123  *              feature.
 124  */
 125
 126 void fgoldi_load(fgoldi *z, const octet xv[56])
 127 {
 128 #if FGOLDI_IMPL == 28
 129
 130   unsigned i;
 131   uint32 xw[14];
 132   piece b, c;
 133
 134   /* First, read the input value as words. */
 135   for (i = 0; i < 14; i++) xw[i] = LOAD32_L(xv + 4*i);
 136
 137   /* Extract unsigned 28-bit pieces from the words. */
 138   z->P[ 0] = (xw[ 0] >> 0)&M28;
 139   z->P[ 7] = (xw[ 6] >> 4)&M28;
 140   z->P[ 8] = (xw[ 7] >> 0)&M28;
 141   z->P[15] = (xw[13] >> 4)&M28;
 142   for (i = 1; i < 7; i++) {
 143     z->P[i + 0] = ((xw[i + 0] << (4*i)) | (xw[i - 1] >> (32 - 4*i)))&M28;
 144     z->P[i + 8] = ((xw[i + 7] << (4*i)) | (xw[i + 6] >> (32 - 4*i)))&M28;
 145   }
 146
 147   /* Convert the nonnegative pieces into a balanced signed representation, so
 148    * each piece ends up in the interval |z_i| <= 2^27.  For each piece, if
 149    * its top bit is set, lend a bit leftwards; in the case of z_15, reduce
 150    * this bit by adding it onto z_0 and z_8, since this is the φ^2 bit, and
 151    * φ^2 = φ + 1.  We delay this carry until after all of the pieces have
 152    * been balanced.  If we don't do this, then we have to do a more expensive
 153    * test for nonzeroness to decide whether to lend a bit leftwards rather
 154    * than just testing a single bit.
 155    *
 156    * Note that we don't try for a canonical representation here: both upper
 157    * and lower bounds are achievable.
 158    */
 159   b = z->P[15]&B27; z->P[15] -= b << 1; c = b >> 27;
 160   for (i = NPIECE - 1; i--; )
 161     { b = z->P[i]&B27; z->P[i] -= b << 1; z->P[i + 1] += b >> 27; }
 162   z->P[0] += c; z->P[8] += c;
 163
 164 #elif FGOLDI_IMPL == 12
 165
 166   unsigned i, j, n, w, b;
 167   uint32 a;
 168   int c;
 169
 170   /* First, convert the bytes into nonnegative pieces. */
 171   for (i = j = a = n = 0, w = PIECEWD(0); i < 56; i++) {
 172     a |= (uint32)xv[i] << n; n += 8;
 173     if (n >= w) {
 174       z->P[j++] = a&MASK(w);
 175       a >>= w; n -= w; w = PIECEWD(j);
 176     }
 177   }
 178
 179   /* Convert the nonnegative pieces into a balanced signed representation, so
 180    * each piece ends up in the interval |z_i| <= 2^11 + 1.
 181    */
 182   b = z->P[39]&B10; z->P[39] -= b << 1; c = b >> 10;
 183   for (i = NPIECE - 1; i--; ) {
 184     w = PIECEWD(i) - 1;
 185     b = z->P[i]&BIT(w);
 186     z->P[i] -= b << 1;
 187     z->P[i + 1] += b >> w;
 188   }
 189   z->P[0] += c; z->P[20] += c;
 190
 191 #endif
 192 }
 193
 194 /* --- @fgoldi_store@ --- *
 195  *
 196  * Arguments:   @octet zv[56]@ = where to write the result
 197  *              @const fgoldi *x@ = the field element to write
 198  *
 199  * Returns:     ---
 200  *
 201  * Use:         Stores a field element in the given octet vector in external
 202  *              representation.  A canonical encoding is always stored.
 203  */
 204
 205 void fgoldi_store(octet zv[56], const fgoldi *x)
 206 {
 207 #if FGOLDI_IMPL == 28
 208
 209   piece y[NPIECE], yy[NPIECE], c, d;
 210   uint32 u, v;
 211   mask32 m;
 212   unsigned i;
 213
 214   for (i = 0; i < NPIECE; i++) y[i] = x->P[i];
 215
 216   /* First, propagate the carries.  By the end of this, we'll have all of the
 217    * the pieces canonically sized and positive, and maybe there'll be
 218    * (signed) carry out.  The carry c is in { -1, 0, +1 }, and the remaining
 219    * value will be in the half-open interval [0, φ^2).  The whole represented
 220    * value is then y + φ^2 c.
 221    *
 222    * Assume that we start out with |y_i| <= 2^30.  We start off by cutting
 223    * off and reducing the carry c_15 from the topmost piece, y_15.  This
 224    * leaves 0 <= y_15 < 2^28; and we'll have |c_15| <= 4.  We'll add this
 225    * onto y_0 and y_8, and propagate the carries.  It's very clear that we'll
 226    * end up with |y + (φ + 1) c_15 - φ^2/2| << φ^2.
 227    *
 228    * Here, the y_i are signed, so we must be cautious about bithacking them.
 229    */
 230   c = ASR(piece, y[15], 28); y[15] = (upiece)y[15]&M28; y[8] += c;
 231   for (i = 0; i < NPIECE; i++)
 232     { y[i] += c; c = ASR(piece, y[i], 28); y[i] = (upiece)y[i]&M28; }
 233
 234   /* Now we have a slightly fiddly job to do.  If c = +1, or if c = 0 and
 235    * y >= p, then we should subtract p from the whole value; if c = -1 then
 236    * we should add p; and otherwise we should do nothing.
 237    *
 238    * But conditional behaviour is bad, m'kay.  So here's what we do instead.
 239    *
 240    * The first job is to sort out what we wanted to do.  If c = -1 then we
 241    * want to (a) invert the constant addend and (b) feed in a carry-in;
 242    * otherwise, we don't.
 243    */
 244   m = SIGN(c)&M28;
 245   d = m&1;
 246
 247   /* Now do the addition/subtraction.  Remember that all of the y_i are
 248    * nonnegative, so shifting and masking are safe and easy.
 249    */
 250       d += y[0] + (1 ^ m); yy[0] = d&M28; d >>= 28;
 251   for (i = 1; i < 8; i++)
 252     { d += y[i] +      m;  yy[i] = d&M28; d >>= 28; }
 253       d += y[8] + (1 ^ m); yy[8] = d&M28; d >>= 28;
 254   for (i = 9; i < 16; i++)
 255     { d += y[i] +      m;  yy[i] = d&M28; d >>= 28; }
 256
 257   /* The final carry-out is in d; since we only did addition, and the y_i are
 258    * nonnegative, then d is in { 0, 1 }.  We want to keep y', rather than y,
 259    * if (a) c /= 0 (in which case we know that the old value was
 260    * unsatisfactory), or (b) if d = 1 (in which case, if c = 0, we know that
 261    * the subtraction didn't cause a borrow, so we must be in the case where
 262    * p <= y < φ^2.
 263    */
 264   m = NONZEROP(c) | ~NONZEROP(d - 1);
 265   for (i = 0; i < NPIECE; i++) y[i] = (yy[i]&m) | (y[i]&~m);
 266
 267   /* Extract 32-bit words from the value. */
 268   for (i = 0; i < 7; i++) {
 269     u = ((y[i + 0] >> (4*i)) | ((uint32)y[i + 1] << (28 - 4*i)))&M32;
 270     v = ((y[i + 8] >> (4*i)) | ((uint32)y[i + 9] << (28 - 4*i)))&M32;
 271     STORE32_L(zv + 4*i,      u);
 272     STORE32_L(zv + 4*i + 28, v);
 273   }
 274
 275 #elif FGOLDI_IMPL == 12
 276
 277   piece y[NPIECE], yy[NPIECE], c, d;
 278   uint32 a;
 279   mask32 m, mm;
 280   unsigned i, j, n, w;
 281
 282   for (i = 0; i < NPIECE; i++) y[i] = x->P[i];
 283
 284   /* First, propagate the carries.  By the end of this, we'll have all of the
 285    * the pieces canonically sized and positive, and maybe there'll be
 286    * (signed) carry out.  The carry c is in { -1, 0, +1 }, and the remaining
 287    * value will be in the half-open interval [0, φ^2).  The whole represented
 288    * value is then y + φ^2 c.
 289    *
 290    * Assume that we start out with |y_i| <= 2^14.  We start off by cutting
 291    * off and reducing the carry c_39 from the topmost piece, y_39.  This
 292    * leaves 0 <= y_39 < 2^11; and we'll have |c_39| <= 16.  We'll add this
 293    * onto y_0 and y_20, and propagate the carries.  It's very clear that
 294    * we'll end up with |y + (φ + 1) c_39 - φ^2/2| << φ^2.
 295    *
 296    * Here, the y_i are signed, so we must be cautious about bithacking them.
 297    */
 298   c = ASR(piece, y[39], 11); y[39] = (piece)y[39]&M11; y[20] += c;
 299   for (i = 0; i < NPIECE; i++) {
 300     w = PIECEWD(i); m = (1 << w) - 1;
 301     y[i] += c; c = ASR(piece, y[i], w); y[i] = (upiece)y[i]&m;
 302   }
 303
 304   /* Now we have a slightly fiddly job to do.  If c = +1, or if c = 0 and
 305    * y >= p, then we should subtract p from the whole value; if c = -1 then
 306    * we should add p; and otherwise we should do nothing.
 307    *
 308    * But conditional behaviour is bad, m'kay.  So here's what we do instead.
 309    *
 310    * The first job is to sort out what we wanted to do.  If c = -1 then we
 311    * want to (a) invert the constant addend and (b) feed in a carry-in;
 312    * otherwise, we don't.
 313    */
 314   mm = SIGN(c);
 315   d = m&1;
 316
 317   /* Now do the addition/subtraction.  Remember that all of the y_i are
 318    * nonnegative, so shifting and masking are safe and easy.
 319    */
 320     d += y[ 0] + (1 ^ (mm&M12)); yy[ 0] = d&M12; d >>= 12;
 321   for (i = 1; i < 20; i++) {
 322     w = PIECEWD(i); m = MASK(w);
 323     d += y[ i] +      (mm&m);    yy[ i] = d&m;   d >>= w;
 324   }
 325     d += y[20] + (1 ^ (mm&M12)); yy[20] = d&M12; d >>= 12;
 326   for (i = 21; i < 40; i++) {
 327     w = PIECEWD(i); m = MASK(w);
 328     d += y[ i] +      (mm&m);    yy[ i] = d&m;   d >>= w;
 329   }
 330
 331   /* The final carry-out is in d; since we only did addition, and the y_i are
 332    * nonnegative, then d is in { 0, 1 }.  We want to keep y', rather than y,
 333    * if (a) c /= 0 (in which case we know that the old value was
 334    * unsatisfactory), or (b) if d = 1 (in which case, if c = 0, we know that
 335    * the subtraction didn't cause a borrow, so we must be in the case where
 336    * p <= y < φ^2.
 337    */
 338   m = NONZEROP(c) | ~NONZEROP(d - 1);
 339   for (i = 0; i < NPIECE; i++) y[i] = (yy[i]&m) | (y[i]&~m);
 340
 341   /* Convert that back into octets. */
 342   for (i = j = a = n = 0; i < NPIECE; i++) {
 343     a |= (uint32)y[i] << n; n += PIECEWD(i);
 344     while (n >= 8) { zv[j++] = a&M8; a >>= 8; n -= 8; }
 345   }
 346
 347 #endif
 348 }
 349
 350 /* --- @fgoldi_set@ --- *
 351  *
 352  * Arguments:   @fgoldi *z@ = where to write the result
 353  *              @int a@ = a small-ish constant
 354  *
 355  * Returns:     ---
 356  *
 357  * Use:         Sets @z@ to equal @a@.
 358  */
 359
 360 void fgoldi_set(fgoldi *x, int a)
 361 {
 362   unsigned i;
 363
 364   x->P[0] = a;
 365   for (i = 1; i < NPIECE; i++) x->P[i] = 0;
 366 }
 367
 368 /*----- Basic arithmetic --------------------------------------------------*/
 369
 370 /* --- @fgoldi_add@ --- *
 371  *
 372  * Arguments:   @fgoldi *z@ = where to put the result (may alias @x@ or @y@)
 373  *              @const fgoldi *x, *y@ = two operands
 374  *
 375  * Returns:     ---
 376  *
 377  * Use:         Set @z@ to the sum %$x + y$%.
 378  */
 379
 380 void fgoldi_add(fgoldi *z, const fgoldi *x, const fgoldi *y)
 381 {
 382   unsigned i;
 383   for (i = 0; i < NPIECE; i++) z->P[i] = x->P[i] + y->P[i];
 384 }
 385
 386 /* --- @fgoldi_sub@ --- *
 387  *
 388  * Arguments:   @fgoldi *z@ = where to put the result (may alias @x@ or @y@)
 389  *              @const fgoldi *x, *y@ = two operands
 390  *
 391  * Returns:     ---
 392  *
 393  * Use:         Set @z@ to the difference %$x - y$%.
 394  */
 395
 396 void fgoldi_sub(fgoldi *z, const fgoldi *x, const fgoldi *y)
 397 {
 398   unsigned i;
 399   for (i = 0; i < NPIECE; i++) z->P[i] = x->P[i] - y->P[i];
 400 }
 401
 402 /*----- Constant-time utilities -------------------------------------------*/
 403
 404 /* --- @fgoldi_condswap@ --- *
 405  *
 406  * Arguments:   @fgoldi *x, *y@ = two operands
 407  *              @uint32 m@ = a mask
 408  *
 409  * Returns:     ---
 410  *
 411  * Use:         If @m@ is zero, do nothing; if @m@ is all-bits-set, then
 412  *              exchange @x@ and @y@.  If @m@ has some other value, then
 413  *              scramble @x@ and @y@ in an unhelpful way.
 414  */
 415
 416 void fgoldi_condswap(fgoldi *x, fgoldi *y, uint32 m)
 417 {
 418   unsigned i;
 419   mask32 mm = FIX_MASK32(m);
 420
 421   for (i = 0; i < NPIECE; i++) CONDSWAP(x->P[i], y->P[i], mm);
 422 }
 423
 424 /*----- Multiplication ----------------------------------------------------*/
 425
 426 #if FGOLDI_IMPL == 28
 427
 428 /* Let B = 2^63 - 1 be the largest value such that +B and -B can be
 429  * represented in a double-precision piece.  On entry, it must be the case
 430  * that |X_i| <= M <= B - 2^27 for some M.  If this is the case, then, on
 431  * exit, we will have |Z_i| <= 2^27 + M/2^27.
 432  */
 433 #define CARRY_REDUCE(z, x) do {                                         \
 434   dblpiece _t[NPIECE], _c;                                              \
 435   unsigned _i;                                                          \
 436                                                                         \
 437   /* Bias the input pieces.  This keeps the carries and so on centred   \
 438    * around zero rather than biased positive.                           \
 439    */                                                                   \
 440   for (_i = 0; _i < NPIECE; _i++) _t[_i] = (x)[_i] + B27;               \
 441                                                                         \
 442   /* Calculate the reduced pieces.  Careful with the bithacking. */     \
 443   _c = ASR(dblpiece, _t[15], 28);                                       \
 444   (z)[0] = (dblpiece)((udblpiece)_t[0]&M28) - B27 + _c;                 \
 445   for (_i = 1; _i < NPIECE; _i++) {                                     \
 446     (z)[_i] = (dblpiece)((udblpiece)_t[_i]&M28) - B27 +                 \
 447       ASR(dblpiece, _t[_i - 1], 28);                                    \
 448   }                                                                     \
 449   (z)[8] += _c;                                                         \
 450 } while (0)
 451
 452 #elif FGOLDI_IMPL == 12
 453
 454 static void carry_reduce(dblpiece x[NPIECE])
 455 {
 456   /* Initial bounds: we assume |x_i| < 2^31 - 2^27. */
 457
 458   unsigned i, j;
 459   dblpiece c;
 460
 461   /* The result is nearly canonical, because we do sequential carry
 462    * propagation, because smaller processors are more likely to prefer the
 463    * smaller working set than the instruction-level parallelism.
 464    *
 465    * Start at x_37; truncate it to 10 bits, and propagate the carry to x_38.
 466    * Truncate x_38 to 10 bits, and add the carry onto x_39.  Truncate x_39 to
 467    * 10 bits, and add the carry onto x_0 and x_20.  And so on.
 468    *
 469    * Once we reach x_37 for the second time, we start with |x_37| <= 2^10.
 470    * The carry into x_37 is at most 2^21; so the carry out into x_38 has
 471    * magnitude at most 2^10.  In turn, |x_38| <= 2^10 before the carry, so is
 472    * now no more than 2^11 in magnitude, and the carry out into x_39 is at
 473    * most 1.  This leaves |x_39| <= 2^10 + 1 after carry propagation.
 474    *
 475    * Be careful with the bit hacking because the quantities involved are
 476    * signed.
 477    */
 478
 479   /* For each piece, we bias it so that floor division (as done by an
 480    * arithmetic right shift) and modulus (as done by bitwise-AND) does the
 481    * right thing.
 482    */
 483 #define CARRY(i, wd, b, m) do {                                         \
 484   x[i] += (b);                                                          \
 485   c = ASR(dblpiece, x[i], (wd));                                        \
 486   x[i] = (dblpiece)((udblpiece)x[i]&(m)) - (b);                         \
 487 } while (0)
 488
 489                              {             CARRY(37, 11, B10, M11);      }
 490                              { x[38] += c; CARRY(38, 11, B10, M11);      }
 491                              { x[39] += c; CARRY(39, 11, B10, M11);      }
 492                                x[20] += c;
 493   for (i = 0; i < 35; ) {
 494                              {  x[i] += c; CARRY( i, 12, B11, M12); i++; }
 495     for (j = i + 4; i < j; ) {  x[i] += c; CARRY( i, 11, B10, M11); i++; }
 496   }
 497                              {  x[i] += c; CARRY( i, 12, B11, M12); i++; }
 498   while (i < 39)             {  x[i] += c; CARRY( i, 11, B10, M11); i++; }
 499   x[39] += c;
 500 }
 501
 502 #endif
 503
 504 /* --- @fgoldi_mulconst@ --- *
 505  *
 506  * Arguments:   @fgoldi *z@ = where to put the result (may alias @x@)
 507  *              @const fgoldi *x@ = an operand
 508  *              @long a@ = a small-ish constant; %$|a| < 2^{20}$%.
 509  *
 510  * Returns:     ---
 511  *
 512  * Use:         Set @z@ to the product %$a x$%.
 513  */
 514
 515 void fgoldi_mulconst(fgoldi *z, const fgoldi *x, long a)
 516 {
 517   unsigned i;
 518   dblpiece zz[NPIECE], aa = a;
 519
 520   for (i = 0; i < NPIECE; i++) zz[i] = aa*x->P[i];
 521 #if FGOLDI_IMPL == 28
 522   CARRY_REDUCE(z->P, zz);
 523 #elif FGOLDI_IMPL == 12
 524   carry_reduce(zz);
 525   for (i = 0; i < NPIECE; i++) z->P[i] = zz[i];
 526 #endif
 527 }
 528
 529 /* --- @fgoldi_mul@ --- *
 530  *
 531  * Arguments:   @fgoldi *z@ = where to put the result (may alias @x@ or @y@)
 532  *              @const fgoldi *x, *y@ = two operands
 533  *
 534  * Returns:     ---
 535  *
 536  * Use:         Set @z@ to the product %$x y$%.
 537  */
 538
 539 void fgoldi_mul(fgoldi *z, const fgoldi *x, const fgoldi *y)
 540 {
 541   dblpiece zz[NPIECE], u[NPIECE];
 542   piece ab[NPIECE/2], cd[NPIECE/2];
 543   const piece
 544     *a = x->P + NPIECE/2, *b = x->P,
 545     *c = y->P + NPIECE/2, *d = y->P;
 546   unsigned i, j;
 547
 548 #if FGOLDI_IMPL == 28
 549
 550 #  define M(x,i, y,j) ((dblpiece)(x)[i]*(y)[j])
 551
 552 #elif FGOLDI_IMPL == 12
 553
 554   static const unsigned short off[39] = {
 555       0,  12,  23,  34,  45,  56,  68,  79,  90, 101,
 556     112, 124, 135, 146, 157, 168, 180, 191, 202, 213,
 557     224, 236, 247, 258, 269, 280, 292, 303, 314, 325,
 558     336, 348, 359, 370, 381, 392, 404, 415, 426
 559   };
 560
 561 #define M(x,i, y,j)                                                     \
 562   (((dblpiece)(x)[i]*(y)[j]) << (off[i] + off[j] - off[(i) + (j)]))
 563
 564 #endif
 565
 566   /* Behold the magic.
 567    *
 568    * Write x = a φ + b, and y = c φ + d.  Then x y = a c φ^2 +
 569    * (a d + b c) φ + b d.  Karatsuba and Ofman observed that a d + b c =
 570    * (a + b) (c + d) - a c - b d, saving a multiplication, and Hamburg chose
 571    * the prime p so that φ^2 = φ + 1.  So
 572    *
 573    *    x y = ((a + b) (c + d) - b d) φ + a c + b d
 574    */
 575
 576   for (i = 0; i < NPIECE; i++) zz[i] = 0;
 577
 578   /* Our first job will be to calculate (1 - φ) b d, and write the result
 579    * into z.  As we do this, an interesting thing will happen.  Write
 580    * b d = u φ + v; then (1 - φ) b d = u φ + v - u φ^2 - v φ = (1 - φ) v - u.
 581    * So, what we do is to write the product end-swapped and negated, and then
 582    * we'll subtract the (negated, remember) high half from the low half.
 583    */
 584   for (i = 0; i < NPIECE/2; i++) {
 585     for (j = 0; j < NPIECE/2 - i; j++)
 586       zz[i + j + NPIECE/2] -= M(b,i, d,j);
 587     for (; j < NPIECE/2; j++)
 588       zz[i + j - NPIECE/2] -= M(b,i, d,j);
 589   }
 590   for (i = 0; i < NPIECE/2; i++)
 591     zz[i] -= zz[i + NPIECE/2];
 592
 593   /* Next, we add on a c.  There are no surprises here. */
 594   for (i = 0; i < NPIECE/2; i++)
 595     for (j = 0; j < NPIECE/2; j++)
 596       zz[i + j] += M(a,i, c,j);
 597
 598   /* Now, calculate a + b and c + d. */
 599   for (i = 0; i < NPIECE/2; i++)
 600     { ab[i] = a[i] + b[i]; cd[i] = c[i] + d[i]; }
 601
 602   /* Finally (for the multiplication) we must add on (a + b) (c + d) φ.
 603    * Write (a + b) (c + d) as u φ + v; then we actually want u φ^2 + v φ =
 604    * v φ + (1 + φ) u.  We'll store u in a temporary place and add it on
 605    * twice.
 606    */
 607   for (i = 0; i < NPIECE; i++) u[i] = 0;
 608   for (i = 0; i < NPIECE/2; i++) {
 609     for (j = 0; j < NPIECE/2 - i; j++)
 610       zz[i + j + NPIECE/2] += M(ab,i, cd,j);
 611     for (; j < NPIECE/2; j++)
 612       u[i + j - NPIECE/2] += M(ab,i, cd,j);
 613   }
 614   for (i = 0; i < NPIECE/2; i++)
 615     { zz[i] += u[i]; zz[i + NPIECE/2] += u[i]; }
 616
 617 #undef M
 618
 619 #if FGOLDI_IMPL == 28
 620   /* That wraps it up for the multiplication.  Let's figure out some bounds.
 621    * Fortunately, Karatsuba is a polynomial identity, so all of the pieces
 622    * end up the way they'd be if we'd done the thing the easy way, which
 623    * simplifies the analysis.  Suppose we started with |x_i|, |y_i| <= 9/5
 624    * 2^28.  The overheads in the result are given by the coefficients of
 625    *
 626    *    ((u^16 - 1)/(u - 1))^2 mod u^16 - u^8 - 1
 627    *
 628    * the greatest of which is 38.  So |z_i| <= 38*81/25*2^56 < 2^63.
 629    *
 630    * Anyway, a round of `CARRY_REDUCE' will leave us with |z_i| < 2^27 +
 631    * 2^36; and a second round will leave us with |z_i| < 2^27 + 512.
 632    */
 633   for (i = 0; i < 2; i++) CARRY_REDUCE(zz, zz);
 634   for (i = 0; i < NPIECE; i++) z->P[i] = zz[i];
 635 #elif FGOLDI_IMPL == 12
 636   carry_reduce(zz);
 637   for (i = 0; i < NPIECE; i++) z->P[i] = zz[i];
 638 #endif
 639 }
 640
 641 /* --- @fgoldi_sqr@ --- *
 642  *
 643  * Arguments:   @fgoldi *z@ = where to put the result (may alias @x@ or @y@)
 644  *              @const fgoldi *x@ = an operand
 645  *
 646  * Returns:     ---
 647  *
 648  * Use:         Set @z@ to the square %$x^2$%.
 649  */
 650
 651 void fgoldi_sqr(fgoldi *z, const fgoldi *x)
 652 {
 653 #if FGOLDI_IMPL == 28
 654
 655   dblpiece zz[NPIECE], u[NPIECE];
 656   piece ab[NPIECE];
 657   const piece *a = x->P + NPIECE/2, *b = x->P;
 658   unsigned i, j;
 659
 660 #  define M(x,i, y,j) ((dblpiece)(x)[i]*(y)[j])
 661
 662   /* The magic is basically the same as `fgoldi_mul' above.  We write
 663    * x = a φ + b and use Karatsuba and the special prime shape.  This time,
 664    * we have
 665    *
 666    *    x^2 = ((a + b)^2 - b^2) φ + a^2 + b^2
 667    */
 668
 669   for (i = 0; i < NPIECE; i++) zz[i] = 0;
 670
 671   /* Our first job will be to calculate (1 - φ) b^2, and write the result
 672    * into z.  Again, this interacts pleasantly with the prime shape.
 673    */
 674   for (i = 0; i < NPIECE/4; i++) {
 675     zz[2*i + NPIECE/2] -= M(b,i, b,i);
 676     for (j = i + 1; j < NPIECE/2 - i; j++)
 677       zz[i + j + NPIECE/2] -= 2*M(b,i, b,j);
 678     for (; j < NPIECE/2; j++)
 679       zz[i + j - NPIECE/2] -= 2*M(b,i, b,j);
 680   }
 681   for (; i < NPIECE/2; i++) {
 682     zz[2*i - NPIECE/2] -= M(b,i, b,i);
 683     for (j = i + 1; j < NPIECE/2; j++)
 684       zz[i + j - NPIECE/2] -= 2*M(b,i, b,j);
 685   }
 686   for (i = 0; i < NPIECE/2; i++)
 687     zz[i] -= zz[i + NPIECE/2];
 688
 689   /* Next, we add on a^2.  There are no surprises here. */
 690   for (i = 0; i < NPIECE/2; i++) {
 691     zz[2*i] += M(a,i, a,i);
 692     for (j = i + 1; j < NPIECE/2; j++)
 693       zz[i + j] += 2*M(a,i, a,j);
 694   }
 695
 696   /* Now, calculate a + b. */
 697   for (i = 0; i < NPIECE/2; i++)
 698     ab[i] = a[i] + b[i];
 699
 700   /* Finally (for the multiplication) we must add on (a + b)^2 φ.
 701    * Write (a + b)^2 as u φ + v; then we actually want (u + v) φ + u.  We'll
 702    * store u in a temporary place and add it on twice.
 703    */
 704   for (i = 0; i < NPIECE; i++) u[i] = 0;
 705   for (i = 0; i < NPIECE/4; i++) {
 706     zz[2*i + NPIECE/2] += M(ab,i, ab,i);
 707     for (j = i + 1; j < NPIECE/2 - i; j++)
 708       zz[i + j + NPIECE/2] += 2*M(ab,i, ab,j);
 709     for (; j < NPIECE/2; j++)
 710       u[i + j - NPIECE/2] += 2*M(ab,i, ab,j);
 711   }
 712   for (; i < NPIECE/2; i++) {
 713     u[2*i - NPIECE/2] += M(ab,i, ab,i);
 714     for (j = i + 1; j < NPIECE/2; j++)
 715       u[i + j - NPIECE/2] += 2*M(ab,i, ab,j);
 716   }
 717   for (i = 0; i < NPIECE/2; i++)
 718     { zz[i] += u[i]; zz[i + NPIECE/2] += u[i]; }
 719
 720 #undef M
 721
 722   /* Finally, carrying. */
 723   for (i = 0; i < 2; i++) CARRY_REDUCE(zz, zz);
 724   for (i = 0; i < NPIECE; i++) z->P[i] = zz[i];
 725
 726 #elif FGOLDI_IMPL == 12
 727   fgoldi_mul(z, x, x);
 728 #endif
 729 }
 730
 731 /*----- More advanced operations ------------------------------------------*/
 732
 733 /* --- @fgoldi_inv@ --- *
 734  *
 735  * Arguments:   @fgoldi *z@ = where to put the result (may alias @x@)
 736  *              @const fgoldi *x@ = an operand
 737  *
 738  * Returns:     ---
 739  *
 740  * Use:         Stores in @z@ the multiplicative inverse %$x^{-1}$%.  If
 741  *              %$x = 0$% then @z@ is set to zero.  This is considered a
 742  *              feature.
 743  */
 744
 745 void fgoldi_inv(fgoldi *z, const fgoldi *x)
 746 {
 747   fgoldi t, u;
 748   unsigned i;
 749
 750 #define SQRN(z, x, n) do {                                              \
 751   fgoldi_sqr((z), (x));                                                 \
 752   for (i = 1; i < (n); i++) fgoldi_sqr((z), (z));                       \
 753 } while (0)
 754
 755   /* Calculate x^-1 = x^(p - 2) = x^(2^448 - 2^224 - 3), which also handles
 756    * x = 0 as intended.  The addition chain is home-made.
 757    */                                   /* step | value */
 758   fgoldi_sqr(&u, x);                    /*    1 | 2 */
 759   fgoldi_mul(&t, &u, x);                /*    2 | 3 */
 760   SQRN(&u, &t, 2);                      /*    4 | 12 */
 761   fgoldi_mul(&t, &u, &t);               /*    5 | 15 */
 762   SQRN(&u, &t, 4);                      /*    9 | 240 */
 763   fgoldi_mul(&u, &u, &t);               /*   10 | 255 = 2^8 - 1 */
 764   SQRN(&u, &u, 4);                      /*   14 | 2^12 - 16 */
 765   fgoldi_mul(&t, &u, &t);               /*   15 | 2^12 - 1 */
 766   SQRN(&u, &t, 12);                     /*   27 | 2^24 - 2^12 */
 767   fgoldi_mul(&u, &u, &t);               /*   28 | 2^24 - 1 */
 768   SQRN(&u, &u, 12);                     /*   40 | 2^36 - 2^12 */
 769   fgoldi_mul(&t, &u, &t);               /*   41 | 2^36 - 1 */
 770   fgoldi_sqr(&t, &t);                   /*   42 | 2^37 - 2 */
 771   fgoldi_mul(&t, &t, x);                /*   43 | 2^37 - 1 */
 772   SQRN(&u, &t, 37);                     /*   80 | 2^74 - 2^37 */
 773   fgoldi_mul(&u, &u, &t);               /*   81 | 2^74 - 1 */
 774   SQRN(&u, &u, 37);                     /*  118 | 2^111 - 2^37 */
 775   fgoldi_mul(&t, &u, &t);               /*  119 | 2^111 - 1 */
 776   SQRN(&u, &t, 111);                    /*  230 | 2^222 - 2^111 */
 777   fgoldi_mul(&t, &u, &t);               /*  231 | 2^222 - 1 */
 778   fgoldi_sqr(&u, &t);                   /*  232 | 2^223 - 2 */
 779   fgoldi_mul(&u, &u, x);                /*  233 | 2^223 - 1 */
 780   SQRN(&u, &u, 223);                    /*  456 | 2^446 - 2^223 */
 781   fgoldi_mul(&t, &u, &t);               /*  457 | 2^446 - 2^222 - 1 */
 782   SQRN(&t, &t, 2);                      /*  459 | 2^448 - 2^224 - 4 */
 783   fgoldi_mul(z, &t, x);                 /*  460 | 2^448 - 2^224 - 3 */
 784
 785 #undef SQRN
 786 }
 787
 788 /*----- Test rig ----------------------------------------------------------*/
 789
 790 #ifdef TEST_RIG
 791
 792 #include <mLib/report.h>
 793 #include <mLib/str.h>
 794 #include <mLib/testrig.h>
 795
 796 static void fixdstr(dstr *d)
 797 {
 798   if (d->len > 56)
 799     die(1, "invalid length for fgoldi");
 800   else if (d->len < 56) {
 801     dstr_ensure(d, 56);
 802     memset(d->buf + d->len, 0, 56 - d->len);
 803     d->len = 56;
 804   }
 805 }
 806
 807 static void cvt_fgoldi(const char *buf, dstr *d)
 808 {
 809   dstr dd = DSTR_INIT;
 810
 811   type_hex.cvt(buf, &dd); fixdstr(&dd);
 812   dstr_ensure(d, sizeof(fgoldi)); d->len = sizeof(fgoldi);
 813   fgoldi_load((fgoldi *)d->buf, (const octet *)dd.buf);
 814   dstr_destroy(&dd);
 815 }
 816
 817 static void dump_fgoldi(dstr *d, FILE *fp)
 818   { fdump(stderr, "???", (const piece *)d->buf); }
 819
 820 static void cvt_fgoldi_ref(const char *buf, dstr *d)
 821   { type_hex.cvt(buf, d); fixdstr(d); }
 822
 823 static void dump_fgoldi_ref(dstr *d, FILE *fp)
 824 {
 825   fgoldi x;
 826
 827   fgoldi_load(&x, (const octet *)d->buf);
 828   fdump(stderr, "???", x.P);
 829 }
 830
 831 static int eq(const fgoldi *x, dstr *d)
 832   { octet b[56]; fgoldi_store(b, x); return (memcmp(b, d->buf, 56) == 0); }
 833
 834 static const test_type
 835   type_fgoldi = { cvt_fgoldi, dump_fgoldi },
 836   type_fgoldi_ref = { cvt_fgoldi_ref, dump_fgoldi_ref };
 837
 838 #define TEST_UNOP(op)                                                   \
 839   static int vrf_##op(dstr dv[])                                        \
 840   {                                                                     \
 841     fgoldi *x = (fgoldi *)dv[0].buf;                                    \
 842     fgoldi z, zz;                                                       \
 843     int ok = 1;                                                         \
 844                                                                         \
 845     fgoldi_##op(&z, x);                                                 \
 846     if (!eq(&z, &dv[1])) {                                              \
 847       ok = 0;                                                           \
 848       fprintf(stderr, "failed!\n");                                     \
 849       fdump(stderr, "x", x->P);                                         \
 850       fdump(stderr, "calc", z.P);                                       \
 851       fgoldi_load(&zz, (const octet *)dv[1].buf);                       \
 852       fdump(stderr, "z", zz.P);                                         \
 853     }                                                                   \
 854                                                                         \
 855     return (ok);                                                        \
 856   }
 857
 858 TEST_UNOP(sqr)
 859 TEST_UNOP(inv)
 860
 861 #define TEST_BINOP(op)                                                  \
 862   static int vrf_##op(dstr dv[])                                        \
 863   {                                                                     \
 864     fgoldi *x = (fgoldi *)dv[0].buf, *y = (fgoldi *)dv[1].buf;          \
 865     fgoldi z, zz;                                                       \
 866     int ok = 1;                                                         \
 867                                                                         \
 868     fgoldi_##op(&z, x, y);                                              \
 869     if (!eq(&z, &dv[2])) {                                              \
 870       ok = 0;                                                           \
 871       fprintf(stderr, "failed!\n");                                     \
 872       fdump(stderr, "x", x->P);                                         \
 873       fdump(stderr, "y", y->P);                                         \
 874       fdump(stderr, "calc", z.P);                                       \
 875       fgoldi_load(&zz, (const octet *)dv[2].buf);                       \
 876       fdump(stderr, "z", zz.P);                                         \
 877     }                                                                   \
 878                                                                         \
 879     return (ok);                                                        \
 880   }
 881
 882 TEST_BINOP(add)
 883 TEST_BINOP(sub)
 884 TEST_BINOP(mul)
 885
 886 static int vrf_mulc(dstr dv[])
 887 {
 888   fgoldi *x = (fgoldi *)dv[0].buf;
 889   long a = *(const long *)dv[1].buf;
 890   fgoldi z, zz;
 891   int ok = 1;
 892
 893   fgoldi_mulconst(&z, x, a);
 894   if (!eq(&z, &dv[2])) {
 895     ok = 0;
 896     fprintf(stderr, "failed!\n");
 897     fdump(stderr, "x", x->P);
 898     fprintf(stderr, "a = %ld\n", a);
 899     fdump(stderr, "calc", z.P);
 900     fgoldi_load(&zz, (const octet *)dv[2].buf);
 901     fdump(stderr, "z", zz.P);
 902   }
 903
 904   return (ok);
 905 }
 906
 907 static int vrf_condswap(dstr dv[])
 908 {
 909   fgoldi *x = (fgoldi *)dv[0].buf, *y = (fgoldi *)dv[1].buf;
 910   fgoldi xx = *x, yy = *y;
 911   uint32 m = *(uint32 *)dv[2].buf;
 912   int ok = 1;
 913
 914   fgoldi_condswap(&xx, &yy, m);
 915   if (!eq(&xx, &dv[3]) || !eq(&yy, &dv[4])) {
 916     ok = 0;
 917     fprintf(stderr, "failed!\n");
 918     fdump(stderr, "x", x->P);
 919     fdump(stderr, "y", y->P);
 920     fprintf(stderr, "m = 0x%08lx\n", (unsigned long)m);
 921     fdump(stderr, "calc xx", xx.P);
 922     fdump(stderr, "calc yy", yy.P);
 923     fgoldi_load(&xx, (const octet *)dv[3].buf);
 924     fgoldi_load(&yy, (const octet *)dv[4].buf);
 925     fdump(stderr, "want xx", xx.P);
 926     fdump(stderr, "want yy", yy.P);
 927   }
 928
 929   return (ok);
 930 }
 931
 932 static int vrf_sub_mulc_add_sub_mul(dstr dv[])
 933 {
 934   fgoldi *u = (fgoldi *)dv[0].buf, *v = (fgoldi *)dv[1].buf,
 935     *w = (fgoldi *)dv[3].buf, *x = (fgoldi *)dv[4].buf,
 936     *y = (fgoldi *)dv[5].buf;
 937   long a = *(const long *)dv[2].buf;
 938   fgoldi umv, aumv, wpaumv, xmy, z, zz;
 939   int ok = 1;
 940
 941   fgoldi_sub(&umv, u, v);
 942   fgoldi_mulconst(&aumv, &umv, a);
 943   fgoldi_add(&wpaumv, w, &aumv);
 944   fgoldi_sub(&xmy, x, y);
 945   fgoldi_mul(&z, &wpaumv, &xmy);
 946
 947   if (!eq(&z, &dv[6])) {
 948     ok = 0;
 949     fprintf(stderr, "failed!\n");
 950     fdump(stderr, "u", u->P);
 951     fdump(stderr, "v", v->P);
 952     fdump(stderr, "u - v", umv.P);
 953     fprintf(stderr, "a = %ld\n", a);
 954     fdump(stderr, "a (u - v)", aumv.P);
 955     fdump(stderr, "w + a (u - v)", wpaumv.P);
 956     fdump(stderr, "x", x->P);
 957     fdump(stderr, "y", y->P);
 958     fdump(stderr, "x - y", xmy.P);
 959     fdump(stderr, "(x - y) (w + a (u - v))", z.P);
 960     fgoldi_load(&zz, (const octet *)dv[6].buf); fdump(stderr, "z", zz.P);
 961   }
 962
 963   return (ok);
 964 }
 965
 966 static test_chunk tests[] = {
 967   { "add", vrf_add, { &type_fgoldi, &type_fgoldi, &type_fgoldi_ref } },
 968   { "sub", vrf_sub, { &type_fgoldi, &type_fgoldi, &type_fgoldi_ref } },
 969   { "mul", vrf_mul, { &type_fgoldi, &type_fgoldi, &type_fgoldi_ref } },
 970   { "mulconst", vrf_mulc, { &type_fgoldi, &type_long, &type_fgoldi_ref } },
 971   { "condswap", vrf_condswap,
 972     { &type_fgoldi, &type_fgoldi, &type_uint32,
 973       &type_fgoldi_ref, &type_fgoldi_ref } },
 974   { "sqr", vrf_sqr, { &type_fgoldi, &type_fgoldi_ref } },
 975   { "inv", vrf_inv, { &type_fgoldi, &type_fgoldi_ref } },
 976   { "sub-mulc-add-sub-mul", vrf_sub_mulc_add_sub_mul,
 977     { &type_fgoldi, &type_fgoldi, &type_long, &type_fgoldi,
 978       &type_fgoldi, &type_fgoldi, &type_fgoldi_ref } },
 979   { 0, 0, { 0 } }
 980 };
 981
 982 int main(int argc, char *argv[])
 983 {
 984   test_run(argc, argv, tests, SRCDIR "/t/fgoldi");
 985   return (0);
 986 }
 987
 988 #endif
 989
 990 /*----- That's all, folks -------------------------------------------------*/