chiark - git - mdw - catacomb/blob - math/fgoldi.c

   1 /* -*-c-*-
   2  *
   3  * Arithmetic in the Goldilocks field GF(2^448 - 2^224 - 1)
   4  *
   5  * (c) 2017 Straylight/Edgeware
   6  */
   7
   8 /*----- Licensing notice --------------------------------------------------*
   9  *
  10  * This file is part of Catacomb.
  11  *
  12  * Catacomb is free software; you can redistribute it and/or modify
  13  * it under the terms of the GNU Library General Public License as
  14  * published by the Free Software Foundation; either version 2 of the
  15  * License, or (at your option) any later version.
  16  *
  17  * Catacomb is distributed in the hope that it will be useful,
  18  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  19  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  20  * GNU Library General Public License for more details.
  21  *
  22  * You should have received a copy of the GNU Library General Public
  23  * License along with Catacomb; if not, write to the Free
  24  * Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
  25  * MA 02111-1307, USA.
  26  */
  27
  28 /*----- Header files ------------------------------------------------------*/
  29
  30 #include "config.h"
  31
  32 #include "ct.h"
  33 #include "fgoldi.h"
  34
  35 /*----- Basic setup -------------------------------------------------------*
  36  *
  37  * Let φ = 2^224; then p = φ^2 - φ - 1, and, in GF(p), we have φ^2 = φ + 1
  38  * (hence the name).
  39  */
  40
  41 typedef fgoldi_piece piece;
  42
  43 #if FGOLDI_IMPL == 28
  44 /* We represent an element of GF(p) as 16 28-bit signed integer pieces x_i:
  45  * x = SUM_{0<=i<16} x_i 2^(28i).
  46  */
  47
  48                        typedef  int64  dblpiece;
  49 typedef uint32 upiece; typedef uint64 udblpiece;
  50 #define PIECEWD(i) 28
  51 #define NPIECE 16
  52 #define P p28
  53
  54 #define B27 0x08000000u
  55 #define M28 0x0fffffffu
  56 #define M32 0xffffffffu
  57
  58 #elif FGOLDI_IMPL == 12
  59 /* We represent an element of GF(p) as 40 signed integer pieces x_i: x =
  60  * SUM_{0<=i<40} x_i 2^ceil(224i/20).  Pieces i with i == 0 (mod 5) are 12
  61  * bits wide; the others are 11 bits wide, so they form eight groups of 56
  62  * bits.
  63  */
  64
  65                        typedef  int32  dblpiece;
  66 typedef uint16 upiece; typedef uint32 udblpiece;
  67 #define PIECEWD(i) ((i)%5 ? 11 : 12)
  68 #define NPIECE 40
  69 #define P p12
  70
  71 #define B11 0x0800u
  72 #define B10 0x0400u
  73 #define M12 0xfffu
  74 #define M11 0x7ffu
  75 #define M8 0xffu
  76
  77 #endif
  78
  79 /*----- Debugging machinery -----------------------------------------------*/
  80
  81 #if defined(FGOLDI_DEBUG) || defined(TEST_RIG)
  82
  83 #include <stdio.h>
  84
  85 #include "mp.h"
  86 #include "mptext.h"
  87
  88 static mp *get_pgoldi(void)
  89 {
  90   mp *p = MP_NEW, *t = MP_NEW;
  91
  92   p = mp_setbit(p, MP_ZERO, 448);
  93   t = mp_setbit(t, MP_ZERO, 224);
  94   p = mp_sub(p, p, t);
  95   p = mp_sub(p, p, MP_ONE);
  96   mp_drop(t);
  97   return (p);
  98 }
  99
 100 DEF_FDUMP(fdump, piece, PIECEWD, NPIECE, 56, get_pgoldi())
 101
 102 #endif
 103
 104 /*----- Loading and storing -----------------------------------------------*/
 105
 106 /* --- @fgoldi_load@ --- *
 107  *
 108  * Arguments:   @fgoldi *z@ = where to store the result
 109  *              @const octet xv[56]@ = source to read
 110  *
 111  * Returns:     ---
 112  *
 113  * Use:         Reads an element of %$\gf{2^{448} - 2^{224} - 19}$% in
 114  *              external representation from @xv@ and stores it in @z@.
 115  *
 116  *              External representation is little-endian base-256.  Some
 117  *              elements have multiple encodings, which are not produced by
 118  *              correct software; use of noncanonical encodings is not an
 119  *              error, and toleration of them is considered a performance
 120  *              feature.
 121  */
 122
 123 void fgoldi_load(fgoldi *z, const octet xv[56])
 124 {
 125 #if FGOLDI_IMPL == 28
 126
 127   unsigned i;
 128   uint32 xw[14];
 129   piece b, c;
 130
 131   /* First, read the input value as words. */
 132   for (i = 0; i < 14; i++) xw[i] = LOAD32_L(xv + 4*i);
 133
 134   /* Extract unsigned 28-bit pieces from the words. */
 135   z->P[ 0] = (xw[ 0] >> 0)&M28;
 136   z->P[ 7] = (xw[ 6] >> 4)&M28;
 137   z->P[ 8] = (xw[ 7] >> 0)&M28;
 138   z->P[15] = (xw[13] >> 4)&M28;
 139   for (i = 1; i < 7; i++) {
 140     z->P[i + 0] = ((xw[i + 0] << (4*i)) | (xw[i - 1] >> (32 - 4*i)))&M28;
 141     z->P[i + 8] = ((xw[i + 7] << (4*i)) | (xw[i + 6] >> (32 - 4*i)))&M28;
 142   }
 143
 144   /* Convert the nonnegative pieces into a balanced signed representation, so
 145    * each piece ends up in the interval |z_i| <= 2^27.  For each piece, if
 146    * its top bit is set, lend a bit leftwards; in the case of z_15, reduce
 147    * this bit by adding it onto z_0 and z_8, since this is the φ^2 bit, and
 148    * φ^2 = φ + 1.  We delay this carry until after all of the pieces have
 149    * been balanced.  If we don't do this, then we have to do a more expensive
 150    * test for nonzeroness to decide whether to lend a bit leftwards rather
 151    * than just testing a single bit.
 152    *
 153    * Note that we don't try for a canonical representation here: both upper
 154    * and lower bounds are achievable.
 155    */
 156   b = z->P[15]&B27; z->P[15] -= b << 1; c = b >> 27;
 157   for (i = NPIECE - 1; i--; )
 158     { b = z->P[i]&B27; z->P[i] -= b << 1; z->P[i + 1] += b >> 27; }
 159   z->P[0] += c; z->P[8] += c;
 160
 161 #elif FGOLDI_IMPL == 12
 162
 163   unsigned i, j, n, w, b;
 164   uint32 a;
 165   int c;
 166
 167   /* First, convert the bytes into nonnegative pieces. */
 168   for (i = j = a = n = 0, w = PIECEWD(0); i < 56; i++) {
 169     a |= (uint32)xv[i] << n; n += 8;
 170     if (n >= w) {
 171       z->P[j++] = a&MASK(w);
 172       a >>= w; n -= w; w = PIECEWD(j);
 173     }
 174   }
 175
 176   /* Convert the nonnegative pieces into a balanced signed representation, so
 177    * each piece ends up in the interval |z_i| <= 2^11 + 1.
 178    */
 179   b = z->P[39]&B10; z->P[39] -= b << 1; c = b >> 10;
 180   for (i = NPIECE - 1; i--; ) {
 181     w = PIECEWD(i) - 1;
 182     b = z->P[i]&BIT(w);
 183     z->P[i] -= b << 1;
 184     z->P[i + 1] += b >> w;
 185   }
 186   z->P[0] += c; z->P[20] += c;
 187
 188 #endif
 189 }
 190
 191 /* --- @fgoldi_store@ --- *
 192  *
 193  * Arguments:   @octet zv[56]@ = where to write the result
 194  *              @const fgoldi *x@ = the field element to write
 195  *
 196  * Returns:     ---
 197  *
 198  * Use:         Stores a field element in the given octet vector in external
 199  *              representation.  A canonical encoding is always stored.
 200  */
 201
 202 void fgoldi_store(octet zv[56], const fgoldi *x)
 203 {
 204 #if FGOLDI_IMPL == 28
 205
 206   piece y[NPIECE], yy[NPIECE], c, d;
 207   uint32 u, v;
 208   mask32 m;
 209   unsigned i;
 210
 211   for (i = 0; i < NPIECE; i++) y[i] = x->P[i];
 212
 213   /* First, propagate the carries.  By the end of this, we'll have all of the
 214    * the pieces canonically sized and positive, and maybe there'll be
 215    * (signed) carry out.  The carry c is in { -1, 0, +1 }, and the remaining
 216    * value will be in the half-open interval [0, φ^2).  The whole represented
 217    * value is then y + φ^2 c.
 218    *
 219    * Assume that we start out with |y_i| <= 2^30.  We start off by cutting
 220    * off and reducing the carry c_15 from the topmost piece, y_15.  This
 221    * leaves 0 <= y_15 < 2^28; and we'll have |c_15| <= 4.  We'll add this
 222    * onto y_0 and y_8, and propagate the carries.  It's very clear that we'll
 223    * end up with |y + (φ + 1) c_15 - φ^2/2| << φ^2.
 224    *
 225    * Here, the y_i are signed, so we must be cautious about bithacking them.
 226    */
 227   c = ASR(piece, y[15], 28); y[15] = (upiece)y[15]&M28; y[8] += c;
 228   for (i = 0; i < NPIECE; i++)
 229     { y[i] += c; c = ASR(piece, y[i], 28); y[i] = (upiece)y[i]&M28; }
 230
 231   /* Now we have a slightly fiddly job to do.  If c = +1, or if c = 0 and
 232    * y >= p, then we should subtract p from the whole value; if c = -1 then
 233    * we should add p; and otherwise we should do nothing.
 234    *
 235    * But conditional behaviour is bad, m'kay.  So here's what we do instead.
 236    *
 237    * The first job is to sort out what we wanted to do.  If c = -1 then we
 238    * want to (a) invert the constant addend and (b) feed in a carry-in;
 239    * otherwise, we don't.
 240    */
 241   m = SIGN(c)&M28;
 242   d = m&1;
 243
 244   /* Now do the addition/subtraction.  Remember that all of the y_i are
 245    * nonnegative, so shifting and masking are safe and easy.
 246    */
 247       d += y[0] + (1 ^ m); yy[0] = d&M28; d >>= 28;
 248   for (i = 1; i < 8; i++)
 249     { d += y[i] +      m;  yy[i] = d&M28; d >>= 28; }
 250       d += y[8] + (1 ^ m); yy[8] = d&M28; d >>= 28;
 251   for (i = 9; i < 16; i++)
 252     { d += y[i] +      m;  yy[i] = d&M28; d >>= 28; }
 253
 254   /* The final carry-out is in d; since we only did addition, and the y_i are
 255    * nonnegative, then d is in { 0, 1 }.  We want to keep y', rather than y,
 256    * if (a) c /= 0 (in which case we know that the old value was
 257    * unsatisfactory), or (b) if d = 1 (in which case, if c = 0, we know that
 258    * the subtraction didn't cause a borrow, so we must be in the case where
 259    * p <= y < φ^2.
 260    */
 261   m = NONZEROP(c) | ~NONZEROP(d - 1);
 262   for (i = 0; i < NPIECE; i++) y[i] = (yy[i]&m) | (y[i]&~m);
 263
 264   /* Extract 32-bit words from the value. */
 265   for (i = 0; i < 7; i++) {
 266     u = ((y[i + 0] >> (4*i)) | ((uint32)y[i + 1] << (28 - 4*i)))&M32;
 267     v = ((y[i + 8] >> (4*i)) | ((uint32)y[i + 9] << (28 - 4*i)))&M32;
 268     STORE32_L(zv + 4*i,      u);
 269     STORE32_L(zv + 4*i + 28, v);
 270   }
 271
 272 #elif FGOLDI_IMPL == 12
 273
 274   piece y[NPIECE], yy[NPIECE], c, d;
 275   uint32 a;
 276   mask32 m, mm;
 277   unsigned i, j, n, w;
 278
 279   for (i = 0; i < NPIECE; i++) y[i] = x->P[i];
 280
 281   /* First, propagate the carries.  By the end of this, we'll have all of the
 282    * the pieces canonically sized and positive, and maybe there'll be
 283    * (signed) carry out.  The carry c is in { -1, 0, +1 }, and the remaining
 284    * value will be in the half-open interval [0, φ^2).  The whole represented
 285    * value is then y + φ^2 c.
 286    *
 287    * Assume that we start out with |y_i| <= 2^14.  We start off by cutting
 288    * off and reducing the carry c_39 from the topmost piece, y_39.  This
 289    * leaves 0 <= y_39 < 2^11; and we'll have |c_39| <= 16.  We'll add this
 290    * onto y_0 and y_20, and propagate the carries.  It's very clear that
 291    * we'll end up with |y + (φ + 1) c_39 - φ^2/2| << φ^2.
 292    *
 293    * Here, the y_i are signed, so we must be cautious about bithacking them.
 294    */
 295   c = ASR(piece, y[39], 11); y[39] = (piece)y[39]&M11; y[20] += c;
 296   for (i = 0; i < NPIECE; i++) {
 297     w = PIECEWD(i); m = (1 << w) - 1;
 298     y[i] += c; c = ASR(piece, y[i], w); y[i] = (upiece)y[i]&m;
 299   }
 300
 301   /* Now we have a slightly fiddly job to do.  If c = +1, or if c = 0 and
 302    * y >= p, then we should subtract p from the whole value; if c = -1 then
 303    * we should add p; and otherwise we should do nothing.
 304    *
 305    * But conditional behaviour is bad, m'kay.  So here's what we do instead.
 306    *
 307    * The first job is to sort out what we wanted to do.  If c = -1 then we
 308    * want to (a) invert the constant addend and (b) feed in a carry-in;
 309    * otherwise, we don't.
 310    */
 311   mm = SIGN(c);
 312   d = m&1;
 313
 314   /* Now do the addition/subtraction.  Remember that all of the y_i are
 315    * nonnegative, so shifting and masking are safe and easy.
 316    */
 317     d += y[ 0] + (1 ^ (mm&M12)); yy[ 0] = d&M12; d >>= 12;
 318   for (i = 1; i < 20; i++) {
 319     w = PIECEWD(i); m = MASK(w);
 320     d += y[ i] +      (mm&m);    yy[ i] = d&m;   d >>= w;
 321   }
 322     d += y[20] + (1 ^ (mm&M12)); yy[20] = d&M12; d >>= 12;
 323   for (i = 21; i < 40; i++) {
 324     w = PIECEWD(i); m = MASK(w);
 325     d += y[ i] +      (mm&m);    yy[ i] = d&m;   d >>= w;
 326   }
 327
 328   /* The final carry-out is in d; since we only did addition, and the y_i are
 329    * nonnegative, then d is in { 0, 1 }.  We want to keep y', rather than y,
 330    * if (a) c /= 0 (in which case we know that the old value was
 331    * unsatisfactory), or (b) if d = 1 (in which case, if c = 0, we know that
 332    * the subtraction didn't cause a borrow, so we must be in the case where
 333    * p <= y < φ^2.
 334    */
 335   m = NONZEROP(c) | ~NONZEROP(d - 1);
 336   for (i = 0; i < NPIECE; i++) y[i] = (yy[i]&m) | (y[i]&~m);
 337
 338   /* Convert that back into octets. */
 339   for (i = j = a = n = 0; i < NPIECE; i++) {
 340     a |= (uint32)y[i] << n; n += PIECEWD(i);
 341     while (n >= 8) { zv[j++] = a&M8; a >>= 8; n -= 8; }
 342   }
 343
 344 #endif
 345 }
 346
 347 /* --- @fgoldi_set@ --- *
 348  *
 349  * Arguments:   @fgoldi *z@ = where to write the result
 350  *              @int a@ = a small-ish constant
 351  *
 352  * Returns:     ---
 353  *
 354  * Use:         Sets @z@ to equal @a@.
 355  */
 356
 357 void fgoldi_set(fgoldi *x, int a)
 358 {
 359   unsigned i;
 360
 361   x->P[0] = a;
 362   for (i = 1; i < NPIECE; i++) x->P[i] = 0;
 363 }
 364
 365 /*----- Basic arithmetic --------------------------------------------------*/
 366
 367 /* --- @fgoldi_add@ --- *
 368  *
 369  * Arguments:   @fgoldi *z@ = where to put the result (may alias @x@ or @y@)
 370  *              @const fgoldi *x, *y@ = two operands
 371  *
 372  * Returns:     ---
 373  *
 374  * Use:         Set @z@ to the sum %$x + y$%.
 375  */
 376
 377 void fgoldi_add(fgoldi *z, const fgoldi *x, const fgoldi *y)
 378 {
 379   unsigned i;
 380   for (i = 0; i < NPIECE; i++) z->P[i] = x->P[i] + y->P[i];
 381 }
 382
 383 /* --- @fgoldi_sub@ --- *
 384  *
 385  * Arguments:   @fgoldi *z@ = where to put the result (may alias @x@ or @y@)
 386  *              @const fgoldi *x, *y@ = two operands
 387  *
 388  * Returns:     ---
 389  *
 390  * Use:         Set @z@ to the difference %$x - y$%.
 391  */
 392
 393 void fgoldi_sub(fgoldi *z, const fgoldi *x, const fgoldi *y)
 394 {
 395   unsigned i;
 396   for (i = 0; i < NPIECE; i++) z->P[i] = x->P[i] - y->P[i];
 397 }
 398
 399 /* --- @fgoldi_neg@ --- *
 400  *
 401  * Arguments:   @fgoldi *z@ = where to put the result (may alias @x@)
 402  *              @const fgoldi *x@ = an operand
 403  *
 404  * Returns:     ---
 405  *
 406  * Use:         Set @z = -x@.
 407  */
 408
 409 void fgoldi_neg(fgoldi *z, const fgoldi *x)
 410 {
 411   unsigned i;
 412   for (i = 0; i < NPIECE; i++) z->P[i] = -x->P[i];
 413 }
 414
 415 /*----- Constant-time utilities -------------------------------------------*/
 416
 417 /* --- @fgoldi_pick2@ --- *
 418  *
 419  * Arguments:   @fgoldi *z@ = where to put the result (may alias @x@ or @y@)
 420  *              @const fgoldi *x, *y@ = two operands
 421  *              @uint32 m@ = a mask
 422  *
 423  * Returns:     ---
 424  *
 425  * Use:         If @m@ is zero, set @z = y@; if @m@ is all-bits-set, then set
 426  *              @z = x@.  If @m@ has some other value, then scramble @z@ in
 427  *              an unhelpful way.
 428  */
 429
 430 void fgoldi_pick2(fgoldi *z, const fgoldi *x, const fgoldi *y, uint32 m)
 431 {
 432   mask32 mm = FIX_MASK32(m);
 433   unsigned i;
 434   for (i = 0; i < NPIECE; i++) z->P[i] = PICK2(x->P[i], y->P[i], mm);
 435 }
 436
 437 /* --- @fgoldi_pickn@ --- *
 438  *
 439  * Arguments:   @fgoldi *z@ = where to put the result
 440  *              @const fgoldi *v@ = a table of entries
 441  *              @size_t n@ = the number of entries in @v@
 442  *              @size_t i@ = an index
 443  *
 444  * Returns:     ---
 445  *
 446  * Use:         If @0 <= i < n < 32@ then set @z = v[i]@.  If @n >= 32@ then
 447  *              do something unhelpful; otherwise, if @i >= n@ then set @z@
 448  *              to zero.
 449  */
 450
 451 void fgoldi_pickn(fgoldi *z, const fgoldi *v, size_t n, size_t i)
 452 {
 453   uint32 b = (uint32)1 << (31 - i);
 454   mask32 m;
 455   unsigned j;
 456
 457   for (j = 0; j < NPIECE; j++) z->P[j] = 0;
 458   while (n--) {
 459     m = SIGN(b);
 460     for (j = 0; j < NPIECE; j++) CONDPICK(z->P[j], v->P[j], m);
 461     v++; b <<= 1;
 462   }
 463 }
 464
 465 /* --- @fgoldi_condswap@ --- *
 466  *
 467  * Arguments:   @fgoldi *x, *y@ = two operands
 468  *              @uint32 m@ = a mask
 469  *
 470  * Returns:     ---
 471  *
 472  * Use:         If @m@ is zero, do nothing; if @m@ is all-bits-set, then
 473  *              exchange @x@ and @y@.  If @m@ has some other value, then
 474  *              scramble @x@ and @y@ in an unhelpful way.
 475  */
 476
 477 void fgoldi_condswap(fgoldi *x, fgoldi *y, uint32 m)
 478 {
 479   unsigned i;
 480   mask32 mm = FIX_MASK32(m);
 481
 482   for (i = 0; i < NPIECE; i++) CONDSWAP(x->P[i], y->P[i], mm);
 483 }
 484
 485 /* --- @fgoldi_condneg@ --- *
 486  *
 487  * Arguments:   @fgoldi *z@ = where to put the result (may alias @x@)
 488  *              @const fgoldi *x@ = an operand
 489  *              @uint32 m@ = a mask
 490  *
 491  * Returns:     ---
 492  *
 493  * Use:         If @m@ is zero, set @z = x@; if @m@ is all-bits-set, then set
 494  *              @z = -x@.  If @m@ has some other value then scramble @z@ in
 495  *              an unhelpful way.
 496  */
 497
 498 void fgoldi_condneg(fgoldi *z, const fgoldi *x, uint32 m)
 499 {
 500 #ifdef NEG_TWOC
 501   mask32 m_xor = FIX_MASK32(m);
 502   piece m_add = m&1;
 503 # define CONDNEG(x) (((x) ^ m_xor) + m_add)
 504 #else
 505   int s = PICK2(-1, +1, m);
 506 # define CONDNEG(x) (s*(x))
 507 #endif
 508
 509   unsigned i;
 510   for (i = 0; i < NPIECE; i++) z->P[i] = CONDNEG(x->P[i]);
 511
 512 #undef CONDNEG
 513 }
 514
 515 /*----- Multiplication ----------------------------------------------------*/
 516
 517 #if FGOLDI_IMPL == 28
 518
 519 /* Let B = 2^63 - 1 be the largest value such that +B and -B can be
 520  * represented in a double-precision piece.  On entry, it must be the case
 521  * that |X_i| <= M <= B - 2^27 for some M.  If this is the case, then, on
 522  * exit, we will have |Z_i| <= 2^27 + M/2^27.
 523  */
 524 #define CARRY_REDUCE(z, x) do {                                         \
 525   dblpiece _t[NPIECE], _c;                                              \
 526   unsigned _i;                                                          \
 527                                                                         \
 528   /* Bias the input pieces.  This keeps the carries and so on centred   \
 529    * around zero rather than biased positive.                           \
 530    */                                                                   \
 531   for (_i = 0; _i < NPIECE; _i++) _t[_i] = (x)[_i] + B27;               \
 532                                                                         \
 533   /* Calculate the reduced pieces.  Careful with the bithacking. */     \
 534   _c = ASR(dblpiece, _t[15], 28);                                       \
 535   (z)[0] = (dblpiece)((udblpiece)_t[0]&M28) - B27 + _c;                 \
 536   for (_i = 1; _i < NPIECE; _i++) {                                     \
 537     (z)[_i] = (dblpiece)((udblpiece)_t[_i]&M28) - B27 +                 \
 538       ASR(dblpiece, _t[_i - 1], 28);                                    \
 539   }                                                                     \
 540   (z)[8] += _c;                                                         \
 541 } while (0)
 542
 543 #elif FGOLDI_IMPL == 12
 544
 545 static void carry_reduce(dblpiece x[NPIECE])
 546 {
 547   /* Initial bounds: we assume |x_i| < 2^31 - 2^27. */
 548
 549   unsigned i, j;
 550   dblpiece c;
 551
 552   /* The result is nearly canonical, because we do sequential carry
 553    * propagation, because smaller processors are more likely to prefer the
 554    * smaller working set than the instruction-level parallelism.
 555    *
 556    * Start at x_37; truncate it to 10 bits, and propagate the carry to x_38.
 557    * Truncate x_38 to 10 bits, and add the carry onto x_39.  Truncate x_39 to
 558    * 10 bits, and add the carry onto x_0 and x_20.  And so on.
 559    *
 560    * Once we reach x_37 for the second time, we start with |x_37| <= 2^10.
 561    * The carry into x_37 is at most 2^21; so the carry out into x_38 has
 562    * magnitude at most 2^10.  In turn, |x_38| <= 2^10 before the carry, so is
 563    * now no more than 2^11 in magnitude, and the carry out into x_39 is at
 564    * most 1.  This leaves |x_39| <= 2^10 + 1 after carry propagation.
 565    *
 566    * Be careful with the bit hacking because the quantities involved are
 567    * signed.
 568    */
 569
 570   /* For each piece, we bias it so that floor division (as done by an
 571    * arithmetic right shift) and modulus (as done by bitwise-AND) does the
 572    * right thing.
 573    */
 574 #define CARRY(i, wd, b, m) do {                                         \
 575   x[i] += (b);                                                          \
 576   c = ASR(dblpiece, x[i], (wd));                                        \
 577   x[i] = (dblpiece)((udblpiece)x[i]&(m)) - (b);                         \
 578 } while (0)
 579
 580                              {             CARRY(37, 11, B10, M11);      }
 581                              { x[38] += c; CARRY(38, 11, B10, M11);      }
 582                              { x[39] += c; CARRY(39, 11, B10, M11);      }
 583                                x[20] += c;
 584   for (i = 0; i < 35; ) {
 585                              {  x[i] += c; CARRY( i, 12, B11, M12); i++; }
 586     for (j = i + 4; i < j; ) {  x[i] += c; CARRY( i, 11, B10, M11); i++; }
 587   }
 588                              {  x[i] += c; CARRY( i, 12, B11, M12); i++; }
 589   while (i < 39)             {  x[i] += c; CARRY( i, 11, B10, M11); i++; }
 590   x[39] += c;
 591 }
 592
 593 #endif
 594
 595 /* --- @fgoldi_mulconst@ --- *
 596  *
 597  * Arguments:   @fgoldi *z@ = where to put the result (may alias @x@)
 598  *              @const fgoldi *x@ = an operand
 599  *              @long a@ = a small-ish constant; %$|a| < 2^{20}$%.
 600  *
 601  * Returns:     ---
 602  *
 603  * Use:         Set @z@ to the product %$a x$%.
 604  */
 605
 606 void fgoldi_mulconst(fgoldi *z, const fgoldi *x, long a)
 607 {
 608   unsigned i;
 609   dblpiece zz[NPIECE], aa = a;
 610
 611   for (i = 0; i < NPIECE; i++) zz[i] = aa*x->P[i];
 612 #if FGOLDI_IMPL == 28
 613   CARRY_REDUCE(z->P, zz);
 614 #elif FGOLDI_IMPL == 12
 615   carry_reduce(zz);
 616   for (i = 0; i < NPIECE; i++) z->P[i] = zz[i];
 617 #endif
 618 }
 619
 620 /* --- @fgoldi_mul@ --- *
 621  *
 622  * Arguments:   @fgoldi *z@ = where to put the result (may alias @x@ or @y@)
 623  *              @const fgoldi *x, *y@ = two operands
 624  *
 625  * Returns:     ---
 626  *
 627  * Use:         Set @z@ to the product %$x y$%.
 628  */
 629
 630 void fgoldi_mul(fgoldi *z, const fgoldi *x, const fgoldi *y)
 631 {
 632   dblpiece zz[NPIECE], u[NPIECE];
 633   piece ab[NPIECE/2], cd[NPIECE/2];
 634   const piece
 635     *a = x->P + NPIECE/2, *b = x->P,
 636     *c = y->P + NPIECE/2, *d = y->P;
 637   unsigned i, j;
 638
 639 #if FGOLDI_IMPL == 28
 640
 641 #  define M(x,i, y,j) ((dblpiece)(x)[i]*(y)[j])
 642
 643 #elif FGOLDI_IMPL == 12
 644
 645   static const unsigned short off[39] = {
 646       0,  12,  23,  34,  45,  56,  68,  79,  90, 101,
 647     112, 124, 135, 146, 157, 168, 180, 191, 202, 213,
 648     224, 236, 247, 258, 269, 280, 292, 303, 314, 325,
 649     336, 348, 359, 370, 381, 392, 404, 415, 426
 650   };
 651
 652 #define M(x,i, y,j)                                                     \
 653   (((dblpiece)(x)[i]*(y)[j]) << (off[i] + off[j] - off[(i) + (j)]))
 654
 655 #endif
 656
 657   /* Behold the magic.
 658    *
 659    * Write x = a φ + b, and y = c φ + d.  Then x y = a c φ^2 +
 660    * (a d + b c) φ + b d.  Karatsuba and Ofman observed that a d + b c =
 661    * (a + b) (c + d) - a c - b d, saving a multiplication, and Hamburg chose
 662    * the prime p so that φ^2 = φ + 1.  So
 663    *
 664    *    x y = ((a + b) (c + d) - b d) φ + a c + b d
 665    */
 666
 667   for (i = 0; i < NPIECE; i++) zz[i] = 0;
 668
 669   /* Our first job will be to calculate (1 - φ) b d, and write the result
 670    * into z.  As we do this, an interesting thing will happen.  Write
 671    * b d = u φ + v; then (1 - φ) b d = u φ + v - u φ^2 - v φ = (1 - φ) v - u.
 672    * So, what we do is to write the product end-swapped and negated, and then
 673    * we'll subtract the (negated, remember) high half from the low half.
 674    */
 675   for (i = 0; i < NPIECE/2; i++) {
 676     for (j = 0; j < NPIECE/2 - i; j++)
 677       zz[i + j + NPIECE/2] -= M(b,i, d,j);
 678     for (; j < NPIECE/2; j++)
 679       zz[i + j - NPIECE/2] -= M(b,i, d,j);
 680   }
 681   for (i = 0; i < NPIECE/2; i++)
 682     zz[i] -= zz[i + NPIECE/2];
 683
 684   /* Next, we add on a c.  There are no surprises here. */
 685   for (i = 0; i < NPIECE/2; i++)
 686     for (j = 0; j < NPIECE/2; j++)
 687       zz[i + j] += M(a,i, c,j);
 688
 689   /* Now, calculate a + b and c + d. */
 690   for (i = 0; i < NPIECE/2; i++)
 691     { ab[i] = a[i] + b[i]; cd[i] = c[i] + d[i]; }
 692
 693   /* Finally (for the multiplication) we must add on (a + b) (c + d) φ.
 694    * Write (a + b) (c + d) as u φ + v; then we actually want u φ^2 + v φ =
 695    * v φ + (1 + φ) u.  We'll store u in a temporary place and add it on
 696    * twice.
 697    */
 698   for (i = 0; i < NPIECE; i++) u[i] = 0;
 699   for (i = 0; i < NPIECE/2; i++) {
 700     for (j = 0; j < NPIECE/2 - i; j++)
 701       zz[i + j + NPIECE/2] += M(ab,i, cd,j);
 702     for (; j < NPIECE/2; j++)
 703       u[i + j - NPIECE/2] += M(ab,i, cd,j);
 704   }
 705   for (i = 0; i < NPIECE/2; i++)
 706     { zz[i] += u[i]; zz[i + NPIECE/2] += u[i]; }
 707
 708 #undef M
 709
 710 #if FGOLDI_IMPL == 28
 711   /* That wraps it up for the multiplication.  Let's figure out some bounds.
 712    * Fortunately, Karatsuba is a polynomial identity, so all of the pieces
 713    * end up the way they'd be if we'd done the thing the easy way, which
 714    * simplifies the analysis.  Suppose we started with |x_i|, |y_i| <= 9/5
 715    * 2^28.  The overheads in the result are given by the coefficients of
 716    *
 717    *    ((u^16 - 1)/(u - 1))^2 mod u^16 - u^8 - 1
 718    *
 719    * the greatest of which is 38.  So |z_i| <= 38*81/25*2^56 < 2^63.
 720    *
 721    * Anyway, a round of `CARRY_REDUCE' will leave us with |z_i| < 2^27 +
 722    * 2^36; and a second round will leave us with |z_i| < 2^27 + 512.
 723    */
 724   for (i = 0; i < 2; i++) CARRY_REDUCE(zz, zz);
 725   for (i = 0; i < NPIECE; i++) z->P[i] = zz[i];
 726 #elif FGOLDI_IMPL == 12
 727   carry_reduce(zz);
 728   for (i = 0; i < NPIECE; i++) z->P[i] = zz[i];
 729 #endif
 730 }
 731
 732 /* --- @fgoldi_sqr@ --- *
 733  *
 734  * Arguments:   @fgoldi *z@ = where to put the result (may alias @x@ or @y@)
 735  *              @const fgoldi *x@ = an operand
 736  *
 737  * Returns:     ---
 738  *
 739  * Use:         Set @z@ to the square %$x^2$%.
 740  */
 741
 742 void fgoldi_sqr(fgoldi *z, const fgoldi *x)
 743 {
 744 #if FGOLDI_IMPL == 28
 745
 746   dblpiece zz[NPIECE], u[NPIECE];
 747   piece ab[NPIECE];
 748   const piece *a = x->P + NPIECE/2, *b = x->P;
 749   unsigned i, j;
 750
 751 #  define M(x,i, y,j) ((dblpiece)(x)[i]*(y)[j])
 752
 753   /* The magic is basically the same as `fgoldi_mul' above.  We write
 754    * x = a φ + b and use Karatsuba and the special prime shape.  This time,
 755    * we have
 756    *
 757    *    x^2 = ((a + b)^2 - b^2) φ + a^2 + b^2
 758    */
 759
 760   for (i = 0; i < NPIECE; i++) zz[i] = 0;
 761
 762   /* Our first job will be to calculate (1 - φ) b^2, and write the result
 763    * into z.  Again, this interacts pleasantly with the prime shape.
 764    */
 765   for (i = 0; i < NPIECE/4; i++) {
 766     zz[2*i + NPIECE/2] -= M(b,i, b,i);
 767     for (j = i + 1; j < NPIECE/2 - i; j++)
 768       zz[i + j + NPIECE/2] -= 2*M(b,i, b,j);
 769     for (; j < NPIECE/2; j++)
 770       zz[i + j - NPIECE/2] -= 2*M(b,i, b,j);
 771   }
 772   for (; i < NPIECE/2; i++) {
 773     zz[2*i - NPIECE/2] -= M(b,i, b,i);
 774     for (j = i + 1; j < NPIECE/2; j++)
 775       zz[i + j - NPIECE/2] -= 2*M(b,i, b,j);
 776   }
 777   for (i = 0; i < NPIECE/2; i++)
 778     zz[i] -= zz[i + NPIECE/2];
 779
 780   /* Next, we add on a^2.  There are no surprises here. */
 781   for (i = 0; i < NPIECE/2; i++) {
 782     zz[2*i] += M(a,i, a,i);
 783     for (j = i + 1; j < NPIECE/2; j++)
 784       zz[i + j] += 2*M(a,i, a,j);
 785   }
 786
 787   /* Now, calculate a + b. */
 788   for (i = 0; i < NPIECE/2; i++)
 789     ab[i] = a[i] + b[i];
 790
 791   /* Finally (for the multiplication) we must add on (a + b)^2 φ.
 792    * Write (a + b)^2 as u φ + v; then we actually want (u + v) φ + u.  We'll
 793    * store u in a temporary place and add it on twice.
 794    */
 795   for (i = 0; i < NPIECE; i++) u[i] = 0;
 796   for (i = 0; i < NPIECE/4; i++) {
 797     zz[2*i + NPIECE/2] += M(ab,i, ab,i);
 798     for (j = i + 1; j < NPIECE/2 - i; j++)
 799       zz[i + j + NPIECE/2] += 2*M(ab,i, ab,j);
 800     for (; j < NPIECE/2; j++)
 801       u[i + j - NPIECE/2] += 2*M(ab,i, ab,j);
 802   }
 803   for (; i < NPIECE/2; i++) {
 804     u[2*i - NPIECE/2] += M(ab,i, ab,i);
 805     for (j = i + 1; j < NPIECE/2; j++)
 806       u[i + j - NPIECE/2] += 2*M(ab,i, ab,j);
 807   }
 808   for (i = 0; i < NPIECE/2; i++)
 809     { zz[i] += u[i]; zz[i + NPIECE/2] += u[i]; }
 810
 811 #undef M
 812
 813   /* Finally, carrying. */
 814   for (i = 0; i < 2; i++) CARRY_REDUCE(zz, zz);
 815   for (i = 0; i < NPIECE; i++) z->P[i] = zz[i];
 816
 817 #elif FGOLDI_IMPL == 12
 818   fgoldi_mul(z, x, x);
 819 #endif
 820 }
 821
 822 /*----- More advanced operations ------------------------------------------*/
 823
 824 /* --- @fgoldi_inv@ --- *
 825  *
 826  * Arguments:   @fgoldi *z@ = where to put the result (may alias @x@)
 827  *              @const fgoldi *x@ = an operand
 828  *
 829  * Returns:     ---
 830  *
 831  * Use:         Stores in @z@ the multiplicative inverse %$x^{-1}$%.  If
 832  *              %$x = 0$% then @z@ is set to zero.  This is considered a
 833  *              feature.
 834  */
 835
 836 void fgoldi_inv(fgoldi *z, const fgoldi *x)
 837 {
 838   fgoldi t, u;
 839   unsigned i;
 840
 841 #define SQRN(z, x, n) do {                                              \
 842   fgoldi_sqr((z), (x));                                                 \
 843   for (i = 1; i < (n); i++) fgoldi_sqr((z), (z));                       \
 844 } while (0)
 845
 846   /* Calculate x^-1 = x^(p - 2) = x^(2^448 - 2^224 - 3), which also handles
 847    * x = 0 as intended.  The addition chain is home-made.
 848    */                                   /* step | value */
 849   fgoldi_sqr(&u, x);                    /*    1 | 2 */
 850   fgoldi_mul(&t, &u, x);                /*    2 | 3 */
 851   SQRN(&u, &t, 2);                      /*    4 | 12 */
 852   fgoldi_mul(&t, &u, &t);               /*    5 | 15 */
 853   SQRN(&u, &t, 4);                      /*    9 | 240 */
 854   fgoldi_mul(&u, &u, &t);               /*   10 | 255 = 2^8 - 1 */
 855   SQRN(&u, &u, 4);                      /*   14 | 2^12 - 16 */
 856   fgoldi_mul(&t, &u, &t);               /*   15 | 2^12 - 1 */
 857   SQRN(&u, &t, 12);                     /*   27 | 2^24 - 2^12 */
 858   fgoldi_mul(&u, &u, &t);               /*   28 | 2^24 - 1 */
 859   SQRN(&u, &u, 12);                     /*   40 | 2^36 - 2^12 */
 860   fgoldi_mul(&t, &u, &t);               /*   41 | 2^36 - 1 */
 861   fgoldi_sqr(&t, &t);                   /*   42 | 2^37 - 2 */
 862   fgoldi_mul(&t, &t, x);                /*   43 | 2^37 - 1 */
 863   SQRN(&u, &t, 37);                     /*   80 | 2^74 - 2^37 */
 864   fgoldi_mul(&u, &u, &t);               /*   81 | 2^74 - 1 */
 865   SQRN(&u, &u, 37);                     /*  118 | 2^111 - 2^37 */
 866   fgoldi_mul(&t, &u, &t);               /*  119 | 2^111 - 1 */
 867   SQRN(&u, &t, 111);                    /*  230 | 2^222 - 2^111 */
 868   fgoldi_mul(&t, &u, &t);               /*  231 | 2^222 - 1 */
 869   fgoldi_sqr(&u, &t);                   /*  232 | 2^223 - 2 */
 870   fgoldi_mul(&u, &u, x);                /*  233 | 2^223 - 1 */
 871   SQRN(&u, &u, 223);                    /*  456 | 2^446 - 2^223 */
 872   fgoldi_mul(&t, &u, &t);               /*  457 | 2^446 - 2^222 - 1 */
 873   SQRN(&t, &t, 2);                      /*  459 | 2^448 - 2^224 - 4 */
 874   fgoldi_mul(z, &t, x);                 /*  460 | 2^448 - 2^224 - 3 */
 875
 876 #undef SQRN
 877 }
 878
 879 /* --- @fgoldi_quosqrt@ --- *
 880  *
 881  * Arguments:   @fgoldi *z@ = where to put the result (may alias @x@ or @y@)
 882  *              @const fgoldi *x, *y@ = two operands
 883  *
 884  * Returns:     Zero if successful, @-1@ if %$x/y$% is not a square.
 885  *
 886  * Use:         Stores in @z@ the one of the square roots %$\pm\sqrt{x/y}$%.
 887  *              If %$x = y = 0% then the result is zero; if %$y = 0$% but %$x
 888  *              \ne 0$% then the operation fails.  If you wanted a specific
 889  *              square root then you'll have to pick it yourself.
 890  */
 891
 892 int fgoldi_quosqrt(fgoldi *z, const fgoldi *x, const fgoldi *y)
 893 {
 894   fgoldi t, u, v;
 895   octet xb[56], b0[56];
 896   int32 rc = -1;
 897   mask32 m;
 898   unsigned i;
 899
 900 #define SQRN(z, x, n) do {                                              \
 901   fgoldi_sqr((z), (x));                                                 \
 902   for (i = 1; i < (n); i++) fgoldi_sqr((z), (z));                       \
 903 } while (0)
 904
 905   /* This is, fortunately, significantly easier than the equivalent problem
 906    * in GF(2^255 - 19), since p == 3 (mod 4).
 907    *
 908    * If x/y is square, then so is v = y^2 x/y = x y, and therefore u has
 909    * order r = (p - 1)/2.  Let w = v^{(p-3)/4}.  Then w^2 = v^{(p-3)/2} =
 910    * u^{r-1} = 1/v = 1/x y.  Clearly, then, (x w)^2 = x^2/x y = x/y, so x w
 911    * is one of the square roots we seek.
 912    *
 913    * The addition chain, then, is a prefix of the previous one.
 914    */
 915   fgoldi_mul(&v, x, y);
 916
 917   fgoldi_sqr(&u, &v);                   /*    1 | 2 */
 918   fgoldi_mul(&t, &u, &v);               /*    2 | 3 */
 919   SQRN(&u, &t, 2);                      /*    4 | 12 */
 920   fgoldi_mul(&t, &u, &t);               /*    5 | 15 */
 921   SQRN(&u, &t, 4);                      /*    9 | 240 */
 922   fgoldi_mul(&u, &u, &t);               /*   10 | 255 = 2^8 - 1 */
 923   SQRN(&u, &u, 4);                      /*   14 | 2^12 - 16 */
 924   fgoldi_mul(&t, &u, &t);               /*   15 | 2^12 - 1 */
 925   SQRN(&u, &t, 12);                     /*   27 | 2^24 - 2^12 */
 926   fgoldi_mul(&u, &u, &t);               /*   28 | 2^24 - 1 */
 927   SQRN(&u, &u, 12);                     /*   40 | 2^36 - 2^12 */
 928   fgoldi_mul(&t, &u, &t);               /*   41 | 2^36 - 1 */
 929   fgoldi_sqr(&t, &t);                   /*   42 | 2^37 - 2 */
 930   fgoldi_mul(&t, &t, &v);               /*   43 | 2^37 - 1 */
 931   SQRN(&u, &t, 37);                     /*   80 | 2^74 - 2^37 */
 932   fgoldi_mul(&u, &u, &t);               /*   81 | 2^74 - 1 */
 933   SQRN(&u, &u, 37);                     /*  118 | 2^111 - 2^37 */
 934   fgoldi_mul(&t, &u, &t);               /*  119 | 2^111 - 1 */
 935   SQRN(&u, &t, 111);                    /*  230 | 2^222 - 2^111 */
 936   fgoldi_mul(&t, &u, &t);               /*  231 | 2^222 - 1 */
 937   fgoldi_sqr(&u, &t);                   /*  232 | 2^223 - 2 */
 938   fgoldi_mul(&u, &u, &v);               /*  233 | 2^223 - 1 */
 939   SQRN(&u, &u, 223);                    /*  456 | 2^446 - 2^223 */
 940   fgoldi_mul(&t, &u, &t);               /*  457 | 2^446 - 2^222 - 1 */
 941
 942 #undef SQRN
 943
 944   /* Now we must decide whether the answer was right.  We should have z^2 =
 945    * x/y, so y z^2 = x.
 946    *
 947    * The easiest way to compare is to encode.  This isn't as wasteful as it
 948    * sounds: the hard part is normalizing the representations, which we have
 949    * to do anyway.
 950    */
 951   fgoldi_mul(z, x, &t);
 952   fgoldi_sqr(&t, z);
 953   fgoldi_mul(&t, &t, y);
 954   fgoldi_store(xb, x);
 955   fgoldi_store(b0, &t);
 956   m = -ct_memeq(xb, b0, 56);
 957   rc = PICK2(0, rc, m);
 958   return (rc);
 959 }
 960
 961 /*----- Test rig ----------------------------------------------------------*/
 962
 963 #ifdef TEST_RIG
 964
 965 #include <mLib/macros.h>
 966 #include <mLib/report.h>
 967 #include <mLib/str.h>
 968 #include <mLib/testrig.h>
 969
 970 static void fixdstr(dstr *d)
 971 {
 972   if (d->len > 56)
 973     die(1, "invalid length for fgoldi");
 974   else if (d->len < 56) {
 975     dstr_ensure(d, 56);
 976     memset(d->buf + d->len, 0, 56 - d->len);
 977     d->len = 56;
 978   }
 979 }
 980
 981 static void cvt_fgoldi(const char *buf, dstr *d)
 982 {
 983   dstr dd = DSTR_INIT;
 984
 985   type_hex.cvt(buf, &dd); fixdstr(&dd);
 986   dstr_ensure(d, sizeof(fgoldi)); d->len = sizeof(fgoldi);
 987   fgoldi_load((fgoldi *)d->buf, (const octet *)dd.buf);
 988   dstr_destroy(&dd);
 989 }
 990
 991 static void dump_fgoldi(dstr *d, FILE *fp)
 992   { fdump(stderr, "???", (const piece *)d->buf); }
 993
 994 static void cvt_fgoldi_ref(const char *buf, dstr *d)
 995   { type_hex.cvt(buf, d); fixdstr(d); }
 996
 997 static void dump_fgoldi_ref(dstr *d, FILE *fp)
 998 {
 999   fgoldi x;
1000
1001   fgoldi_load(&x, (const octet *)d->buf);
1002   fdump(stderr, "???", x.P);
1003 }
1004
1005 static int eq(const fgoldi *x, dstr *d)
1006   { octet b[56]; fgoldi_store(b, x); return (MEMCMP(b, ==, d->buf, 56)); }
1007
1008 static const test_type
1009   type_fgoldi = { cvt_fgoldi, dump_fgoldi },
1010   type_fgoldi_ref = { cvt_fgoldi_ref, dump_fgoldi_ref };
1011
1012 #define TEST_UNOP(op)                                                   \
1013   static int vrf_##op(dstr dv[])                                        \
1014   {                                                                     \
1015     fgoldi *x = (fgoldi *)dv[0].buf;                                    \
1016     fgoldi z, zz;                                                       \
1017     int ok = 1;                                                         \
1018                                                                         \
1019     fgoldi_##op(&z, x);                                                 \
1020     if (!eq(&z, &dv[1])) {                                              \
1021       ok = 0;                                                           \
1022       fprintf(stderr, "failed!\n");                                     \
1023       fdump(stderr, "x", x->P);                                         \
1024       fdump(stderr, "calc", z.P);                                       \
1025       fgoldi_load(&zz, (const octet *)dv[1].buf);                       \
1026       fdump(stderr, "z", zz.P);                                         \
1027     }                                                                   \
1028                                                                         \
1029     return (ok);                                                        \
1030   }
1031
1032 TEST_UNOP(sqr)
1033 TEST_UNOP(inv)
1034 TEST_UNOP(neg)
1035
1036 #define TEST_BINOP(op)                                                  \
1037   static int vrf_##op(dstr dv[])                                        \
1038   {                                                                     \
1039     fgoldi *x = (fgoldi *)dv[0].buf, *y = (fgoldi *)dv[1].buf;          \
1040     fgoldi z, zz;                                                       \
1041     int ok = 1;                                                         \
1042                                                                         \
1043     fgoldi_##op(&z, x, y);                                              \
1044     if (!eq(&z, &dv[2])) {                                              \
1045       ok = 0;                                                           \
1046       fprintf(stderr, "failed!\n");                                     \
1047       fdump(stderr, "x", x->P);                                         \
1048       fdump(stderr, "y", y->P);                                         \
1049       fdump(stderr, "calc", z.P);                                       \
1050       fgoldi_load(&zz, (const octet *)dv[2].buf);                       \
1051       fdump(stderr, "z", zz.P);                                         \
1052     }                                                                   \
1053                                                                         \
1054     return (ok);                                                        \
1055   }
1056
1057 TEST_BINOP(add)
1058 TEST_BINOP(sub)
1059 TEST_BINOP(mul)
1060
1061 static int vrf_mulc(dstr dv[])
1062 {
1063   fgoldi *x = (fgoldi *)dv[0].buf;
1064   long a = *(const long *)dv[1].buf;
1065   fgoldi z, zz;
1066   int ok = 1;
1067
1068   fgoldi_mulconst(&z, x, a);
1069   if (!eq(&z, &dv[2])) {
1070     ok = 0;
1071     fprintf(stderr, "failed!\n");
1072     fdump(stderr, "x", x->P);
1073     fprintf(stderr, "a = %ld\n", a);
1074     fdump(stderr, "calc", z.P);
1075     fgoldi_load(&zz, (const octet *)dv[2].buf);
1076     fdump(stderr, "z", zz.P);
1077   }
1078
1079   return (ok);
1080 }
1081
1082 static int vrf_condneg(dstr dv[])
1083 {
1084   fgoldi *x = (fgoldi *)dv[0].buf;
1085   uint32 m = *(uint32 *)dv[1].buf;
1086   fgoldi z;
1087   int ok = 1;
1088
1089   fgoldi_condneg(&z, x, m);
1090   if (!eq(&z, &dv[2])) {
1091     ok = 0;
1092     fprintf(stderr, "failed!\n");
1093     fdump(stderr, "x", x->P);
1094     fprintf(stderr, "m = 0x%08lx\n", (unsigned long)m);
1095     fdump(stderr, "calc z", z.P);
1096     fgoldi_load(&z, (const octet *)dv[1].buf);
1097     fdump(stderr, "want z", z.P);
1098   }
1099
1100   return (ok);
1101 }
1102
1103 static int vrf_pick2(dstr dv[])
1104 {
1105   fgoldi *x = (fgoldi *)dv[0].buf, *y = (fgoldi *)dv[1].buf;
1106   uint32 m = *(uint32 *)dv[2].buf;
1107   fgoldi z;
1108   int ok = 1;
1109
1110   fgoldi_pick2(&z, x, y, m);
1111   if (!eq(&z, &dv[3])) {
1112     ok = 0;
1113     fprintf(stderr, "failed!\n");
1114     fdump(stderr, "x", x->P);
1115     fdump(stderr, "y", y->P);
1116     fprintf(stderr, "m = 0x%08lx\n", (unsigned long)m);
1117     fdump(stderr, "calc z", z.P);
1118     fgoldi_load(&z, (const octet *)dv[3].buf);
1119     fdump(stderr, "want z", z.P);
1120   }
1121
1122   return (ok);
1123 }
1124
1125 static int vrf_pickn(dstr dv[])
1126 {
1127   dstr d = DSTR_INIT;
1128   fgoldi v[32], z;
1129   size_t i = *(uint32 *)dv[1].buf, j, n;
1130   const char *p;
1131   char *q;
1132   int ok = 1;
1133
1134   for (q = dv[0].buf, n = 0; (p = str_qword(&q, 0)) != 0; n++)
1135     { cvt_fgoldi(p, &d); v[n] = *(fgoldi *)d.buf; }
1136
1137   fgoldi_pickn(&z, v, n, i);
1138   if (!eq(&z, &dv[2])) {
1139     ok = 0;
1140     fprintf(stderr, "failed!\n");
1141     for (j = 0; j < n; j++) {
1142       fprintf(stderr, "v[%2u]", (unsigned)j);
1143       fdump(stderr, "", v[j].P);
1144     }
1145     fprintf(stderr, "i = %u\n", (unsigned)i);
1146     fdump(stderr, "calc z", z.P);
1147     fgoldi_load(&z, (const octet *)dv[2].buf);
1148     fdump(stderr, "want z", z.P);
1149   }
1150
1151   dstr_destroy(&d);
1152   return (ok);
1153 }
1154
1155 static int vrf_condswap(dstr dv[])
1156 {
1157   fgoldi *x = (fgoldi *)dv[0].buf, *y = (fgoldi *)dv[1].buf;
1158   fgoldi xx = *x, yy = *y;
1159   uint32 m = *(uint32 *)dv[2].buf;
1160   int ok = 1;
1161
1162   fgoldi_condswap(&xx, &yy, m);
1163   if (!eq(&xx, &dv[3]) || !eq(&yy, &dv[4])) {
1164     ok = 0;
1165     fprintf(stderr, "failed!\n");
1166     fdump(stderr, "x", x->P);
1167     fdump(stderr, "y", y->P);
1168     fprintf(stderr, "m = 0x%08lx\n", (unsigned long)m);
1169     fdump(stderr, "calc xx", xx.P);
1170     fdump(stderr, "calc yy", yy.P);
1171     fgoldi_load(&xx, (const octet *)dv[3].buf);
1172     fgoldi_load(&yy, (const octet *)dv[4].buf);
1173     fdump(stderr, "want xx", xx.P);
1174     fdump(stderr, "want yy", yy.P);
1175   }
1176
1177   return (ok);
1178 }
1179
1180 static int vrf_quosqrt(dstr dv[])
1181 {
1182   fgoldi *x = (fgoldi *)dv[0].buf, *y = (fgoldi *)dv[1].buf;
1183   fgoldi z, zz;
1184   int rc;
1185   int ok = 1;
1186
1187   if (dv[2].len) { fixdstr(&dv[2]); fixdstr(&dv[3]); }
1188   rc = fgoldi_quosqrt(&z, x, y);
1189   if (!dv[2].len ? !rc : (rc || (!eq(&z, &dv[2]) && !eq(&z, &dv[3])))) {
1190     ok = 0;
1191     fprintf(stderr, "failed!\n");
1192     fdump(stderr, "x", x->P);
1193     fdump(stderr, "y", y->P);
1194     if (rc) fprintf(stderr, "calc: FAIL\n");
1195     else fdump(stderr, "calc", z.P);
1196     if (!dv[2].len)
1197       fprintf(stderr, "exp: FAIL\n");
1198     else {
1199       fgoldi_load(&zz, (const octet *)dv[2].buf);
1200       fdump(stderr, "z", zz.P);
1201       fgoldi_load(&zz, (const octet *)dv[3].buf);
1202       fdump(stderr, "z'", zz.P);
1203     }
1204   }
1205
1206   return (ok);
1207 }
1208
1209 static int vrf_sub_mulc_add_sub_mul(dstr dv[])
1210 {
1211   fgoldi *u = (fgoldi *)dv[0].buf, *v = (fgoldi *)dv[1].buf,
1212     *w = (fgoldi *)dv[3].buf, *x = (fgoldi *)dv[4].buf,
1213     *y = (fgoldi *)dv[5].buf;
1214   long a = *(const long *)dv[2].buf;
1215   fgoldi umv, aumv, wpaumv, xmy, z, zz;
1216   int ok = 1;
1217
1218   fgoldi_sub(&umv, u, v);
1219   fgoldi_mulconst(&aumv, &umv, a);
1220   fgoldi_add(&wpaumv, w, &aumv);
1221   fgoldi_sub(&xmy, x, y);
1222   fgoldi_mul(&z, &wpaumv, &xmy);
1223
1224   if (!eq(&z, &dv[6])) {
1225     ok = 0;
1226     fprintf(stderr, "failed!\n");
1227     fdump(stderr, "u", u->P);
1228     fdump(stderr, "v", v->P);
1229     fdump(stderr, "u - v", umv.P);
1230     fprintf(stderr, "a = %ld\n", a);
1231     fdump(stderr, "a (u - v)", aumv.P);
1232     fdump(stderr, "w + a (u - v)", wpaumv.P);
1233     fdump(stderr, "x", x->P);
1234     fdump(stderr, "y", y->P);
1235     fdump(stderr, "x - y", xmy.P);
1236     fdump(stderr, "(x - y) (w + a (u - v))", z.P);
1237     fgoldi_load(&zz, (const octet *)dv[6].buf); fdump(stderr, "z", zz.P);
1238   }
1239
1240   return (ok);
1241 }
1242
1243 static test_chunk tests[] = {
1244   { "add", vrf_add, { &type_fgoldi, &type_fgoldi, &type_fgoldi_ref } },
1245   { "sub", vrf_sub, { &type_fgoldi, &type_fgoldi, &type_fgoldi_ref } },
1246   { "neg", vrf_neg, { &type_fgoldi, &type_fgoldi_ref } },
1247   { "condneg", vrf_condneg,
1248     { &type_fgoldi, &type_uint32, &type_fgoldi_ref } },
1249   { "mul", vrf_mul, { &type_fgoldi, &type_fgoldi, &type_fgoldi_ref } },
1250   { "mulconst", vrf_mulc, { &type_fgoldi, &type_long, &type_fgoldi_ref } },
1251   { "pick2", vrf_pick2,
1252     { &type_fgoldi, &type_fgoldi, &type_uint32, &type_fgoldi_ref } },
1253   { "pickn", vrf_pickn,
1254     { &type_string, &type_uint32, &type_fgoldi_ref } },
1255   { "condswap", vrf_condswap,
1256     { &type_fgoldi, &type_fgoldi, &type_uint32,
1257       &type_fgoldi_ref, &type_fgoldi_ref } },
1258   { "sqr", vrf_sqr, { &type_fgoldi, &type_fgoldi_ref } },
1259   { "inv", vrf_inv, { &type_fgoldi, &type_fgoldi_ref } },
1260   { "quosqrt", vrf_quosqrt,
1261     { &type_fgoldi, &type_fgoldi, &type_hex, &type_hex } },
1262   { "sub-mulc-add-sub-mul", vrf_sub_mulc_add_sub_mul,
1263     { &type_fgoldi, &type_fgoldi, &type_long, &type_fgoldi,
1264       &type_fgoldi, &type_fgoldi, &type_fgoldi_ref } },
1265   { 0, 0, { 0 } }
1266 };
1267
1268 int main(int argc, char *argv[])
1269 {
1270   test_run(argc, argv, tests, SRCDIR "/t/fgoldi");
1271   return (0);
1272 }
1273
1274 #endif
1275
1276 /*----- That's all, folks -------------------------------------------------*/