chiark - git - mdw - catacomb/blob - math/fgoldi.c

   1 /* -*-c-*-
   2  *
   3  * Arithmetic in the Goldilocks field GF(2^448 - 2^224 - 1)
   4  *
   5  * (c) 2017 Straylight/Edgeware
   6  */
   7
   8 /*----- Licensing notice --------------------------------------------------*
   9  *
  10  * This file is part of Catacomb.
  11  *
  12  * Catacomb is free software; you can redistribute it and/or modify
  13  * it under the terms of the GNU Library General Public License as
  14  * published by the Free Software Foundation; either version 2 of the
  15  * License, or (at your option) any later version.
  16  *
  17  * Catacomb is distributed in the hope that it will be useful,
  18  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  19  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  20  * GNU Library General Public License for more details.
  21  *
  22  * You should have received a copy of the GNU Library General Public
  23  * License along with Catacomb; if not, write to the Free
  24  * Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
  25  * MA 02111-1307, USA.
  26  */
  27
  28 /*----- Header files ------------------------------------------------------*/
  29
  30 #include "config.h"
  31
  32 #include "ct.h"
  33 #include "fgoldi.h"
  34
  35 /*----- Basic setup -------------------------------------------------------*
  36  *
  37  * Let φ = 2^224; then p = φ^2 - φ - 1, and, in GF(p), we have φ^2 = φ + 1
  38  * (hence the name).
  39  */
  40
  41 typedef fgoldi_piece piece;
  42
  43 #if FGOLDI_IMPL == 28
  44 /* We represent an element of GF(p) as 16 28-bit signed integer pieces x_i:
  45  * x = SUM_{0<=i<16} x_i 2^(28i).
  46  */
  47
  48                        typedef  int64  dblpiece;
  49 typedef uint32 upiece; typedef uint64 udblpiece;
  50 #define PIECEWD(i) 28
  51 #define NPIECE 16
  52 #define P p28
  53
  54 #define B28 0x10000000u
  55 #define B27 0x08000000u
  56 #define M28 0x0fffffffu
  57 #define M27 0x07ffffffu
  58 #define M32 0xffffffffu
  59
  60 #elif FGOLDI_IMPL == 12
  61 /* We represent an element of GF(p) as 40 signed integer pieces x_i: x =
  62  * SUM_{0<=i<40} x_i 2^ceil(224i/20).  Pieces i with i == 0 (mod 5) are 12
  63  * bits wide; the others are 11 bits wide, so they form eight groups of 56
  64  * bits.
  65  */
  66
  67                        typedef  int32  dblpiece;
  68 typedef uint16 upiece; typedef uint32 udblpiece;
  69 #define PIECEWD(i) ((i)%5 ? 11 : 12)
  70 #define NPIECE 40
  71 #define P p12
  72
  73 #define B12 0x1000u
  74 #define B11 0x0800u
  75 #define B10 0x0400u
  76 #define M12 0xfffu
  77 #define M11 0x7ffu
  78 #define M10 0x3ffu
  79 #define M8 0xffu
  80
  81 #endif
  82
  83 /*----- Debugging machinery -----------------------------------------------*/
  84
  85 #if defined(FGOLDI_DEBUG) || defined(TEST_RIG)
  86
  87 #include <stdio.h>
  88
  89 #include "mp.h"
  90 #include "mptext.h"
  91
  92 static mp *get_pgoldi(void)
  93 {
  94   mp *p = MP_NEW, *t = MP_NEW;
  95
  96   p = mp_setbit(p, MP_ZERO, 448);
  97   t = mp_setbit(t, MP_ZERO, 224);
  98   p = mp_sub(p, p, t);
  99   p = mp_sub(p, p, MP_ONE);
 100   mp_drop(t);
 101   return (p);
 102 }
 103
 104 DEF_FDUMP(fdump, piece, PIECEWD, NPIECE, 56, get_pgoldi())
 105
 106 #endif
 107
 108 /*----- Loading and storing -----------------------------------------------*/
 109
 110 /* --- @fgoldi_load@ --- *
 111  *
 112  * Arguments:   @fgoldi *z@ = where to store the result
 113  *              @const octet xv[56]@ = source to read
 114  *
 115  * Returns:     ---
 116  *
 117  * Use:         Reads an element of %$\gf{2^{448} - 2^{224} - 19}$% in
 118  *              external representation from @xv@ and stores it in @z@.
 119  *
 120  *              External representation is little-endian base-256.  Some
 121  *              elements have multiple encodings, which are not produced by
 122  *              correct software; use of noncanonical encodings is not an
 123  *              error, and toleration of them is considered a performance
 124  *              feature.
 125  */
 126
 127 void fgoldi_load(fgoldi *z, const octet xv[56])
 128 {
 129 #if FGOLDI_IMPL == 28
 130
 131   unsigned i;
 132   uint32 xw[14];
 133   piece b, c;
 134
 135   /* First, read the input value as words. */
 136   for (i = 0; i < 14; i++) xw[i] = LOAD32_L(xv + 4*i);
 137
 138   /* Extract unsigned 28-bit pieces from the words. */
 139   z->P[ 0] = (xw[ 0] >> 0)&M28;
 140   z->P[ 7] = (xw[ 6] >> 4)&M28;
 141   z->P[ 8] = (xw[ 7] >> 0)&M28;
 142   z->P[15] = (xw[13] >> 4)&M28;
 143   for (i = 1; i < 7; i++) {
 144     z->P[i + 0] = ((xw[i + 0] << (4*i)) | (xw[i - 1] >> (32 - 4*i)))&M28;
 145     z->P[i + 8] = ((xw[i + 7] << (4*i)) | (xw[i + 6] >> (32 - 4*i)))&M28;
 146   }
 147
 148   /* Convert the nonnegative pieces into a balanced signed representation, so
 149    * each piece ends up in the interval |z_i| <= 2^27.  For each piece, if
 150    * its top bit is set, lend a bit leftwards; in the case of z_15, reduce
 151    * this bit by adding it onto z_0 and z_8, since this is the φ^2 bit, and
 152    * φ^2 = φ + 1.  We delay this carry until after all of the pieces have
 153    * been balanced.  If we don't do this, then we have to do a more expensive
 154    * test for nonzeroness to decide whether to lend a bit leftwards rather
 155    * than just testing a single bit.
 156    *
 157    * Note that we don't try for a canonical representation here: both upper
 158    * and lower bounds are achievable.
 159    */
 160   b = z->P[15]&B27; z->P[15] -= b << 1; c = b >> 27;
 161   for (i = NPIECE - 1; i--; )
 162     { b = z->P[i]&B27; z->P[i] -= b << 1; z->P[i + 1] += b >> 27; }
 163   z->P[0] += c; z->P[8] += c;
 164
 165 #elif FGOLDI_IMPL == 12
 166
 167   unsigned i, j, n, w, b;
 168   uint32 a;
 169   int c;
 170
 171   /* First, convert the bytes into nonnegative pieces. */
 172   for (i = j = a = n = 0, w = PIECEWD(0); i < 56; i++) {
 173     a |= (uint32)xv[i] << n; n += 8;
 174     if (n >= w) {
 175       z->P[j++] = a&MASK(w);
 176       a >>= w; n -= w; w = PIECEWD(j);
 177     }
 178   }
 179
 180   /* Convert the nonnegative pieces into a balanced signed representation, so
 181    * each piece ends up in the interval |z_i| <= 2^11 + 1.
 182    */
 183   b = z->P[39]&B10; z->P[39] -= b << 1; c = b >> 10;
 184   for (i = NPIECE - 1; i--; ) {
 185     w = PIECEWD(i) - 1;
 186     b = z->P[i]&BIT(w);
 187     z->P[i] -= b << 1;
 188     z->P[i + 1] += b >> w;
 189   }
 190   z->P[0] += c; z->P[20] += c;
 191
 192 #endif
 193 }
 194
 195 /* --- @fgoldi_store@ --- *
 196  *
 197  * Arguments:   @octet zv[56]@ = where to write the result
 198  *              @const fgoldi *x@ = the field element to write
 199  *
 200  * Returns:     ---
 201  *
 202  * Use:         Stores a field element in the given octet vector in external
 203  *              representation.  A canonical encoding is always stored.
 204  */
 205
 206 void fgoldi_store(octet zv[56], const fgoldi *x)
 207 {
 208 #if FGOLDI_IMPL == 28
 209
 210   piece y[NPIECE], yy[NPIECE], c, d;
 211   uint32 u, v;
 212   mask32 m;
 213   unsigned i;
 214
 215   for (i = 0; i < NPIECE; i++) y[i] = x->P[i];
 216
 217   /* First, propagate the carries.  By the end of this, we'll have all of the
 218    * the pieces canonically sized and positive, and maybe there'll be
 219    * (signed) carry out.  The carry c is in { -1, 0, +1 }, and the remaining
 220    * value will be in the half-open interval [0, φ^2).  The whole represented
 221    * value is then y + φ^2 c.
 222    *
 223    * Assume that we start out with |y_i| <= 2^30.  We start off by cutting
 224    * off and reducing the carry c_15 from the topmost piece, y_15.  This
 225    * leaves 0 <= y_15 < 2^28; and we'll have |c_15| <= 4.  We'll add this
 226    * onto y_0 and y_8, and propagate the carries.  It's very clear that we'll
 227    * end up with |y + (φ + 1) c_15 - φ^2/2| << φ^2.
 228    *
 229    * Here, the y_i are signed, so we must be cautious about bithacking them.
 230    */
 231   c = ASR(piece, y[15], 28); y[15] = (upiece)y[15]&M28; y[8] += c;
 232   for (i = 0; i < NPIECE; i++)
 233     { y[i] += c; c = ASR(piece, y[i], 28); y[i] = (upiece)y[i]&M28; }
 234
 235   /* Now we have a slightly fiddly job to do.  If c = +1, or if c = 0 and
 236    * y >= p, then we should subtract p from the whole value; if c = -1 then
 237    * we should add p; and otherwise we should do nothing.
 238    *
 239    * But conditional behaviour is bad, m'kay.  So here's what we do instead.
 240    *
 241    * The first job is to sort out what we wanted to do.  If c = -1 then we
 242    * want to (a) invert the constant addend and (b) feed in a carry-in;
 243    * otherwise, we don't.
 244    */
 245   m = SIGN(c)&M28;
 246   d = m&1;
 247
 248   /* Now do the addition/subtraction.  Remember that all of the y_i are
 249    * nonnegative, so shifting and masking are safe and easy.
 250    */
 251       d += y[0] + (1 ^ m); yy[0] = d&M28; d >>= 28;
 252   for (i = 1; i < 8; i++)
 253     { d += y[i] +      m;  yy[i] = d&M28; d >>= 28; }
 254       d += y[8] + (1 ^ m); yy[8] = d&M28; d >>= 28;
 255   for (i = 9; i < 16; i++)
 256     { d += y[i] +      m;  yy[i] = d&M28; d >>= 28; }
 257
 258   /* The final carry-out is in d; since we only did addition, and the y_i are
 259    * nonnegative, then d is in { 0, 1 }.  We want to keep y', rather than y,
 260    * if (a) c /= 0 (in which case we know that the old value was
 261    * unsatisfactory), or (b) if d = 1 (in which case, if c = 0, we know that
 262    * the subtraction didn't cause a borrow, so we must be in the case where
 263    * p <= y < φ^2.
 264    */
 265   m = NONZEROP(c) | ~NONZEROP(d - 1);
 266   for (i = 0; i < NPIECE; i++) y[i] = (yy[i]&m) | (y[i]&~m);
 267
 268   /* Extract 32-bit words from the value. */
 269   for (i = 0; i < 7; i++) {
 270     u = ((y[i + 0] >> (4*i)) | ((uint32)y[i + 1] << (28 - 4*i)))&M32;
 271     v = ((y[i + 8] >> (4*i)) | ((uint32)y[i + 9] << (28 - 4*i)))&M32;
 272     STORE32_L(zv + 4*i,      u);
 273     STORE32_L(zv + 4*i + 28, v);
 274   }
 275
 276 #elif FGOLDI_IMPL == 12
 277
 278   piece y[NPIECE], yy[NPIECE], c, d;
 279   uint32 a;
 280   mask32 m, mm;
 281   unsigned i, j, n, w;
 282
 283   for (i = 0; i < NPIECE; i++) y[i] = x->P[i];
 284
 285   /* First, propagate the carries.  By the end of this, we'll have all of the
 286    * the pieces canonically sized and positive, and maybe there'll be
 287    * (signed) carry out.  The carry c is in { -1, 0, +1 }, and the remaining
 288    * value will be in the half-open interval [0, φ^2).  The whole represented
 289    * value is then y + φ^2 c.
 290    *
 291    * Assume that we start out with |y_i| <= 2^14.  We start off by cutting
 292    * off and reducing the carry c_39 from the topmost piece, y_39.  This
 293    * leaves 0 <= y_39 < 2^11; and we'll have |c_39| <= 16.  We'll add this
 294    * onto y_0 and y_20, and propagate the carries.  It's very clear that
 295    * we'll end up with |y + (φ + 1) c_39 - φ^2/2| << φ^2.
 296    *
 297    * Here, the y_i are signed, so we must be cautious about bithacking them.
 298    */
 299   c = ASR(piece, y[39], 11); y[39] = (piece)y[39]&M11; y[20] += c;
 300   for (i = 0; i < NPIECE; i++) {
 301     w = PIECEWD(i); m = (1 << w) - 1;
 302     y[i] += c; c = ASR(piece, y[i], w); y[i] = (upiece)y[i]&m;
 303   }
 304
 305   /* Now we have a slightly fiddly job to do.  If c = +1, or if c = 0 and
 306    * y >= p, then we should subtract p from the whole value; if c = -1 then
 307    * we should add p; and otherwise we should do nothing.
 308    *
 309    * But conditional behaviour is bad, m'kay.  So here's what we do instead.
 310    *
 311    * The first job is to sort out what we wanted to do.  If c = -1 then we
 312    * want to (a) invert the constant addend and (b) feed in a carry-in;
 313    * otherwise, we don't.
 314    */
 315   mm = SIGN(c);
 316   d = m&1;
 317
 318   /* Now do the addition/subtraction.  Remember that all of the y_i are
 319    * nonnegative, so shifting and masking are safe and easy.
 320    */
 321     d += y[ 0] + (1 ^ (mm&M12)); yy[ 0] = d&M12; d >>= 12;
 322   for (i = 1; i < 20; i++) {
 323     w = PIECEWD(i); m = MASK(w);
 324     d += y[ i] +      (mm&m);    yy[ i] = d&m;   d >>= w;
 325   }
 326     d += y[20] + (1 ^ (mm&M12)); yy[20] = d&M12; d >>= 12;
 327   for (i = 21; i < 40; i++) {
 328     w = PIECEWD(i); m = MASK(w);
 329     d += y[ i] +      (mm&m);    yy[ i] = d&m;   d >>= w;
 330   }
 331
 332   /* The final carry-out is in d; since we only did addition, and the y_i are
 333    * nonnegative, then d is in { 0, 1 }.  We want to keep y', rather than y,
 334    * if (a) c /= 0 (in which case we know that the old value was
 335    * unsatisfactory), or (b) if d = 1 (in which case, if c = 0, we know that
 336    * the subtraction didn't cause a borrow, so we must be in the case where
 337    * p <= y < φ^2.
 338    */
 339   m = NONZEROP(c) | ~NONZEROP(d - 1);
 340   for (i = 0; i < NPIECE; i++) y[i] = (yy[i]&m) | (y[i]&~m);
 341
 342   /* Convert that back into octets. */
 343   for (i = j = a = n = 0; i < NPIECE; i++) {
 344     a |= (uint32)y[i] << n; n += PIECEWD(i);
 345     while (n >= 8) { zv[j++] = a&M8; a >>= 8; n -= 8; }
 346   }
 347
 348 #endif
 349 }
 350
 351 /* --- @fgoldi_set@ --- *
 352  *
 353  * Arguments:   @fgoldi *z@ = where to write the result
 354  *              @int a@ = a small-ish constant
 355  *
 356  * Returns:     ---
 357  *
 358  * Use:         Sets @z@ to equal @a@.
 359  */
 360
 361 void fgoldi_set(fgoldi *x, int a)
 362 {
 363   unsigned i;
 364
 365   x->P[0] = a;
 366   for (i = 1; i < NPIECE; i++) x->P[i] = 0;
 367 }
 368
 369 /*----- Basic arithmetic --------------------------------------------------*/
 370
 371 /* --- @fgoldi_add@ --- *
 372  *
 373  * Arguments:   @fgoldi *z@ = where to put the result (may alias @x@ or @y@)
 374  *              @const fgoldi *x, *y@ = two operands
 375  *
 376  * Returns:     ---
 377  *
 378  * Use:         Set @z@ to the sum %$x + y$%.
 379  */
 380
 381 void fgoldi_add(fgoldi *z, const fgoldi *x, const fgoldi *y)
 382 {
 383   unsigned i;
 384   for (i = 0; i < NPIECE; i++) z->P[i] = x->P[i] + y->P[i];
 385 }
 386
 387 /* --- @fgoldi_sub@ --- *
 388  *
 389  * Arguments:   @fgoldi *z@ = where to put the result (may alias @x@ or @y@)
 390  *              @const fgoldi *x, *y@ = two operands
 391  *
 392  * Returns:     ---
 393  *
 394  * Use:         Set @z@ to the difference %$x - y$%.
 395  */
 396
 397 void fgoldi_sub(fgoldi *z, const fgoldi *x, const fgoldi *y)
 398 {
 399   unsigned i;
 400   for (i = 0; i < NPIECE; i++) z->P[i] = x->P[i] - y->P[i];
 401 }
 402
 403 /* --- @fgoldi_neg@ --- *
 404  *
 405  * Arguments:   @fgoldi *z@ = where to put the result (may alias @x@)
 406  *              @const fgoldi *x@ = an operand
 407  *
 408  * Returns:     ---
 409  *
 410  * Use:         Set @z = -x@.
 411  */
 412
 413 void fgoldi_neg(fgoldi *z, const fgoldi *x)
 414 {
 415   unsigned i;
 416   for (i = 0; i < NPIECE; i++) z->P[i] = -x->P[i];
 417 }
 418
 419 /*----- Constant-time utilities -------------------------------------------*/
 420
 421 /* --- @fgoldi_pick2@ --- *
 422  *
 423  * Arguments:   @fgoldi *z@ = where to put the result (may alias @x@ or @y@)
 424  *              @const fgoldi *x, *y@ = two operands
 425  *              @uint32 m@ = a mask
 426  *
 427  * Returns:     ---
 428  *
 429  * Use:         If @m@ is zero, set @z = y@; if @m@ is all-bits-set, then set
 430  *              @z = x@.  If @m@ has some other value, then scramble @z@ in
 431  *              an unhelpful way.
 432  */
 433
 434 void fgoldi_pick2(fgoldi *z, const fgoldi *x, const fgoldi *y, uint32 m)
 435 {
 436   mask32 mm = FIX_MASK32(m);
 437   unsigned i;
 438   for (i = 0; i < NPIECE; i++) z->P[i] = PICK2(x->P[i], y->P[i], mm);
 439 }
 440
 441 /* --- @fgoldi_pickn@ --- *
 442  *
 443  * Arguments:   @fgoldi *z@ = where to put the result
 444  *              @const fgoldi *v@ = a table of entries
 445  *              @size_t n@ = the number of entries in @v@
 446  *              @size_t i@ = an index
 447  *
 448  * Returns:     ---
 449  *
 450  * Use:         If @0 <= i < n < 32@ then set @z = v[i]@.  If @n >= 32@ then
 451  *              do something unhelpful; otherwise, if @i >= n@ then set @z@
 452  *              to zero.
 453  */
 454
 455 void fgoldi_pickn(fgoldi *z, const fgoldi *v, size_t n, size_t i)
 456 {
 457   uint32 b = (uint32)1 << (31 - i);
 458   mask32 m;
 459   unsigned j;
 460
 461   for (j = 0; j < NPIECE; j++) z->P[j] = 0;
 462   while (n--) {
 463     m = SIGN(b);
 464     for (j = 0; j < NPIECE; j++) CONDPICK(z->P[j], v->P[j], m);
 465     v++; b <<= 1;
 466   }
 467 }
 468
 469 /* --- @fgoldi_condswap@ --- *
 470  *
 471  * Arguments:   @fgoldi *x, *y@ = two operands
 472  *              @uint32 m@ = a mask
 473  *
 474  * Returns:     ---
 475  *
 476  * Use:         If @m@ is zero, do nothing; if @m@ is all-bits-set, then
 477  *              exchange @x@ and @y@.  If @m@ has some other value, then
 478  *              scramble @x@ and @y@ in an unhelpful way.
 479  */
 480
 481 void fgoldi_condswap(fgoldi *x, fgoldi *y, uint32 m)
 482 {
 483   unsigned i;
 484   mask32 mm = FIX_MASK32(m);
 485
 486   for (i = 0; i < NPIECE; i++) CONDSWAP(x->P[i], y->P[i], mm);
 487 }
 488
 489 /* --- @fgoldi_condneg@ --- *
 490  *
 491  * Arguments:   @fgoldi *z@ = where to put the result (may alias @x@)
 492  *              @const fgoldi *x@ = an operand
 493  *              @uint32 m@ = a mask
 494  *
 495  * Returns:     ---
 496  *
 497  * Use:         If @m@ is zero, set @z = x@; if @m@ is all-bits-set, then set
 498  *              @z = -x@.  If @m@ has some other value then scramble @z@ in
 499  *              an unhelpful way.
 500  */
 501
 502 void fgoldi_condneg(fgoldi *z, const fgoldi *x, uint32 m)
 503 {
 504 #ifdef NEG_TWOC
 505   mask32 m_xor = FIX_MASK32(m);
 506   piece m_add = m&1;
 507 # define CONDNEG(x) (((x) ^ m_xor) + m_add)
 508 #else
 509   int s = PICK2(-1, +1, m);
 510 # define CONDNEG(x) (s*(x))
 511 #endif
 512
 513   unsigned i;
 514   for (i = 0; i < NPIECE; i++) z->P[i] = CONDNEG(x->P[i]);
 515
 516 #undef CONDNEG
 517 }
 518
 519 /*----- Multiplication ----------------------------------------------------*/
 520
 521 #if FGOLDI_IMPL == 28
 522
 523 /* Let B = 2^63 - 1 be the largest value such that +B and -B can be
 524  * represented in a double-precision piece.  On entry, it must be the case
 525  * that |X_i| <= M <= B - 2^27 for some M.  If this is the case, then, on
 526  * exit, we will have |Z_i| <= 2^27 + M/2^27.
 527  */
 528 #define CARRY_REDUCE(z, x) do {                                         \
 529   dblpiece _t[NPIECE], _c;                                              \
 530   unsigned _i;                                                          \
 531                                                                         \
 532   /* Bias the input pieces.  This keeps the carries and so on centred   \
 533    * around zero rather than biased positive.                           \
 534    */                                                                   \
 535   for (_i = 0; _i < NPIECE; _i++) _t[_i] = (x)[_i] + B27;               \
 536                                                                         \
 537   /* Calculate the reduced pieces.  Careful with the bithacking. */     \
 538   _c = ASR(dblpiece, _t[15], 28);                                       \
 539   (z)[0] = (dblpiece)((udblpiece)_t[0]&M28) - B27 + _c;                 \
 540   for (_i = 1; _i < NPIECE; _i++) {                                     \
 541     (z)[_i] = (dblpiece)((udblpiece)_t[_i]&M28) - B27 +                 \
 542       ASR(dblpiece, _t[_i - 1], 28);                                    \
 543   }                                                                     \
 544   (z)[8] += _c;                                                         \
 545 } while (0)
 546
 547 #elif FGOLDI_IMPL == 12
 548
 549 static void carry_reduce(dblpiece x[NPIECE])
 550 {
 551   /* Initial bounds: we assume |x_i| < 2^31 - 2^27. */
 552
 553   unsigned i, j;
 554   dblpiece c;
 555
 556   /* The result is nearly canonical, because we do sequential carry
 557    * propagation, because smaller processors are more likely to prefer the
 558    * smaller working set than the instruction-level parallelism.
 559    *
 560    * Start at x_37; truncate it to 10 bits, and propagate the carry to x_38.
 561    * Truncate x_38 to 10 bits, and add the carry onto x_39.  Truncate x_39 to
 562    * 10 bits, and add the carry onto x_0 and x_20.  And so on.
 563    *
 564    * Once we reach x_37 for the second time, we start with |x_37| <= 2^10.
 565    * The carry into x_37 is at most 2^21; so the carry out into x_38 has
 566    * magnitude at most 2^10.  In turn, |x_38| <= 2^10 before the carry, so is
 567    * now no more than 2^11 in magnitude, and the carry out into x_39 is at
 568    * most 1.  This leaves |x_39| <= 2^10 + 1 after carry propagation.
 569    *
 570    * Be careful with the bit hacking because the quantities involved are
 571    * signed.
 572    */
 573
 574   /* For each piece, we bias it so that floor division (as done by an
 575    * arithmetic right shift) and modulus (as done by bitwise-AND) does the
 576    * right thing.
 577    */
 578 #define CARRY(i, wd, b, m) do {                                         \
 579   x[i] += (b);                                                          \
 580   c = ASR(dblpiece, x[i], (wd));                                        \
 581   x[i] = (dblpiece)((udblpiece)x[i]&(m)) - (b);                         \
 582 } while (0)
 583
 584                              {             CARRY(37, 11, B10, M11);      }
 585                              { x[38] += c; CARRY(38, 11, B10, M11);      }
 586                              { x[39] += c; CARRY(39, 11, B10, M11);      }
 587                                x[20] += c;
 588   for (i = 0; i < 35; ) {
 589                              {  x[i] += c; CARRY( i, 12, B11, M12); i++; }
 590     for (j = i + 4; i < j; ) {  x[i] += c; CARRY( i, 11, B10, M11); i++; }
 591   }
 592                              {  x[i] += c; CARRY( i, 12, B11, M12); i++; }
 593   while (i < 39)             {  x[i] += c; CARRY( i, 11, B10, M11); i++; }
 594   x[39] += c;
 595 }
 596
 597 #endif
 598
 599 /* --- @fgoldi_mulconst@ --- *
 600  *
 601  * Arguments:   @fgoldi *z@ = where to put the result (may alias @x@)
 602  *              @const fgoldi *x@ = an operand
 603  *              @long a@ = a small-ish constant; %$|a| < 2^{20}$%.
 604  *
 605  * Returns:     ---
 606  *
 607  * Use:         Set @z@ to the product %$a x$%.
 608  */
 609
 610 void fgoldi_mulconst(fgoldi *z, const fgoldi *x, long a)
 611 {
 612   unsigned i;
 613   dblpiece zz[NPIECE], aa = a;
 614
 615   for (i = 0; i < NPIECE; i++) zz[i] = aa*x->P[i];
 616 #if FGOLDI_IMPL == 28
 617   CARRY_REDUCE(z->P, zz);
 618 #elif FGOLDI_IMPL == 12
 619   carry_reduce(zz);
 620   for (i = 0; i < NPIECE; i++) z->P[i] = zz[i];
 621 #endif
 622 }
 623
 624 /* --- @fgoldi_mul@ --- *
 625  *
 626  * Arguments:   @fgoldi *z@ = where to put the result (may alias @x@ or @y@)
 627  *              @const fgoldi *x, *y@ = two operands
 628  *
 629  * Returns:     ---
 630  *
 631  * Use:         Set @z@ to the product %$x y$%.
 632  */
 633
 634 void fgoldi_mul(fgoldi *z, const fgoldi *x, const fgoldi *y)
 635 {
 636   dblpiece zz[NPIECE], u[NPIECE];
 637   piece ab[NPIECE/2], cd[NPIECE/2];
 638   const piece
 639     *a = x->P + NPIECE/2, *b = x->P,
 640     *c = y->P + NPIECE/2, *d = y->P;
 641   unsigned i, j;
 642
 643 #if FGOLDI_IMPL == 28
 644
 645 #  define M(x,i, y,j) ((dblpiece)(x)[i]*(y)[j])
 646
 647 #elif FGOLDI_IMPL == 12
 648
 649   static const unsigned short off[39] = {
 650       0,  12,  23,  34,  45,  56,  68,  79,  90, 101,
 651     112, 124, 135, 146, 157, 168, 180, 191, 202, 213,
 652     224, 236, 247, 258, 269, 280, 292, 303, 314, 325,
 653     336, 348, 359, 370, 381, 392, 404, 415, 426
 654   };
 655
 656 #define M(x,i, y,j)                                                     \
 657   (((dblpiece)(x)[i]*(y)[j]) << (off[i] + off[j] - off[(i) + (j)]))
 658
 659 #endif
 660
 661   /* Behold the magic.
 662    *
 663    * Write x = a φ + b, and y = c φ + d.  Then x y = a c φ^2 +
 664    * (a d + b c) φ + b d.  Karatsuba and Ofman observed that a d + b c =
 665    * (a + b) (c + d) - a c - b d, saving a multiplication, and Hamburg chose
 666    * the prime p so that φ^2 = φ + 1.  So
 667    *
 668    *    x y = ((a + b) (c + d) - b d) φ + a c + b d
 669    */
 670
 671   for (i = 0; i < NPIECE; i++) zz[i] = 0;
 672
 673   /* Our first job will be to calculate (1 - φ) b d, and write the result
 674    * into z.  As we do this, an interesting thing will happen.  Write
 675    * b d = u φ + v; then (1 - φ) b d = u φ + v - u φ^2 - v φ = (1 - φ) v - u.
 676    * So, what we do is to write the product end-swapped and negated, and then
 677    * we'll subtract the (negated, remember) high half from the low half.
 678    */
 679   for (i = 0; i < NPIECE/2; i++) {
 680     for (j = 0; j < NPIECE/2 - i; j++)
 681       zz[i + j + NPIECE/2] -= M(b,i, d,j);
 682     for (; j < NPIECE/2; j++)
 683       zz[i + j - NPIECE/2] -= M(b,i, d,j);
 684   }
 685   for (i = 0; i < NPIECE/2; i++)
 686     zz[i] -= zz[i + NPIECE/2];
 687
 688   /* Next, we add on a c.  There are no surprises here. */
 689   for (i = 0; i < NPIECE/2; i++)
 690     for (j = 0; j < NPIECE/2; j++)
 691       zz[i + j] += M(a,i, c,j);
 692
 693   /* Now, calculate a + b and c + d. */
 694   for (i = 0; i < NPIECE/2; i++)
 695     { ab[i] = a[i] + b[i]; cd[i] = c[i] + d[i]; }
 696
 697   /* Finally (for the multiplication) we must add on (a + b) (c + d) φ.
 698    * Write (a + b) (c + d) as u φ + v; then we actually want u φ^2 + v φ =
 699    * v φ + (1 + φ) u.  We'll store u in a temporary place and add it on
 700    * twice.
 701    */
 702   for (i = 0; i < NPIECE; i++) u[i] = 0;
 703   for (i = 0; i < NPIECE/2; i++) {
 704     for (j = 0; j < NPIECE/2 - i; j++)
 705       zz[i + j + NPIECE/2] += M(ab,i, cd,j);
 706     for (; j < NPIECE/2; j++)
 707       u[i + j - NPIECE/2] += M(ab,i, cd,j);
 708   }
 709   for (i = 0; i < NPIECE/2; i++)
 710     { zz[i] += u[i]; zz[i + NPIECE/2] += u[i]; }
 711
 712 #undef M
 713
 714 #if FGOLDI_IMPL == 28
 715   /* That wraps it up for the multiplication.  Let's figure out some bounds.
 716    * Fortunately, Karatsuba is a polynomial identity, so all of the pieces
 717    * end up the way they'd be if we'd done the thing the easy way, which
 718    * simplifies the analysis.  Suppose we started with |x_i|, |y_i| <= 9/5
 719    * 2^28.  The overheads in the result are given by the coefficients of
 720    *
 721    *    ((u^16 - 1)/(u - 1))^2 mod u^16 - u^8 - 1
 722    *
 723    * the greatest of which is 38.  So |z_i| <= 38*81/25*2^56 < 2^63.
 724    *
 725    * Anyway, a round of `CARRY_REDUCE' will leave us with |z_i| < 2^27 +
 726    * 2^36; and a second round will leave us with |z_i| < 2^27 + 512.
 727    */
 728   for (i = 0; i < 2; i++) CARRY_REDUCE(zz, zz);
 729   for (i = 0; i < NPIECE; i++) z->P[i] = zz[i];
 730 #elif FGOLDI_IMPL == 12
 731   carry_reduce(zz);
 732   for (i = 0; i < NPIECE; i++) z->P[i] = zz[i];
 733 #endif
 734 }
 735
 736 /* --- @fgoldi_sqr@ --- *
 737  *
 738  * Arguments:   @fgoldi *z@ = where to put the result (may alias @x@ or @y@)
 739  *              @const fgoldi *x@ = an operand
 740  *
 741  * Returns:     ---
 742  *
 743  * Use:         Set @z@ to the square %$x^2$%.
 744  */
 745
 746 void fgoldi_sqr(fgoldi *z, const fgoldi *x)
 747 {
 748 #if FGOLDI_IMPL == 28
 749
 750   dblpiece zz[NPIECE], u[NPIECE];
 751   piece ab[NPIECE];
 752   const piece *a = x->P + NPIECE/2, *b = x->P;
 753   unsigned i, j;
 754
 755 #  define M(x,i, y,j) ((dblpiece)(x)[i]*(y)[j])
 756
 757   /* The magic is basically the same as `fgoldi_mul' above.  We write
 758    * x = a φ + b and use Karatsuba and the special prime shape.  This time,
 759    * we have
 760    *
 761    *    x^2 = ((a + b)^2 - b^2) φ + a^2 + b^2
 762    */
 763
 764   for (i = 0; i < NPIECE; i++) zz[i] = 0;
 765
 766   /* Our first job will be to calculate (1 - φ) b^2, and write the result
 767    * into z.  Again, this interacts pleasantly with the prime shape.
 768    */
 769   for (i = 0; i < NPIECE/4; i++) {
 770     zz[2*i + NPIECE/2] -= M(b,i, b,i);
 771     for (j = i + 1; j < NPIECE/2 - i; j++)
 772       zz[i + j + NPIECE/2] -= 2*M(b,i, b,j);
 773     for (; j < NPIECE/2; j++)
 774       zz[i + j - NPIECE/2] -= 2*M(b,i, b,j);
 775   }
 776   for (; i < NPIECE/2; i++) {
 777     zz[2*i - NPIECE/2] -= M(b,i, b,i);
 778     for (j = i + 1; j < NPIECE/2; j++)
 779       zz[i + j - NPIECE/2] -= 2*M(b,i, b,j);
 780   }
 781   for (i = 0; i < NPIECE/2; i++)
 782     zz[i] -= zz[i + NPIECE/2];
 783
 784   /* Next, we add on a^2.  There are no surprises here. */
 785   for (i = 0; i < NPIECE/2; i++) {
 786     zz[2*i] += M(a,i, a,i);
 787     for (j = i + 1; j < NPIECE/2; j++)
 788       zz[i + j] += 2*M(a,i, a,j);
 789   }
 790
 791   /* Now, calculate a + b. */
 792   for (i = 0; i < NPIECE/2; i++)
 793     ab[i] = a[i] + b[i];
 794
 795   /* Finally (for the multiplication) we must add on (a + b)^2 φ.
 796    * Write (a + b)^2 as u φ + v; then we actually want (u + v) φ + u.  We'll
 797    * store u in a temporary place and add it on twice.
 798    */
 799   for (i = 0; i < NPIECE; i++) u[i] = 0;
 800   for (i = 0; i < NPIECE/4; i++) {
 801     zz[2*i + NPIECE/2] += M(ab,i, ab,i);
 802     for (j = i + 1; j < NPIECE/2 - i; j++)
 803       zz[i + j + NPIECE/2] += 2*M(ab,i, ab,j);
 804     for (; j < NPIECE/2; j++)
 805       u[i + j - NPIECE/2] += 2*M(ab,i, ab,j);
 806   }
 807   for (; i < NPIECE/2; i++) {
 808     u[2*i - NPIECE/2] += M(ab,i, ab,i);
 809     for (j = i + 1; j < NPIECE/2; j++)
 810       u[i + j - NPIECE/2] += 2*M(ab,i, ab,j);
 811   }
 812   for (i = 0; i < NPIECE/2; i++)
 813     { zz[i] += u[i]; zz[i + NPIECE/2] += u[i]; }
 814
 815 #undef M
 816
 817   /* Finally, carrying. */
 818   for (i = 0; i < 2; i++) CARRY_REDUCE(zz, zz);
 819   for (i = 0; i < NPIECE; i++) z->P[i] = zz[i];
 820
 821 #elif FGOLDI_IMPL == 12
 822   fgoldi_mul(z, x, x);
 823 #endif
 824 }
 825
 826 /*----- More advanced operations ------------------------------------------*/
 827
 828 /* --- @fgoldi_inv@ --- *
 829  *
 830  * Arguments:   @fgoldi *z@ = where to put the result (may alias @x@)
 831  *              @const fgoldi *x@ = an operand
 832  *
 833  * Returns:     ---
 834  *
 835  * Use:         Stores in @z@ the multiplicative inverse %$x^{-1}$%.  If
 836  *              %$x = 0$% then @z@ is set to zero.  This is considered a
 837  *              feature.
 838  */
 839
 840 void fgoldi_inv(fgoldi *z, const fgoldi *x)
 841 {
 842   fgoldi t, u;
 843   unsigned i;
 844
 845 #define SQRN(z, x, n) do {                                              \
 846   fgoldi_sqr((z), (x));                                                 \
 847   for (i = 1; i < (n); i++) fgoldi_sqr((z), (z));                       \
 848 } while (0)
 849
 850   /* Calculate x^-1 = x^(p - 2) = x^(2^448 - 2^224 - 3), which also handles
 851    * x = 0 as intended.  The addition chain is home-made.
 852    */                                   /* step | value */
 853   fgoldi_sqr(&u, x);                    /*    1 | 2 */
 854   fgoldi_mul(&t, &u, x);                /*    2 | 3 */
 855   SQRN(&u, &t, 2);                      /*    4 | 12 */
 856   fgoldi_mul(&t, &u, &t);               /*    5 | 15 */
 857   SQRN(&u, &t, 4);                      /*    9 | 240 */
 858   fgoldi_mul(&u, &u, &t);               /*   10 | 255 = 2^8 - 1 */
 859   SQRN(&u, &u, 4);                      /*   14 | 2^12 - 16 */
 860   fgoldi_mul(&t, &u, &t);               /*   15 | 2^12 - 1 */
 861   SQRN(&u, &t, 12);                     /*   27 | 2^24 - 2^12 */
 862   fgoldi_mul(&u, &u, &t);               /*   28 | 2^24 - 1 */
 863   SQRN(&u, &u, 12);                     /*   40 | 2^36 - 2^12 */
 864   fgoldi_mul(&t, &u, &t);               /*   41 | 2^36 - 1 */
 865   fgoldi_sqr(&t, &t);                   /*   42 | 2^37 - 2 */
 866   fgoldi_mul(&t, &t, x);                /*   43 | 2^37 - 1 */
 867   SQRN(&u, &t, 37);                     /*   80 | 2^74 - 2^37 */
 868   fgoldi_mul(&u, &u, &t);               /*   81 | 2^74 - 1 */
 869   SQRN(&u, &u, 37);                     /*  118 | 2^111 - 2^37 */
 870   fgoldi_mul(&t, &u, &t);               /*  119 | 2^111 - 1 */
 871   SQRN(&u, &t, 111);                    /*  230 | 2^222 - 2^111 */
 872   fgoldi_mul(&t, &u, &t);               /*  231 | 2^222 - 1 */
 873   fgoldi_sqr(&u, &t);                   /*  232 | 2^223 - 2 */
 874   fgoldi_mul(&u, &u, x);                /*  233 | 2^223 - 1 */
 875   SQRN(&u, &u, 223);                    /*  456 | 2^446 - 2^223 */
 876   fgoldi_mul(&t, &u, &t);               /*  457 | 2^446 - 2^222 - 1 */
 877   SQRN(&t, &t, 2);                      /*  459 | 2^448 - 2^224 - 4 */
 878   fgoldi_mul(z, &t, x);                 /*  460 | 2^448 - 2^224 - 3 */
 879
 880 #undef SQRN
 881 }
 882
 883 /* --- @fgoldi_quosqrt@ --- *
 884  *
 885  * Arguments:   @fgoldi *z@ = where to put the result (may alias @x@ or @y@)
 886  *              @const fgoldi *x, *y@ = two operands
 887  *
 888  * Returns:     Zero if successful, @-1@ if %$x/y$% is not a square.
 889  *
 890  * Use:         Stores in @z@ the one of the square roots %$\pm\sqrt{x/y}$%.
 891  *              If %$x = y = 0% then the result is zero; if %$y = 0$% but %$x
 892  *              \ne 0$% then the operation fails.  If you wanted a specific
 893  *              square root then you'll have to pick it yourself.
 894  */
 895
 896 int fgoldi_quosqrt(fgoldi *z, const fgoldi *x, const fgoldi *y)
 897 {
 898   fgoldi t, u, v;
 899   octet xb[56], b0[56];
 900   int32 rc = -1;
 901   mask32 m;
 902   unsigned i;
 903
 904 #define SQRN(z, x, n) do {                                              \
 905   fgoldi_sqr((z), (x));                                                 \
 906   for (i = 1; i < (n); i++) fgoldi_sqr((z), (z));                       \
 907 } while (0)
 908
 909   /* This is, fortunately, significantly easier than the equivalent problem
 910    * in GF(2^255 - 19), since p == 3 (mod 4).
 911    *
 912    * If x/y is square, then so is v = y^2 x/y = x y, and therefore u has
 913    * order r = (p - 1)/2.  Let w = v^{(p-3)/4}.  Then w^2 = v^{(p-3)/2} =
 914    * u^{r-1} = 1/v = 1/x y.  Clearly, then, (x w)^2 = x^2/x y = x/y, so x w
 915    * is one of the square roots we seek.
 916    *
 917    * The addition chain, then, is a prefix of the previous one.
 918    */
 919   fgoldi_mul(&v, x, y);
 920
 921   fgoldi_sqr(&u, &v);                   /*    1 | 2 */
 922   fgoldi_mul(&t, &u, &v);               /*    2 | 3 */
 923   SQRN(&u, &t, 2);                      /*    4 | 12 */
 924   fgoldi_mul(&t, &u, &t);               /*    5 | 15 */
 925   SQRN(&u, &t, 4);                      /*    9 | 240 */
 926   fgoldi_mul(&u, &u, &t);               /*   10 | 255 = 2^8 - 1 */
 927   SQRN(&u, &u, 4);                      /*   14 | 2^12 - 16 */
 928   fgoldi_mul(&t, &u, &t);               /*   15 | 2^12 - 1 */
 929   SQRN(&u, &t, 12);                     /*   27 | 2^24 - 2^12 */
 930   fgoldi_mul(&u, &u, &t);               /*   28 | 2^24 - 1 */
 931   SQRN(&u, &u, 12);                     /*   40 | 2^36 - 2^12 */
 932   fgoldi_mul(&t, &u, &t);               /*   41 | 2^36 - 1 */
 933   fgoldi_sqr(&t, &t);                   /*   42 | 2^37 - 2 */
 934   fgoldi_mul(&t, &t, &v);               /*   43 | 2^37 - 1 */
 935   SQRN(&u, &t, 37);                     /*   80 | 2^74 - 2^37 */
 936   fgoldi_mul(&u, &u, &t);               /*   81 | 2^74 - 1 */
 937   SQRN(&u, &u, 37);                     /*  118 | 2^111 - 2^37 */
 938   fgoldi_mul(&t, &u, &t);               /*  119 | 2^111 - 1 */
 939   SQRN(&u, &t, 111);                    /*  230 | 2^222 - 2^111 */
 940   fgoldi_mul(&t, &u, &t);               /*  231 | 2^222 - 1 */
 941   fgoldi_sqr(&u, &t);                   /*  232 | 2^223 - 2 */
 942   fgoldi_mul(&u, &u, &v);               /*  233 | 2^223 - 1 */
 943   SQRN(&u, &u, 223);                    /*  456 | 2^446 - 2^223 */
 944   fgoldi_mul(&t, &u, &t);               /*  457 | 2^446 - 2^222 - 1 */
 945
 946 #undef SQRN
 947
 948   /* Now we must decide whether the answer was right.  We should have z^2 =
 949    * x/y, so y z^2 = x.
 950    *
 951    * The easiest way to compare is to encode.  This isn't as wasteful as it
 952    * sounds: the hard part is normalizing the representations, which we have
 953    * to do anyway.
 954    */
 955   fgoldi_mul(z, x, &t);
 956   fgoldi_sqr(&t, z);
 957   fgoldi_mul(&t, &t, y);
 958   fgoldi_store(xb, x);
 959   fgoldi_store(b0, &t);
 960   m = -ct_memeq(xb, b0, 56);
 961   rc = PICK2(0, rc, m);
 962   return (rc);
 963 }
 964
 965 /*----- Test rig ----------------------------------------------------------*/
 966
 967 #ifdef TEST_RIG
 968
 969 #include <mLib/report.h>
 970 #include <mLib/str.h>
 971 #include <mLib/testrig.h>
 972
 973 static void fixdstr(dstr *d)
 974 {
 975   if (d->len > 56)
 976     die(1, "invalid length for fgoldi");
 977   else if (d->len < 56) {
 978     dstr_ensure(d, 56);
 979     memset(d->buf + d->len, 0, 56 - d->len);
 980     d->len = 56;
 981   }
 982 }
 983
 984 static void cvt_fgoldi(const char *buf, dstr *d)
 985 {
 986   dstr dd = DSTR_INIT;
 987
 988   type_hex.cvt(buf, &dd); fixdstr(&dd);
 989   dstr_ensure(d, sizeof(fgoldi)); d->len = sizeof(fgoldi);
 990   fgoldi_load((fgoldi *)d->buf, (const octet *)dd.buf);
 991   dstr_destroy(&dd);
 992 }
 993
 994 static void dump_fgoldi(dstr *d, FILE *fp)
 995   { fdump(stderr, "???", (const piece *)d->buf); }
 996
 997 static void cvt_fgoldi_ref(const char *buf, dstr *d)
 998   { type_hex.cvt(buf, d); fixdstr(d); }
 999
1000 static void dump_fgoldi_ref(dstr *d, FILE *fp)
1001 {
1002   fgoldi x;
1003
1004   fgoldi_load(&x, (const octet *)d->buf);
1005   fdump(stderr, "???", x.P);
1006 }
1007
1008 static int eq(const fgoldi *x, dstr *d)
1009   { octet b[56]; fgoldi_store(b, x); return (memcmp(b, d->buf, 56) == 0); }
1010
1011 static const test_type
1012   type_fgoldi = { cvt_fgoldi, dump_fgoldi },
1013   type_fgoldi_ref = { cvt_fgoldi_ref, dump_fgoldi_ref };
1014
1015 #define TEST_UNOP(op)                                                   \
1016   static int vrf_##op(dstr dv[])                                        \
1017   {                                                                     \
1018     fgoldi *x = (fgoldi *)dv[0].buf;                                    \
1019     fgoldi z, zz;                                                       \
1020     int ok = 1;                                                         \
1021                                                                         \
1022     fgoldi_##op(&z, x);                                                 \
1023     if (!eq(&z, &dv[1])) {                                              \
1024       ok = 0;                                                           \
1025       fprintf(stderr, "failed!\n");                                     \
1026       fdump(stderr, "x", x->P);                                         \
1027       fdump(stderr, "calc", z.P);                                       \
1028       fgoldi_load(&zz, (const octet *)dv[1].buf);                       \
1029       fdump(stderr, "z", zz.P);                                         \
1030     }                                                                   \
1031                                                                         \
1032     return (ok);                                                        \
1033   }
1034
1035 TEST_UNOP(sqr)
1036 TEST_UNOP(inv)
1037 TEST_UNOP(neg)
1038
1039 #define TEST_BINOP(op)                                                  \
1040   static int vrf_##op(dstr dv[])                                        \
1041   {                                                                     \
1042     fgoldi *x = (fgoldi *)dv[0].buf, *y = (fgoldi *)dv[1].buf;          \
1043     fgoldi z, zz;                                                       \
1044     int ok = 1;                                                         \
1045                                                                         \
1046     fgoldi_##op(&z, x, y);                                              \
1047     if (!eq(&z, &dv[2])) {                                              \
1048       ok = 0;                                                           \
1049       fprintf(stderr, "failed!\n");                                     \
1050       fdump(stderr, "x", x->P);                                         \
1051       fdump(stderr, "y", y->P);                                         \
1052       fdump(stderr, "calc", z.P);                                       \
1053       fgoldi_load(&zz, (const octet *)dv[2].buf);                       \
1054       fdump(stderr, "z", zz.P);                                         \
1055     }                                                                   \
1056                                                                         \
1057     return (ok);                                                        \
1058   }
1059
1060 TEST_BINOP(add)
1061 TEST_BINOP(sub)
1062 TEST_BINOP(mul)
1063
1064 static int vrf_mulc(dstr dv[])
1065 {
1066   fgoldi *x = (fgoldi *)dv[0].buf;
1067   long a = *(const long *)dv[1].buf;
1068   fgoldi z, zz;
1069   int ok = 1;
1070
1071   fgoldi_mulconst(&z, x, a);
1072   if (!eq(&z, &dv[2])) {
1073     ok = 0;
1074     fprintf(stderr, "failed!\n");
1075     fdump(stderr, "x", x->P);
1076     fprintf(stderr, "a = %ld\n", a);
1077     fdump(stderr, "calc", z.P);
1078     fgoldi_load(&zz, (const octet *)dv[2].buf);
1079     fdump(stderr, "z", zz.P);
1080   }
1081
1082   return (ok);
1083 }
1084
1085 static int vrf_condneg(dstr dv[])
1086 {
1087   fgoldi *x = (fgoldi *)dv[0].buf;
1088   uint32 m = *(uint32 *)dv[1].buf;
1089   fgoldi z;
1090   int ok = 1;
1091
1092   fgoldi_condneg(&z, x, m);
1093   if (!eq(&z, &dv[2])) {
1094     ok = 0;
1095     fprintf(stderr, "failed!\n");
1096     fdump(stderr, "x", x->P);
1097     fprintf(stderr, "m = 0x%08lx\n", (unsigned long)m);
1098     fdump(stderr, "calc z", z.P);
1099     fgoldi_load(&z, (const octet *)dv[1].buf);
1100     fdump(stderr, "want z", z.P);
1101   }
1102
1103   return (ok);
1104 }
1105
1106 static int vrf_pick2(dstr dv[])
1107 {
1108   fgoldi *x = (fgoldi *)dv[0].buf, *y = (fgoldi *)dv[1].buf;
1109   uint32 m = *(uint32 *)dv[2].buf;
1110   fgoldi z;
1111   int ok = 1;
1112
1113   fgoldi_pick2(&z, x, y, m);
1114   if (!eq(&z, &dv[3])) {
1115     ok = 0;
1116     fprintf(stderr, "failed!\n");
1117     fdump(stderr, "x", x->P);
1118     fdump(stderr, "y", y->P);
1119     fprintf(stderr, "m = 0x%08lx\n", (unsigned long)m);
1120     fdump(stderr, "calc z", z.P);
1121     fgoldi_load(&z, (const octet *)dv[3].buf);
1122     fdump(stderr, "want z", z.P);
1123   }
1124
1125   return (ok);
1126 }
1127
1128 static int vrf_pickn(dstr dv[])
1129 {
1130   dstr d = DSTR_INIT;
1131   fgoldi v[32], z;
1132   size_t i = *(uint32 *)dv[1].buf, j, n;
1133   const char *p;
1134   char *q;
1135   int ok = 1;
1136
1137   for (q = dv[0].buf, n = 0; (p = str_qword(&q, 0)) != 0; n++)
1138     { cvt_fgoldi(p, &d); v[n] = *(fgoldi *)d.buf; }
1139
1140   fgoldi_pickn(&z, v, n, i);
1141   if (!eq(&z, &dv[2])) {
1142     ok = 0;
1143     fprintf(stderr, "failed!\n");
1144     for (j = 0; j < n; j++) {
1145       fprintf(stderr, "v[%2u]", (unsigned)j);
1146       fdump(stderr, "", v[j].P);
1147     }
1148     fprintf(stderr, "i = %u\n", (unsigned)i);
1149     fdump(stderr, "calc z", z.P);
1150     fgoldi_load(&z, (const octet *)dv[2].buf);
1151     fdump(stderr, "want z", z.P);
1152   }
1153
1154   dstr_destroy(&d);
1155   return (ok);
1156 }
1157
1158 static int vrf_condswap(dstr dv[])
1159 {
1160   fgoldi *x = (fgoldi *)dv[0].buf, *y = (fgoldi *)dv[1].buf;
1161   fgoldi xx = *x, yy = *y;
1162   uint32 m = *(uint32 *)dv[2].buf;
1163   int ok = 1;
1164
1165   fgoldi_condswap(&xx, &yy, m);
1166   if (!eq(&xx, &dv[3]) || !eq(&yy, &dv[4])) {
1167     ok = 0;
1168     fprintf(stderr, "failed!\n");
1169     fdump(stderr, "x", x->P);
1170     fdump(stderr, "y", y->P);
1171     fprintf(stderr, "m = 0x%08lx\n", (unsigned long)m);
1172     fdump(stderr, "calc xx", xx.P);
1173     fdump(stderr, "calc yy", yy.P);
1174     fgoldi_load(&xx, (const octet *)dv[3].buf);
1175     fgoldi_load(&yy, (const octet *)dv[4].buf);
1176     fdump(stderr, "want xx", xx.P);
1177     fdump(stderr, "want yy", yy.P);
1178   }
1179
1180   return (ok);
1181 }
1182
1183 static int vrf_quosqrt(dstr dv[])
1184 {
1185   fgoldi *x = (fgoldi *)dv[0].buf, *y = (fgoldi *)dv[1].buf;
1186   fgoldi z, zz;
1187   int rc;
1188   int ok = 1;
1189
1190   if (dv[2].len) { fixdstr(&dv[2]); fixdstr(&dv[3]); }
1191   rc = fgoldi_quosqrt(&z, x, y);
1192   if (!dv[2].len ? !rc : (rc || (!eq(&z, &dv[2]) && !eq(&z, &dv[3])))) {
1193     ok = 0;
1194     fprintf(stderr, "failed!\n");
1195     fdump(stderr, "x", x->P);
1196     fdump(stderr, "y", y->P);
1197     if (rc) fprintf(stderr, "calc: FAIL\n");
1198     else fdump(stderr, "calc", z.P);
1199     if (!dv[2].len)
1200       fprintf(stderr, "exp: FAIL\n");
1201     else {
1202       fgoldi_load(&zz, (const octet *)dv[2].buf);
1203       fdump(stderr, "z", zz.P);
1204       fgoldi_load(&zz, (const octet *)dv[3].buf);
1205       fdump(stderr, "z'", zz.P);
1206     }
1207   }
1208
1209   return (ok);
1210 }
1211
1212 static int vrf_sub_mulc_add_sub_mul(dstr dv[])
1213 {
1214   fgoldi *u = (fgoldi *)dv[0].buf, *v = (fgoldi *)dv[1].buf,
1215     *w = (fgoldi *)dv[3].buf, *x = (fgoldi *)dv[4].buf,
1216     *y = (fgoldi *)dv[5].buf;
1217   long a = *(const long *)dv[2].buf;
1218   fgoldi umv, aumv, wpaumv, xmy, z, zz;
1219   int ok = 1;
1220
1221   fgoldi_sub(&umv, u, v);
1222   fgoldi_mulconst(&aumv, &umv, a);
1223   fgoldi_add(&wpaumv, w, &aumv);
1224   fgoldi_sub(&xmy, x, y);
1225   fgoldi_mul(&z, &wpaumv, &xmy);
1226
1227   if (!eq(&z, &dv[6])) {
1228     ok = 0;
1229     fprintf(stderr, "failed!\n");
1230     fdump(stderr, "u", u->P);
1231     fdump(stderr, "v", v->P);
1232     fdump(stderr, "u - v", umv.P);
1233     fprintf(stderr, "a = %ld\n", a);
1234     fdump(stderr, "a (u - v)", aumv.P);
1235     fdump(stderr, "w + a (u - v)", wpaumv.P);
1236     fdump(stderr, "x", x->P);
1237     fdump(stderr, "y", y->P);
1238     fdump(stderr, "x - y", xmy.P);
1239     fdump(stderr, "(x - y) (w + a (u - v))", z.P);
1240     fgoldi_load(&zz, (const octet *)dv[6].buf); fdump(stderr, "z", zz.P);
1241   }
1242
1243   return (ok);
1244 }
1245
1246 static test_chunk tests[] = {
1247   { "add", vrf_add, { &type_fgoldi, &type_fgoldi, &type_fgoldi_ref } },
1248   { "sub", vrf_sub, { &type_fgoldi, &type_fgoldi, &type_fgoldi_ref } },
1249   { "neg", vrf_neg, { &type_fgoldi, &type_fgoldi_ref } },
1250   { "condneg", vrf_condneg,
1251     { &type_fgoldi, &type_uint32, &type_fgoldi_ref } },
1252   { "mul", vrf_mul, { &type_fgoldi, &type_fgoldi, &type_fgoldi_ref } },
1253   { "mulconst", vrf_mulc, { &type_fgoldi, &type_long, &type_fgoldi_ref } },
1254   { "pick2", vrf_pick2,
1255     { &type_fgoldi, &type_fgoldi, &type_uint32, &type_fgoldi_ref } },
1256   { "pickn", vrf_pickn,
1257     { &type_string, &type_uint32, &type_fgoldi_ref } },
1258   { "condswap", vrf_condswap,
1259     { &type_fgoldi, &type_fgoldi, &type_uint32,
1260       &type_fgoldi_ref, &type_fgoldi_ref } },
1261   { "sqr", vrf_sqr, { &type_fgoldi, &type_fgoldi_ref } },
1262   { "inv", vrf_inv, { &type_fgoldi, &type_fgoldi_ref } },
1263   { "quosqrt", vrf_quosqrt,
1264     { &type_fgoldi, &type_fgoldi, &type_hex, &type_hex } },
1265   { "sub-mulc-add-sub-mul", vrf_sub_mulc_add_sub_mul,
1266     { &type_fgoldi, &type_fgoldi, &type_long, &type_fgoldi,
1267       &type_fgoldi, &type_fgoldi, &type_fgoldi_ref } },
1268   { 0, 0, { 0 } }
1269 };
1270
1271 int main(int argc, char *argv[])
1272 {
1273   test_run(argc, argv, tests, SRCDIR "/t/fgoldi");
1274   return (0);
1275 }
1276
1277 #endif
1278
1279 /*----- That's all, folks -------------------------------------------------*/