3 * Arithmetic in the Goldilocks field GF(2^448 - 2^224 - 1)
5 * (c) 2017 Straylight/Edgeware
8 /*----- Licensing notice --------------------------------------------------*
10 * This file is part of Catacomb.
12 * Catacomb is free software; you can redistribute it and/or modify
13 * it under the terms of the GNU Library General Public License as
14 * published by the Free Software Foundation; either version 2 of the
15 * License, or (at your option) any later version.
17 * Catacomb is distributed in the hope that it will be useful,
18 * but WITHOUT ANY WARRANTY; without even the implied warranty of
19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20 * GNU Library General Public License for more details.
22 * You should have received a copy of the GNU Library General Public
23 * License along with Catacomb; if not, write to the Free
24 * Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
28 /*----- Header files ------------------------------------------------------*/
35 /*----- Basic setup -------------------------------------------------------*
37 * Let φ = 2^224; then p = φ^2 - φ - 1, and, in GF(p), we have φ^2 = φ + 1
41 typedef fgoldi_piece piece;
44 /* We represent an element of GF(p) as 16 28-bit signed integer pieces x_i:
45 * x = SUM_{0<=i<16} x_i 2^(28i).
48 typedef int64 dblpiece;
49 typedef uint32 upiece; typedef uint64 udblpiece;
54 #define B28 0x10000000u
55 #define B27 0x08000000u
56 #define M28 0x0fffffffu
57 #define M27 0x07ffffffu
58 #define M32 0xffffffffu
60 #elif FGOLDI_IMPL == 12
61 /* We represent an element of GF(p) as 40 signed integer pieces x_i: x =
62 * SUM_{0<=i<40} x_i 2^ceil(224i/20). Pieces i with i == 0 (mod 5) are 12
63 * bits wide; the others are 11 bits wide, so they form eight groups of 56
67 typedef int32 dblpiece;
68 typedef uint16 upiece; typedef uint32 udblpiece;
69 #define PIECEWD(i) ((i)%5 ? 11 : 12)
83 /*----- Debugging machinery -----------------------------------------------*/
85 #if defined(FGOLDI_DEBUG) || defined(TEST_RIG)
92 static mp *get_pgoldi(void)
94 mp *p = MP_NEW, *t = MP_NEW;
96 p = mp_setbit(p, MP_ZERO, 448);
97 t = mp_setbit(t, MP_ZERO, 224);
99 p = mp_sub(p, p, MP_ONE);
104 DEF_FDUMP(fdump, piece, PIECEWD, NPIECE, 56, get_pgoldi())
108 /*----- Loading and storing -----------------------------------------------*/
110 /* --- @fgoldi_load@ --- *
112 * Arguments: @fgoldi *z@ = where to store the result
113 * @const octet xv[56]@ = source to read
117 * Use: Reads an element of %$\gf{2^{448} - 2^{224} - 19}$% in
118 * external representation from @xv@ and stores it in @z@.
120 * External representation is little-endian base-256. Some
121 * elements have multiple encodings, which are not produced by
122 * correct software; use of noncanonical encodings is not an
123 * error, and toleration of them is considered a performance
127 void fgoldi_load(fgoldi *z, const octet xv[56])
129 #if FGOLDI_IMPL == 28
135 /* First, read the input value as words. */
136 for (i = 0; i < 14; i++) xw[i] = LOAD32_L(xv + 4*i);
138 /* Extract unsigned 28-bit pieces from the words. */
139 z->P[ 0] = (xw[ 0] >> 0)&M28;
140 z->P[ 7] = (xw[ 6] >> 4)&M28;
141 z->P[ 8] = (xw[ 7] >> 0)&M28;
142 z->P[15] = (xw[13] >> 4)&M28;
143 for (i = 1; i < 7; i++) {
144 z->P[i + 0] = ((xw[i + 0] << (4*i)) | (xw[i - 1] >> (32 - 4*i)))&M28;
145 z->P[i + 8] = ((xw[i + 7] << (4*i)) | (xw[i + 6] >> (32 - 4*i)))&M28;
148 /* Convert the nonnegative pieces into a balanced signed representation, so
149 * each piece ends up in the interval |z_i| <= 2^27. For each piece, if
150 * its top bit is set, lend a bit leftwards; in the case of z_15, reduce
151 * this bit by adding it onto z_0 and z_8, since this is the φ^2 bit, and
152 * φ^2 = φ + 1. We delay this carry until after all of the pieces have
153 * been balanced. If we don't do this, then we have to do a more expensive
154 * test for nonzeroness to decide whether to lend a bit leftwards rather
155 * than just testing a single bit.
157 * Note that we don't try for a canonical representation here: both upper
158 * and lower bounds are achievable.
160 b = z->P[15]&B27; z->P[15] -= b << 1; c = b >> 27;
161 for (i = NPIECE - 1; i--; )
162 { b = z->P[i]&B27; z->P[i] -= b << 1; z->P[i + 1] += b >> 27; }
163 z->P[0] += c; z->P[8] += c;
165 #elif FGOLDI_IMPL == 12
167 unsigned i, j, n, w, b;
171 /* First, convert the bytes into nonnegative pieces. */
172 for (i = j = a = n = 0, w = PIECEWD(0); i < 56; i++) {
173 a |= (uint32)xv[i] << n; n += 8;
175 z->P[j++] = a&MASK(w);
176 a >>= w; n -= w; w = PIECEWD(j);
180 /* Convert the nonnegative pieces into a balanced signed representation, so
181 * each piece ends up in the interval |z_i| <= 2^11 + 1.
183 b = z->P[39]&B10; z->P[39] -= b << 1; c = b >> 10;
184 for (i = NPIECE - 1; i--; ) {
188 z->P[i + 1] += b >> w;
190 z->P[0] += c; z->P[20] += c;
195 /* --- @fgoldi_store@ --- *
197 * Arguments: @octet zv[56]@ = where to write the result
198 * @const fgoldi *x@ = the field element to write
202 * Use: Stores a field element in the given octet vector in external
203 * representation. A canonical encoding is always stored.
206 void fgoldi_store(octet zv[56], const fgoldi *x)
208 #if FGOLDI_IMPL == 28
210 piece y[NPIECE], yy[NPIECE], c, d;
215 for (i = 0; i < NPIECE; i++) y[i] = x->P[i];
217 /* First, propagate the carries. By the end of this, we'll have all of the
218 * the pieces canonically sized and positive, and maybe there'll be
219 * (signed) carry out. The carry c is in { -1, 0, +1 }, and the remaining
220 * value will be in the half-open interval [0, φ^2). The whole represented
221 * value is then y + φ^2 c.
223 * Assume that we start out with |y_i| <= 2^30. We start off by cutting
224 * off and reducing the carry c_15 from the topmost piece, y_15. This
225 * leaves 0 <= y_15 < 2^28; and we'll have |c_15| <= 4. We'll add this
226 * onto y_0 and y_8, and propagate the carries. It's very clear that we'll
227 * end up with |y + (φ + 1) c_15 - φ^2/2| << φ^2.
229 * Here, the y_i are signed, so we must be cautious about bithacking them.
231 c = ASR(piece, y[15], 28); y[15] = (upiece)y[15]&M28; y[8] += c;
232 for (i = 0; i < NPIECE; i++)
233 { y[i] += c; c = ASR(piece, y[i], 28); y[i] = (upiece)y[i]&M28; }
235 /* Now we have a slightly fiddly job to do. If c = +1, or if c = 0 and
236 * y >= p, then we should subtract p from the whole value; if c = -1 then
237 * we should add p; and otherwise we should do nothing.
239 * But conditional behaviour is bad, m'kay. So here's what we do instead.
241 * The first job is to sort out what we wanted to do. If c = -1 then we
242 * want to (a) invert the constant addend and (b) feed in a carry-in;
243 * otherwise, we don't.
248 /* Now do the addition/subtraction. Remember that all of the y_i are
249 * nonnegative, so shifting and masking are safe and easy.
251 d += y[0] + (1 ^ m); yy[0] = d&M28; d >>= 28;
252 for (i = 1; i < 8; i++)
253 { d += y[i] + m; yy[i] = d&M28; d >>= 28; }
254 d += y[8] + (1 ^ m); yy[8] = d&M28; d >>= 28;
255 for (i = 9; i < 16; i++)
256 { d += y[i] + m; yy[i] = d&M28; d >>= 28; }
258 /* The final carry-out is in d; since we only did addition, and the y_i are
259 * nonnegative, then d is in { 0, 1 }. We want to keep y', rather than y,
260 * if (a) c /= 0 (in which case we know that the old value was
261 * unsatisfactory), or (b) if d = 1 (in which case, if c = 0, we know that
262 * the subtraction didn't cause a borrow, so we must be in the case where
265 m = NONZEROP(c) | ~NONZEROP(d - 1);
266 for (i = 0; i < NPIECE; i++) y[i] = (yy[i]&m) | (y[i]&~m);
268 /* Extract 32-bit words from the value. */
269 for (i = 0; i < 7; i++) {
270 u = ((y[i + 0] >> (4*i)) | ((uint32)y[i + 1] << (28 - 4*i)))&M32;
271 v = ((y[i + 8] >> (4*i)) | ((uint32)y[i + 9] << (28 - 4*i)))&M32;
272 STORE32_L(zv + 4*i, u);
273 STORE32_L(zv + 4*i + 28, v);
276 #elif FGOLDI_IMPL == 12
278 piece y[NPIECE], yy[NPIECE], c, d;
283 for (i = 0; i < NPIECE; i++) y[i] = x->P[i];
285 /* First, propagate the carries. By the end of this, we'll have all of the
286 * the pieces canonically sized and positive, and maybe there'll be
287 * (signed) carry out. The carry c is in { -1, 0, +1 }, and the remaining
288 * value will be in the half-open interval [0, φ^2). The whole represented
289 * value is then y + φ^2 c.
291 * Assume that we start out with |y_i| <= 2^14. We start off by cutting
292 * off and reducing the carry c_39 from the topmost piece, y_39. This
293 * leaves 0 <= y_39 < 2^11; and we'll have |c_39| <= 16. We'll add this
294 * onto y_0 and y_20, and propagate the carries. It's very clear that
295 * we'll end up with |y + (φ + 1) c_39 - φ^2/2| << φ^2.
297 * Here, the y_i are signed, so we must be cautious about bithacking them.
299 c = ASR(piece, y[39], 11); y[39] = (piece)y[39]&M11; y[20] += c;
300 for (i = 0; i < NPIECE; i++) {
301 w = PIECEWD(i); m = (1 << w) - 1;
302 y[i] += c; c = ASR(piece, y[i], w); y[i] = (upiece)y[i]&m;
305 /* Now we have a slightly fiddly job to do. If c = +1, or if c = 0 and
306 * y >= p, then we should subtract p from the whole value; if c = -1 then
307 * we should add p; and otherwise we should do nothing.
309 * But conditional behaviour is bad, m'kay. So here's what we do instead.
311 * The first job is to sort out what we wanted to do. If c = -1 then we
312 * want to (a) invert the constant addend and (b) feed in a carry-in;
313 * otherwise, we don't.
318 /* Now do the addition/subtraction. Remember that all of the y_i are
319 * nonnegative, so shifting and masking are safe and easy.
321 d += y[ 0] + (1 ^ (mm&M12)); yy[ 0] = d&M12; d >>= 12;
322 for (i = 1; i < 20; i++) {
323 w = PIECEWD(i); m = MASK(w);
324 d += y[ i] + (mm&m); yy[ i] = d&m; d >>= w;
326 d += y[20] + (1 ^ (mm&M12)); yy[20] = d&M12; d >>= 12;
327 for (i = 21; i < 40; i++) {
328 w = PIECEWD(i); m = MASK(w);
329 d += y[ i] + (mm&m); yy[ i] = d&m; d >>= w;
332 /* The final carry-out is in d; since we only did addition, and the y_i are
333 * nonnegative, then d is in { 0, 1 }. We want to keep y', rather than y,
334 * if (a) c /= 0 (in which case we know that the old value was
335 * unsatisfactory), or (b) if d = 1 (in which case, if c = 0, we know that
336 * the subtraction didn't cause a borrow, so we must be in the case where
339 m = NONZEROP(c) | ~NONZEROP(d - 1);
340 for (i = 0; i < NPIECE; i++) y[i] = (yy[i]&m) | (y[i]&~m);
342 /* Convert that back into octets. */
343 for (i = j = a = n = 0; i < NPIECE; i++) {
344 a |= (uint32)y[i] << n; n += PIECEWD(i);
345 while (n >= 8) { zv[j++] = a&M8; a >>= 8; n -= 8; }
351 /* --- @fgoldi_set@ --- *
353 * Arguments: @fgoldi *z@ = where to write the result
354 * @int a@ = a small-ish constant
358 * Use: Sets @z@ to equal @a@.
361 void fgoldi_set(fgoldi *x, int a)
366 for (i = 1; i < NPIECE; i++) x->P[i] = 0;
369 /*----- Basic arithmetic --------------------------------------------------*/
371 /* --- @fgoldi_add@ --- *
373 * Arguments: @fgoldi *z@ = where to put the result (may alias @x@ or @y@)
374 * @const fgoldi *x, *y@ = two operands
378 * Use: Set @z@ to the sum %$x + y$%.
381 void fgoldi_add(fgoldi *z, const fgoldi *x, const fgoldi *y)
384 for (i = 0; i < NPIECE; i++) z->P[i] = x->P[i] + y->P[i];
387 /* --- @fgoldi_sub@ --- *
389 * Arguments: @fgoldi *z@ = where to put the result (may alias @x@ or @y@)
390 * @const fgoldi *x, *y@ = two operands
394 * Use: Set @z@ to the difference %$x - y$%.
397 void fgoldi_sub(fgoldi *z, const fgoldi *x, const fgoldi *y)
400 for (i = 0; i < NPIECE; i++) z->P[i] = x->P[i] - y->P[i];
403 /* --- @fgoldi_neg@ --- *
405 * Arguments: @fgoldi *z@ = where to put the result (may alias @x@)
406 * @const fgoldi *x@ = an operand
413 void fgoldi_neg(fgoldi *z, const fgoldi *x)
416 for (i = 0; i < NPIECE; i++) z->P[i] = -x->P[i];
419 /*----- Constant-time utilities -------------------------------------------*/
421 /* --- @fgoldi_pick2@ --- *
423 * Arguments: @fgoldi *z@ = where to put the result (may alias @x@ or @y@)
424 * @const fgoldi *x, *y@ = two operands
425 * @uint32 m@ = a mask
429 * Use: If @m@ is zero, set @z = y@; if @m@ is all-bits-set, then set
430 * @z = x@. If @m@ has some other value, then scramble @z@ in
434 void fgoldi_pick2(fgoldi *z, const fgoldi *x, const fgoldi *y, uint32 m)
436 mask32 mm = FIX_MASK32(m);
438 for (i = 0; i < NPIECE; i++) z->P[i] = PICK2(x->P[i], y->P[i], mm);
441 /* --- @fgoldi_pickn@ --- *
443 * Arguments: @fgoldi *z@ = where to put the result
444 * @const fgoldi *v@ = a table of entries
445 * @size_t n@ = the number of entries in @v@
446 * @size_t i@ = an index
450 * Use: If @0 <= i < n < 32@ then set @z = v[i]@. If @n >= 32@ then
451 * do something unhelpful; otherwise, if @i >= n@ then set @z@
455 void fgoldi_pickn(fgoldi *z, const fgoldi *v, size_t n, size_t i)
457 uint32 b = (uint32)1 << (31 - i);
461 for (j = 0; j < NPIECE; j++) z->P[j] = 0;
464 for (j = 0; j < NPIECE; j++) CONDPICK(z->P[j], v->P[j], m);
469 /* --- @fgoldi_condswap@ --- *
471 * Arguments: @fgoldi *x, *y@ = two operands
472 * @uint32 m@ = a mask
476 * Use: If @m@ is zero, do nothing; if @m@ is all-bits-set, then
477 * exchange @x@ and @y@. If @m@ has some other value, then
478 * scramble @x@ and @y@ in an unhelpful way.
481 void fgoldi_condswap(fgoldi *x, fgoldi *y, uint32 m)
484 mask32 mm = FIX_MASK32(m);
486 for (i = 0; i < NPIECE; i++) CONDSWAP(x->P[i], y->P[i], mm);
489 /* --- @fgoldi_condneg@ --- *
491 * Arguments: @fgoldi *z@ = where to put the result (may alias @x@)
492 * @const fgoldi *x@ = an operand
493 * @uint32 m@ = a mask
497 * Use: If @m@ is zero, set @z = x@; if @m@ is all-bits-set, then set
498 * @z = -x@. If @m@ has some other value then scramble @z@ in
502 void fgoldi_condneg(fgoldi *z, const fgoldi *x, uint32 m)
505 mask32 m_xor = FIX_MASK32(m);
507 # define CONDNEG(x) (((x) ^ m_xor) + m_add)
509 int s = PICK2(-1, +1, m);
510 # define CONDNEG(x) (s*(x))
514 for (i = 0; i < NPIECE; i++) z->P[i] = CONDNEG(x->P[i]);
519 /*----- Multiplication ----------------------------------------------------*/
521 #if FGOLDI_IMPL == 28
523 /* Let B = 2^63 - 1 be the largest value such that +B and -B can be
524 * represented in a double-precision piece. On entry, it must be the case
525 * that |X_i| <= M <= B - 2^27 for some M. If this is the case, then, on
526 * exit, we will have |Z_i| <= 2^27 + M/2^27.
528 #define CARRY_REDUCE(z, x) do { \
529 dblpiece _t[NPIECE], _c; \
532 /* Bias the input pieces. This keeps the carries and so on centred \
533 * around zero rather than biased positive. \
535 for (_i = 0; _i < NPIECE; _i++) _t[_i] = (x)[_i] + B27; \
537 /* Calculate the reduced pieces. Careful with the bithacking. */ \
538 _c = ASR(dblpiece, _t[15], 28); \
539 (z)[0] = (dblpiece)((udblpiece)_t[0]&M28) - B27 + _c; \
540 for (_i = 1; _i < NPIECE; _i++) { \
541 (z)[_i] = (dblpiece)((udblpiece)_t[_i]&M28) - B27 + \
542 ASR(dblpiece, _t[_i - 1], 28); \
547 #elif FGOLDI_IMPL == 12
549 static void carry_reduce(dblpiece x[NPIECE])
551 /* Initial bounds: we assume |x_i| < 2^31 - 2^27. */
556 /* The result is nearly canonical, because we do sequential carry
557 * propagation, because smaller processors are more likely to prefer the
558 * smaller working set than the instruction-level parallelism.
560 * Start at x_37; truncate it to 10 bits, and propagate the carry to x_38.
561 * Truncate x_38 to 10 bits, and add the carry onto x_39. Truncate x_39 to
562 * 10 bits, and add the carry onto x_0 and x_20. And so on.
564 * Once we reach x_37 for the second time, we start with |x_37| <= 2^10.
565 * The carry into x_37 is at most 2^21; so the carry out into x_38 has
566 * magnitude at most 2^10. In turn, |x_38| <= 2^10 before the carry, so is
567 * now no more than 2^11 in magnitude, and the carry out into x_39 is at
568 * most 1. This leaves |x_39| <= 2^10 + 1 after carry propagation.
570 * Be careful with the bit hacking because the quantities involved are
574 /* For each piece, we bias it so that floor division (as done by an
575 * arithmetic right shift) and modulus (as done by bitwise-AND) does the
578 #define CARRY(i, wd, b, m) do { \
580 c = ASR(dblpiece, x[i], (wd)); \
581 x[i] = (dblpiece)((udblpiece)x[i]&(m)) - (b); \
584 { CARRY(37, 11, B10, M11); }
585 { x[38] += c; CARRY(38, 11, B10, M11); }
586 { x[39] += c; CARRY(39, 11, B10, M11); }
588 for (i = 0; i < 35; ) {
589 { x[i] += c; CARRY( i, 12, B11, M12); i++; }
590 for (j = i + 4; i < j; ) { x[i] += c; CARRY( i, 11, B10, M11); i++; }
592 { x[i] += c; CARRY( i, 12, B11, M12); i++; }
593 while (i < 39) { x[i] += c; CARRY( i, 11, B10, M11); i++; }
599 /* --- @fgoldi_mulconst@ --- *
601 * Arguments: @fgoldi *z@ = where to put the result (may alias @x@)
602 * @const fgoldi *x@ = an operand
603 * @long a@ = a small-ish constant; %$|a| < 2^{20}$%.
607 * Use: Set @z@ to the product %$a x$%.
610 void fgoldi_mulconst(fgoldi *z, const fgoldi *x, long a)
613 dblpiece zz[NPIECE], aa = a;
615 for (i = 0; i < NPIECE; i++) zz[i] = aa*x->P[i];
616 #if FGOLDI_IMPL == 28
617 CARRY_REDUCE(z->P, zz);
618 #elif FGOLDI_IMPL == 12
620 for (i = 0; i < NPIECE; i++) z->P[i] = zz[i];
624 /* --- @fgoldi_mul@ --- *
626 * Arguments: @fgoldi *z@ = where to put the result (may alias @x@ or @y@)
627 * @const fgoldi *x, *y@ = two operands
631 * Use: Set @z@ to the product %$x y$%.
634 void fgoldi_mul(fgoldi *z, const fgoldi *x, const fgoldi *y)
636 dblpiece zz[NPIECE], u[NPIECE];
637 piece ab[NPIECE/2], cd[NPIECE/2];
639 *a = x->P + NPIECE/2, *b = x->P,
640 *c = y->P + NPIECE/2, *d = y->P;
643 #if FGOLDI_IMPL == 28
645 # define M(x,i, y,j) ((dblpiece)(x)[i]*(y)[j])
647 #elif FGOLDI_IMPL == 12
649 static const unsigned short off[39] = {
650 0, 12, 23, 34, 45, 56, 68, 79, 90, 101,
651 112, 124, 135, 146, 157, 168, 180, 191, 202, 213,
652 224, 236, 247, 258, 269, 280, 292, 303, 314, 325,
653 336, 348, 359, 370, 381, 392, 404, 415, 426
656 #define M(x,i, y,j) \
657 (((dblpiece)(x)[i]*(y)[j]) << (off[i] + off[j] - off[(i) + (j)]))
663 * Write x = a φ + b, and y = c φ + d. Then x y = a c φ^2 +
664 * (a d + b c) φ + b d. Karatsuba and Ofman observed that a d + b c =
665 * (a + b) (c + d) - a c - b d, saving a multiplication, and Hamburg chose
666 * the prime p so that φ^2 = φ + 1. So
668 * x y = ((a + b) (c + d) - b d) φ + a c + b d
671 for (i = 0; i < NPIECE; i++) zz[i] = 0;
673 /* Our first job will be to calculate (1 - φ) b d, and write the result
674 * into z. As we do this, an interesting thing will happen. Write
675 * b d = u φ + v; then (1 - φ) b d = u φ + v - u φ^2 - v φ = (1 - φ) v - u.
676 * So, what we do is to write the product end-swapped and negated, and then
677 * we'll subtract the (negated, remember) high half from the low half.
679 for (i = 0; i < NPIECE/2; i++) {
680 for (j = 0; j < NPIECE/2 - i; j++)
681 zz[i + j + NPIECE/2] -= M(b,i, d,j);
682 for (; j < NPIECE/2; j++)
683 zz[i + j - NPIECE/2] -= M(b,i, d,j);
685 for (i = 0; i < NPIECE/2; i++)
686 zz[i] -= zz[i + NPIECE/2];
688 /* Next, we add on a c. There are no surprises here. */
689 for (i = 0; i < NPIECE/2; i++)
690 for (j = 0; j < NPIECE/2; j++)
691 zz[i + j] += M(a,i, c,j);
693 /* Now, calculate a + b and c + d. */
694 for (i = 0; i < NPIECE/2; i++)
695 { ab[i] = a[i] + b[i]; cd[i] = c[i] + d[i]; }
697 /* Finally (for the multiplication) we must add on (a + b) (c + d) φ.
698 * Write (a + b) (c + d) as u φ + v; then we actually want u φ^2 + v φ =
699 * v φ + (1 + φ) u. We'll store u in a temporary place and add it on
702 for (i = 0; i < NPIECE; i++) u[i] = 0;
703 for (i = 0; i < NPIECE/2; i++) {
704 for (j = 0; j < NPIECE/2 - i; j++)
705 zz[i + j + NPIECE/2] += M(ab,i, cd,j);
706 for (; j < NPIECE/2; j++)
707 u[i + j - NPIECE/2] += M(ab,i, cd,j);
709 for (i = 0; i < NPIECE/2; i++)
710 { zz[i] += u[i]; zz[i + NPIECE/2] += u[i]; }
714 #if FGOLDI_IMPL == 28
715 /* That wraps it up for the multiplication. Let's figure out some bounds.
716 * Fortunately, Karatsuba is a polynomial identity, so all of the pieces
717 * end up the way they'd be if we'd done the thing the easy way, which
718 * simplifies the analysis. Suppose we started with |x_i|, |y_i| <= 9/5
719 * 2^28. The overheads in the result are given by the coefficients of
721 * ((u^16 - 1)/(u - 1))^2 mod u^16 - u^8 - 1
723 * the greatest of which is 38. So |z_i| <= 38*81/25*2^56 < 2^63.
725 * Anyway, a round of `CARRY_REDUCE' will leave us with |z_i| < 2^27 +
726 * 2^36; and a second round will leave us with |z_i| < 2^27 + 512.
728 for (i = 0; i < 2; i++) CARRY_REDUCE(zz, zz);
729 for (i = 0; i < NPIECE; i++) z->P[i] = zz[i];
730 #elif FGOLDI_IMPL == 12
732 for (i = 0; i < NPIECE; i++) z->P[i] = zz[i];
736 /* --- @fgoldi_sqr@ --- *
738 * Arguments: @fgoldi *z@ = where to put the result (may alias @x@ or @y@)
739 * @const fgoldi *x@ = an operand
743 * Use: Set @z@ to the square %$x^2$%.
746 void fgoldi_sqr(fgoldi *z, const fgoldi *x)
748 #if FGOLDI_IMPL == 28
750 dblpiece zz[NPIECE], u[NPIECE];
752 const piece *a = x->P + NPIECE/2, *b = x->P;
755 # define M(x,i, y,j) ((dblpiece)(x)[i]*(y)[j])
757 /* The magic is basically the same as `fgoldi_mul' above. We write
758 * x = a φ + b and use Karatsuba and the special prime shape. This time,
761 * x^2 = ((a + b)^2 - b^2) φ + a^2 + b^2
764 for (i = 0; i < NPIECE; i++) zz[i] = 0;
766 /* Our first job will be to calculate (1 - φ) b^2, and write the result
767 * into z. Again, this interacts pleasantly with the prime shape.
769 for (i = 0; i < NPIECE/4; i++) {
770 zz[2*i + NPIECE/2] -= M(b,i, b,i);
771 for (j = i + 1; j < NPIECE/2 - i; j++)
772 zz[i + j + NPIECE/2] -= 2*M(b,i, b,j);
773 for (; j < NPIECE/2; j++)
774 zz[i + j - NPIECE/2] -= 2*M(b,i, b,j);
776 for (; i < NPIECE/2; i++) {
777 zz[2*i - NPIECE/2] -= M(b,i, b,i);
778 for (j = i + 1; j < NPIECE/2; j++)
779 zz[i + j - NPIECE/2] -= 2*M(b,i, b,j);
781 for (i = 0; i < NPIECE/2; i++)
782 zz[i] -= zz[i + NPIECE/2];
784 /* Next, we add on a^2. There are no surprises here. */
785 for (i = 0; i < NPIECE/2; i++) {
786 zz[2*i] += M(a,i, a,i);
787 for (j = i + 1; j < NPIECE/2; j++)
788 zz[i + j] += 2*M(a,i, a,j);
791 /* Now, calculate a + b. */
792 for (i = 0; i < NPIECE/2; i++)
795 /* Finally (for the multiplication) we must add on (a + b)^2 φ.
796 * Write (a + b)^2 as u φ + v; then we actually want (u + v) φ + u. We'll
797 * store u in a temporary place and add it on twice.
799 for (i = 0; i < NPIECE; i++) u[i] = 0;
800 for (i = 0; i < NPIECE/4; i++) {
801 zz[2*i + NPIECE/2] += M(ab,i, ab,i);
802 for (j = i + 1; j < NPIECE/2 - i; j++)
803 zz[i + j + NPIECE/2] += 2*M(ab,i, ab,j);
804 for (; j < NPIECE/2; j++)
805 u[i + j - NPIECE/2] += 2*M(ab,i, ab,j);
807 for (; i < NPIECE/2; i++) {
808 u[2*i - NPIECE/2] += M(ab,i, ab,i);
809 for (j = i + 1; j < NPIECE/2; j++)
810 u[i + j - NPIECE/2] += 2*M(ab,i, ab,j);
812 for (i = 0; i < NPIECE/2; i++)
813 { zz[i] += u[i]; zz[i + NPIECE/2] += u[i]; }
817 /* Finally, carrying. */
818 for (i = 0; i < 2; i++) CARRY_REDUCE(zz, zz);
819 for (i = 0; i < NPIECE; i++) z->P[i] = zz[i];
821 #elif FGOLDI_IMPL == 12
826 /*----- More advanced operations ------------------------------------------*/
828 /* --- @fgoldi_inv@ --- *
830 * Arguments: @fgoldi *z@ = where to put the result (may alias @x@)
831 * @const fgoldi *x@ = an operand
835 * Use: Stores in @z@ the multiplicative inverse %$x^{-1}$%. If
836 * %$x = 0$% then @z@ is set to zero. This is considered a
840 void fgoldi_inv(fgoldi *z, const fgoldi *x)
845 #define SQRN(z, x, n) do { \
846 fgoldi_sqr((z), (x)); \
847 for (i = 1; i < (n); i++) fgoldi_sqr((z), (z)); \
850 /* Calculate x^-1 = x^(p - 2) = x^(2^448 - 2^224 - 3), which also handles
851 * x = 0 as intended. The addition chain is home-made.
852 */ /* step | value */
853 fgoldi_sqr(&u, x); /* 1 | 2 */
854 fgoldi_mul(&t, &u, x); /* 2 | 3 */
855 SQRN(&u, &t, 2); /* 4 | 12 */
856 fgoldi_mul(&t, &u, &t); /* 5 | 15 */
857 SQRN(&u, &t, 4); /* 9 | 240 */
858 fgoldi_mul(&u, &u, &t); /* 10 | 255 = 2^8 - 1 */
859 SQRN(&u, &u, 4); /* 14 | 2^12 - 16 */
860 fgoldi_mul(&t, &u, &t); /* 15 | 2^12 - 1 */
861 SQRN(&u, &t, 12); /* 27 | 2^24 - 2^12 */
862 fgoldi_mul(&u, &u, &t); /* 28 | 2^24 - 1 */
863 SQRN(&u, &u, 12); /* 40 | 2^36 - 2^12 */
864 fgoldi_mul(&t, &u, &t); /* 41 | 2^36 - 1 */
865 fgoldi_sqr(&t, &t); /* 42 | 2^37 - 2 */
866 fgoldi_mul(&t, &t, x); /* 43 | 2^37 - 1 */
867 SQRN(&u, &t, 37); /* 80 | 2^74 - 2^37 */
868 fgoldi_mul(&u, &u, &t); /* 81 | 2^74 - 1 */
869 SQRN(&u, &u, 37); /* 118 | 2^111 - 2^37 */
870 fgoldi_mul(&t, &u, &t); /* 119 | 2^111 - 1 */
871 SQRN(&u, &t, 111); /* 230 | 2^222 - 2^111 */
872 fgoldi_mul(&t, &u, &t); /* 231 | 2^222 - 1 */
873 fgoldi_sqr(&u, &t); /* 232 | 2^223 - 2 */
874 fgoldi_mul(&u, &u, x); /* 233 | 2^223 - 1 */
875 SQRN(&u, &u, 223); /* 456 | 2^446 - 2^223 */
876 fgoldi_mul(&t, &u, &t); /* 457 | 2^446 - 2^222 - 1 */
877 SQRN(&t, &t, 2); /* 459 | 2^448 - 2^224 - 4 */
878 fgoldi_mul(z, &t, x); /* 460 | 2^448 - 2^224 - 3 */
883 /* --- @fgoldi_quosqrt@ --- *
885 * Arguments: @fgoldi *z@ = where to put the result (may alias @x@ or @y@)
886 * @const fgoldi *x, *y@ = two operands
888 * Returns: Zero if successful, @-1@ if %$x/y$% is not a square.
890 * Use: Stores in @z@ the one of the square roots %$\pm\sqrt{x/y}$%.
891 * If %$x = y = 0% then the result is zero; if %$y = 0$% but %$x
892 * \ne 0$% then the operation fails. If you wanted a specific
893 * square root then you'll have to pick it yourself.
896 int fgoldi_quosqrt(fgoldi *z, const fgoldi *x, const fgoldi *y)
899 octet xb[56], b0[56];
904 #define SQRN(z, x, n) do { \
905 fgoldi_sqr((z), (x)); \
906 for (i = 1; i < (n); i++) fgoldi_sqr((z), (z)); \
909 /* This is, fortunately, significantly easier than the equivalent problem
910 * in GF(2^255 - 19), since p == 3 (mod 4).
912 * If x/y is square, then so is v = y^2 x/y = x y, and therefore u has
913 * order r = (p - 1)/2. Let w = v^{(p-3)/4}. Then w^2 = v^{(p-3)/2} =
914 * u^{r-1} = 1/v = 1/x y. Clearly, then, (x w)^2 = x^2/x y = x/y, so x w
915 * is one of the square roots we seek.
917 * The addition chain, then, is a prefix of the previous one.
919 fgoldi_mul(&v, x, y);
921 fgoldi_sqr(&u, &v); /* 1 | 2 */
922 fgoldi_mul(&t, &u, &v); /* 2 | 3 */
923 SQRN(&u, &t, 2); /* 4 | 12 */
924 fgoldi_mul(&t, &u, &t); /* 5 | 15 */
925 SQRN(&u, &t, 4); /* 9 | 240 */
926 fgoldi_mul(&u, &u, &t); /* 10 | 255 = 2^8 - 1 */
927 SQRN(&u, &u, 4); /* 14 | 2^12 - 16 */
928 fgoldi_mul(&t, &u, &t); /* 15 | 2^12 - 1 */
929 SQRN(&u, &t, 12); /* 27 | 2^24 - 2^12 */
930 fgoldi_mul(&u, &u, &t); /* 28 | 2^24 - 1 */
931 SQRN(&u, &u, 12); /* 40 | 2^36 - 2^12 */
932 fgoldi_mul(&t, &u, &t); /* 41 | 2^36 - 1 */
933 fgoldi_sqr(&t, &t); /* 42 | 2^37 - 2 */
934 fgoldi_mul(&t, &t, &v); /* 43 | 2^37 - 1 */
935 SQRN(&u, &t, 37); /* 80 | 2^74 - 2^37 */
936 fgoldi_mul(&u, &u, &t); /* 81 | 2^74 - 1 */
937 SQRN(&u, &u, 37); /* 118 | 2^111 - 2^37 */
938 fgoldi_mul(&t, &u, &t); /* 119 | 2^111 - 1 */
939 SQRN(&u, &t, 111); /* 230 | 2^222 - 2^111 */
940 fgoldi_mul(&t, &u, &t); /* 231 | 2^222 - 1 */
941 fgoldi_sqr(&u, &t); /* 232 | 2^223 - 2 */
942 fgoldi_mul(&u, &u, &v); /* 233 | 2^223 - 1 */
943 SQRN(&u, &u, 223); /* 456 | 2^446 - 2^223 */
944 fgoldi_mul(&t, &u, &t); /* 457 | 2^446 - 2^222 - 1 */
948 /* Now we must decide whether the answer was right. We should have z^2 =
951 * The easiest way to compare is to encode. This isn't as wasteful as it
952 * sounds: the hard part is normalizing the representations, which we have
955 fgoldi_mul(z, x, &t);
957 fgoldi_mul(&t, &t, y);
959 fgoldi_store(b0, &t);
960 m = -ct_memeq(xb, b0, 56);
961 rc = PICK2(0, rc, m);
965 /*----- Test rig ----------------------------------------------------------*/
969 #include <mLib/report.h>
970 #include <mLib/str.h>
971 #include <mLib/testrig.h>
973 static void fixdstr(dstr *d)
976 die(1, "invalid length for fgoldi");
977 else if (d->len < 56) {
979 memset(d->buf + d->len, 0, 56 - d->len);
984 static void cvt_fgoldi(const char *buf, dstr *d)
988 type_hex.cvt(buf, &dd); fixdstr(&dd);
989 dstr_ensure(d, sizeof(fgoldi)); d->len = sizeof(fgoldi);
990 fgoldi_load((fgoldi *)d->buf, (const octet *)dd.buf);
994 static void dump_fgoldi(dstr *d, FILE *fp)
995 { fdump(stderr, "???", (const piece *)d->buf); }
997 static void cvt_fgoldi_ref(const char *buf, dstr *d)
998 { type_hex.cvt(buf, d); fixdstr(d); }
1000 static void dump_fgoldi_ref(dstr *d, FILE *fp)
1004 fgoldi_load(&x, (const octet *)d->buf);
1005 fdump(stderr, "???", x.P);
1008 static int eq(const fgoldi *x, dstr *d)
1009 { octet b[56]; fgoldi_store(b, x); return (memcmp(b, d->buf, 56) == 0); }
1011 static const test_type
1012 type_fgoldi = { cvt_fgoldi, dump_fgoldi },
1013 type_fgoldi_ref = { cvt_fgoldi_ref, dump_fgoldi_ref };
1015 #define TEST_UNOP(op) \
1016 static int vrf_##op(dstr dv[]) \
1018 fgoldi *x = (fgoldi *)dv[0].buf; \
1022 fgoldi_##op(&z, x); \
1023 if (!eq(&z, &dv[1])) { \
1025 fprintf(stderr, "failed!\n"); \
1026 fdump(stderr, "x", x->P); \
1027 fdump(stderr, "calc", z.P); \
1028 fgoldi_load(&zz, (const octet *)dv[1].buf); \
1029 fdump(stderr, "z", zz.P); \
1039 #define TEST_BINOP(op) \
1040 static int vrf_##op(dstr dv[]) \
1042 fgoldi *x = (fgoldi *)dv[0].buf, *y = (fgoldi *)dv[1].buf; \
1046 fgoldi_##op(&z, x, y); \
1047 if (!eq(&z, &dv[2])) { \
1049 fprintf(stderr, "failed!\n"); \
1050 fdump(stderr, "x", x->P); \
1051 fdump(stderr, "y", y->P); \
1052 fdump(stderr, "calc", z.P); \
1053 fgoldi_load(&zz, (const octet *)dv[2].buf); \
1054 fdump(stderr, "z", zz.P); \
1064 static int vrf_mulc(dstr dv[])
1066 fgoldi *x = (fgoldi *)dv[0].buf;
1067 long a = *(const long *)dv[1].buf;
1071 fgoldi_mulconst(&z, x, a);
1072 if (!eq(&z, &dv[2])) {
1074 fprintf(stderr, "failed!\n");
1075 fdump(stderr, "x", x->P);
1076 fprintf(stderr, "a = %ld\n", a);
1077 fdump(stderr, "calc", z.P);
1078 fgoldi_load(&zz, (const octet *)dv[2].buf);
1079 fdump(stderr, "z", zz.P);
1085 static int vrf_condneg(dstr dv[])
1087 fgoldi *x = (fgoldi *)dv[0].buf;
1088 uint32 m = *(uint32 *)dv[1].buf;
1092 fgoldi_condneg(&z, x, m);
1093 if (!eq(&z, &dv[2])) {
1095 fprintf(stderr, "failed!\n");
1096 fdump(stderr, "x", x->P);
1097 fprintf(stderr, "m = 0x%08lx\n", (unsigned long)m);
1098 fdump(stderr, "calc z", z.P);
1099 fgoldi_load(&z, (const octet *)dv[1].buf);
1100 fdump(stderr, "want z", z.P);
1106 static int vrf_pick2(dstr dv[])
1108 fgoldi *x = (fgoldi *)dv[0].buf, *y = (fgoldi *)dv[1].buf;
1109 uint32 m = *(uint32 *)dv[2].buf;
1113 fgoldi_pick2(&z, x, y, m);
1114 if (!eq(&z, &dv[3])) {
1116 fprintf(stderr, "failed!\n");
1117 fdump(stderr, "x", x->P);
1118 fdump(stderr, "y", y->P);
1119 fprintf(stderr, "m = 0x%08lx\n", (unsigned long)m);
1120 fdump(stderr, "calc z", z.P);
1121 fgoldi_load(&z, (const octet *)dv[3].buf);
1122 fdump(stderr, "want z", z.P);
1128 static int vrf_pickn(dstr dv[])
1132 size_t i = *(uint32 *)dv[1].buf, j, n;
1137 for (q = dv[0].buf, n = 0; (p = str_qword(&q, 0)) != 0; n++)
1138 { cvt_fgoldi(p, &d); v[n] = *(fgoldi *)d.buf; }
1140 fgoldi_pickn(&z, v, n, i);
1141 if (!eq(&z, &dv[2])) {
1143 fprintf(stderr, "failed!\n");
1144 for (j = 0; j < n; j++) {
1145 fprintf(stderr, "v[%2u]", (unsigned)j);
1146 fdump(stderr, "", v[j].P);
1148 fprintf(stderr, "i = %u\n", (unsigned)i);
1149 fdump(stderr, "calc z", z.P);
1150 fgoldi_load(&z, (const octet *)dv[2].buf);
1151 fdump(stderr, "want z", z.P);
1158 static int vrf_condswap(dstr dv[])
1160 fgoldi *x = (fgoldi *)dv[0].buf, *y = (fgoldi *)dv[1].buf;
1161 fgoldi xx = *x, yy = *y;
1162 uint32 m = *(uint32 *)dv[2].buf;
1165 fgoldi_condswap(&xx, &yy, m);
1166 if (!eq(&xx, &dv[3]) || !eq(&yy, &dv[4])) {
1168 fprintf(stderr, "failed!\n");
1169 fdump(stderr, "x", x->P);
1170 fdump(stderr, "y", y->P);
1171 fprintf(stderr, "m = 0x%08lx\n", (unsigned long)m);
1172 fdump(stderr, "calc xx", xx.P);
1173 fdump(stderr, "calc yy", yy.P);
1174 fgoldi_load(&xx, (const octet *)dv[3].buf);
1175 fgoldi_load(&yy, (const octet *)dv[4].buf);
1176 fdump(stderr, "want xx", xx.P);
1177 fdump(stderr, "want yy", yy.P);
1183 static int vrf_quosqrt(dstr dv[])
1185 fgoldi *x = (fgoldi *)dv[0].buf, *y = (fgoldi *)dv[1].buf;
1190 if (dv[2].len) { fixdstr(&dv[2]); fixdstr(&dv[3]); }
1191 rc = fgoldi_quosqrt(&z, x, y);
1192 if (!dv[2].len ? !rc : (rc || (!eq(&z, &dv[2]) && !eq(&z, &dv[3])))) {
1194 fprintf(stderr, "failed!\n");
1195 fdump(stderr, "x", x->P);
1196 fdump(stderr, "y", y->P);
1197 if (rc) fprintf(stderr, "calc: FAIL\n");
1198 else fdump(stderr, "calc", z.P);
1200 fprintf(stderr, "exp: FAIL\n");
1202 fgoldi_load(&zz, (const octet *)dv[2].buf);
1203 fdump(stderr, "z", zz.P);
1204 fgoldi_load(&zz, (const octet *)dv[3].buf);
1205 fdump(stderr, "z'", zz.P);
1212 static int vrf_sub_mulc_add_sub_mul(dstr dv[])
1214 fgoldi *u = (fgoldi *)dv[0].buf, *v = (fgoldi *)dv[1].buf,
1215 *w = (fgoldi *)dv[3].buf, *x = (fgoldi *)dv[4].buf,
1216 *y = (fgoldi *)dv[5].buf;
1217 long a = *(const long *)dv[2].buf;
1218 fgoldi umv, aumv, wpaumv, xmy, z, zz;
1221 fgoldi_sub(&umv, u, v);
1222 fgoldi_mulconst(&aumv, &umv, a);
1223 fgoldi_add(&wpaumv, w, &aumv);
1224 fgoldi_sub(&xmy, x, y);
1225 fgoldi_mul(&z, &wpaumv, &xmy);
1227 if (!eq(&z, &dv[6])) {
1229 fprintf(stderr, "failed!\n");
1230 fdump(stderr, "u", u->P);
1231 fdump(stderr, "v", v->P);
1232 fdump(stderr, "u - v", umv.P);
1233 fprintf(stderr, "a = %ld\n", a);
1234 fdump(stderr, "a (u - v)", aumv.P);
1235 fdump(stderr, "w + a (u - v)", wpaumv.P);
1236 fdump(stderr, "x", x->P);
1237 fdump(stderr, "y", y->P);
1238 fdump(stderr, "x - y", xmy.P);
1239 fdump(stderr, "(x - y) (w + a (u - v))", z.P);
1240 fgoldi_load(&zz, (const octet *)dv[6].buf); fdump(stderr, "z", zz.P);
1246 static test_chunk tests[] = {
1247 { "add", vrf_add, { &type_fgoldi, &type_fgoldi, &type_fgoldi_ref } },
1248 { "sub", vrf_sub, { &type_fgoldi, &type_fgoldi, &type_fgoldi_ref } },
1249 { "neg", vrf_neg, { &type_fgoldi, &type_fgoldi_ref } },
1250 { "condneg", vrf_condneg,
1251 { &type_fgoldi, &type_uint32, &type_fgoldi_ref } },
1252 { "mul", vrf_mul, { &type_fgoldi, &type_fgoldi, &type_fgoldi_ref } },
1253 { "mulconst", vrf_mulc, { &type_fgoldi, &type_long, &type_fgoldi_ref } },
1254 { "pick2", vrf_pick2,
1255 { &type_fgoldi, &type_fgoldi, &type_uint32, &type_fgoldi_ref } },
1256 { "pickn", vrf_pickn,
1257 { &type_string, &type_uint32, &type_fgoldi_ref } },
1258 { "condswap", vrf_condswap,
1259 { &type_fgoldi, &type_fgoldi, &type_uint32,
1260 &type_fgoldi_ref, &type_fgoldi_ref } },
1261 { "sqr", vrf_sqr, { &type_fgoldi, &type_fgoldi_ref } },
1262 { "inv", vrf_inv, { &type_fgoldi, &type_fgoldi_ref } },
1263 { "quosqrt", vrf_quosqrt,
1264 { &type_fgoldi, &type_fgoldi, &type_hex, &type_hex } },
1265 { "sub-mulc-add-sub-mul", vrf_sub_mulc_add_sub_mul,
1266 { &type_fgoldi, &type_fgoldi, &type_long, &type_fgoldi,
1267 &type_fgoldi, &type_fgoldi, &type_fgoldi_ref } },
1271 int main(int argc, char *argv[])
1273 test_run(argc, argv, tests, SRCDIR "/t/fgoldi");
1279 /*----- That's all, folks -------------------------------------------------*/