1 /// -*- mode: asm; asm-comment-char: ?/; comment-start: "// " -*-
3 /// Large SIMD-based multiplications
5 /// (c) 2016 Straylight/Edgeware
8 ///----- Licensing notice ---------------------------------------------------
10 /// This file is part of Catacomb.
12 /// Catacomb is free software; you can redistribute it and/or modify
13 /// it under the terms of the GNU Library General Public License as
14 /// published by the Free Software Foundation; either version 2 of the
15 /// License, or (at your option) any later version.
17 /// Catacomb is distributed in the hope that it will be useful,
18 /// but WITHOUT ANY WARRANTY; without even the implied warranty of
19 /// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20 /// GNU Library General Public License for more details.
22 /// You should have received a copy of the GNU Library General Public
23 /// License along with Catacomb; if not, write to the Free
24 /// Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
25 /// MA 02111-1307, USA.
27 ///--------------------------------------------------------------------------
28 /// External definitions.
31 #include "asm-common.h"
33 ///--------------------------------------------------------------------------
39 ///--------------------------------------------------------------------------
42 /// We define a number of primitive fixed-size multipliers from which we can
43 /// construct more general variable-length multipliers.
45 /// The basic trick is the same throughout. In an operand-scanning
46 /// multiplication, the inner multiplication loop multiplies a
47 /// multiple-precision operand by a single precision factor, and adds the
48 /// result, appropriately shifted, to the result. A `finely integrated
49 /// operand scanning' implementation of Montgomery multiplication also adds
50 /// the product of a single-precision `Montgomery factor' and the modulus,
51 /// calculated in the same pass. The more common `coarsely integrated
52 /// operand scanning' alternates main multiplication and Montgomery passes,
53 /// which requires additional carry propagation.
55 /// Throughout both plain-multiplication and Montgomery stages, then, one of
56 /// the factors remains constant throughout the operation, so we can afford
57 /// to take a little time to preprocess it. The transformation we perform is
58 /// as follows. Let b = 2^16, and B = b^2 = 2^32. Suppose we're given a
59 /// 128-bit factor v = v_0 + v_1 B + v_2 B^2 + v_3 B^3. Split each v_i into
60 /// two sixteen-bit pieces, so v_i = v'_i + v''_i b. These eight 16-bit
61 /// pieces are placed into 32-bit cells, and arranged as two 128-bit SSE
62 /// operands, as follows.
65 /// 0 v'_0 v'_1 v''_0 v''_1
66 /// 16 v'_2 v'_3 v''_2 v''_3
68 /// A `pmuludqd' instruction ignores the odd positions in its operands; thus,
69 /// it will act on (say) v'_0 and v''_0 in a single instruction. Shifting
70 /// this vector right by 4 bytes brings v'_1 and v''_1 into position. We can
71 /// multiply such a vector by a full 32-bit scalar to produce two 48-bit
72 /// results in 64-bit fields. The sixteen bits of headroom allows us to add
73 /// many products together before we must deal with carrying; it also allows
74 /// for some calculations to be performed on the above expanded form.
78 /// We maintain four `carry' registers accumulating intermediate results.
79 /// The registers' precise roles rotate during the computation; we name them
80 /// `c0', `c1', `c2', and `c3'. Each carry register holds two 64-bit halves:
81 /// the register c0, for example, holds c'_0 (low half) and c''_0 (high
82 /// half), and represents the value c_0 = c'_0 + c''_0 b; the carry registers
83 /// collectively represent the value c_0 + c_1 B + c_2 B^2 + c_3 B^3. The
84 /// `pmuluqdq' instruction acting on a scalar operand (broadcast across all
85 /// lanes of its vector) and an operand in the expanded form above produces a
86 /// result which can be added directly to the appropriate carry register.
87 /// Following a pass of four multiplications, we perform some limited carry
88 /// propagation: let t = c''_0 mod B, and let d = c'_0 + t b; then we output
89 /// z = d mod B, add (floor(d/B), floor(c''_0/B)) to c1, and cycle the carry
90 /// registers around, so that c1 becomes c0, and the old c0 is (implicitly)
91 /// zeroed becomes c3.
93 ///--------------------------------------------------------------------------
94 /// Macro definitions.
96 .macro mulcore r, i, slo, shi, d0, d1=nil, d2=nil, d3=nil
97 // Multiply R_I by the expanded operand SLO/SHI, and leave the pieces
98 // of the product in registers D0, D1, D2, D3.
99 pshufd \d0, \r, SHUF(3, \i, 3, \i) // (r_i, ?, r_i, ?)
101 movdqa \d1, \slo // (s'_0, s'_1, s''_0, s''_1)
104 movdqa \d3, \shi // (s'_2, s'_3, s''_2, s''_3)
107 psrldq \d1, 4 // (s'_1, s''_0, s''_1, 0)
110 movdqa \d2, \d0 // another copy of (r_i, ?, r_i, ?)
113 psrldq \d3, 4 // (s'_3, s''_2, s''_3, 0)
116 pmuludq \d1, \d0 // (r_i s'_1, r_i s''_1)
119 pmuludq \d3, \d0 // (r_i s'_3, r_i s''_3)
122 pmuludq \d2, \shi // (r_i s'_2, r_i s''_2)
124 pmuludq \d0, \slo // (r_i s'_0, r_i s''_0)
127 .macro accum c0, c1=nil, c2=nil, c3=nil
128 // Accumulate 64-bit pieces in XMM0--XMM3 into the corresponding
129 // carry registers C0--C3. Any or all of C1--C3 may be `nil' to skip
130 // updating that register.
143 .macro mulacc r, i, slo, shi, c0=nil, c1=nil, c2=nil, c3=nil, z3p=nil
144 // Multiply R_I by the expanded operand SLO/SHI, and accumulate in
145 // carry registers C0, C1, C2, C3. If Z3P is `t' then C3 notionally
146 // contains zero, but needs clearing; in practice, we store the
147 // product directly rather than attempting to add. On completion,
148 // XMM0, XMM1, and XMM2 are clobbered, as is XMM3 if Z3P is not `t'.
150 mulcore \r, \i, \slo, \shi, xmm0, xmm1, xmm2, \c3
153 mulcore \r, \i, \slo, \shi, xmm0, xmm1, xmm2, xmm3
154 accum \c0, \c1, \c2, \c3
158 .macro propout d, pos, c, cc=nil
159 // Calculate an output word from C, and store it at POS in D;
160 // propagate carries out from C to CC in preparation for a rotation
161 // of the carry registers. D is an XMM register; the POS is either
162 // `lo' or `hi' according to whether the output word should be in
163 // lane 0 or 1 of D; the high two lanes of D are clobbered. On
164 // completion, XMM3 is clobbered. If CC is `nil', then the
165 // contribution which would have been added to it is left in C.
166 pshufd xmm3, \c, SHUF(2, 3, 3, 3) // (?, ?, ?, t = c'' mod B)
167 psrldq xmm3, 12 // (t, 0, 0, 0) = (t, 0)
168 pslldq xmm3, 2 // (t b, 0)
169 paddq \c, xmm3 // (c' + t b, c'')
175 psrlq \c, 32 // floor(c/B)
177 paddq \cc, \c // propagate up
181 .macro endprop d, pos, c, t
182 // On entry, C contains a carry register. On exit, the low 32 bits
183 // of the value represented in C are written at POS in D, and the
184 // remaining bits are left at the bottom of T.
186 psllq \t, 16 // (?, c'' b)
187 pslldq \c, 8 // (0, c')
188 paddq \t, \c // (?, c' + c'' b)
189 psrldq \t, 8 // c' + c'' b
195 psrldq \t, 4 // floor((c' + c'' b)/B)
198 .macro expand z, a, b, c=nil, d=nil
199 // On entry, A and C hold packed 128-bit values, and Z is zero. On
200 // exit, A:B and C:D together hold the same values in expanded
201 // form. If C is `nil', then only expand A to A:B.
202 movdqa \b, \a // (a_0, a_1, a_2, a_3)
204 movdqa \d, \c // (c_0, c_1, c_2, c_3)
206 punpcklwd \a, \z // (a'_0, a''_0, a'_1, a''_1)
207 punpckhwd \b, \z // (a'_2, a''_2, a'_3, a''_3)
209 punpcklwd \c, \z // (c'_0, c''_0, c'_1, c''_1)
210 punpckhwd \d, \z // (c'_2, c''_2, c'_3, c''_3)
212 pshufd \a, \a, SHUF(3, 1, 2, 0) // (a'_0, a'_1, a''_0, a''_1)
213 pshufd \b, \b, SHUF(3, 1, 2, 0) // (a'_2, a'_3, a''_2, a''_3)
215 pshufd \c, \c, SHUF(3, 1, 2, 0) // (c'_0, c'_1, c''_0, c''_1)
216 pshufd \d, \d, SHUF(3, 1, 2, 0) // (c'_2, c'_3, c''_2, c''_3)
220 .macro squash c0, c1, c2, c3, t, u, lo, hi=nil
221 // On entry, C0, C1, C2, C3 are carry registers representing a value
222 // Y. On exit, LO holds the low 128 bits of the carry value; C1, C2,
223 // C3, T, and U are clobbered; and the high bits of Y are stored in
224 // HI, if this is not `nil'.
226 // The first step is to eliminate the `double-prime' pieces -- i.e.,
227 // the ones offset by 16 bytes from a 32-bit boundary -- by carrying
228 // them into the 32-bit-aligned pieces above and below. But before
229 // we can do that, we must gather them together.
232 punpcklqdq \t, \c2 // (y'_0, y'_2)
233 punpckhqdq \c0, \c2 // (y''_0, y''_2)
234 punpcklqdq \u, \c3 // (y'_1, y'_3)
235 punpckhqdq \c1, \c3 // (y''_1, y''_3)
237 // Now split the double-prime pieces. The high (up to) 48 bits will
238 // go up; the low 16 bits go down.
243 psrlq \c0, 16 // high parts of (y''_0, y''_2)
244 psrlq \c1, 16 // high parts of (y''_1, y''_3)
245 psrlq \c2, 32 // low parts of (y''_0, y''_2)
246 psrlq \c3, 32 // low parts of (y''_1, y''_3)
250 pslldq \c1, 8 // high part of (0, y''_1)
252 paddq \t, \c2 // propagate down
254 paddq \t, \c1 // and up: (y_0, y_2)
255 paddq \u, \c0 // (y_1, y_3)
257 psrldq \hi, 8 // high part of (y''_3, 0)
260 // Finally extract the answer. This complicated dance is better than
261 // storing to memory and loading, because the piecemeal stores
262 // inhibit store forwarding.
263 movdqa \c3, \t // (y_0, y_1)
264 movdqa \lo, \t // (y^*_0, ?, ?, ?)
265 psrldq \t, 8 // (y_2, 0)
266 psrlq \c3, 32 // (floor(y_0/B), ?)
267 paddq \c3, \u // (y_1 + floor(y_0/B), ?)
268 movdqa \c1, \c3 // (y^*_1, ?, ?, ?)
269 psrldq \u, 8 // (y_3, 0)
270 psrlq \c3, 32 // (floor((y_1 B + y_0)/B^2, ?)
271 paddq \c3, \t // (y_2 + floor((y_1 B + y_0)/B^2, ?)
272 punpckldq \lo, \c3 // (y^*_0, y^*_2, ?, ?)
273 psrlq \c3, 32 // (floor((y_2 B^2 + y_1 B + y_0)/B^3, ?)
274 paddq \c3, \u // (y_3 + floor((y_2 B^2 + y_1 B + y_0)/B^3, ?)
279 punpckldq \c1, \c3 // (y^*_1, y^*_3, ?, ?)
281 psrlq \t, 32 // very high bits of y
283 punpcklqdq \hi, \u // carry up
285 punpckldq \lo, \c1 // y mod B^4
289 // On entry, RDI points to a packed addend A, and XMM12, XMM13, XMM14
290 // hold the incoming carry registers c0, c1, and c2 representing a
293 // On exit, the carry registers, including XMM15, are updated to hold
294 // C + A; XMM0, XMM1, XMM2, and XMM3 are clobbered. The other
295 // registers are preserved.
296 movd xmm0, [rdi + 0] // (a_0, 0)
297 movd xmm1, [rdi + 4] // (a_1, 0)
298 movd xmm2, [rdi + 8] // (a_2, 0)
299 movd xmm15, [rdi + 12] // (a_3, 0)
300 paddq xmm12, xmm0 // (c'_0 + a_0, c''_0)
301 paddq xmm13, xmm1 // (c'_1 + a_1, c''_1)
302 paddq xmm14, xmm2 // (c'_2 + a_2, c''_2 + a_3 b)
305 ///--------------------------------------------------------------------------
306 /// Primitive multipliers and related utilities.
309 // On entry, XMM12, XMM13, and XMM14 hold a 144-bit carry in an
310 // expanded form. Store the low 128 bits of the represented carry to
311 // [RDI] as a packed 128-bit value, and leave the remaining 16 bits
312 // in the low 32 bits of XMM12. On exit, XMM0, XMM1, XMM3, XMM13 and
313 // XMM14 are clobbered.
316 propout xmm0, lo, xmm12, xmm13
317 propout xmm1, lo, xmm13, xmm14
318 propout xmm0, hi, xmm14, nil
319 endprop xmm1, hi, xmm14, xmm12
328 // On entry, RDI points to the destination buffer; RAX and RBX point
329 // to the packed operands U and X; XMM8/XMM9 and XMM10/XMM11 hold the
330 // expanded operands V and Y; and XMM12, XMM13, XMM14 hold the
331 // incoming carry registers c0, c1, and c2; c3 is assumed to be zero.
333 // On exit, we write the low 128 bits of the sum C + U V + X Y to
334 // [RDI], and update the carry registers with the carry out. The
335 // registers XMM0--XMM7, and XMM15 are clobbered; the general-purpose
336 // registers are preserved.
342 mulacc xmm4, 0, xmm8, xmm9, xmm12, xmm13, xmm14, xmm15, t
343 mulacc xmm5, 0, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15
344 propout xmm6, lo, xmm12, xmm13
346 mulacc xmm4, 1, xmm8, xmm9, xmm13, xmm14, xmm15, xmm12, t
347 mulacc xmm5, 1, xmm10, xmm11, xmm13, xmm14, xmm15, xmm12
348 propout xmm7, lo, xmm13, xmm14
350 mulacc xmm4, 2, xmm8, xmm9, xmm14, xmm15, xmm12, xmm13, t
351 mulacc xmm5, 2, xmm10, xmm11, xmm14, xmm15, xmm12, xmm13
352 propout xmm6, hi, xmm14, xmm15
354 mulacc xmm4, 3, xmm8, xmm9, xmm15, xmm12, xmm13, xmm14, t
355 mulacc xmm5, 3, xmm10, xmm11, xmm15, xmm12, xmm13, xmm14
356 propout xmm7, hi, xmm15, xmm12
366 // On entry, RDI points to the destination buffer, which also
367 // contains an addend A to accumulate; RAX and RBX point to the
368 // packed operands U and X; XMM8/XMM9 and XMM10/XMM11 hold the
369 // expanded operands V and Y; and XMM12, XMM13, XMM14 hold the
370 // incoming carry registers c0, c1, and c2 representing a carry-in C;
371 // c3 is assumed to be zero.
373 // On exit, we write the low 128 bits of the sum A + C + U V + X Y to
374 // [RDI], and update the carry registers with the carry out. The
375 // registers XMM0--XMM7, and XMM15 are clobbered; the general-purpose
376 // registers are preserved.
383 mulacc xmm4, 0, xmm8, xmm9, xmm12, xmm13, xmm14, xmm15
384 mulacc xmm5, 0, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15
385 propout xmm6, lo, xmm12, xmm13
387 mulacc xmm4, 1, xmm8, xmm9, xmm13, xmm14, xmm15, xmm12, t
388 mulacc xmm5, 1, xmm10, xmm11, xmm13, xmm14, xmm15, xmm12
389 propout xmm7, lo, xmm13, xmm14
391 mulacc xmm4, 2, xmm8, xmm9, xmm14, xmm15, xmm12, xmm13, t
392 mulacc xmm5, 2, xmm10, xmm11, xmm14, xmm15, xmm12, xmm13
393 propout xmm6, hi, xmm14, xmm15
395 mulacc xmm4, 3, xmm8, xmm9, xmm15, xmm12, xmm13, xmm14, t
396 mulacc xmm5, 3, xmm10, xmm11, xmm15, xmm12, xmm13, xmm14
397 propout xmm7, hi, xmm15, xmm12
407 // On entry, RDI points to the destination buffer; RBX points to a
408 // packed operand X; and XMM10/XMM11 hold an expanded operand Y.
410 // On exit, we write the low 128 bits of the product X Y to [RDI],
411 // and set the carry registers XMM12, XMM13, XMM14 to the carry out.
412 // The registers XMM0--XMM3, XMM5--XMM7, and XMM15 are clobbered; the
413 // general-purpose registers are preserved.
418 mulcore xmm5, 0, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15
419 propout xmm6, lo, xmm12, xmm13
421 mulacc xmm5, 1, xmm10, xmm11, xmm13, xmm14, xmm15, xmm12, t
422 propout xmm7, lo, xmm13, xmm14
424 mulacc xmm5, 2, xmm10, xmm11, xmm14, xmm15, xmm12, xmm13, t
425 propout xmm6, hi, xmm14, xmm15
427 mulacc xmm5, 3, xmm10, xmm11, xmm15, xmm12, xmm13, xmm14, t
428 propout xmm7, hi, xmm15, xmm12
438 // On entry, RDI points to the destination buffer; RBX points to a
439 // packed operand X; XMM10/XMM11 hold an expanded operand Y; and
440 // XMM12, XMM13, XMM14 hold the incoming carry registers c0, c1, and
441 // c2, representing a carry-in C; c3 is assumed to be zero.
443 // On exit, we write the low 128 bits of the sum C + X Y to [RDI],
444 // and update the carry registers with the carry out. The registers
445 // XMM0--XMM3, XMM5--XMM7, and XMM15 are clobbered; the
446 // general-purpose registers are preserved.
451 mulacc xmm5, 0, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, t
452 propout xmm6, lo, xmm12, xmm13
454 mulacc xmm5, 1, xmm10, xmm11, xmm13, xmm14, xmm15, xmm12, t
455 propout xmm7, lo, xmm13, xmm14
457 mulacc xmm5, 2, xmm10, xmm11, xmm14, xmm15, xmm12, xmm13, t
458 propout xmm6, hi, xmm14, xmm15
460 mulacc xmm5, 3, xmm10, xmm11, xmm15, xmm12, xmm13, xmm14, t
461 propout xmm7, hi, xmm15, xmm12
471 // On entry, RDI points to the destination buffer, which also
472 // contains an addend A to accumulate; RBX points to a packed operand
473 // X; and XMM10/XMM11 points to an expanded operand Y.
475 // On exit, we write the low 128 bits of the sum A + X Y to [RDI],
476 // and set the carry registers XMM12, XMM13, XMM14 to the carry out.
477 // The registers XMM0--XMM3, XMM5--XMM7, and XMM15 are clobbered; the
478 // general-purpose registers are preserved.
482 movd xmm12, [rdi + 0]
483 movd xmm13, [rdi + 4]
484 movd xmm14, [rdi + 8]
485 movd xmm15, [rdi + 12]
487 mulacc xmm5, 0, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15
488 propout xmm6, lo, xmm12, xmm13
490 mulacc xmm5, 1, xmm10, xmm11, xmm13, xmm14, xmm15, xmm12, t
491 propout xmm7, lo, xmm13, xmm14
493 mulacc xmm5, 2, xmm10, xmm11, xmm14, xmm15, xmm12, xmm13, t
494 propout xmm6, hi, xmm14, xmm15
496 mulacc xmm5, 3, xmm10, xmm11, xmm15, xmm12, xmm13, xmm14, t
497 propout xmm7, hi, xmm15, xmm12
507 // On entry, RDI points to the destination buffer, which also
508 // contains an addend A to accumulate; RBX points to a packed operand
509 // X; XMM10/XMM11 holds an expanded operand Y; and XMM12, XMM13,
510 // XMM14 hold the incoming carry registers c0, c1, and c2,
511 // representing a carry-in C; c3 is assumed to be zero.
513 // On exit, we write the low 128 bits of the sum A + C + X Y to
514 // [RDI], and update the carry registers with the carry out. The
515 // registers XMM0--XMM3, XMM5--XMM7, and XMM15 are clobbered; the
516 // general-purpose registers are preserved.
522 mulacc xmm5, 0, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15
523 propout xmm6, lo, xmm12, xmm13
525 mulacc xmm5, 1, xmm10, xmm11, xmm13, xmm14, xmm15, xmm12, t
526 propout xmm7, lo, xmm13, xmm14
528 mulacc xmm5, 2, xmm10, xmm11, xmm14, xmm15, xmm12, xmm13, t
529 propout xmm6, hi, xmm14, xmm15
531 mulacc xmm5, 3, xmm10, xmm11, xmm15, xmm12, xmm13, xmm14, t
532 propout xmm7, hi, xmm15, xmm12
542 // On entry, RDI points to the destination buffer; RAX and RBX point
543 // to the packed operands U and N; and XMM8/XMM9 and XMM10/XMM11 hold
544 // the expanded operands V and M. The stack pointer must be 8 modulo 16
545 // (as usual for AMD64 ABIs).
547 // On exit, we store Y = U V M mod B in XMM10/XMM11, and write the
548 // low 128 bits of the sum U V + N Y to [RDI], leaving the remaining
549 // carry in XMM12, XMM13, and XMM14. The registers XMM0--XMM7, and
550 // XMM15 are clobbered; the general-purpose registers are preserved.
553 stalloc 48 + 8 // space for the carries
557 // Calculate W = U V, and leave it in XMM7. Stash the carry pieces
559 mulcore xmm4, 0, xmm8, xmm9, xmm12, xmm13, xmm14, xmm15
560 propout xmm7, lo, xmm12, xmm13
566 // On entry, RDI points to the destination buffer, which also
567 // contains an addend A to accumulate; RAX and RBX point to the
568 // packed operands U and N; and XMM8/XMM9 and XMM10/XMM11 hold the
569 // expanded operands V and M. The stack pointer must be 8 modulo 16
570 // (as usual for AMD64 ABIs).
572 // On exit, we store Y = (A + U V) M mod B in XMM10/XMM11, and write
573 // the low 128 bits of the sum A + U V + N Y to [RDI], leaving the
574 // remaining carry in XMM12, XMM13, and XMM14. The registers
575 // XMM0--XMM7, and XMM15 are clobbered; the general-purpose registers
579 stalloc 48 + 8 // space for the carries
580 # define STKTMP(i) [rsp + i]
583 # define STKTMP(i) [rsp + i - 48 - 8] // use red zone
587 movd xmm12, [rdi + 0]
588 movd xmm13, [rdi + 4]
589 movd xmm14, [rdi + 8]
590 movd xmm15, [rdi + 12]
592 // Calculate W = U V, and leave it in XMM7. Stash the carry pieces
594 mulacc xmm4, 0, xmm8, xmm9, xmm12, xmm13, xmm14, xmm15
595 propout xmm7, lo, xmm12, xmm13
597 5: mulacc xmm4, 1, xmm8, xmm9, xmm13, xmm14, xmm15, xmm12, t
598 propout xmm6, lo, xmm13, xmm14
600 mulacc xmm4, 2, xmm8, xmm9, xmm14, xmm15, xmm12, xmm13, t
601 propout xmm7, hi, xmm14, xmm15
603 mulacc xmm4, 3, xmm8, xmm9, xmm15, xmm12, xmm13, xmm14, t
604 propout xmm6, hi, xmm15, xmm12
606 // Prepare W, and stash carries for later.
608 movdqa STKTMP( 0), xmm12
609 movdqa STKTMP(16), xmm13
610 movdqa STKTMP(32), xmm14
612 // Calculate Y = W M. We just about have enough spare registers to
614 mulcore xmm7, 0, xmm10, xmm11, xmm3, xmm4, xmm5, xmm6
616 // Start expanding W back into the main carry registers...
621 mulcore xmm7, 1, xmm10, xmm11, xmm0, xmm1, xmm2
622 accum xmm4, xmm5, xmm6
624 punpckldq xmm12, xmm15 // (w_0, 0, w_1, 0)
625 punpckhdq xmm14, xmm15 // (w_2, 0, w_3, 0)
627 mulcore xmm7, 2, xmm10, xmm11, xmm0, xmm1
634 mulcore xmm7, 3, xmm10, xmm11, xmm0
637 punpckldq xmm12, xmm2 // (w_0, 0, 0, 0)
638 punpckldq xmm14, xmm2 // (w_2, 0, 0, 0)
639 punpckhdq xmm13, xmm2 // (w_1, 0, 0, 0)
640 punpckhdq xmm15, xmm2 // (w_3, 0, 0, 0)
642 // That's lots of pieces. Now we have to assemble the answer.
643 squash xmm3, xmm4, xmm5, xmm6, xmm0, xmm1, xmm10
647 expand xmm2, xmm10, xmm11
649 // Finish the calculation by adding the Montgomery product.
650 mulacc xmm5, 0 xmm10, xmm11, xmm12, xmm13, xmm14, xmm15
651 propout xmm6, lo, xmm12, xmm13
653 mulacc xmm5, 1 xmm10, xmm11, xmm13, xmm14, xmm15, xmm12, t
654 propout xmm7, lo, xmm13, xmm14
656 mulacc xmm5, 2 xmm10, xmm11, xmm14, xmm15, xmm12, xmm13, t
657 propout xmm6, hi, xmm14, xmm15
659 mulacc xmm5, 3 xmm10, xmm11, xmm15, xmm12, xmm13, xmm14, t
660 propout xmm7, hi, xmm15, xmm12
664 // Add add on the carry we calculated earlier.
665 paddq xmm12, STKTMP( 0)
666 paddq xmm13, STKTMP(16)
667 paddq xmm14, STKTMP(32)
669 // And, with that, we're done.
681 // On entry, RDI points to the destination buffer holding a packed
682 // value W; RBX points to a packed operand N; and XMM8/XMM9 hold an
683 // expanded operand M.
685 // On exit, we store Y = W M mod B in XMM10/XMM11, and write the low
686 // 128 bits of the sum W + N Y to [RDI], leaving the remaining carry
687 // in XMM12, XMM13, and XMM14. The registers XMM0--XMM3, XMM5--XMM7,
688 // and XMM15 are clobbered; the general-purpose registers are
694 // Calculate Y = W M. Avoid the standard carry registers, because
695 // we're setting something else up there.
696 mulcore xmm7, 0, xmm8, xmm9, xmm3, xmm4, xmm5, xmm6
698 // Start expanding W back into the main carry registers...
703 mulcore xmm7, 1, xmm8, xmm9, xmm0, xmm1, xmm2
704 accum xmm4, xmm5, xmm6
706 punpckldq xmm12, xmm15 // (w_0, 0, w_1, 0)
707 punpckhdq xmm14, xmm15 // (w_2, 0, w_3, 0)
709 mulcore xmm7, 2, xmm8, xmm9, xmm0, xmm1
716 mulcore xmm7, 3, xmm8, xmm9, xmm0
719 punpckldq xmm12, xmm2 // (w_0, 0, 0, 0)
720 punpckldq xmm14, xmm2 // (w_2, 0, 0, 0)
721 punpckhdq xmm13, xmm2 // (w_1, 0, 0, 0)
722 punpckhdq xmm15, xmm2 // (w_3, 0, 0, 0)
724 // That's lots of pieces. Now we have to assemble the answer.
725 squash xmm3, xmm4, xmm5, xmm6, xmm0, xmm1, xmm10
729 expand xmm2, xmm10, xmm11
731 // Finish the calculation by adding the Montgomery product.
732 mulacc xmm5, 0 xmm10, xmm11, xmm12, xmm13, xmm14, xmm15
733 propout xmm6, lo, xmm12, xmm13
735 mulacc xmm5, 1 xmm10, xmm11, xmm13, xmm14, xmm15, xmm12, t
736 propout xmm7, lo, xmm13, xmm14
738 mulacc xmm5, 2 xmm10, xmm11, xmm14, xmm15, xmm12, xmm13, t
739 propout xmm6, hi, xmm14, xmm15
741 mulacc xmm5, 3 xmm10, xmm11, xmm15, xmm12, xmm13, xmm14, t
742 propout xmm7, hi, xmm15, xmm12
746 // And, with that, we're done.
752 ///--------------------------------------------------------------------------
753 /// Bulk multipliers.
755 FUNC(mpx_umul4_amd64_sse2)
756 // void mpx_umul4_amd64_sse2(mpw *dv, const mpw *av, const mpw *avl,
757 // const mpw *bv, const mpw *bvl);
759 // Establish the arguments and do initial setup.
762 // inner loop dv rdi rdi*
763 // inner loop av rbx* rbx*
764 // outer loop dv r10 rcx
765 // outer loop bv rcx r9
813 // Prepare for the first iteration.
815 movdqu xmm10, [BV] // bv[0]
819 expand xmm0, xmm10, xmm11
823 cmp rbx, AVL // all done?
827 // Continue with the first iteration.
831 cmp rbx, AVL // all done?
834 // Write out the leftover carry. There can be no tail here.
836 cmp BV, BVL // more passes to do?
840 // Set up for the next pass.
841 1: movdqu xmm10, [BV] // bv[i]
842 mov rdi, DV // -> dv[i]
844 expand xmm0, xmm10, xmm11
845 mov rbx, AV // -> av[0]
851 cmp rbx, AVL // done yet?
862 // Finish off this pass. There was no tail on the previous pass, and
863 // there can be none on this pass.
904 FUNC(mpxmont_mul4_amd64_sse2)
905 // void mpxmont_mul4_amd64_sse2(mpw *dv, const mpw *av, const mpw *bv,
906 // const mpw *nv, size_t n, const mpw *mi);
908 // Establish the arguments and do initial setup.
911 // inner loop dv rdi rdi*
912 // inner loop av rax rax
913 // inner loop nv rbx* rbx*
915 // outer loop dv r10 rcx
916 // outer loop bv rdx r8
974 // Establish the expanded operands.
976 movdqu xmm8, [BV] // bv[0]
977 movdqu xmm10, [MI] // mi
978 expand xmm0, xmm8, xmm9, xmm10, xmm11
980 // Set up the outer loop state and prepare for the first iteration.
981 mov rax, AV // -> U = av[0]
982 mov rbx, NV // -> X = nv[0]
983 lea AVL, [AV + 4*N] // -> av[n/4] = av limit
984 lea BVL, [BV + 4*N] // -> bv[n/4] = bv limit
991 cmp rax, AVL // done already?
995 // Complete the first inner loop.
1000 cmp rax, AVL // done yet?
1003 // Still have carries left to propagate.
1005 movd [rdi + 16], xmm12
1008 // Embark on the next iteration. (There must be one. If n = 1, then
1009 // we would have bailed above, to label 8. Similarly, the subsequent
1010 // iterations can fall into the inner loop immediately.)
1012 movdqu xmm8, [BV] // bv[i]
1013 movdqu xmm10, [MI] // mi
1014 mov rdi, DV // -> Z = dv[i]
1015 mov rax, AV // -> U = av[0]
1016 mov rbx, NV // -> X = nv[0]
1017 expand xmm0, xmm8, xmm9, xmm10, xmm11
1026 // Complete the next inner loop.
1034 // Still have carries left to propagate, and they overlap the
1035 // previous iteration's final tail, so read that in and add it.
1039 movd [rdi + 16], xmm12
1041 // Back again, maybe.
1074 // First iteration was short. Write out the carries and we're done.
1075 // (This could be folded into the main loop structure, but that would
1076 // penalize small numbers more.)
1078 movd [rdi + 16], xmm12
1098 FUNC(mpxmont_redc4_amd64_sse2)
1099 // void mpxmont_redc4_amd64_sse2(mpw *dv, mpw *dvl, const mpw *nv,
1100 // size_t n, const mpw *mi);
1102 // Establish the arguments and do initial setup.
1105 // inner loop dv rdi rdi*
1107 // blocks-of-4 dv limit rsi rdx
1108 // inner loop nv rbx* rbx*
1110 // outer loop dv r10 rcx
1111 // outer loop dv limit r11 r11
1171 // Establish the expanded operands and the blocks-of-4 dv limit.
1173 mov DVL, DVL4 // -> dv[n] = dv limit
1174 sub DVL4, DV // length of dv in bytes
1175 movdqu xmm8, [MI] // mi
1176 and DVL4, ~15 // mask off the tail end
1177 expand xmm0, xmm8, xmm9
1178 add DVL4, DV // find limit
1180 // Set up the outer loop state and prepare for the first iteration.
1181 mov rbx, NV // -> X = nv[0]
1182 lea DVLO, [DV + 4*N] // -> dv[n/4] = outer dv limit
1183 lea NVL, [NV + 4*N] // -> nv[n/4] = nv limit
1188 cmp rbx, NVL // done already?
1192 // Complete the first inner loop.
1196 cmp rbx, NVL // done yet?
1199 // Still have carries left to propagate.
1211 // Continue carry propagation until the end of the buffer.
1213 mov C, 0 // preserves flags
1222 // Deal with the tail end.
1224 mov C, 0 // preserves flags
1230 // All done for this iteration. Start the next. (This must have at
1231 // least one follow-on iteration, or we'd not have started this outer
1233 8: mov rdi, DV // -> Z = dv[i]
1234 mov rbx, NV // -> X = nv[0]
1235 cmp rdi, DVLO // all done yet?
1284 ///--------------------------------------------------------------------------
1285 /// Testing and performance measurement.
1296 # define ARG6 STKARG(0)
1297 # define ARG7 STKARG(1)
1298 # define ARG8 STKARG(2)
1299 # define STKARG_OFFSET 16
1306 # define ARG4 STKARG(0)
1307 # define ARG5 STKARG(1)
1308 # define ARG6 STKARG(2)
1309 # define ARG7 STKARG(3)
1310 # define ARG8 STKARG(4)
1311 # define STKARG_OFFSET 40
1313 #define STKARG(i) [rsp + STKARG_OFFSET + 8*(i)]
1316 // dmul smul mmul mont dmul smul mmul mont
1319 // z rdi rdi rdi rdi rdi rcx rcx rcx rcx
1320 // c rcx rsi rsi rsi rsi rdx rdx rdx rdx
1321 // y r10 -- -- rdx rdx -- -- r8 r8
1322 // u r11 rdx -- rcx -- r8 -- r9 --
1323 // x rbx rcx rdx r8 rcx r9 r8 stk0 r9
1324 // vv xmm8/9 r8 -- r9 r8 stk0 -- stk1 stk0
1325 // yy xmm10/11 r9 rcx stk0 -- stk1 r9 stk2 --
1326 // n r8 stk0 r8 stk1 r9 stk2 stk0 stk3 stk1
1327 // cyv r9 stk1 r9 stk2 stk0 stk3 stk1 stk4 stk2
1333 mov [\v + 8*\n - 8], rax
1340 sub rax, [\v + 8*\n - 8]
1341 mov [\v + 8*\n - 8], rax
1345 .macro testprologue mode
1349 .ifeqs "\mode", "dmul"
1358 .ifeqs "\mode", "smul"
1363 .ifeqs "\mode", "mmul"
1374 .ifeqs "\mode", "mont"
1397 .ifeqs "\mode", "dmul"
1409 .ifeqs "\mode", "smul"
1417 .ifeqs "\mode", "mmul"
1430 .ifeqs "\mode", "mont"
1443 .ifeqs "\mode", "dmul"
1444 expand xmm0, xmm8, xmm9, xmm10, xmm11
1446 .ifeqs "\mode", "smul"
1447 expand xmm0, xmm10, xmm11
1449 .ifeqs "\mode", "mmul"
1450 expand xmm0, xmm8, xmm9, xmm10, xmm11
1452 .ifeqs "\mode", "mont"
1453 expand xmm0, xmm8, xmm9
1477 movdqu xmm12, [rcx + 0] // (c'_0, c''_0)
1478 movdqu xmm13, [rcx + 16] // (c'_1, c''_1)
1479 movdqu xmm14, [rcx + 32] // (c'_2, c''_2)
1482 .macro testtop u=nil
1497 movdqu [rcx + 0], xmm12
1498 movdqu [rcx + 16], xmm13
1499 movdqu [rcx + 32], xmm14
1547 movdqu [r10 + 0], xmm10
1548 movdqu [r10 + 16], xmm11
1558 movdqu [r10 + 0], xmm10
1559 movdqu [r10 + 16], xmm11
1569 movdqu [r10 + 0], xmm10
1570 movdqu [r10 + 16], xmm11
1577 ///----- That's all, folks --------------------------------------------------