/// 0 v'_0 v'_1 v''_0 v''_1
/// 16 v'_2 v'_3 v''_2 v''_3
///
-/// A `pmuludqd' instruction ignores the odd positions in its operands; thus,
+/// A `pmuludq' instruction ignores the odd positions in its operands; thus,
/// it will act on (say) v'_0 and v''_0 in a single instruction. Shifting
/// this vector right by 4 bytes brings v'_1 and v''_1 into position. We can
/// multiply such a vector by a full 32-bit scalar to produce two 48-bit
/// the register c0, for example, holds c'_0 (low half) and c''_0 (high
/// half), and represents the value c_0 = c'_0 + c''_0 b; the carry registers
/// collectively represent the value c_0 + c_1 B + c_2 B^2 + c_3 B^3. The
-/// `pmuluqdq' instruction acting on a scalar operand (broadcast across all
+/// `pmuluqd' instruction acting on a scalar operand (broadcast across all
/// lanes of its vector) and an operand in the expanded form above produces a
/// result which can be added directly to the appropriate carry register.
/// Following a pass of four multiplications, we perform some limited carry
///--------------------------------------------------------------------------
/// Macro definitions.
-.macro mulcore r, s, d0, d1, d2, d3
+.macro mulcore r, s, d0, d1=nil, d2=nil, d3=nil
// Load a word r_i from R, multiply by the expanded operand [S], and
// leave the pieces of the product in registers D0, D1, D2, D3.
movd \d0, \r // (r_i, 0, 0, 0)
psrldq \d3, 4 // (s'_3, s''_2, s''_3, 0)
.endif
.ifnes "\d1", "nil"
- pmuludqd \d1, \d0 // (r_i s'_1, r_i s''_1)
+ pmuludq \d1, \d0 // (r_i s'_1, r_i s''_1)
.endif
.ifnes "\d3", "nil"
- pmuludqd \d3, \d0 // (r_i s'_3, r_i s''_3)
+ pmuludq \d3, \d0 // (r_i s'_3, r_i s''_3)
.endif
.ifnes "\d2", "nil"
.ifnes "\d3", "nil"
- pmuludqd \d2, \d0 // (r_i s'_2, r_i s''_2)
+ pmuludq \d2, \d0 // (r_i s'_2, r_i s''_2)
.else
- pmuludqd \d2, [\s + 16]
+ pmuludq \d2, [\s + 16]
.endif
.endif
- pmuludqd \d0, [\s] // (r_i s'_0, r_i s''_0)
+ pmuludq \d0, [\s] // (r_i s'_0, r_i s''_0)
.endm
-.macro accum c0, c1, c2, c3
+.macro accum c0, c1=nil, c2=nil, c3=nil
+ // Accumulate 64-bit pieces in XMM0--XMM3 into the corresponding
+ // carry registers C0--C3. Any or all of C1--C3 may be `nil' to skip
+ // updating that register.
paddq \c0, xmm0
.ifnes "\c1", "nil"
paddq \c1, xmm1
.endif
.endm
-.macro mulacc r, s, c0, c1, c2, c3, z3p
+.macro mulacc r, s, c0, c1, c2, c3, z3p=nil
// Load a word r_i from R, multiply by the expanded operand [S],
// and accumulate in carry registers C0, C1, C2, C3. If Z3P is `t'
// then C3 notionally contains zero, but needs clearing; in practice,
// is not `t'.
.ifeqs "\z3p", "t"
mulcore \r, \s, xmm0, xmm1, xmm2, \c3
- accum \c0, \c1, \c2, nil
+ accum \c0, \c1, \c2
.else
mulcore \r, \s, xmm0, xmm1, xmm2, xmm3
accum \c0, \c1, \c2, \c3
.endif
.endm
-.macro propout d, c, cc
+.macro propout d, c, cc=nil
// Calculate an output word from C, and store it in D; propagate
// carries out from C to CC in preparation for a rotation of the
// carry registers. On completion, XMM3 is clobbered. If CC is
psrldq \t, 4 // floor((c' + c'' b)/B)
.endm
-.macro expand a, b, c, d, z
+.macro expand z, a, b, c=nil, d=nil
// On entry, A and C hold packed 128-bit values, and Z is zero. On
// exit, A:B and C:D together hold the same values in expanded
// form. If C is `nil', then only expand A to A:B.
.endif
.endm
-.macro squash c0, c1, c2, c3, h, t, u
+.macro squash c0, c1, c2, c3, t, u, lo, hi=nil
// On entry, C0, C1, C2, C3 are carry registers representing a value
- // Y. On exit, C0 holds the low 128 bits of the carry value; C1, C2,
+ // Y. On exit, LO holds the low 128 bits of the carry value; C1, C2,
// C3, T, and U are clobbered; and the high bits of Y are stored in
- // H, if this is not `nil'.
+ // HI, if this is not `nil'.
// The first step is to eliminate the `double-prime' pieces -- i.e.,
// the ones offset by 16 bytes from a 32-bit boundary -- by carrying
psrlq \c1, 16 // high parts of (y''_1, y''_3)
psrlq \c2, 32 // low parts of (y''_0, y''_2)
psrlq \c3, 32 // low parts of (y''_1, y''_3)
- .ifnes "\h", "nil"
- movdqa \h, \c1
+ .ifnes "\hi", "nil"
+ movdqa \hi, \c1
.endif
pslldq \c1, 8 // high part of (0, y''_1)
paddq \u, \c3
paddq \t, \c1 // and up: (y_0, y_2)
paddq \u, \c0 // (y_1, y_3)
- .ifnes "\h", "nil"
- psrldq \h, 8 // high part of (y''_3, 0)
+ .ifnes "\hi", "nil"
+ psrldq \hi, 8 // high part of (y''_3, 0)
.endif
// Finally extract the answer. This complicated dance is better than
// storing to memory and loading, because the piecemeal stores
// inhibit store forwarding.
movdqa \c3, \t // (y_0, y_1)
- movdqa \c0, \t // (y^*_0, ?, ?, ?)
+ movdqa \lo, \t // (y^*_0, ?, ?, ?)
psrldq \t, 8 // (y_2, 0)
psrlq \c3, 32 // (floor(y_0/B), ?)
paddq \c3, \u // (y_1 + floor(y_0/B), ?)
- pslldq \c0, 12 // (0, 0, 0, y^*_0)
movdqa \c1, \c3 // (y^*_1, ?, ?, ?)
psrldq \u, 8 // (y_3, 0)
psrlq \c3, 32 // (floor((y_1 B + y_0)/B^2, ?)
paddq \c3, \t // (y_2 + floor((y_1 B + y_0)/B^2, ?)
- pslldq \c1, 12 // (0, 0, 0, y^*_1)
- psrldq \c0, 12 // (y^*_0, 0, 0, 0)
- movdqa \c2, \c3 // (y^*_2, ?, ?, ?)
+ punpckldq \lo, \c3 // (y^*_0, y^*_2, ?, ?)
psrlq \c3, 32 // (floor((y_2 B^2 + y_1 B + y_0)/B^3, ?)
paddq \c3, \u // (y_3 + floor((y_2 B^2 + y_1 B + y_0)/B^3, ?)
- pslldq \c2, 12 // (0, 0, 0, y^*_2)
- psrldq \c1, 8 // (0, y^*_1, 0, 0)
- psrldq \c2, 4 // (0, 0, y^*_2, 0)
- .ifnes "\h", "nil"
+ .ifnes "\hi", "nil"
movdqa \t, \c3
pxor \u, \u
.endif
- pslldq \c3, 12 // (0, 0, 0, y^*_3)
- por \c0, \c1 // (y^*_0, y^*_1, 0, 0)
- por \c2, \c3 // (0, 0, y^*_2, y^*_3)
- por \c0, \c2 // y mod B^4
- .ifnes "\h", "nil"
+ punpckldq \c1, \c3 // (y^*_1, y^*_3, ?, ?)
+ .ifnes "\hi", "nil"
psrlq \t, 32 // very high bits of y
- paddq \h, \t
- punpcklqdq \h, \u // carry up
+ paddq \hi, \t
+ punpcklqdq \hi, \u // carry up
.endif
+ punpckldq \lo, \c1 // y mod B^4
.endm
.macro carryadd
endprologue
mulacc [eax + 0], ecx, xmm4, xmm5, xmm6, xmm7, t
- mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7, nil
+ mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7
propout [edi + 0], xmm4, xmm5
mulacc [eax + 4], ecx, xmm5, xmm6, xmm7, xmm4, t
- mulacc [ebx + 4], edx, xmm5, xmm6, xmm7, xmm4, nil
+ mulacc [ebx + 4], edx, xmm5, xmm6, xmm7, xmm4
propout [edi + 4], xmm5, xmm6
mulacc [eax + 8], ecx, xmm6, xmm7, xmm4, xmm5, t
- mulacc [ebx + 8], edx, xmm6, xmm7, xmm4, xmm5, nil
+ mulacc [ebx + 8], edx, xmm6, xmm7, xmm4, xmm5
propout [edi + 8], xmm6, xmm7
mulacc [eax + 12], ecx, xmm7, xmm4, xmm5, xmm6, t
- mulacc [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6, nil
+ mulacc [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6
propout [edi + 12], xmm7, xmm4
ret
carryadd
- mulacc [eax + 0], ecx, xmm4, xmm5, xmm6, xmm7, nil
- mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7, nil
+ mulacc [eax + 0], ecx, xmm4, xmm5, xmm6, xmm7
+ mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7
propout [edi + 0], xmm4, xmm5
mulacc [eax + 4], ecx, xmm5, xmm6, xmm7, xmm4, t
- mulacc [ebx + 4], edx, xmm5, xmm6, xmm7, xmm4, nil
+ mulacc [ebx + 4], edx, xmm5, xmm6, xmm7, xmm4
propout [edi + 4], xmm5, xmm6
mulacc [eax + 8], ecx, xmm6, xmm7, xmm4, xmm5, t
- mulacc [ebx + 8], edx, xmm6, xmm7, xmm4, xmm5, nil
+ mulacc [ebx + 8], edx, xmm6, xmm7, xmm4, xmm5
propout [edi + 8], xmm6, xmm7
mulacc [eax + 12], ecx, xmm7, xmm4, xmm5, xmm6, t
- mulacc [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6, nil
+ mulacc [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6
propout [edi + 12], xmm7, xmm4
ret
movd xmm6, [edi + 8]
movd xmm7, [edi + 12]
- mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7, nil
+ mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7
propout [edi + 0], xmm4, xmm5
mulacc [ebx + 4], edx, xmm5, xmm6, xmm7, xmm4, t
carryadd
- mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7, nil
+ mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7
propout [edi + 0], xmm4, xmm5
mulacc [ebx + 4], edx, xmm5, xmm6, xmm7, xmm4, t
movd xmm5, [edi + 4]
movd xmm6, [edi + 8]
movd xmm7, [edi + 12]
- mulacc [eax + 0], ecx, xmm4, xmm5, xmm6, xmm7, nil
+ mulacc [eax + 0], ecx, xmm4, xmm5, xmm6, xmm7
propout [edi + 0], xmm4, xmm5
5: mulacc [eax + 4], ecx, xmm5, xmm6, xmm7, xmm4, t
// Calculate Y = W M.
mulcore [edi + 0], esi, xmm4, xmm5, xmm6, xmm7
- mulcore [edi + 4], esi, xmm0, xmm1, xmm2, nil
- accum xmm5, xmm6, xmm7, nil
+ mulcore [edi + 4], esi, xmm0, xmm1, xmm2
+ accum xmm5, xmm6, xmm7
- mulcore [edi + 8], esi, xmm0, xmm1, nil, nil
- accum xmm6, xmm7, nil, nil
+ mulcore [edi + 8], esi, xmm0, xmm1
+ accum xmm6, xmm7
- mulcore [edi + 12], esi, xmm0, nil, nil, nil
- accum xmm7, nil, nil, nil
+ mulcore [edi + 12], esi, xmm0
+ accum xmm7
// That's lots of pieces. Now we have to assemble the answer.
- squash xmm4, xmm5, xmm6, xmm7, nil, xmm0, xmm1
+ squash xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm4
// Expand it.
pxor xmm2, xmm2
- expand xmm4, xmm1, nil, nil, xmm2
+ expand xmm2, xmm4, xmm1
movdqa [edx + 0], xmm4
movdqa [edx + 16], xmm1
movd xmm7, [edi + 12]
// Finish the calculation by adding the Montgomery product.
- mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7, nil
+ mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7
propout [edi + 0], xmm4, xmm5
mulacc [ebx + 4], edx, xmm5, xmm6, xmm7, xmm4, t
// Calculate Y = W M.
mulcore [edi + 0], esi, xmm4, xmm5, xmm6, xmm7
- mulcore [edi + 4], esi, xmm0, xmm1, xmm2, nil
- accum xmm5, xmm6, xmm7, nil
+ mulcore [edi + 4], esi, xmm0, xmm1, xmm2
+ accum xmm5, xmm6, xmm7
- mulcore [edi + 8], esi, xmm0, xmm1, nil, nil
- accum xmm6, xmm7, nil, nil
+ mulcore [edi + 8], esi, xmm0, xmm1
+ accum xmm6, xmm7
- mulcore [edi + 12], esi, xmm0, nil, nil, nil
- accum xmm7, nil, nil, nil
+ mulcore [edi + 12], esi, xmm0
+ accum xmm7
// That's lots of pieces. Now we have to assemble the answer.
- squash xmm4, xmm5, xmm6, xmm7, nil, xmm0, xmm1
+ squash xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm4
// Expand it.
pxor xmm2, xmm2
- expand xmm4, xmm1, nil, nil, xmm2
+ expand xmm2, xmm4, xmm1
movdqa [edx + 0], xmm4
movdqa [edx + 16], xmm1
movd xmm7, [edi + 12]
// Finish the calculation by adding the Montgomery product.
- mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7, nil
+ mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7
propout [edi + 0], xmm4, xmm5
mulacc [ebx + 4], edx, xmm5, xmm6, xmm7, xmm4, t
movdqu xmm0, [esi] // bv[0]
mov edi, [ebp + 20] // -> dv[0]
mov ecx, edi // outer loop dv cursor
- expand xmm0, xmm1, nil, nil, xmm7
+ expand xmm7, xmm0, xmm1
mov ebx, [ebp + 24] // -> av[0]
mov eax, [ebp + 28] // -> av[m] = av limit
mov edx, esp // -> expanded Y = bv[0]
1: movdqu xmm0, [esi] // bv[i]
mov edi, ecx // -> dv[i]
pxor xmm7, xmm7
- expand xmm0, xmm1, nil, nil, xmm7
+ expand xmm7, xmm0, xmm1
mov ebx, [ebp + 24] // -> av[0]
movdqa [esp + 0], xmm0 // bv[i] expanded low
movdqa [esp + 16], xmm1 // bv[i] expanded high
mov edx, [ebp + 40] // -> mi
movdqu xmm0, [ecx] // bv[0]
movdqu xmm2, [edx] // mi
- expand xmm0, xmm1, xmm2, xmm3, xmm7
+ expand xmm7, xmm0, xmm1, xmm2, xmm3
movdqa [esp + 12], xmm0 // bv[0] expanded low
movdqa [esp + 28], xmm1 // bv[0] expanded high
movdqa [esp + 44], xmm2 // mi expanded low
mov ebx, [ebp + 32] // -> X = nv[0]
lea esi, [esp + 44] // -> expanded M = mi
mov eax, [ebp + 24] // -> U = av[0]
- expand xmm0, xmm1, nil, nil, xmm7
+ expand xmm7, xmm0, xmm1
movdqa [esp + 12], xmm0 // bv[i] expanded low
movdqa [esp + 28], xmm1 // bv[i] expanded high
call mmla4
mov edx, [ebp + 36] // -> mi
movdqu xmm0, [edx] // mi
and eax, ~15 // mask off the tail end
- expand xmm0, xmm1, nil, nil, xmm7
+ expand xmm7, xmm0, xmm1
add eax, edi // find limit
movdqa [esp + 12], xmm0 // mi expanded low
movdqa [esp + 28], xmm1 // mi expanded high
movdqu xmm6, [ecx + 32] // (c'_2, c''_2)
.endm
-.macro testexpand v, y
+.macro testexpand v=nil, y=nil
pxor xmm7, xmm7
.ifnes "\v", "nil"
mov ecx, \v
movdqu xmm0, [ecx]
- expand xmm0, xmm1, nil, nil, xmm7
+ expand xmm7, xmm0, xmm1
movdqa [esp + 12], xmm0
movdqa [esp + 28], xmm1
.endif
.ifnes "\y", "nil"
mov edx, \y
movdqu xmm2, [edx]
- expand xmm2, xmm3, nil, nil, xmm7
+ expand xmm7, xmm2, xmm3
movdqa [esp + 44], xmm2
movdqa [esp + 60], xmm3
.endif
.endm
-.macro testtop u, x, mode
+.macro testtop u=nil, x=nil, mode=nil
.p2align 4
0:
.ifnes "\u", "nil"