///--------------------------------------------------------------------------
/// Macro definitions.
-.macro mulcore r, s, d0, d1, d2, d3
+.macro mulcore r, s, d0, d1=nil, d2=nil, d3=nil
// Load a word r_i from R, multiply by the expanded operand [S], and
// leave the pieces of the product in registers D0, D1, D2, D3.
movd \d0, \r // (r_i, 0, 0, 0)
pmuludq \d0, [\s] // (r_i s'_0, r_i s''_0)
.endm
-.macro accum c0, c1, c2, c3
+.macro accum c0, c1=nil, c2=nil, c3=nil
+ // Accumulate 64-bit pieces in XMM0--XMM3 into the corresponding
+ // carry registers C0--C3. Any or all of C1--C3 may be `nil' to skip
+ // updating that register.
paddq \c0, xmm0
.ifnes "\c1", "nil"
paddq \c1, xmm1
.endif
.endm
-.macro mulacc r, s, c0, c1, c2, c3, z3p
+.macro mulacc r, s, c0, c1, c2, c3, z3p=nil
// Load a word r_i from R, multiply by the expanded operand [S],
// and accumulate in carry registers C0, C1, C2, C3. If Z3P is `t'
// then C3 notionally contains zero, but needs clearing; in practice,
// is not `t'.
.ifeqs "\z3p", "t"
mulcore \r, \s, xmm0, xmm1, xmm2, \c3
- accum \c0, \c1, \c2, nil
+ accum \c0, \c1, \c2
.else
mulcore \r, \s, xmm0, xmm1, xmm2, xmm3
accum \c0, \c1, \c2, \c3
.endif
.endm
-.macro propout d, c, cc
+.macro propout d, c, cc=nil
// Calculate an output word from C, and store it in D; propagate
// carries out from C to CC in preparation for a rotation of the
// carry registers. On completion, XMM3 is clobbered. If CC is
psrldq \t, 4 // floor((c' + c'' b)/B)
.endm
-.macro expand a, b, c, d, z
+.macro expand z, a, b, c=nil, d=nil
// On entry, A and C hold packed 128-bit values, and Z is zero. On
// exit, A:B and C:D together hold the same values in expanded
// form. If C is `nil', then only expand A to A:B.
.endif
.endm
-.macro squash lo, hi, c0, c1, c2, c3, t, u
+.macro squash c0, c1, c2, c3, t, u, lo, hi=nil
// On entry, C0, C1, C2, C3 are carry registers representing a value
// Y. On exit, LO holds the low 128 bits of the carry value; C1, C2,
// C3, T, and U are clobbered; and the high bits of Y are stored in
endprologue
mulacc [eax + 0], ecx, xmm4, xmm5, xmm6, xmm7, t
- mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7, nil
+ mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7
propout [edi + 0], xmm4, xmm5
mulacc [eax + 4], ecx, xmm5, xmm6, xmm7, xmm4, t
- mulacc [ebx + 4], edx, xmm5, xmm6, xmm7, xmm4, nil
+ mulacc [ebx + 4], edx, xmm5, xmm6, xmm7, xmm4
propout [edi + 4], xmm5, xmm6
mulacc [eax + 8], ecx, xmm6, xmm7, xmm4, xmm5, t
- mulacc [ebx + 8], edx, xmm6, xmm7, xmm4, xmm5, nil
+ mulacc [ebx + 8], edx, xmm6, xmm7, xmm4, xmm5
propout [edi + 8], xmm6, xmm7
mulacc [eax + 12], ecx, xmm7, xmm4, xmm5, xmm6, t
- mulacc [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6, nil
+ mulacc [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6
propout [edi + 12], xmm7, xmm4
ret
carryadd
- mulacc [eax + 0], ecx, xmm4, xmm5, xmm6, xmm7, nil
- mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7, nil
+ mulacc [eax + 0], ecx, xmm4, xmm5, xmm6, xmm7
+ mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7
propout [edi + 0], xmm4, xmm5
mulacc [eax + 4], ecx, xmm5, xmm6, xmm7, xmm4, t
- mulacc [ebx + 4], edx, xmm5, xmm6, xmm7, xmm4, nil
+ mulacc [ebx + 4], edx, xmm5, xmm6, xmm7, xmm4
propout [edi + 4], xmm5, xmm6
mulacc [eax + 8], ecx, xmm6, xmm7, xmm4, xmm5, t
- mulacc [ebx + 8], edx, xmm6, xmm7, xmm4, xmm5, nil
+ mulacc [ebx + 8], edx, xmm6, xmm7, xmm4, xmm5
propout [edi + 8], xmm6, xmm7
mulacc [eax + 12], ecx, xmm7, xmm4, xmm5, xmm6, t
- mulacc [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6, nil
+ mulacc [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6
propout [edi + 12], xmm7, xmm4
ret
movd xmm6, [edi + 8]
movd xmm7, [edi + 12]
- mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7, nil
+ mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7
propout [edi + 0], xmm4, xmm5
mulacc [ebx + 4], edx, xmm5, xmm6, xmm7, xmm4, t
carryadd
- mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7, nil
+ mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7
propout [edi + 0], xmm4, xmm5
mulacc [ebx + 4], edx, xmm5, xmm6, xmm7, xmm4, t
movd xmm5, [edi + 4]
movd xmm6, [edi + 8]
movd xmm7, [edi + 12]
- mulacc [eax + 0], ecx, xmm4, xmm5, xmm6, xmm7, nil
+ mulacc [eax + 0], ecx, xmm4, xmm5, xmm6, xmm7
propout [edi + 0], xmm4, xmm5
5: mulacc [eax + 4], ecx, xmm5, xmm6, xmm7, xmm4, t
// Calculate Y = W M.
mulcore [edi + 0], esi, xmm4, xmm5, xmm6, xmm7
- mulcore [edi + 4], esi, xmm0, xmm1, xmm2, nil
- accum xmm5, xmm6, xmm7, nil
+ mulcore [edi + 4], esi, xmm0, xmm1, xmm2
+ accum xmm5, xmm6, xmm7
- mulcore [edi + 8], esi, xmm0, xmm1, nil, nil
- accum xmm6, xmm7, nil, nil
+ mulcore [edi + 8], esi, xmm0, xmm1
+ accum xmm6, xmm7
- mulcore [edi + 12], esi, xmm0, nil, nil, nil
- accum xmm7, nil, nil, nil
+ mulcore [edi + 12], esi, xmm0
+ accum xmm7
// That's lots of pieces. Now we have to assemble the answer.
- squash xmm4, nil, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1
+ squash xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm4
// Expand it.
pxor xmm2, xmm2
- expand xmm4, xmm1, nil, nil, xmm2
+ expand xmm2, xmm4, xmm1
movdqa [edx + 0], xmm4
movdqa [edx + 16], xmm1
movd xmm7, [edi + 12]
// Finish the calculation by adding the Montgomery product.
- mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7, nil
+ mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7
propout [edi + 0], xmm4, xmm5
mulacc [ebx + 4], edx, xmm5, xmm6, xmm7, xmm4, t
// Calculate Y = W M.
mulcore [edi + 0], esi, xmm4, xmm5, xmm6, xmm7
- mulcore [edi + 4], esi, xmm0, xmm1, xmm2, nil
- accum xmm5, xmm6, xmm7, nil
+ mulcore [edi + 4], esi, xmm0, xmm1, xmm2
+ accum xmm5, xmm6, xmm7
- mulcore [edi + 8], esi, xmm0, xmm1, nil, nil
- accum xmm6, xmm7, nil, nil
+ mulcore [edi + 8], esi, xmm0, xmm1
+ accum xmm6, xmm7
- mulcore [edi + 12], esi, xmm0, nil, nil, nil
- accum xmm7, nil, nil, nil
+ mulcore [edi + 12], esi, xmm0
+ accum xmm7
// That's lots of pieces. Now we have to assemble the answer.
- squash xmm4, nil, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1
+ squash xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm4
// Expand it.
pxor xmm2, xmm2
- expand xmm4, xmm1, nil, nil, xmm2
+ expand xmm2, xmm4, xmm1
movdqa [edx + 0], xmm4
movdqa [edx + 16], xmm1
movd xmm7, [edi + 12]
// Finish the calculation by adding the Montgomery product.
- mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7, nil
+ mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7
propout [edi + 0], xmm4, xmm5
mulacc [ebx + 4], edx, xmm5, xmm6, xmm7, xmm4, t
movdqu xmm0, [esi] // bv[0]
mov edi, [ebp + 20] // -> dv[0]
mov ecx, edi // outer loop dv cursor
- expand xmm0, xmm1, nil, nil, xmm7
+ expand xmm7, xmm0, xmm1
mov ebx, [ebp + 24] // -> av[0]
mov eax, [ebp + 28] // -> av[m] = av limit
mov edx, esp // -> expanded Y = bv[0]
1: movdqu xmm0, [esi] // bv[i]
mov edi, ecx // -> dv[i]
pxor xmm7, xmm7
- expand xmm0, xmm1, nil, nil, xmm7
+ expand xmm7, xmm0, xmm1
mov ebx, [ebp + 24] // -> av[0]
movdqa [esp + 0], xmm0 // bv[i] expanded low
movdqa [esp + 16], xmm1 // bv[i] expanded high
mov edx, [ebp + 40] // -> mi
movdqu xmm0, [ecx] // bv[0]
movdqu xmm2, [edx] // mi
- expand xmm0, xmm1, xmm2, xmm3, xmm7
+ expand xmm7, xmm0, xmm1, xmm2, xmm3
movdqa [esp + 12], xmm0 // bv[0] expanded low
movdqa [esp + 28], xmm1 // bv[0] expanded high
movdqa [esp + 44], xmm2 // mi expanded low
mov ebx, [ebp + 32] // -> X = nv[0]
lea esi, [esp + 44] // -> expanded M = mi
mov eax, [ebp + 24] // -> U = av[0]
- expand xmm0, xmm1, nil, nil, xmm7
+ expand xmm7, xmm0, xmm1
movdqa [esp + 12], xmm0 // bv[i] expanded low
movdqa [esp + 28], xmm1 // bv[i] expanded high
call mmla4
mov edx, [ebp + 36] // -> mi
movdqu xmm0, [edx] // mi
and eax, ~15 // mask off the tail end
- expand xmm0, xmm1, nil, nil, xmm7
+ expand xmm7, xmm0, xmm1
add eax, edi // find limit
movdqa [esp + 12], xmm0 // mi expanded low
movdqa [esp + 28], xmm1 // mi expanded high
movdqu xmm6, [ecx + 32] // (c'_2, c''_2)
.endm
-.macro testexpand v, y
+.macro testexpand v=nil, y=nil
pxor xmm7, xmm7
.ifnes "\v", "nil"
mov ecx, \v
movdqu xmm0, [ecx]
- expand xmm0, xmm1, nil, nil, xmm7
+ expand xmm7, xmm0, xmm1
movdqa [esp + 12], xmm0
movdqa [esp + 28], xmm1
.endif
.ifnes "\y", "nil"
mov edx, \y
movdqu xmm2, [edx]
- expand xmm2, xmm3, nil, nil, xmm7
+ expand xmm7, xmm2, xmm3
movdqa [esp + 44], xmm2
movdqa [esp + 60], xmm3
.endif
.endm
-.macro testtop u, x, mode
+.macro testtop u=nil, x=nil, mode=nil
.p2align 4
0:
.ifnes "\u", "nil"