I'd muddled up my macro languages and misremembered that GNU as handles
omitted macro arguments sensibly. So use default argument values
throughout. Some of the macro arguments have been reordered to make
defaulting work better. No functional change.
///--------------------------------------------------------------------------
/// Macro definitions.
///--------------------------------------------------------------------------
/// Macro definitions.
-.macro mulcore r, s, d0, d1, d2, d3
+.macro mulcore r, s, d0, d1=nil, d2=nil, d3=nil
// Load a word r_i from R, multiply by the expanded operand [S], and
// leave the pieces of the product in registers D0, D1, D2, D3.
movd \d0, \r // (r_i, 0, 0, 0)
// Load a word r_i from R, multiply by the expanded operand [S], and
// leave the pieces of the product in registers D0, D1, D2, D3.
movd \d0, \r // (r_i, 0, 0, 0)
pmuludq \d0, [\s] // (r_i s'_0, r_i s''_0)
.endm
pmuludq \d0, [\s] // (r_i s'_0, r_i s''_0)
.endm
-.macro accum c0, c1, c2, c3
+.macro accum c0, c1=nil, c2=nil, c3=nil
+ // Accumulate 64-bit pieces in XMM0--XMM3 into the corresponding
+ // carry registers C0--C3. Any or all of C1--C3 may be `nil' to skip
+ // updating that register.
paddq \c0, xmm0
.ifnes "\c1", "nil"
paddq \c1, xmm1
paddq \c0, xmm0
.ifnes "\c1", "nil"
paddq \c1, xmm1
-.macro mulacc r, s, c0, c1, c2, c3, z3p
+.macro mulacc r, s, c0, c1, c2, c3, z3p=nil
// Load a word r_i from R, multiply by the expanded operand [S],
// and accumulate in carry registers C0, C1, C2, C3. If Z3P is `t'
// then C3 notionally contains zero, but needs clearing; in practice,
// Load a word r_i from R, multiply by the expanded operand [S],
// and accumulate in carry registers C0, C1, C2, C3. If Z3P is `t'
// then C3 notionally contains zero, but needs clearing; in practice,
// is not `t'.
.ifeqs "\z3p", "t"
mulcore \r, \s, xmm0, xmm1, xmm2, \c3
// is not `t'.
.ifeqs "\z3p", "t"
mulcore \r, \s, xmm0, xmm1, xmm2, \c3
- accum \c0, \c1, \c2, nil
.else
mulcore \r, \s, xmm0, xmm1, xmm2, xmm3
accum \c0, \c1, \c2, \c3
.endif
.endm
.else
mulcore \r, \s, xmm0, xmm1, xmm2, xmm3
accum \c0, \c1, \c2, \c3
.endif
.endm
+.macro propout d, c, cc=nil
// Calculate an output word from C, and store it in D; propagate
// carries out from C to CC in preparation for a rotation of the
// carry registers. On completion, XMM3 is clobbered. If CC is
// Calculate an output word from C, and store it in D; propagate
// carries out from C to CC in preparation for a rotation of the
// carry registers. On completion, XMM3 is clobbered. If CC is
psrldq \t, 4 // floor((c' + c'' b)/B)
.endm
psrldq \t, 4 // floor((c' + c'' b)/B)
.endm
-.macro expand a, b, c, d, z
+.macro expand z, a, b, c=nil, d=nil
// On entry, A and C hold packed 128-bit values, and Z is zero. On
// exit, A:B and C:D together hold the same values in expanded
// form. If C is `nil', then only expand A to A:B.
// On entry, A and C hold packed 128-bit values, and Z is zero. On
// exit, A:B and C:D together hold the same values in expanded
// form. If C is `nil', then only expand A to A:B.
-.macro squash lo, hi, c0, c1, c2, c3, t, u
+.macro squash c0, c1, c2, c3, t, u, lo, hi=nil
// On entry, C0, C1, C2, C3 are carry registers representing a value
// Y. On exit, LO holds the low 128 bits of the carry value; C1, C2,
// C3, T, and U are clobbered; and the high bits of Y are stored in
// On entry, C0, C1, C2, C3 are carry registers representing a value
// Y. On exit, LO holds the low 128 bits of the carry value; C1, C2,
// C3, T, and U are clobbered; and the high bits of Y are stored in
endprologue
mulacc [eax + 0], ecx, xmm4, xmm5, xmm6, xmm7, t
endprologue
mulacc [eax + 0], ecx, xmm4, xmm5, xmm6, xmm7, t
- mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7, nil
+ mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7
propout [edi + 0], xmm4, xmm5
mulacc [eax + 4], ecx, xmm5, xmm6, xmm7, xmm4, t
propout [edi + 0], xmm4, xmm5
mulacc [eax + 4], ecx, xmm5, xmm6, xmm7, xmm4, t
- mulacc [ebx + 4], edx, xmm5, xmm6, xmm7, xmm4, nil
+ mulacc [ebx + 4], edx, xmm5, xmm6, xmm7, xmm4
propout [edi + 4], xmm5, xmm6
mulacc [eax + 8], ecx, xmm6, xmm7, xmm4, xmm5, t
propout [edi + 4], xmm5, xmm6
mulacc [eax + 8], ecx, xmm6, xmm7, xmm4, xmm5, t
- mulacc [ebx + 8], edx, xmm6, xmm7, xmm4, xmm5, nil
+ mulacc [ebx + 8], edx, xmm6, xmm7, xmm4, xmm5
propout [edi + 8], xmm6, xmm7
mulacc [eax + 12], ecx, xmm7, xmm4, xmm5, xmm6, t
propout [edi + 8], xmm6, xmm7
mulacc [eax + 12], ecx, xmm7, xmm4, xmm5, xmm6, t
- mulacc [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6, nil
+ mulacc [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6
propout [edi + 12], xmm7, xmm4
ret
propout [edi + 12], xmm7, xmm4
ret
- mulacc [eax + 0], ecx, xmm4, xmm5, xmm6, xmm7, nil
- mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7, nil
+ mulacc [eax + 0], ecx, xmm4, xmm5, xmm6, xmm7
+ mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7
propout [edi + 0], xmm4, xmm5
mulacc [eax + 4], ecx, xmm5, xmm6, xmm7, xmm4, t
propout [edi + 0], xmm4, xmm5
mulacc [eax + 4], ecx, xmm5, xmm6, xmm7, xmm4, t
- mulacc [ebx + 4], edx, xmm5, xmm6, xmm7, xmm4, nil
+ mulacc [ebx + 4], edx, xmm5, xmm6, xmm7, xmm4
propout [edi + 4], xmm5, xmm6
mulacc [eax + 8], ecx, xmm6, xmm7, xmm4, xmm5, t
propout [edi + 4], xmm5, xmm6
mulacc [eax + 8], ecx, xmm6, xmm7, xmm4, xmm5, t
- mulacc [ebx + 8], edx, xmm6, xmm7, xmm4, xmm5, nil
+ mulacc [ebx + 8], edx, xmm6, xmm7, xmm4, xmm5
propout [edi + 8], xmm6, xmm7
mulacc [eax + 12], ecx, xmm7, xmm4, xmm5, xmm6, t
propout [edi + 8], xmm6, xmm7
mulacc [eax + 12], ecx, xmm7, xmm4, xmm5, xmm6, t
- mulacc [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6, nil
+ mulacc [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6
propout [edi + 12], xmm7, xmm4
ret
propout [edi + 12], xmm7, xmm4
ret
movd xmm6, [edi + 8]
movd xmm7, [edi + 12]
movd xmm6, [edi + 8]
movd xmm7, [edi + 12]
- mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7, nil
+ mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7
propout [edi + 0], xmm4, xmm5
mulacc [ebx + 4], edx, xmm5, xmm6, xmm7, xmm4, t
propout [edi + 0], xmm4, xmm5
mulacc [ebx + 4], edx, xmm5, xmm6, xmm7, xmm4, t
- mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7, nil
+ mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7
propout [edi + 0], xmm4, xmm5
mulacc [ebx + 4], edx, xmm5, xmm6, xmm7, xmm4, t
propout [edi + 0], xmm4, xmm5
mulacc [ebx + 4], edx, xmm5, xmm6, xmm7, xmm4, t
movd xmm5, [edi + 4]
movd xmm6, [edi + 8]
movd xmm7, [edi + 12]
movd xmm5, [edi + 4]
movd xmm6, [edi + 8]
movd xmm7, [edi + 12]
- mulacc [eax + 0], ecx, xmm4, xmm5, xmm6, xmm7, nil
+ mulacc [eax + 0], ecx, xmm4, xmm5, xmm6, xmm7
propout [edi + 0], xmm4, xmm5
5: mulacc [eax + 4], ecx, xmm5, xmm6, xmm7, xmm4, t
propout [edi + 0], xmm4, xmm5
5: mulacc [eax + 4], ecx, xmm5, xmm6, xmm7, xmm4, t
// Calculate Y = W M.
mulcore [edi + 0], esi, xmm4, xmm5, xmm6, xmm7
// Calculate Y = W M.
mulcore [edi + 0], esi, xmm4, xmm5, xmm6, xmm7
- mulcore [edi + 4], esi, xmm0, xmm1, xmm2, nil
- accum xmm5, xmm6, xmm7, nil
+ mulcore [edi + 4], esi, xmm0, xmm1, xmm2
+ accum xmm5, xmm6, xmm7
- mulcore [edi + 8], esi, xmm0, xmm1, nil, nil
- accum xmm6, xmm7, nil, nil
+ mulcore [edi + 8], esi, xmm0, xmm1
+ accum xmm6, xmm7
- mulcore [edi + 12], esi, xmm0, nil, nil, nil
- accum xmm7, nil, nil, nil
+ mulcore [edi + 12], esi, xmm0
+ accum xmm7
// That's lots of pieces. Now we have to assemble the answer.
// That's lots of pieces. Now we have to assemble the answer.
- squash xmm4, nil, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1
+ squash xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm4
// Expand it.
pxor xmm2, xmm2
// Expand it.
pxor xmm2, xmm2
- expand xmm4, xmm1, nil, nil, xmm2
+ expand xmm2, xmm4, xmm1
movdqa [edx + 0], xmm4
movdqa [edx + 16], xmm1
movdqa [edx + 0], xmm4
movdqa [edx + 16], xmm1
movd xmm7, [edi + 12]
// Finish the calculation by adding the Montgomery product.
movd xmm7, [edi + 12]
// Finish the calculation by adding the Montgomery product.
- mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7, nil
+ mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7
propout [edi + 0], xmm4, xmm5
mulacc [ebx + 4], edx, xmm5, xmm6, xmm7, xmm4, t
propout [edi + 0], xmm4, xmm5
mulacc [ebx + 4], edx, xmm5, xmm6, xmm7, xmm4, t
// Calculate Y = W M.
mulcore [edi + 0], esi, xmm4, xmm5, xmm6, xmm7
// Calculate Y = W M.
mulcore [edi + 0], esi, xmm4, xmm5, xmm6, xmm7
- mulcore [edi + 4], esi, xmm0, xmm1, xmm2, nil
- accum xmm5, xmm6, xmm7, nil
+ mulcore [edi + 4], esi, xmm0, xmm1, xmm2
+ accum xmm5, xmm6, xmm7
- mulcore [edi + 8], esi, xmm0, xmm1, nil, nil
- accum xmm6, xmm7, nil, nil
+ mulcore [edi + 8], esi, xmm0, xmm1
+ accum xmm6, xmm7
- mulcore [edi + 12], esi, xmm0, nil, nil, nil
- accum xmm7, nil, nil, nil
+ mulcore [edi + 12], esi, xmm0
+ accum xmm7
// That's lots of pieces. Now we have to assemble the answer.
// That's lots of pieces. Now we have to assemble the answer.
- squash xmm4, nil, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1
+ squash xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm4
// Expand it.
pxor xmm2, xmm2
// Expand it.
pxor xmm2, xmm2
- expand xmm4, xmm1, nil, nil, xmm2
+ expand xmm2, xmm4, xmm1
movdqa [edx + 0], xmm4
movdqa [edx + 16], xmm1
movdqa [edx + 0], xmm4
movdqa [edx + 16], xmm1
movd xmm7, [edi + 12]
// Finish the calculation by adding the Montgomery product.
movd xmm7, [edi + 12]
// Finish the calculation by adding the Montgomery product.
- mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7, nil
+ mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7
propout [edi + 0], xmm4, xmm5
mulacc [ebx + 4], edx, xmm5, xmm6, xmm7, xmm4, t
propout [edi + 0], xmm4, xmm5
mulacc [ebx + 4], edx, xmm5, xmm6, xmm7, xmm4, t
movdqu xmm0, [esi] // bv[0]
mov edi, [ebp + 20] // -> dv[0]
mov ecx, edi // outer loop dv cursor
movdqu xmm0, [esi] // bv[0]
mov edi, [ebp + 20] // -> dv[0]
mov ecx, edi // outer loop dv cursor
- expand xmm0, xmm1, nil, nil, xmm7
+ expand xmm7, xmm0, xmm1
mov ebx, [ebp + 24] // -> av[0]
mov eax, [ebp + 28] // -> av[m] = av limit
mov edx, esp // -> expanded Y = bv[0]
mov ebx, [ebp + 24] // -> av[0]
mov eax, [ebp + 28] // -> av[m] = av limit
mov edx, esp // -> expanded Y = bv[0]
1: movdqu xmm0, [esi] // bv[i]
mov edi, ecx // -> dv[i]
pxor xmm7, xmm7
1: movdqu xmm0, [esi] // bv[i]
mov edi, ecx // -> dv[i]
pxor xmm7, xmm7
- expand xmm0, xmm1, nil, nil, xmm7
+ expand xmm7, xmm0, xmm1
mov ebx, [ebp + 24] // -> av[0]
movdqa [esp + 0], xmm0 // bv[i] expanded low
movdqa [esp + 16], xmm1 // bv[i] expanded high
mov ebx, [ebp + 24] // -> av[0]
movdqa [esp + 0], xmm0 // bv[i] expanded low
movdqa [esp + 16], xmm1 // bv[i] expanded high
mov edx, [ebp + 40] // -> mi
movdqu xmm0, [ecx] // bv[0]
movdqu xmm2, [edx] // mi
mov edx, [ebp + 40] // -> mi
movdqu xmm0, [ecx] // bv[0]
movdqu xmm2, [edx] // mi
- expand xmm0, xmm1, xmm2, xmm3, xmm7
+ expand xmm7, xmm0, xmm1, xmm2, xmm3
movdqa [esp + 12], xmm0 // bv[0] expanded low
movdqa [esp + 28], xmm1 // bv[0] expanded high
movdqa [esp + 44], xmm2 // mi expanded low
movdqa [esp + 12], xmm0 // bv[0] expanded low
movdqa [esp + 28], xmm1 // bv[0] expanded high
movdqa [esp + 44], xmm2 // mi expanded low
mov ebx, [ebp + 32] // -> X = nv[0]
lea esi, [esp + 44] // -> expanded M = mi
mov eax, [ebp + 24] // -> U = av[0]
mov ebx, [ebp + 32] // -> X = nv[0]
lea esi, [esp + 44] // -> expanded M = mi
mov eax, [ebp + 24] // -> U = av[0]
- expand xmm0, xmm1, nil, nil, xmm7
+ expand xmm7, xmm0, xmm1
movdqa [esp + 12], xmm0 // bv[i] expanded low
movdqa [esp + 28], xmm1 // bv[i] expanded high
call mmla4
movdqa [esp + 12], xmm0 // bv[i] expanded low
movdqa [esp + 28], xmm1 // bv[i] expanded high
call mmla4
mov edx, [ebp + 36] // -> mi
movdqu xmm0, [edx] // mi
and eax, ~15 // mask off the tail end
mov edx, [ebp + 36] // -> mi
movdqu xmm0, [edx] // mi
and eax, ~15 // mask off the tail end
- expand xmm0, xmm1, nil, nil, xmm7
+ expand xmm7, xmm0, xmm1
add eax, edi // find limit
movdqa [esp + 12], xmm0 // mi expanded low
movdqa [esp + 28], xmm1 // mi expanded high
add eax, edi // find limit
movdqa [esp + 12], xmm0 // mi expanded low
movdqa [esp + 28], xmm1 // mi expanded high
movdqu xmm6, [ecx + 32] // (c'_2, c''_2)
.endm
movdqu xmm6, [ecx + 32] // (c'_2, c''_2)
.endm
+.macro testexpand v=nil, y=nil
pxor xmm7, xmm7
.ifnes "\v", "nil"
mov ecx, \v
movdqu xmm0, [ecx]
pxor xmm7, xmm7
.ifnes "\v", "nil"
mov ecx, \v
movdqu xmm0, [ecx]
- expand xmm0, xmm1, nil, nil, xmm7
+ expand xmm7, xmm0, xmm1
movdqa [esp + 12], xmm0
movdqa [esp + 28], xmm1
.endif
.ifnes "\y", "nil"
mov edx, \y
movdqu xmm2, [edx]
movdqa [esp + 12], xmm0
movdqa [esp + 28], xmm1
.endif
.ifnes "\y", "nil"
mov edx, \y
movdqu xmm2, [edx]
- expand xmm2, xmm3, nil, nil, xmm7
+ expand xmm7, xmm2, xmm3
movdqa [esp + 44], xmm2
movdqa [esp + 60], xmm3
.endif
.endm
movdqa [esp + 44], xmm2
movdqa [esp + 60], xmm3
.endif
.endm
-.macro testtop u, x, mode
+.macro testtop u=nil, x=nil, mode=nil
.p2align 4
0:
.ifnes "\u", "nil"
.p2align 4
0:
.ifnes "\u", "nil"