From 71ac8e5eb7dcaf08a9db60c7b460120f3f43d8a1 Mon Sep 17 00:00:00 2001 Message-Id: <71ac8e5eb7dcaf08a9db60c7b460120f3f43d8a1.1716365649.git.mdw@distorted.org.uk> From: Mark Wooding Date: Thu, 29 Dec 2016 15:24:26 +0000 Subject: [PATCH] math/mpx-mul4-x86-sse2.S: Use default arguments for macros. Organization: Straylight/Edgeware From: Mark Wooding I'd muddled up my macro languages and misremembered that GNU as handles omitted macro arguments sensibly. So use default argument values throughout. Some of the macro arguments have been reordered to make defaulting work better. No functional change. --- math/mpx-mul4-x86-sse2.S | 95 +++++++++++++++++++++------------------- 1 file changed, 49 insertions(+), 46 deletions(-) diff --git a/math/mpx-mul4-x86-sse2.S b/math/mpx-mul4-x86-sse2.S index 0b57dbfd..5d5714da 100644 --- a/math/mpx-mul4-x86-sse2.S +++ b/math/mpx-mul4-x86-sse2.S @@ -93,7 +93,7 @@ ///-------------------------------------------------------------------------- /// Macro definitions. -.macro mulcore r, s, d0, d1, d2, d3 +.macro mulcore r, s, d0, d1=nil, d2=nil, d3=nil // Load a word r_i from R, multiply by the expanded operand [S], and // leave the pieces of the product in registers D0, D1, D2, D3. movd \d0, \r // (r_i, 0, 0, 0) @@ -133,7 +133,10 @@ pmuludq \d0, [\s] // (r_i s'_0, r_i s''_0) .endm -.macro accum c0, c1, c2, c3 +.macro accum c0, c1=nil, c2=nil, c3=nil + // Accumulate 64-bit pieces in XMM0--XMM3 into the corresponding + // carry registers C0--C3. Any or all of C1--C3 may be `nil' to skip + // updating that register. paddq \c0, xmm0 .ifnes "\c1", "nil" paddq \c1, xmm1 @@ -146,7 +149,7 @@ .endif .endm -.macro mulacc r, s, c0, c1, c2, c3, z3p +.macro mulacc r, s, c0, c1, c2, c3, z3p=nil // Load a word r_i from R, multiply by the expanded operand [S], // and accumulate in carry registers C0, C1, C2, C3. If Z3P is `t' // then C3 notionally contains zero, but needs clearing; in practice, @@ -155,14 +158,14 @@ // is not `t'. .ifeqs "\z3p", "t" mulcore \r, \s, xmm0, xmm1, xmm2, \c3 - accum \c0, \c1, \c2, nil + accum \c0, \c1, \c2 .else mulcore \r, \s, xmm0, xmm1, xmm2, xmm3 accum \c0, \c1, \c2, \c3 .endif .endm -.macro propout d, c, cc +.macro propout d, c, cc=nil // Calculate an output word from C, and store it in D; propagate // carries out from C to CC in preparation for a rotation of the // carry registers. On completion, XMM3 is clobbered. If CC is @@ -192,7 +195,7 @@ psrldq \t, 4 // floor((c' + c'' b)/B) .endm -.macro expand a, b, c, d, z +.macro expand z, a, b, c=nil, d=nil // On entry, A and C hold packed 128-bit values, and Z is zero. On // exit, A:B and C:D together hold the same values in expanded // form. If C is `nil', then only expand A to A:B. @@ -214,7 +217,7 @@ .endif .endm -.macro squash lo, hi, c0, c1, c2, c3, t, u +.macro squash c0, c1, c2, c3, t, u, lo, hi=nil // On entry, C0, C1, C2, C3 are carry registers representing a value // Y. On exit, LO holds the low 128 bits of the carry value; C1, C2, // C3, T, and U are clobbered; and the high bits of Y are stored in @@ -331,19 +334,19 @@ INTFUNC(dmul4) endprologue mulacc [eax + 0], ecx, xmm4, xmm5, xmm6, xmm7, t - mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7, nil + mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7 propout [edi + 0], xmm4, xmm5 mulacc [eax + 4], ecx, xmm5, xmm6, xmm7, xmm4, t - mulacc [ebx + 4], edx, xmm5, xmm6, xmm7, xmm4, nil + mulacc [ebx + 4], edx, xmm5, xmm6, xmm7, xmm4 propout [edi + 4], xmm5, xmm6 mulacc [eax + 8], ecx, xmm6, xmm7, xmm4, xmm5, t - mulacc [ebx + 8], edx, xmm6, xmm7, xmm4, xmm5, nil + mulacc [ebx + 8], edx, xmm6, xmm7, xmm4, xmm5 propout [edi + 8], xmm6, xmm7 mulacc [eax + 12], ecx, xmm7, xmm4, xmm5, xmm6, t - mulacc [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6, nil + mulacc [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6 propout [edi + 12], xmm7, xmm4 ret @@ -366,20 +369,20 @@ INTFUNC(dmla4) carryadd - mulacc [eax + 0], ecx, xmm4, xmm5, xmm6, xmm7, nil - mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7, nil + mulacc [eax + 0], ecx, xmm4, xmm5, xmm6, xmm7 + mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7 propout [edi + 0], xmm4, xmm5 mulacc [eax + 4], ecx, xmm5, xmm6, xmm7, xmm4, t - mulacc [ebx + 4], edx, xmm5, xmm6, xmm7, xmm4, nil + mulacc [ebx + 4], edx, xmm5, xmm6, xmm7, xmm4 propout [edi + 4], xmm5, xmm6 mulacc [eax + 8], ecx, xmm6, xmm7, xmm4, xmm5, t - mulacc [ebx + 8], edx, xmm6, xmm7, xmm4, xmm5, nil + mulacc [ebx + 8], edx, xmm6, xmm7, xmm4, xmm5 propout [edi + 8], xmm6, xmm7 mulacc [eax + 12], ecx, xmm7, xmm4, xmm5, xmm6, t - mulacc [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6, nil + mulacc [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6 propout [edi + 12], xmm7, xmm4 ret @@ -456,7 +459,7 @@ INTFUNC(mla4zc) movd xmm6, [edi + 8] movd xmm7, [edi + 12] - mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7, nil + mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7 propout [edi + 0], xmm4, xmm5 mulacc [ebx + 4], edx, xmm5, xmm6, xmm7, xmm4, t @@ -487,7 +490,7 @@ INTFUNC(mla4) carryadd - mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7, nil + mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7 propout [edi + 0], xmm4, xmm5 mulacc [ebx + 4], edx, xmm5, xmm6, xmm7, xmm4, t @@ -547,7 +550,7 @@ INTFUNC(mmla4) movd xmm5, [edi + 4] movd xmm6, [edi + 8] movd xmm7, [edi + 12] - mulacc [eax + 0], ecx, xmm4, xmm5, xmm6, xmm7, nil + mulacc [eax + 0], ecx, xmm4, xmm5, xmm6, xmm7 propout [edi + 0], xmm4, xmm5 5: mulacc [eax + 4], ecx, xmm5, xmm6, xmm7, xmm4, t @@ -566,21 +569,21 @@ INTFUNC(mmla4) // Calculate Y = W M. mulcore [edi + 0], esi, xmm4, xmm5, xmm6, xmm7 - mulcore [edi + 4], esi, xmm0, xmm1, xmm2, nil - accum xmm5, xmm6, xmm7, nil + mulcore [edi + 4], esi, xmm0, xmm1, xmm2 + accum xmm5, xmm6, xmm7 - mulcore [edi + 8], esi, xmm0, xmm1, nil, nil - accum xmm6, xmm7, nil, nil + mulcore [edi + 8], esi, xmm0, xmm1 + accum xmm6, xmm7 - mulcore [edi + 12], esi, xmm0, nil, nil, nil - accum xmm7, nil, nil, nil + mulcore [edi + 12], esi, xmm0 + accum xmm7 // That's lots of pieces. Now we have to assemble the answer. - squash xmm4, nil, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1 + squash xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm4 // Expand it. pxor xmm2, xmm2 - expand xmm4, xmm1, nil, nil, xmm2 + expand xmm2, xmm4, xmm1 movdqa [edx + 0], xmm4 movdqa [edx + 16], xmm1 @@ -591,7 +594,7 @@ INTFUNC(mmla4) movd xmm7, [edi + 12] // Finish the calculation by adding the Montgomery product. - mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7, nil + mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7 propout [edi + 0], xmm4, xmm5 mulacc [ebx + 4], edx, xmm5, xmm6, xmm7, xmm4, t @@ -629,21 +632,21 @@ INTFUNC(mont4) // Calculate Y = W M. mulcore [edi + 0], esi, xmm4, xmm5, xmm6, xmm7 - mulcore [edi + 4], esi, xmm0, xmm1, xmm2, nil - accum xmm5, xmm6, xmm7, nil + mulcore [edi + 4], esi, xmm0, xmm1, xmm2 + accum xmm5, xmm6, xmm7 - mulcore [edi + 8], esi, xmm0, xmm1, nil, nil - accum xmm6, xmm7, nil, nil + mulcore [edi + 8], esi, xmm0, xmm1 + accum xmm6, xmm7 - mulcore [edi + 12], esi, xmm0, nil, nil, nil - accum xmm7, nil, nil, nil + mulcore [edi + 12], esi, xmm0 + accum xmm7 // That's lots of pieces. Now we have to assemble the answer. - squash xmm4, nil, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1 + squash xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm4 // Expand it. pxor xmm2, xmm2 - expand xmm4, xmm1, nil, nil, xmm2 + expand xmm2, xmm4, xmm1 movdqa [edx + 0], xmm4 movdqa [edx + 16], xmm1 @@ -654,7 +657,7 @@ INTFUNC(mont4) movd xmm7, [edi + 12] // Finish the calculation by adding the Montgomery product. - mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7, nil + mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7 propout [edi + 0], xmm4, xmm5 mulacc [ebx + 4], edx, xmm5, xmm6, xmm7, xmm4, t @@ -706,7 +709,7 @@ FUNC(mpx_umul4_x86_sse2) movdqu xmm0, [esi] // bv[0] mov edi, [ebp + 20] // -> dv[0] mov ecx, edi // outer loop dv cursor - expand xmm0, xmm1, nil, nil, xmm7 + expand xmm7, xmm0, xmm1 mov ebx, [ebp + 24] // -> av[0] mov eax, [ebp + 28] // -> av[m] = av limit mov edx, esp // -> expanded Y = bv[0] @@ -738,7 +741,7 @@ FUNC(mpx_umul4_x86_sse2) 1: movdqu xmm0, [esi] // bv[i] mov edi, ecx // -> dv[i] pxor xmm7, xmm7 - expand xmm0, xmm1, nil, nil, xmm7 + expand xmm7, xmm0, xmm1 mov ebx, [ebp + 24] // -> av[0] movdqa [esp + 0], xmm0 // bv[i] expanded low movdqa [esp + 16], xmm1 // bv[i] expanded high @@ -814,7 +817,7 @@ FUNC(mpxmont_mul4_x86_sse2) mov edx, [ebp + 40] // -> mi movdqu xmm0, [ecx] // bv[0] movdqu xmm2, [edx] // mi - expand xmm0, xmm1, xmm2, xmm3, xmm7 + expand xmm7, xmm0, xmm1, xmm2, xmm3 movdqa [esp + 12], xmm0 // bv[0] expanded low movdqa [esp + 28], xmm1 // bv[0] expanded high movdqa [esp + 44], xmm2 // mi expanded low @@ -871,7 +874,7 @@ FUNC(mpxmont_mul4_x86_sse2) mov ebx, [ebp + 32] // -> X = nv[0] lea esi, [esp + 44] // -> expanded M = mi mov eax, [ebp + 24] // -> U = av[0] - expand xmm0, xmm1, nil, nil, xmm7 + expand xmm7, xmm0, xmm1 movdqa [esp + 12], xmm0 // bv[i] expanded low movdqa [esp + 28], xmm1 // bv[i] expanded high call mmla4 @@ -954,7 +957,7 @@ FUNC(mpxmont_redc4_x86_sse2) mov edx, [ebp + 36] // -> mi movdqu xmm0, [edx] // mi and eax, ~15 // mask off the tail end - expand xmm0, xmm1, nil, nil, xmm7 + expand xmm7, xmm0, xmm1 add eax, edi // find limit movdqa [esp + 12], xmm0 // mi expanded low movdqa [esp + 28], xmm1 // mi expanded high @@ -1097,25 +1100,25 @@ ENDFUNC movdqu xmm6, [ecx + 32] // (c'_2, c''_2) .endm -.macro testexpand v, y +.macro testexpand v=nil, y=nil pxor xmm7, xmm7 .ifnes "\v", "nil" mov ecx, \v movdqu xmm0, [ecx] - expand xmm0, xmm1, nil, nil, xmm7 + expand xmm7, xmm0, xmm1 movdqa [esp + 12], xmm0 movdqa [esp + 28], xmm1 .endif .ifnes "\y", "nil" mov edx, \y movdqu xmm2, [edx] - expand xmm2, xmm3, nil, nil, xmm7 + expand xmm7, xmm2, xmm3 movdqa [esp + 44], xmm2 movdqa [esp + 60], xmm3 .endif .endm -.macro testtop u, x, mode +.macro testtop u=nil, x=nil, mode=nil .p2align 4 0: .ifnes "\u", "nil" -- [mdw]