X-Git-Url: http://www.chiark.greenend.org.uk/ucgi/~mdw/git/catacomb/blobdiff_plain/2aaa07f8c724ca7230ea6c23e3ab8f337fd91999..71ac8e5eb7dcaf08a9db60c7b460120f3f43d8a1:/math/mpx-mul4-x86-sse2.S diff --git a/math/mpx-mul4-x86-sse2.S b/math/mpx-mul4-x86-sse2.S index 0b57dbfd..5d5714da 100644 --- a/math/mpx-mul4-x86-sse2.S +++ b/math/mpx-mul4-x86-sse2.S @@ -93,7 +93,7 @@ ///-------------------------------------------------------------------------- /// Macro definitions. -.macro mulcore r, s, d0, d1, d2, d3 +.macro mulcore r, s, d0, d1=nil, d2=nil, d3=nil // Load a word r_i from R, multiply by the expanded operand [S], and // leave the pieces of the product in registers D0, D1, D2, D3. movd \d0, \r // (r_i, 0, 0, 0) @@ -133,7 +133,10 @@ pmuludq \d0, [\s] // (r_i s'_0, r_i s''_0) .endm -.macro accum c0, c1, c2, c3 +.macro accum c0, c1=nil, c2=nil, c3=nil + // Accumulate 64-bit pieces in XMM0--XMM3 into the corresponding + // carry registers C0--C3. Any or all of C1--C3 may be `nil' to skip + // updating that register. paddq \c0, xmm0 .ifnes "\c1", "nil" paddq \c1, xmm1 @@ -146,7 +149,7 @@ .endif .endm -.macro mulacc r, s, c0, c1, c2, c3, z3p +.macro mulacc r, s, c0, c1, c2, c3, z3p=nil // Load a word r_i from R, multiply by the expanded operand [S], // and accumulate in carry registers C0, C1, C2, C3. If Z3P is `t' // then C3 notionally contains zero, but needs clearing; in practice, @@ -155,14 +158,14 @@ // is not `t'. .ifeqs "\z3p", "t" mulcore \r, \s, xmm0, xmm1, xmm2, \c3 - accum \c0, \c1, \c2, nil + accum \c0, \c1, \c2 .else mulcore \r, \s, xmm0, xmm1, xmm2, xmm3 accum \c0, \c1, \c2, \c3 .endif .endm -.macro propout d, c, cc +.macro propout d, c, cc=nil // Calculate an output word from C, and store it in D; propagate // carries out from C to CC in preparation for a rotation of the // carry registers. On completion, XMM3 is clobbered. If CC is @@ -192,7 +195,7 @@ psrldq \t, 4 // floor((c' + c'' b)/B) .endm -.macro expand a, b, c, d, z +.macro expand z, a, b, c=nil, d=nil // On entry, A and C hold packed 128-bit values, and Z is zero. On // exit, A:B and C:D together hold the same values in expanded // form. If C is `nil', then only expand A to A:B. @@ -214,7 +217,7 @@ .endif .endm -.macro squash lo, hi, c0, c1, c2, c3, t, u +.macro squash c0, c1, c2, c3, t, u, lo, hi=nil // On entry, C0, C1, C2, C3 are carry registers representing a value // Y. On exit, LO holds the low 128 bits of the carry value; C1, C2, // C3, T, and U are clobbered; and the high bits of Y are stored in @@ -331,19 +334,19 @@ INTFUNC(dmul4) endprologue mulacc [eax + 0], ecx, xmm4, xmm5, xmm6, xmm7, t - mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7, nil + mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7 propout [edi + 0], xmm4, xmm5 mulacc [eax + 4], ecx, xmm5, xmm6, xmm7, xmm4, t - mulacc [ebx + 4], edx, xmm5, xmm6, xmm7, xmm4, nil + mulacc [ebx + 4], edx, xmm5, xmm6, xmm7, xmm4 propout [edi + 4], xmm5, xmm6 mulacc [eax + 8], ecx, xmm6, xmm7, xmm4, xmm5, t - mulacc [ebx + 8], edx, xmm6, xmm7, xmm4, xmm5, nil + mulacc [ebx + 8], edx, xmm6, xmm7, xmm4, xmm5 propout [edi + 8], xmm6, xmm7 mulacc [eax + 12], ecx, xmm7, xmm4, xmm5, xmm6, t - mulacc [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6, nil + mulacc [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6 propout [edi + 12], xmm7, xmm4 ret @@ -366,20 +369,20 @@ INTFUNC(dmla4) carryadd - mulacc [eax + 0], ecx, xmm4, xmm5, xmm6, xmm7, nil - mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7, nil + mulacc [eax + 0], ecx, xmm4, xmm5, xmm6, xmm7 + mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7 propout [edi + 0], xmm4, xmm5 mulacc [eax + 4], ecx, xmm5, xmm6, xmm7, xmm4, t - mulacc [ebx + 4], edx, xmm5, xmm6, xmm7, xmm4, nil + mulacc [ebx + 4], edx, xmm5, xmm6, xmm7, xmm4 propout [edi + 4], xmm5, xmm6 mulacc [eax + 8], ecx, xmm6, xmm7, xmm4, xmm5, t - mulacc [ebx + 8], edx, xmm6, xmm7, xmm4, xmm5, nil + mulacc [ebx + 8], edx, xmm6, xmm7, xmm4, xmm5 propout [edi + 8], xmm6, xmm7 mulacc [eax + 12], ecx, xmm7, xmm4, xmm5, xmm6, t - mulacc [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6, nil + mulacc [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6 propout [edi + 12], xmm7, xmm4 ret @@ -456,7 +459,7 @@ INTFUNC(mla4zc) movd xmm6, [edi + 8] movd xmm7, [edi + 12] - mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7, nil + mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7 propout [edi + 0], xmm4, xmm5 mulacc [ebx + 4], edx, xmm5, xmm6, xmm7, xmm4, t @@ -487,7 +490,7 @@ INTFUNC(mla4) carryadd - mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7, nil + mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7 propout [edi + 0], xmm4, xmm5 mulacc [ebx + 4], edx, xmm5, xmm6, xmm7, xmm4, t @@ -547,7 +550,7 @@ INTFUNC(mmla4) movd xmm5, [edi + 4] movd xmm6, [edi + 8] movd xmm7, [edi + 12] - mulacc [eax + 0], ecx, xmm4, xmm5, xmm6, xmm7, nil + mulacc [eax + 0], ecx, xmm4, xmm5, xmm6, xmm7 propout [edi + 0], xmm4, xmm5 5: mulacc [eax + 4], ecx, xmm5, xmm6, xmm7, xmm4, t @@ -566,21 +569,21 @@ INTFUNC(mmla4) // Calculate Y = W M. mulcore [edi + 0], esi, xmm4, xmm5, xmm6, xmm7 - mulcore [edi + 4], esi, xmm0, xmm1, xmm2, nil - accum xmm5, xmm6, xmm7, nil + mulcore [edi + 4], esi, xmm0, xmm1, xmm2 + accum xmm5, xmm6, xmm7 - mulcore [edi + 8], esi, xmm0, xmm1, nil, nil - accum xmm6, xmm7, nil, nil + mulcore [edi + 8], esi, xmm0, xmm1 + accum xmm6, xmm7 - mulcore [edi + 12], esi, xmm0, nil, nil, nil - accum xmm7, nil, nil, nil + mulcore [edi + 12], esi, xmm0 + accum xmm7 // That's lots of pieces. Now we have to assemble the answer. - squash xmm4, nil, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1 + squash xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm4 // Expand it. pxor xmm2, xmm2 - expand xmm4, xmm1, nil, nil, xmm2 + expand xmm2, xmm4, xmm1 movdqa [edx + 0], xmm4 movdqa [edx + 16], xmm1 @@ -591,7 +594,7 @@ INTFUNC(mmla4) movd xmm7, [edi + 12] // Finish the calculation by adding the Montgomery product. - mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7, nil + mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7 propout [edi + 0], xmm4, xmm5 mulacc [ebx + 4], edx, xmm5, xmm6, xmm7, xmm4, t @@ -629,21 +632,21 @@ INTFUNC(mont4) // Calculate Y = W M. mulcore [edi + 0], esi, xmm4, xmm5, xmm6, xmm7 - mulcore [edi + 4], esi, xmm0, xmm1, xmm2, nil - accum xmm5, xmm6, xmm7, nil + mulcore [edi + 4], esi, xmm0, xmm1, xmm2 + accum xmm5, xmm6, xmm7 - mulcore [edi + 8], esi, xmm0, xmm1, nil, nil - accum xmm6, xmm7, nil, nil + mulcore [edi + 8], esi, xmm0, xmm1 + accum xmm6, xmm7 - mulcore [edi + 12], esi, xmm0, nil, nil, nil - accum xmm7, nil, nil, nil + mulcore [edi + 12], esi, xmm0 + accum xmm7 // That's lots of pieces. Now we have to assemble the answer. - squash xmm4, nil, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1 + squash xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm4 // Expand it. pxor xmm2, xmm2 - expand xmm4, xmm1, nil, nil, xmm2 + expand xmm2, xmm4, xmm1 movdqa [edx + 0], xmm4 movdqa [edx + 16], xmm1 @@ -654,7 +657,7 @@ INTFUNC(mont4) movd xmm7, [edi + 12] // Finish the calculation by adding the Montgomery product. - mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7, nil + mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7 propout [edi + 0], xmm4, xmm5 mulacc [ebx + 4], edx, xmm5, xmm6, xmm7, xmm4, t @@ -706,7 +709,7 @@ FUNC(mpx_umul4_x86_sse2) movdqu xmm0, [esi] // bv[0] mov edi, [ebp + 20] // -> dv[0] mov ecx, edi // outer loop dv cursor - expand xmm0, xmm1, nil, nil, xmm7 + expand xmm7, xmm0, xmm1 mov ebx, [ebp + 24] // -> av[0] mov eax, [ebp + 28] // -> av[m] = av limit mov edx, esp // -> expanded Y = bv[0] @@ -738,7 +741,7 @@ FUNC(mpx_umul4_x86_sse2) 1: movdqu xmm0, [esi] // bv[i] mov edi, ecx // -> dv[i] pxor xmm7, xmm7 - expand xmm0, xmm1, nil, nil, xmm7 + expand xmm7, xmm0, xmm1 mov ebx, [ebp + 24] // -> av[0] movdqa [esp + 0], xmm0 // bv[i] expanded low movdqa [esp + 16], xmm1 // bv[i] expanded high @@ -814,7 +817,7 @@ FUNC(mpxmont_mul4_x86_sse2) mov edx, [ebp + 40] // -> mi movdqu xmm0, [ecx] // bv[0] movdqu xmm2, [edx] // mi - expand xmm0, xmm1, xmm2, xmm3, xmm7 + expand xmm7, xmm0, xmm1, xmm2, xmm3 movdqa [esp + 12], xmm0 // bv[0] expanded low movdqa [esp + 28], xmm1 // bv[0] expanded high movdqa [esp + 44], xmm2 // mi expanded low @@ -871,7 +874,7 @@ FUNC(mpxmont_mul4_x86_sse2) mov ebx, [ebp + 32] // -> X = nv[0] lea esi, [esp + 44] // -> expanded M = mi mov eax, [ebp + 24] // -> U = av[0] - expand xmm0, xmm1, nil, nil, xmm7 + expand xmm7, xmm0, xmm1 movdqa [esp + 12], xmm0 // bv[i] expanded low movdqa [esp + 28], xmm1 // bv[i] expanded high call mmla4 @@ -954,7 +957,7 @@ FUNC(mpxmont_redc4_x86_sse2) mov edx, [ebp + 36] // -> mi movdqu xmm0, [edx] // mi and eax, ~15 // mask off the tail end - expand xmm0, xmm1, nil, nil, xmm7 + expand xmm7, xmm0, xmm1 add eax, edi // find limit movdqa [esp + 12], xmm0 // mi expanded low movdqa [esp + 28], xmm1 // mi expanded high @@ -1097,25 +1100,25 @@ ENDFUNC movdqu xmm6, [ecx + 32] // (c'_2, c''_2) .endm -.macro testexpand v, y +.macro testexpand v=nil, y=nil pxor xmm7, xmm7 .ifnes "\v", "nil" mov ecx, \v movdqu xmm0, [ecx] - expand xmm0, xmm1, nil, nil, xmm7 + expand xmm7, xmm0, xmm1 movdqa [esp + 12], xmm0 movdqa [esp + 28], xmm1 .endif .ifnes "\y", "nil" mov edx, \y movdqu xmm2, [edx] - expand xmm2, xmm3, nil, nil, xmm7 + expand xmm7, xmm2, xmm3 movdqa [esp + 44], xmm2 movdqa [esp + 60], xmm3 .endif .endm -.macro testtop u, x, mode +.macro testtop u=nil, x=nil, mode=nil .p2align 4 0: .ifnes "\u", "nil"