From 4b30aca5fd259dec2eef4869364866c496774608 Mon Sep 17 00:00:00 2001 Message-Id: <4b30aca5fd259dec2eef4869364866c496774608.1716389896.git.mdw@distorted.org.uk> From: Mark Wooding Date: Sat, 5 Nov 2016 21:28:22 +0000 Subject: [PATCH] math/mpx-mul4-x86-sse2.S: Give `squash' an explicit destination argument. Organization: Straylight/Edgeware From: Mark Wooding Also, rearrange the arguments so the destination(s) are at the start. --- math/mpx-mul4-x86-sse2.S | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/math/mpx-mul4-x86-sse2.S b/math/mpx-mul4-x86-sse2.S index 471b3f1f..5d8c5b97 100644 --- a/math/mpx-mul4-x86-sse2.S +++ b/math/mpx-mul4-x86-sse2.S @@ -214,11 +214,11 @@ .endif .endm -.macro squash c0, c1, c2, c3, h, t, u +.macro squash lo, hi, c0, c1, c2, c3, t, u // On entry, C0, C1, C2, C3 are carry registers representing a value - // Y. On exit, C0 holds the low 128 bits of the carry value; C1, C2, + // Y. On exit, LO holds the low 128 bits of the carry value; C1, C2, // C3, T, and U are clobbered; and the high bits of Y are stored in - // H, if this is not `nil'. + // HI, if this is not `nil'. // The first step is to eliminate the `double-prime' pieces -- i.e., // the ones offset by 16 bytes from a 32-bit boundary -- by carrying @@ -241,8 +241,8 @@ psrlq \c1, 16 // high parts of (y''_1, y''_3) psrlq \c2, 32 // low parts of (y''_0, y''_2) psrlq \c3, 32 // low parts of (y''_1, y''_3) - .ifnes "\h", "nil" - movdqa \h, \c1 + .ifnes "\hi", "nil" + movdqa \hi, \c1 .endif pslldq \c1, 8 // high part of (0, y''_1) @@ -250,15 +250,15 @@ paddq \u, \c3 paddq \t, \c1 // and up: (y_0, y_2) paddq \u, \c0 // (y_1, y_3) - .ifnes "\h", "nil" - psrldq \h, 8 // high part of (y''_3, 0) + .ifnes "\hi", "nil" + psrldq \hi, 8 // high part of (y''_3, 0) .endif // Finally extract the answer. This complicated dance is better than // storing to memory and loading, because the piecemeal stores // inhibit store forwarding. movdqa \c3, \t // (y_0, y_1) - movdqa \c0, \t // (y^*_0, ?, ?, ?) + movdqa \lo, \t // (y^*_0, ?, ?, ?) psrldq \t, 8 // (y_2, 0) psrlq \c3, 32 // (floor(y_0/B), ?) paddq \c3, \u // (y_1 + floor(y_0/B), ?) @@ -266,20 +266,20 @@ psrldq \u, 8 // (y_3, 0) psrlq \c3, 32 // (floor((y_1 B + y_0)/B^2, ?) paddq \c3, \t // (y_2 + floor((y_1 B + y_0)/B^2, ?) - punpckldq \c0, \c3 // (y^*_0, y^*_2, ?, ?) + punpckldq \lo, \c3 // (y^*_0, y^*_2, ?, ?) psrlq \c3, 32 // (floor((y_2 B^2 + y_1 B + y_0)/B^3, ?) paddq \c3, \u // (y_3 + floor((y_2 B^2 + y_1 B + y_0)/B^3, ?) - .ifnes "\h", "nil" + .ifnes "\hi", "nil" movdqa \t, \c3 pxor \u, \u .endif punpckldq \c1, \c3 // (y^*_1, y^*_3, ?, ?) - .ifnes "\h", "nil" + .ifnes "\hi", "nil" psrlq \t, 32 // very high bits of y - paddq \h, \t - punpcklqdq \h, \u // carry up + paddq \hi, \t + punpcklqdq \hi, \u // carry up .endif - punpckldq \c0, \c1 // y mod B^4 + punpckldq \lo, \c1 // y mod B^4 .endm .macro carryadd @@ -576,7 +576,7 @@ INTFUNC(mmla4) accum xmm7, nil, nil, nil // That's lots of pieces. Now we have to assemble the answer. - squash xmm4, xmm5, xmm6, xmm7, nil, xmm0, xmm1 + squash xmm4, nil, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1 // Expand it. pxor xmm2, xmm2 @@ -639,7 +639,7 @@ INTFUNC(mont4) accum xmm7, nil, nil, nil // That's lots of pieces. Now we have to assemble the answer. - squash xmm4, xmm5, xmm6, xmm7, nil, xmm0, xmm1 + squash xmm4, nil, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1 // Expand it. pxor xmm2, xmm2 -- [mdw]