From: Mark Wooding Date: Sat, 5 Nov 2016 21:28:22 +0000 (+0000) Subject: math/mpx-mul4-x86-sse2.S: Optimize `squash'. X-Git-Tag: 2.3.0~10 X-Git-Url: http://www.chiark.greenend.org.uk/ucgi/~mdw/git/catacomb/commitdiff_plain/bd685577211d2706258095a8d3cbb6dd6efd937e math/mpx-mul4-x86-sse2.S: Optimize `squash'. We can use `punpckldq' to assemble the 32-bit pieces, rather than a lot of shifting to clear bits and then `por'. --- diff --git a/math/mpx-mul4-x86-sse2.S b/math/mpx-mul4-x86-sse2.S index 0e87ff58..471b3f1f 100644 --- a/math/mpx-mul4-x86-sse2.S +++ b/math/mpx-mul4-x86-sse2.S @@ -262,32 +262,24 @@ psrldq \t, 8 // (y_2, 0) psrlq \c3, 32 // (floor(y_0/B), ?) paddq \c3, \u // (y_1 + floor(y_0/B), ?) - pslldq \c0, 12 // (0, 0, 0, y^*_0) movdqa \c1, \c3 // (y^*_1, ?, ?, ?) psrldq \u, 8 // (y_3, 0) psrlq \c3, 32 // (floor((y_1 B + y_0)/B^2, ?) paddq \c3, \t // (y_2 + floor((y_1 B + y_0)/B^2, ?) - pslldq \c1, 12 // (0, 0, 0, y^*_1) - psrldq \c0, 12 // (y^*_0, 0, 0, 0) - movdqa \c2, \c3 // (y^*_2, ?, ?, ?) + punpckldq \c0, \c3 // (y^*_0, y^*_2, ?, ?) psrlq \c3, 32 // (floor((y_2 B^2 + y_1 B + y_0)/B^3, ?) paddq \c3, \u // (y_3 + floor((y_2 B^2 + y_1 B + y_0)/B^3, ?) - pslldq \c2, 12 // (0, 0, 0, y^*_2) - psrldq \c1, 8 // (0, y^*_1, 0, 0) - psrldq \c2, 4 // (0, 0, y^*_2, 0) .ifnes "\h", "nil" movdqa \t, \c3 pxor \u, \u .endif - pslldq \c3, 12 // (0, 0, 0, y^*_3) - por \c0, \c1 // (y^*_0, y^*_1, 0, 0) - por \c2, \c3 // (0, 0, y^*_2, y^*_3) - por \c0, \c2 // y mod B^4 + punpckldq \c1, \c3 // (y^*_1, y^*_3, ?, ?) .ifnes "\h", "nil" psrlq \t, 32 // very high bits of y paddq \h, \t punpcklqdq \h, \u // carry up .endif + punpckldq \c0, \c1 // y mod B^4 .endm .macro carryadd