From: Mark Wooding Date: Thu, 29 Dec 2016 14:36:12 +0000 (+0000) Subject: math/mpx-mul4-x86-sse2.S: Use the correct vector-multiply instruction. X-Git-Tag: 2.3.0~8 X-Git-Url: http://www.chiark.greenend.org.uk/ucgi/~mdw/git/catacomb/commitdiff_plain/2aaa07f8c724ca7230ea6c23e3ab8f337fd91999 math/mpx-mul4-x86-sse2.S: Use the correct vector-multiply instruction. Not sure why GNU as let me get away with that. --- diff --git a/math/mpx-mul4-x86-sse2.S b/math/mpx-mul4-x86-sse2.S index 5d8c5b97..0b57dbfd 100644 --- a/math/mpx-mul4-x86-sse2.S +++ b/math/mpx-mul4-x86-sse2.S @@ -64,7 +64,7 @@ /// 0 v'_0 v'_1 v''_0 v''_1 /// 16 v'_2 v'_3 v''_2 v''_3 /// -/// A `pmuludqd' instruction ignores the odd positions in its operands; thus, +/// A `pmuludq' instruction ignores the odd positions in its operands; thus, /// it will act on (say) v'_0 and v''_0 in a single instruction. Shifting /// this vector right by 4 bytes brings v'_1 and v''_1 into position. We can /// multiply such a vector by a full 32-bit scalar to produce two 48-bit @@ -81,7 +81,7 @@ /// the register c0, for example, holds c'_0 (low half) and c''_0 (high /// half), and represents the value c_0 = c'_0 + c''_0 b; the carry registers /// collectively represent the value c_0 + c_1 B + c_2 B^2 + c_3 B^3. The -/// `pmuluqdq' instruction acting on a scalar operand (broadcast across all +/// `pmuluqd' instruction acting on a scalar operand (broadcast across all /// lanes of its vector) and an operand in the expanded form above produces a /// result which can be added directly to the appropriate carry register. /// Following a pass of four multiplications, we perform some limited carry @@ -118,19 +118,19 @@ psrldq \d3, 4 // (s'_3, s''_2, s''_3, 0) .endif .ifnes "\d1", "nil" - pmuludqd \d1, \d0 // (r_i s'_1, r_i s''_1) + pmuludq \d1, \d0 // (r_i s'_1, r_i s''_1) .endif .ifnes "\d3", "nil" - pmuludqd \d3, \d0 // (r_i s'_3, r_i s''_3) + pmuludq \d3, \d0 // (r_i s'_3, r_i s''_3) .endif .ifnes "\d2", "nil" .ifnes "\d3", "nil" - pmuludqd \d2, \d0 // (r_i s'_2, r_i s''_2) + pmuludq \d2, \d0 // (r_i s'_2, r_i s''_2) .else - pmuludqd \d2, [\s + 16] + pmuludq \d2, [\s + 16] .endif .endif - pmuludqd \d0, [\s] // (r_i s'_0, r_i s''_0) + pmuludq \d0, [\s] // (r_i s'_0, r_i s''_0) .endm .macro accum c0, c1, c2, c3