From: Mark Wooding <mdw@distorted.org.uk>
Date: Thu, 29 Dec 2016 14:36:12 +0000 (+0000)
Subject: math/mpx-mul4-x86-sse2.S: Use the correct vector-multiply instruction.
X-Git-Tag: 2.3.0~8
X-Git-Url: http://www.chiark.greenend.org.uk/ucgi/~mdw/git/catacomb/commitdiff_plain/2aaa07f8c724ca7230ea6c23e3ab8f337fd91999

math/mpx-mul4-x86-sse2.S: Use the correct vector-multiply instruction.

Not sure why GNU as let me get away with that.
---

diff --git a/math/mpx-mul4-x86-sse2.S b/math/mpx-mul4-x86-sse2.S
index 5d8c5b97..0b57dbfd 100644
--- a/math/mpx-mul4-x86-sse2.S
+++ b/math/mpx-mul4-x86-sse2.S
@@ -64,7 +64,7 @@
 ///	   0	v'_0	v'_1	v''_0	v''_1
 ///	  16	v'_2	v'_3	v''_2	v''_3
 ///
-/// A `pmuludqd' instruction ignores the odd positions in its operands; thus,
+/// A `pmuludq' instruction ignores the odd positions in its operands; thus,
 /// it will act on (say) v'_0 and v''_0 in a single instruction.  Shifting
 /// this vector right by 4 bytes brings v'_1 and v''_1 into position.  We can
 /// multiply such a vector by a full 32-bit scalar to produce two 48-bit
@@ -81,7 +81,7 @@
 /// the register c0, for example, holds c'_0 (low half) and c''_0 (high
 /// half), and represents the value c_0 = c'_0 + c''_0 b; the carry registers
 /// collectively represent the value c_0 + c_1 B + c_2 B^2 + c_3 B^3.  The
-/// `pmuluqdq' instruction acting on a scalar operand (broadcast across all
+/// `pmuluqd' instruction acting on a scalar operand (broadcast across all
 /// lanes of its vector) and an operand in the expanded form above produces a
 /// result which can be added directly to the appropriate carry register.
 /// Following a pass of four multiplications, we perform some limited carry
@@ -118,19 +118,19 @@
 	psrldq	\d3, 4			// (s'_3, s''_2, s''_3, 0)
   .endif
   .ifnes "\d1", "nil"
-	pmuludqd \d1, \d0		// (r_i s'_1, r_i s''_1)
+	pmuludq	\d1, \d0		// (r_i s'_1, r_i s''_1)
   .endif
   .ifnes "\d3", "nil"
-	pmuludqd \d3, \d0		// (r_i s'_3, r_i s''_3)
+	pmuludq	\d3, \d0		// (r_i s'_3, r_i s''_3)
   .endif
   .ifnes "\d2", "nil"
     .ifnes "\d3", "nil"
-	pmuludqd \d2, \d0		// (r_i s'_2, r_i s''_2)
+	pmuludq	\d2, \d0		// (r_i s'_2, r_i s''_2)
     .else
-	pmuludqd \d2, [\s + 16]
+	pmuludq	\d2, [\s + 16]
     .endif
   .endif
-	pmuludqd \d0, [\s]		// (r_i s'_0, r_i s''_0)
+	pmuludq	\d0, [\s]		// (r_i s'_0, r_i s''_0)
 .endm
 
 .macro	accum	c0, c1, c2, c3