math/mpx-mul4-x86-sse2.S: Use the correct vector-multiply instruction.

author Mark Wooding <mdw@distorted.org.uk>

Thu, 29 Dec 2016 14:36:12 +0000 (14:36 +0000)

committer Mark Wooding <mdw@distorted.org.uk>

Mon, 3 Apr 2017 09:12:32 +0000 (10:12 +0100)
author Mark Wooding <mdw@distorted.org.uk>
Thu, 29 Dec 2016 14:36:12 +0000 (14:36 +0000)
committer Mark Wooding <mdw@distorted.org.uk>
Mon, 3 Apr 2017 09:12:32 +0000 (10:12 +0100)
diff --git a/math/mpx-mul4-x86-sse2.S b/math/mpx-mul4-x86-sse2.S

index 5d8c5b97d2e72c4a1f1adde83432f232dbd4a401..0b57dbfd1e01a0fa6b0a3789cdddc71a9c62d230 100644 (file)
--- a/math/mpx-mul4-x86-sse2.S
+++ b/math/mpx-mul4-x86-sse2.S
@@ -64,7 +64,7 @@
  ///       0    v'_0    v'_1    v''_0   v''_1
  ///      16    v'_2    v'_3    v''_2   v''_3
  ///
-/// A `pmuludqd' instruction ignores the odd positions in its operands; thus,
+/// A `pmuludq' instruction ignores the odd positions in its operands; thus,
  /// it will act on (say) v'_0 and v''_0 in a single instruction.  Shifting
  /// this vector right by 4 bytes brings v'_1 and v''_1 into position.  We can
  /// multiply such a vector by a full 32-bit scalar to produce two 48-bit
@@ -81,7 +81,7 @@
  /// the register c0, for example, holds c'_0 (low half) and c''_0 (high
  /// half), and represents the value c_0 = c'_0 + c''_0 b; the carry registers
  /// collectively represent the value c_0 + c_1 B + c_2 B^2 + c_3 B^3.  The
-/// `pmuluqdq' instruction acting on a scalar operand (broadcast across all
+/// `pmuluqd' instruction acting on a scalar operand (broadcast across all
  /// lanes of its vector) and an operand in the expanded form above produces a
  /// result which can be added directly to the appropriate carry register.
  /// Following a pass of four multiplications, we perform some limited carry
@@ -118,19 +118,19 @@
         psrldq  \d3, 4                  // (s'_3, s''_2, s''_3, 0)
    .endif
    .ifnes "\d1", "nil"
-       pmuludqd \d1, \d0               // (r_i s'_1, r_i s''_1)
+       pmuludq \d1, \d0                // (r_i s'_1, r_i s''_1)
    .endif
    .ifnes "\d3", "nil"
-       pmuludqd \d3, \d0               // (r_i s'_3, r_i s''_3)
+       pmuludq \d3, \d0                // (r_i s'_3, r_i s''_3)
    .endif
    .ifnes "\d2", "nil"
      .ifnes "\d3", "nil"
-       pmuludqd \d2, \d0               // (r_i s'_2, r_i s''_2)
+       pmuludq \d2, \d0                // (r_i s'_2, r_i s''_2)
      .else
-       pmuludqd \d2, [\s + 16]
+       pmuludq \d2, [\s + 16]
      .endif
    .endif
-       pmuludqd \d0, [\s]              // (r_i s'_0, r_i s''_0)
+       pmuludq \d0, [\s]               // (r_i s'_0, r_i s''_0)
  .endm
  
  .macro accum   c0, c1, c2, c3
author	Mark Wooding <mdw@distorted.org.uk>
	Thu, 29 Dec 2016 14:36:12 +0000 (14:36 +0000)
committer	Mark Wooding <mdw@distorted.org.uk>
	Mon, 3 Apr 2017 09:12:32 +0000 (10:12 +0100)