base/asm-common.h: Reverse the order of `SHUF' arguments.

author Mark Wooding <mdw@distorted.org.uk>

Mon, 12 Nov 2018 11:03:05 +0000 (11:03 +0000)

committer Mark Wooding <mdw@distorted.org.uk>

Sat, 24 Nov 2018 21:53:38 +0000 (21:53 +0000)
author Mark Wooding <mdw@distorted.org.uk>
Mon, 12 Nov 2018 11:03:05 +0000 (11:03 +0000)
committer Mark Wooding <mdw@distorted.org.uk>
Sat, 24 Nov 2018 21:53:38 +0000 (21:53 +0000)
diff --git a/base/asm-common.h b/base/asm-common.h

index 8e51ea39ef6993b61bdd4f1ae6588041fb376b1f..d6a8b01223ed17bee8a0c09292c35126f1948966 100644 (file)
--- a/base/asm-common.h
+++ b/base/asm-common.h
@@ -217,11 +217,11 @@ name:
  #  define INTADDR__1(addr, got) addr
  #endif
  
-// Permutations for SIMD instructions.  SHUF(D, C, B, A) is an immediate,
-// suitable for use in `pshufd' or `shufpd', which copies element D
-// (0 <= D < 4) of the source to element 3 of the destination, element C to
-// element 2, element B to element 1, and element A to element 0.
-#define SHUF(d, c, b, a) (64*(d) + 16*(c) + 4*(b) + (a))
+// Permutations for SIMD instructions.  SHUF(A, B, C, D) is an immediate,
+// suitable for use in `pshufd' or `shufpd', which copies element A
+// (0 <= A < 4) of the source to element 0 of the destination, element B to
+// element 1, element C to element 2, and element D to element 3.
+#define SHUF(a, b, c, d) ((a) + 4*(b) + 16*(c) + 64*(d))
  
  // Map register names to their individual pieces.
  
diff --git a/math/mpx-mul4-amd64-sse2.S b/math/mpx-mul4-amd64-sse2.S

index 8b8cd414bc1b8bfd8799ebc4b186b9db6a1126f1..64460ca9e8c6f9a0d552472dfd788ae4e059310a 100644 (file)
--- a/math/mpx-mul4-amd64-sse2.S
+++ b/math/mpx-mul4-amd64-sse2.S
@@ -96,7 +96,7 @@
  .macro mulcore r, i, slo, shi, d0, d1=nil, d2=nil, d3=nil
         // Multiply R_I by the expanded operand SLO/SHI, and leave the pieces
         // of the product in registers D0, D1, D2, D3.
-       pshufd  \d0, \r, SHUF(3, \i, 3, \i) // (r_i, ?; r_i, ?)
+       pshufd  \d0, \r, SHUF(\i, 3, \i, 3) // (r_i, ?; r_i, ?)
    .ifnes "\d1", "nil"
         movdqa  \d1, \slo               // (s'_0, s'_1; s''_0, s''_1)
    .endif
@@ -163,7 +163,7 @@
         // lane 0 or 1 of D; the high two lanes of D are clobbered.  On
         // completion, XMM3 is clobbered.  If CC is `nil', then the
         // contribution which would have been added to it is left in C.
-       pshufd  xmm3, \c, SHUF(2, 3, 3, 3) // (?, ?; ?, t = c'' mod B)
+       pshufd  xmm3, \c, SHUF(3, 3, 3, 2) // (?, ?; ?, t = c'' mod B)
         psrldq  xmm3, 12                // (t, 0; 0, 0) = (t; 0)
         pslldq  xmm3, 2                 // (t b; 0)
         paddq   \c, xmm3                // (c' + t b; c'')
@@ -209,11 +209,11 @@
         punpcklwd \c, \z                // (c'_0, c''_0; c'_1, c''_1)
         punpckhwd \d, \z                // (c'_2, c''_2; c'_3, c''_3)
    .endif
-       pshufd  \a, \a, SHUF(3, 1, 2, 0) // (a'_0, a'_1; a''_0, a''_1)
-       pshufd  \b, \b, SHUF(3, 1, 2, 0) // (a'_2, a'_3; a''_2, a''_3)
+       pshufd  \a, \a, SHUF(0, 2, 1, 3) // (a'_0, a'_1; a''_0, a''_1)
+       pshufd  \b, \b, SHUF(0, 2, 1, 3) // (a'_2, a'_3; a''_2, a''_3)
    .ifnes "\c", "nil"
-       pshufd  \c, \c, SHUF(3, 1, 2, 0) // (c'_0, c'_1; c''_0, c''_1)
-       pshufd  \d, \d, SHUF(3, 1, 2, 0) // (c'_2, c'_3; c''_2, c''_3)
+       pshufd  \c, \c, SHUF(0, 2, 1, 3) // (c'_0, c'_1; c''_0, c''_1)
+       pshufd  \d, \d, SHUF(0, 2, 1, 3) // (c'_2, c'_3; c''_2, c''_3)
    .endif
  .endm
  
diff --git a/math/mpx-mul4-x86-sse2.S b/math/mpx-mul4-x86-sse2.S

index 591a7a8f2fec8b927160bd10085abd299bfe4230..11aadc9500fb6f11aa9eb7f8d159962b4328aee4 100644 (file)
--- a/math/mpx-mul4-x86-sse2.S
+++ b/math/mpx-mul4-x86-sse2.S
@@ -103,7 +103,7 @@
    .ifnes "\d3", "nil"
         movdqa  \d3, [\s + 16]          // (s'_2, s'_3; s''_2, s''_3)
    .endif
-       pshufd  \d0, \d0, SHUF(3, 0, 3, 0) // (r_i, ?; r_i, ?)
+       pshufd  \d0, \d0, SHUF(0, 3, 0, 3) // (r_i, ?; r_i, ?)
    .ifnes "\d1", "nil"
         psrldq  \d1, 4                  // (s'_1, s''_0; s''_1, 0)
    .endif
@@ -171,7 +171,7 @@
         // carry registers.  On completion, XMM3 is clobbered.  If CC is
         // `nil', then the contribution which would have been added to it is
         // left in C.
-       pshufd  xmm3, \c, SHUF(2, 3, 3, 3) // (?, ?; ?, t = c'' mod B)
+       pshufd  xmm3, \c, SHUF(3, 3, 3, 2) // (?, ?; ?, t = c'' mod B)
         psrldq  xmm3, 12                // (t, 0; 0, 0) = (t, 0)
         pslldq  xmm3, 2                 // (t b; 0)
         paddq   \c, xmm3                // (c' + t b; c'')
@@ -209,11 +209,11 @@
         punpcklwd \c, \z                // (c'_0, c''_0; c'_1, c''_1)
         punpckhwd \d, \z                // (c'_2, c''_2; c'_3, c''_3)
    .endif
-       pshufd  \a, \a, SHUF(3, 1, 2, 0) // (a'_0, a'_1; a''_0, a''_1)
-       pshufd  \b, \b, SHUF(3, 1, 2, 0) // (a'_2, a'_3; a''_2, a''_3)
+       pshufd  \a, \a, SHUF(0, 2, 1, 3) // (a'_0, a'_1; a''_0, a''_1)
+       pshufd  \b, \b, SHUF(0, 2, 1, 3) // (a'_2, a'_3; a''_2, a''_3)
    .ifnes "\c", "nil"
-       pshufd  \c, \c, SHUF(3, 1, 2, 0) // (c'_0, c'_1; c''_0, c''_1)
-       pshufd  \d, \d, SHUF(3, 1, 2, 0) // (c'_2, c'_3; c''_2, c''_3)
+       pshufd  \c, \c, SHUF(0, 2, 1, 3) // (c'_0, c'_1; c''_0, c''_1)
+       pshufd  \d, \d, SHUF(0, 2, 1, 3) // (c'_2, c'_3; c''_2, c''_3)
    .endif
  .endm
  
diff --git a/symm/chacha-x86ish-sse2.S b/symm/chacha-x86ish-sse2.S

index b8f72d538b5a179fc50e83dcd4b26d36a75f6ae5..77047ebe3e5e6fa04e02c02e5d5930f2fd720f88 100644 (file)
--- a/symm/chacha-x86ish-sse2.S
+++ b/symm/chacha-x86ish-sse2.S
@@ -164,9 +164,9 @@ FUNC(chacha_core_x86ish_sse2)
  
         // c += d; b ^= c; b <<<=  7
         paddd   xmm2, xmm3
-        pshufd xmm3, xmm3, SHUF(2, 1, 0, 3)
+        pshufd xmm3, xmm3, SHUF(3, 0, 1, 2)
         pxor    xmm1, xmm2
-        pshufd xmm2, xmm2, SHUF(1, 0, 3, 2)
+        pshufd xmm2, xmm2, SHUF(2, 3, 0, 1)
         movdqa  xmm4, xmm1
         pslld   xmm1, 7
         psrld   xmm4, 25
@@ -184,7 +184,7 @@ FUNC(chacha_core_x86ish_sse2)
         //
         // The shuffles have quite high latency, so they've mostly been
         // pushed upwards.  The remaining one can't be moved, though.
-       pshufd  xmm1, xmm1, SHUF(0, 3, 2, 1)
+       pshufd  xmm1, xmm1, SHUF(1, 2, 3, 0)
  
         // Apply the diagonal quarterround to each of the columns
         // simultaneously.
@@ -215,9 +215,9 @@ FUNC(chacha_core_x86ish_sse2)
  
         // c += d; b ^= c; b <<<=  7
         paddd   xmm2, xmm3
-        pshufd xmm3, xmm3, SHUF(0, 3, 2, 1)
+        pshufd xmm3, xmm3, SHUF(1, 2, 3, 0)
         pxor    xmm1, xmm2
-        pshufd xmm2, xmm2, SHUF(1, 0, 3, 2)
+        pshufd xmm2, xmm2, SHUF(2, 3, 0, 1)
         movdqa  xmm4, xmm1
         pslld   xmm1, 7
         psrld   xmm4, 25
@@ -226,7 +226,7 @@ FUNC(chacha_core_x86ish_sse2)
         // Finally, finish off undoing the transpose, and we're done for this
         // doubleround.  Again, most of this was done above so we don't have
         // to wait for the shuffles.
-       pshufd  xmm1, xmm1, SHUF(2, 1, 0, 3)
+       pshufd  xmm1, xmm1, SHUF(3, 0, 1, 2)
  
         // Decrement the loop counter and see if we should go round again.
         sub     NR, 2
diff --git a/symm/rijndael-x86ish-aesni.S b/symm/rijndael-x86ish-aesni.S

index a7a1ece3d3823cbd7ad135b91ac1353d76d62b8b..dc80f4dbdb6fa2c18da94893b7bd941be6d9821d 100644 (file)
--- a/symm/rijndael-x86ish-aesni.S
+++ b/symm/rijndael-x86ish-aesni.S
@@ -211,16 +211,16 @@ FUNC(rijndael_setup_x86ish_aesni)
         // Fourth word of the cycle, and seven or eight words of key.  Do a
         // byte substitution.
         movd    xmm0, eax
-       pshufd  xmm0, xmm0, SHUF(2, 1, 0, 3)
+       pshufd  xmm0, xmm0, SHUF(3, 0, 1, 2)
         aeskeygenassist xmm1, xmm0, 0
         movd    eax, xmm1
         jmp     2f
  
         // First word of the cycle.  This is the complicated piece.
  1:     movd    xmm0, eax
-       pshufd  xmm0, xmm0, SHUF(0, 3, 2, 1)
+       pshufd  xmm0, xmm0, SHUF(1, 2, 3, 0)
         aeskeygenassist xmm1, xmm0, 0
-       pshufd  xmm1, xmm1, SHUF(2, 1, 0, 3)
+       pshufd  xmm1, xmm1, SHUF(3, 0, 1, 2)
         movd    eax, xmm1
         xor     al, [RCON]
         inc     RCON
diff --git a/symm/salsa20-x86ish-sse2.S b/symm/salsa20-x86ish-sse2.S

index 76ac0ed96e27284007fe9e5decef636fafd6882b..ad4e322b21d4756cff7777de18df4537465f7079 100644 (file)
--- a/symm/salsa20-x86ish-sse2.S
+++ b/symm/salsa20-x86ish-sse2.S
@@ -180,7 +180,7 @@ FUNC(salsa20_core_x86ish_sse2)
         // d ^= (c + b) <<< 13
         movdqa  xmm4, xmm2
         paddd   xmm4, xmm1
-        pshufd xmm1, xmm1, SHUF(2, 1, 0, 3)
+        pshufd xmm1, xmm1, SHUF(3, 0, 1, 2)
         movdqa  xmm5, xmm4
         pslld   xmm4, 13
         psrld   xmm5, 19
@@ -189,9 +189,9 @@ FUNC(salsa20_core_x86ish_sse2)
  
         // a ^= (d + c) <<< 18
         movdqa  xmm4, xmm3
-        pshufd xmm3, xmm3, SHUF(0, 3, 2, 1)
+        pshufd xmm3, xmm3, SHUF(1, 2, 3, 0)
         paddd   xmm4, xmm2
-        pshufd xmm2, xmm2, SHUF(1, 0, 3, 2)
+        pshufd xmm2, xmm2, SHUF(2, 3, 0, 1)
         movdqa  xmm5, xmm4
         pslld   xmm4, 18
         psrld   xmm5, 14
@@ -235,7 +235,7 @@ FUNC(salsa20_core_x86ish_sse2)
         // d ^= (c + b) <<< 13
         movdqa  xmm4, xmm2
         paddd   xmm4, xmm3
-        pshufd xmm3, xmm3, SHUF(2, 1, 0, 3)
+        pshufd xmm3, xmm3, SHUF(3, 0, 1, 2)
         movdqa  xmm5, xmm4
         pslld   xmm4, 13
         psrld   xmm5, 19
@@ -244,9 +244,9 @@ FUNC(salsa20_core_x86ish_sse2)
  
         // a ^= (d + c) <<< 18
         movdqa  xmm4, xmm1
-        pshufd xmm1, xmm1, SHUF(0, 3, 2, 1)
+        pshufd xmm1, xmm1, SHUF(1, 2, 3, 0)
         paddd   xmm4, xmm2
-        pshufd xmm2, xmm2, SHUF(1, 0, 3, 2)
+        pshufd xmm2, xmm2, SHUF(2, 3, 0, 1)
         movdqa  xmm5, xmm4
         pslld   xmm4, 18
         psrld   xmm5, 14
@@ -270,9 +270,9 @@ FUNC(salsa20_core_x86ish_sse2)
         // input.  This can be done by juggling values in registers, with the
         // following fancy footwork: some row rotations, a transpose, and
         // some more rotations.
-       pshufd  xmm1, xmm1, SHUF(2, 1, 0, 3)    //  3,  4,  9, 14
-       pshufd  xmm2, xmm2, SHUF(1, 0, 3, 2)    //  2,  7,  8, 13
-       pshufd  xmm3, xmm3, SHUF(0, 3, 2, 1)    //  1,  6, 11, 12
+       pshufd  xmm1, xmm1, SHUF(3, 0, 1, 2)    //  3,  4,  9, 14
+       pshufd  xmm2, xmm2, SHUF(2, 3, 0, 1)    //  2,  7,  8, 13
+       pshufd  xmm3, xmm3, SHUF(1, 2, 3, 0)    //  1,  6, 11, 12
  
         movdqa  xmm4, xmm0
         movdqa  xmm5, xmm3
@@ -288,9 +288,9 @@ FUNC(salsa20_core_x86ish_sse2)
         punpckhdq xmm1, xmm3                    //  5,  6,  7,  4
         punpckhdq xmm2, xmm5                    // 15, 12, 13, 14
  
-       pshufd  xmm1, xmm1, SHUF(2, 1, 0, 3)    //  4,  5,  6,  7
-       pshufd  xmm4, xmm4, SHUF(1, 0, 3, 2)    //  8,  9, 10, 11
-       pshufd  xmm2, xmm2, SHUF(0, 3, 2, 1)    // 12, 13, 14, 15
+       pshufd  xmm1, xmm1, SHUF(3, 0, 1, 2)    //  4,  5,  6,  7
+       pshufd  xmm4, xmm4, SHUF(2, 3, 0, 1)    //  8,  9, 10, 11
+       pshufd  xmm2, xmm2, SHUF(1, 2, 3, 0)    // 12, 13, 14, 15
  
         // Finally we have to write out the result.
         movdqu  [OUT +  0], xmm0
author	Mark Wooding <mdw@distorted.org.uk>
	Mon, 12 Nov 2018 11:03:05 +0000 (11:03 +0000)
committer	Mark Wooding <mdw@distorted.org.uk>
	Sat, 24 Nov 2018 21:53:38 +0000 (21:53 +0000)
base/asm-common.h		patch \| blob \| blame \| history
math/mpx-mul4-amd64-sse2.S		patch \| blob \| blame \| history
math/mpx-mul4-x86-sse2.S		patch \| blob \| blame \| history
symm/chacha-x86ish-sse2.S		patch \| blob \| blame \| history
symm/rijndael-x86ish-aesni.S		patch \| blob \| blame \| history
symm/salsa20-x86ish-sse2.S		patch \| blob \| blame \| history