base/asm-common.h, *-x86ish-*.S: Centralize SSE shuffling constants.

[catacomb] / symm / salsa20-x86ish-sse2.S
diff --git a/symm/salsa20-x86ish-sse2.S b/symm/salsa20-x86ish-sse2.S

index a168d79a7b0cb91b64de7eb12df42f6006bae4ea..5fa5b1516d01c6adeaf9ad857d0d421fd5bbae67 100644 (file)
--- a/symm/salsa20-x86ish-sse2.S
+++ b/symm/salsa20-x86ish-sse2.S
@@ -30,19 +30,11 @@
  #include "config.h"
  #include "asm-common.h"
  
-///--------------------------------------------------------------------------
-/// Local utilities.
-
-// Magic constants for shuffling.
-#define ROTL 0x93
-#define ROT2 0x4e
-#define ROTR 0x39
-
  ///--------------------------------------------------------------------------
  /// Main code.
  
         .arch pentium4
-       .section .text
+       .text
  
  FUNC(salsa20_core_x86ish_sse2)
  
@@ -99,8 +91,8 @@ FUNC(salsa20_core_x86ish_sse2)
         // registers, but we want more than we can use as scratch space.  Two
         // places we only need to save a copy of the input for the
         // feedforward at the end; but the other two we want for the final
-       // permutation, so save the old values on the stack (We need an extra
-       // 8 bytes to align the stack.)
+       // permutation, so save the old values on the stack.  (We need an
+       // extra 8 bytes to align the stack.)
  
  #  define NR ecx
  #  define IN rdx
@@ -111,8 +103,12 @@ FUNC(salsa20_core_x86ish_sse2)
  #  define SAVE3 [rsp + 48]
  
         sub     rsp, 64 + 8
+         .seh_stackalloc 64 + 8
         movdqa  [rsp +  0], xmm6
+         .seh_savexmm xmm6, 0
         movdqa  [rsp + 16], xmm7
+         .seh_savexmm xmm7, 16
+  .seh_endprologue
  #endif
  
         // First job is to slurp the matrix into XMM registers.  The words
@@ -146,13 +142,13 @@ FUNC(salsa20_core_x86ish_sse2)
         movdqu  xmm2, [IN + 32]
         movdqu  xmm3, [IN + 48]
  
-       ## Take a copy for later.
+       // Take a copy for later.
         movdqa  SAVE0, xmm0
         movdqa  SAVE1, xmm1
         movdqa  SAVE2, xmm2
         movdqa  SAVE3, xmm3
  
-loop:
+0:
         // Apply a column quarterround to each of the columns simultaneously.
         // Alas, there doesn't seem to be a packed doubleword rotate, so we
         // have to synthesize it.
@@ -178,7 +174,7 @@ loop:
         // d ^= (c + b) <<< 13
         movdqa  xmm4, xmm2
         paddd   xmm4, xmm1
-       pshufd  xmm1, xmm1, ROTL
+        pshufd xmm1, xmm1, SHUF(2, 1, 0, 3)
         movdqa  xmm5, xmm4
         pslld   xmm4, 13
         psrld   xmm5, 19
@@ -187,9 +183,9 @@ loop:
  
         // a ^= (d + c) <<< 18
         movdqa  xmm4, xmm3
-       pshufd  xmm3, xmm3, ROTR
+        pshufd xmm3, xmm3, SHUF(0, 3, 2, 1)
         paddd   xmm4, xmm2
-       pshufd  xmm2, xmm2, ROT2
+        pshufd xmm2, xmm2, SHUF(1, 0, 3, 2)
         movdqa  xmm5, xmm4
         pslld   xmm4, 18
         psrld   xmm5, 14
@@ -233,7 +229,7 @@ loop:
         // d ^= (c + b) <<< 13
         movdqa  xmm4, xmm2
         paddd   xmm4, xmm3
-       pshufd  xmm3, xmm3, ROTL
+        pshufd xmm3, xmm3, SHUF(2, 1, 0, 3)
         movdqa  xmm5, xmm4
         pslld   xmm4, 13
         psrld   xmm5, 19
@@ -242,9 +238,9 @@ loop:
  
         // a ^= (d + c) <<< 18
         movdqa  xmm4, xmm1
-       pshufd  xmm1, xmm1, ROTR
+        pshufd xmm1, xmm1, SHUF(0, 3, 2, 1)
         paddd   xmm4, xmm2
-       pshufd  xmm2, xmm2, ROT2
+        pshufd xmm2, xmm2, SHUF(1, 0, 3, 2)
         movdqa  xmm5, xmm4
         pslld   xmm4, 18
         psrld   xmm5, 14
@@ -256,7 +252,7 @@ loop:
         // Decrement the loop counter and see if we should go round again.
         // Later processors fuse this pair into a single uop.
         sub     NR, 2
-       ja      loop
+       ja      0b
  
         // Almost there.  Firstly, the feedforward addition, and then we have
         // to write out the result.  Here we have to undo the permutation
@@ -264,47 +260,46 @@ loop:
         // latency, so arrange to start a new shuffle into a temporary as
         // soon as we've written out the old value.
         paddd   xmm0, SAVE0
-       pshufd  xmm4, xmm0, 0x39
+        pshufd xmm4, xmm0, 0x39
         movd    [OUT +  0], xmm0
  
         paddd   xmm1, SAVE1
-       pshufd  xmm5, xmm1, ROTL
+        pshufd xmm5, xmm1, SHUF(2, 1, 0, 3)
         movd    [OUT + 16], xmm1
  
         paddd   xmm2, SAVE2
-       pshufd  xmm6, xmm2, ROT2
+        pshufd xmm6, xmm2, SHUF(1, 0, 3, 2)
         movd    [OUT + 32], xmm2
  
         paddd   xmm3, SAVE3
-       pshufd  xmm7, xmm3, ROTR
+        pshufd xmm7, xmm3, SHUF(0, 3, 2, 1)
         movd    [OUT + 48], xmm3
  
         movd    [OUT +  4], xmm7
-       pshufd  xmm7, xmm3, ROT2
+        pshufd xmm7, xmm3, SHUF(1, 0, 3, 2)
         movd    [OUT + 24], xmm7
-       pshufd  xmm3, xmm3, ROTL
+        pshufd xmm3, xmm3, SHUF(2, 1, 0, 3)
         movd    [OUT + 44], xmm3
  
         movd    [OUT +  8], xmm6
-       pshufd  xmm6, xmm2, ROTL
+        pshufd xmm6, xmm2, SHUF(2, 1, 0, 3)
         movd    [OUT + 28], xmm6
-       pshufd  xmm2, xmm2, ROTR
+        pshufd xmm2, xmm2, SHUF(0, 3, 2, 1)
         movd    [OUT + 52], xmm2
  
         movd    [OUT + 12], xmm5
-       pshufd  xmm5, xmm1, ROTR
+        pshufd xmm5, xmm1, SHUF(0, 3, 2, 1)
         movd    [OUT + 36], xmm5
-       pshufd  xmm1, xmm1, ROT2
+        pshufd xmm1, xmm1, SHUF(1, 0, 3, 2)
         movd    [OUT + 56], xmm1
  
         movd    [OUT + 20], xmm4
-       pshufd  xmm4, xmm0, ROT2
+        pshufd xmm4, xmm0, SHUF(1, 0, 3, 2)
         movd    [OUT + 40], xmm4
-       pshufd  xmm0, xmm0, ROTL
+        pshufd xmm0, xmm0, SHUF(2, 1, 0, 3)
         movd    [OUT + 60], xmm0
  
         // Tidy things up.
-
  #if CPUFAM_X86
         mov     esp, ebp
         pop     ebp