symm/salsa20-x86ish-sse2.S: Cosmetic fixes.

[catacomb] / symm / salsa20-x86ish-sse2.S
diff --git a/symm/salsa20-x86ish-sse2.S b/symm/salsa20-x86ish-sse2.S

index b3ebf90c3faa93f9d45f8c64bcd3ced13b03dda6..fbdfea72e84239428f681aaa9659385bf658a80d 100644 (file)
--- a/symm/salsa20-x86ish-sse2.S
+++ b/symm/salsa20-x86ish-sse2.S
@@ -42,7 +42,7 @@
  /// Main code.
  
         .arch pentium4
-       .section .text
+       .text
  
  FUNC(salsa20_core_x86ish_sse2)
  
@@ -99,8 +99,8 @@ FUNC(salsa20_core_x86ish_sse2)
         // registers, but we want more than we can use as scratch space.  Two
         // places we only need to save a copy of the input for the
         // feedforward at the end; but the other two we want for the final
-       // permutation, so save the old values on the stack (We need an extra
-       // 8 bytes to align the stack.)
+       // permutation, so save the old values on the stack.  (We need an
+       // extra 8 bytes to align the stack.)
  
  #  define NR ecx
  #  define IN rdx
@@ -111,8 +111,12 @@ FUNC(salsa20_core_x86ish_sse2)
  #  define SAVE3 [rsp + 48]
  
         sub     rsp, 64 + 8
+         .seh_stackalloc 64 + 8
         movdqa  [rsp +  0], xmm6
+         .seh_savexmm xmm6, 0
         movdqa  [rsp + 16], xmm7
+         .seh_savexmm xmm7, 16
+  .seh_endprologue
  #endif
  
         // First job is to slurp the matrix into XMM registers.  The words
@@ -152,7 +156,7 @@ FUNC(salsa20_core_x86ish_sse2)
         movdqa  SAVE2, xmm2
         movdqa  SAVE3, xmm3
  
-loop:
+0:
         // Apply a column quarterround to each of the columns simultaneously.
         // Alas, there doesn't seem to be a packed doubleword rotate, so we
         // have to synthesize it.
@@ -256,7 +260,7 @@ loop:
         // Decrement the loop counter and see if we should go round again.
         // Later processors fuse this pair into a single uop.
         sub     NR, 2
-       ja      loop
+       ja      0b
  
         // Almost there.  Firstly, the feedforward addition, and then we have
         // to write out the result.  Here we have to undo the permutation
@@ -304,7 +308,6 @@ loop:
         movd    [OUT + 60], xmm0
  
         // Tidy things up.
-
  #if CPUFAM_X86
         mov     esp, ebp
         pop     ebp