/// Main code.
.arch pentium4
- .section .text
+ .text
FUNC(salsa20_core_x86ish_sse2)
// registers, but we want more than we can use as scratch space. Two
// places we only need to save a copy of the input for the
// feedforward at the end; but the other two we want for the final
- // permutation, so save the old values on the stack (We need an extra
- // 8 bytes to align the stack.)
+ // permutation, so save the old values on the stack. (We need an
+ // extra 8 bytes to align the stack.)
# define NR ecx
# define IN rdx
# define SAVE3 [rsp + 48]
sub rsp, 64 + 8
+ .seh_stackalloc 64 + 8
movdqa [rsp + 0], xmm6
+ .seh_savexmm xmm6, 0
movdqa [rsp + 16], xmm7
+ .seh_savexmm xmm7, 16
+ .seh_endprologue
#endif
// First job is to slurp the matrix into XMM registers. The words
movdqa SAVE2, xmm2
movdqa SAVE3, xmm3
-loop:
+0:
// Apply a column quarterround to each of the columns simultaneously.
// Alas, there doesn't seem to be a packed doubleword rotate, so we
// have to synthesize it.
// Decrement the loop counter and see if we should go round again.
// Later processors fuse this pair into a single uop.
sub NR, 2
- ja loop
+ ja 0b
// Almost there. Firstly, the feedforward addition, and then we have
// to write out the result. Here we have to undo the permutation
movd [OUT + 60], xmm0
// Tidy things up.
-
#if CPUFAM_X86
mov esp, ebp
pop ebp