#include "config.h"
#include "asm-common.h"
-///--------------------------------------------------------------------------
-/// Local utilities.
-
-// Magic constants for shuffling.
-#define ROTL 0x93
-#define ROT2 0x4e
-#define ROTR 0x39
-
///--------------------------------------------------------------------------
/// Main code.
.arch pentium4
- .section .text
+ .text
FUNC(salsa20_core_x86ish_sse2)
// registers, but we want more than we can use as scratch space. Two
// places we only need to save a copy of the input for the
// feedforward at the end; but the other two we want for the final
- // permutation, so save the old values on the stack (We need an extra
- // 8 bytes to align the stack.)
+ // permutation, so save the old values on the stack. (We need an
+ // extra 8 bytes to align the stack.)
# define NR ecx
# define IN rdx
# define SAVE3 [rsp + 48]
sub rsp, 64 + 8
+ .seh_stackalloc 64 + 8
movdqa [rsp + 0], xmm6
+ .seh_savexmm xmm6, 0
movdqa [rsp + 16], xmm7
+ .seh_savexmm xmm7, 16
+ .seh_endprologue
#endif
// First job is to slurp the matrix into XMM registers. The words
// d ^= (c + b) <<< 13
movdqa xmm4, xmm2
paddd xmm4, xmm1
- pshufd xmm1, xmm1, ROTL
+ pshufd xmm1, xmm1, SHUF(2, 1, 0, 3)
movdqa xmm5, xmm4
pslld xmm4, 13
psrld xmm5, 19
// a ^= (d + c) <<< 18
movdqa xmm4, xmm3
- pshufd xmm3, xmm3, ROTR
+ pshufd xmm3, xmm3, SHUF(0, 3, 2, 1)
paddd xmm4, xmm2
- pshufd xmm2, xmm2, ROT2
+ pshufd xmm2, xmm2, SHUF(1, 0, 3, 2)
movdqa xmm5, xmm4
pslld xmm4, 18
psrld xmm5, 14
// d ^= (c + b) <<< 13
movdqa xmm4, xmm2
paddd xmm4, xmm3
- pshufd xmm3, xmm3, ROTL
+ pshufd xmm3, xmm3, SHUF(2, 1, 0, 3)
movdqa xmm5, xmm4
pslld xmm4, 13
psrld xmm5, 19
// a ^= (d + c) <<< 18
movdqa xmm4, xmm1
- pshufd xmm1, xmm1, ROTR
+ pshufd xmm1, xmm1, SHUF(0, 3, 2, 1)
paddd xmm4, xmm2
- pshufd xmm2, xmm2, ROT2
+ pshufd xmm2, xmm2, SHUF(1, 0, 3, 2)
movdqa xmm5, xmm4
pslld xmm4, 18
psrld xmm5, 14
// latency, so arrange to start a new shuffle into a temporary as
// soon as we've written out the old value.
paddd xmm0, SAVE0
- pshufd xmm4, xmm0, 0x39
+ pshufd xmm4, xmm0, 0x39
movd [OUT + 0], xmm0
paddd xmm1, SAVE1
- pshufd xmm5, xmm1, ROTL
+ pshufd xmm5, xmm1, SHUF(2, 1, 0, 3)
movd [OUT + 16], xmm1
paddd xmm2, SAVE2
- pshufd xmm6, xmm2, ROT2
+ pshufd xmm6, xmm2, SHUF(1, 0, 3, 2)
movd [OUT + 32], xmm2
paddd xmm3, SAVE3
- pshufd xmm7, xmm3, ROTR
+ pshufd xmm7, xmm3, SHUF(0, 3, 2, 1)
movd [OUT + 48], xmm3
movd [OUT + 4], xmm7
- pshufd xmm7, xmm3, ROT2
+ pshufd xmm7, xmm3, SHUF(1, 0, 3, 2)
movd [OUT + 24], xmm7
- pshufd xmm3, xmm3, ROTL
+ pshufd xmm3, xmm3, SHUF(2, 1, 0, 3)
movd [OUT + 44], xmm3
movd [OUT + 8], xmm6
- pshufd xmm6, xmm2, ROTL
+ pshufd xmm6, xmm2, SHUF(2, 1, 0, 3)
movd [OUT + 28], xmm6
- pshufd xmm2, xmm2, ROTR
+ pshufd xmm2, xmm2, SHUF(0, 3, 2, 1)
movd [OUT + 52], xmm2
movd [OUT + 12], xmm5
- pshufd xmm5, xmm1, ROTR
+ pshufd xmm5, xmm1, SHUF(0, 3, 2, 1)
movd [OUT + 36], xmm5
- pshufd xmm1, xmm1, ROT2
+ pshufd xmm1, xmm1, SHUF(1, 0, 3, 2)
movd [OUT + 56], xmm1
movd [OUT + 20], xmm4
- pshufd xmm4, xmm0, ROT2
+ pshufd xmm4, xmm0, SHUF(1, 0, 3, 2)
movd [OUT + 40], xmm4
- pshufd xmm0, xmm0, ROTL
+ pshufd xmm0, xmm0, SHUF(2, 1, 0, 3)
movd [OUT + 60], xmm0
// Tidy things up.
-
#if CPUFAM_X86
mov esp, ebp
pop ebp