From a13b573023a59dc1e2febef1142aa4b1d1af67b7 Mon Sep 17 00:00:00 2001 Message-Id: From: Mark Wooding Date: Thu, 11 Aug 2016 09:15:12 +0100 Subject: [PATCH] base/asm-common.h, *-x86ish-*.S: Centralize SSE shuffling constants. Organization: Straylight/Edgeware From: Mark Wooding Introduce a centrally defined `SHUF(D, C, B, A)' macro to make shuffling constants for `pshufd' and friends, rather than defining inscrutable `ROTL' etc. macros in each file. There are lots of other shuffling instructions, which may need their own magic macros, so this might prove to have been a bad name, but we'll worry about that later. --- base/asm-common.h | 6 ++++++ symm/chacha-x86ish-sse2.S | 20 ++++++----------- symm/rijndael-x86ish-aesni.S | 14 +++--------- symm/salsa20-x86ish-sse2.S | 42 +++++++++++++++--------------------- 4 files changed, 32 insertions(+), 50 deletions(-) diff --git a/base/asm-common.h b/base/asm-common.h index 0d32ccf9..20a1d6a8 100644 --- a/base/asm-common.h +++ b/base/asm-common.h @@ -170,6 +170,12 @@ name: # define INTADDR__1(addr, got) addr #endif +// Permutations for SIMD instructions. SHUF(D, C, B, A) is an immediate, +// suitable for use in `pshufd' or `shufpd', which copies element D +// (0 <= D < 4) of the source to element 3 of the destination, element C to +// element 2, element B to element 1, and element A to element 0. +#define SHUF(d, c, b, a) (64*(d) + 16*(c) + 4*(b) + (a)) + #endif ///-------------------------------------------------------------------------- diff --git a/symm/chacha-x86ish-sse2.S b/symm/chacha-x86ish-sse2.S index b58cdcea..a7ff68b5 100644 --- a/symm/chacha-x86ish-sse2.S +++ b/symm/chacha-x86ish-sse2.S @@ -30,14 +30,6 @@ #include "config.h" #include "asm-common.h" -///-------------------------------------------------------------------------- -/// Local utilities. - -// Magic constants for shuffling. -#define ROTL 0x93 -#define ROT2 0x4e -#define ROTR 0x39 - ///-------------------------------------------------------------------------- /// Main code. @@ -164,9 +156,9 @@ FUNC(chacha_core_x86ish_sse2) // c += d; b ^= c; b <<<= 7 paddd xmm2, xmm3 - pshufd xmm3, xmm3, ROTL + pshufd xmm3, xmm3, SHUF(2, 1, 0, 3) pxor xmm1, xmm2 - pshufd xmm2, xmm2, ROT2 + pshufd xmm2, xmm2, SHUF(1, 0, 3, 2) movdqa xmm4, xmm1 pslld xmm1, 7 psrld xmm4, 25 @@ -184,7 +176,7 @@ FUNC(chacha_core_x86ish_sse2) // // The shuffles have quite high latency, so they've mostly been // pushed upwards. The remaining one can't be moved, though. - pshufd xmm1, xmm1, ROTR + pshufd xmm1, xmm1, SHUF(0, 3, 2, 1) // Apply the diagonal quarterround to each of the columns // simultaneously. @@ -215,9 +207,9 @@ FUNC(chacha_core_x86ish_sse2) // c += d; b ^= c; b <<<= 7 paddd xmm2, xmm3 - pshufd xmm3, xmm3, ROTR + pshufd xmm3, xmm3, SHUF(0, 3, 2, 1) pxor xmm1, xmm2 - pshufd xmm2, xmm2, ROT2 + pshufd xmm2, xmm2, SHUF(1, 0, 3, 2) movdqa xmm4, xmm1 pslld xmm1, 7 psrld xmm4, 25 @@ -226,7 +218,7 @@ FUNC(chacha_core_x86ish_sse2) // Finally, finish off undoing the transpose, and we're done for this // doubleround. Again, most of this was done above so we don't have // to wait for the shuffles. - pshufd xmm1, xmm1, ROTL + pshufd xmm1, xmm1, SHUF(2, 1, 0, 3) // Decrement the loop counter and see if we should go round again. sub NR, 2 diff --git a/symm/rijndael-x86ish-aesni.S b/symm/rijndael-x86ish-aesni.S index 27f09bc4..2f6430b3 100644 --- a/symm/rijndael-x86ish-aesni.S +++ b/symm/rijndael-x86ish-aesni.S @@ -33,14 +33,6 @@ .globl F(abort) .globl F(rijndael_rcon) -///-------------------------------------------------------------------------- -/// Local utilities. - -// Magic constants for shuffling. -#define ROTL 0x93 -#define ROT2 0x4e -#define ROTR 0x39 - ///-------------------------------------------------------------------------- /// Main code. @@ -232,16 +224,16 @@ FUNC(rijndael_setup_x86ish_aesni) // Fourth word of the cycle, and seven or eight words of key. Do a // byte substitution. movd xmm0, eax - pshufd xmm0, xmm0, ROTL + pshufd xmm0, xmm0, SHUF(2, 1, 0, 3) aeskeygenassist xmm1, xmm0, 0 movd eax, xmm1 jmp 2f // First word of the cycle. This is the complicated piece. 1: movd xmm0, eax - pshufd xmm0, xmm0, ROTR + pshufd xmm0, xmm0, SHUF(0, 3, 2, 1) aeskeygenassist xmm1, xmm0, 0 - pshufd xmm1, xmm1, ROTL + pshufd xmm1, xmm1, SHUF(2, 1, 0, 3) movd eax, xmm1 xor al, [RCON] inc RCON diff --git a/symm/salsa20-x86ish-sse2.S b/symm/salsa20-x86ish-sse2.S index 930508a1..5fa5b151 100644 --- a/symm/salsa20-x86ish-sse2.S +++ b/symm/salsa20-x86ish-sse2.S @@ -30,14 +30,6 @@ #include "config.h" #include "asm-common.h" -///-------------------------------------------------------------------------- -/// Local utilities. - -// Magic constants for shuffling. -#define ROTL 0x93 -#define ROT2 0x4e -#define ROTR 0x39 - ///-------------------------------------------------------------------------- /// Main code. @@ -182,7 +174,7 @@ FUNC(salsa20_core_x86ish_sse2) // d ^= (c + b) <<< 13 movdqa xmm4, xmm2 paddd xmm4, xmm1 - pshufd xmm1, xmm1, ROTL + pshufd xmm1, xmm1, SHUF(2, 1, 0, 3) movdqa xmm5, xmm4 pslld xmm4, 13 psrld xmm5, 19 @@ -191,9 +183,9 @@ FUNC(salsa20_core_x86ish_sse2) // a ^= (d + c) <<< 18 movdqa xmm4, xmm3 - pshufd xmm3, xmm3, ROTR + pshufd xmm3, xmm3, SHUF(0, 3, 2, 1) paddd xmm4, xmm2 - pshufd xmm2, xmm2, ROT2 + pshufd xmm2, xmm2, SHUF(1, 0, 3, 2) movdqa xmm5, xmm4 pslld xmm4, 18 psrld xmm5, 14 @@ -237,7 +229,7 @@ FUNC(salsa20_core_x86ish_sse2) // d ^= (c + b) <<< 13 movdqa xmm4, xmm2 paddd xmm4, xmm3 - pshufd xmm3, xmm3, ROTL + pshufd xmm3, xmm3, SHUF(2, 1, 0, 3) movdqa xmm5, xmm4 pslld xmm4, 13 psrld xmm5, 19 @@ -246,9 +238,9 @@ FUNC(salsa20_core_x86ish_sse2) // a ^= (d + c) <<< 18 movdqa xmm4, xmm1 - pshufd xmm1, xmm1, ROTR + pshufd xmm1, xmm1, SHUF(0, 3, 2, 1) paddd xmm4, xmm2 - pshufd xmm2, xmm2, ROT2 + pshufd xmm2, xmm2, SHUF(1, 0, 3, 2) movdqa xmm5, xmm4 pslld xmm4, 18 psrld xmm5, 14 @@ -272,39 +264,39 @@ FUNC(salsa20_core_x86ish_sse2) movd [OUT + 0], xmm0 paddd xmm1, SAVE1 - pshufd xmm5, xmm1, ROTL + pshufd xmm5, xmm1, SHUF(2, 1, 0, 3) movd [OUT + 16], xmm1 paddd xmm2, SAVE2 - pshufd xmm6, xmm2, ROT2 + pshufd xmm6, xmm2, SHUF(1, 0, 3, 2) movd [OUT + 32], xmm2 paddd xmm3, SAVE3 - pshufd xmm7, xmm3, ROTR + pshufd xmm7, xmm3, SHUF(0, 3, 2, 1) movd [OUT + 48], xmm3 movd [OUT + 4], xmm7 - pshufd xmm7, xmm3, ROT2 + pshufd xmm7, xmm3, SHUF(1, 0, 3, 2) movd [OUT + 24], xmm7 - pshufd xmm3, xmm3, ROTL + pshufd xmm3, xmm3, SHUF(2, 1, 0, 3) movd [OUT + 44], xmm3 movd [OUT + 8], xmm6 - pshufd xmm6, xmm2, ROTL + pshufd xmm6, xmm2, SHUF(2, 1, 0, 3) movd [OUT + 28], xmm6 - pshufd xmm2, xmm2, ROTR + pshufd xmm2, xmm2, SHUF(0, 3, 2, 1) movd [OUT + 52], xmm2 movd [OUT + 12], xmm5 - pshufd xmm5, xmm1, ROTR + pshufd xmm5, xmm1, SHUF(0, 3, 2, 1) movd [OUT + 36], xmm5 - pshufd xmm1, xmm1, ROT2 + pshufd xmm1, xmm1, SHUF(1, 0, 3, 2) movd [OUT + 56], xmm1 movd [OUT + 20], xmm4 - pshufd xmm4, xmm0, ROT2 + pshufd xmm4, xmm0, SHUF(1, 0, 3, 2) movd [OUT + 40], xmm4 - pshufd xmm0, xmm0, ROTL + pshufd xmm0, xmm0, SHUF(2, 1, 0, 3) movd [OUT + 60], xmm0 // Tidy things up. -- [mdw]