From: Mark Wooding Date: Mon, 8 Aug 2016 09:33:29 +0000 (+0100) Subject: symm/{chacha,salsa20}-*.S: Indent the hoisted transposition instructions. X-Git-Tag: 2.3.0~44 X-Git-Url: http://www.chiark.greenend.org.uk/ucgi/~mdw/git/catacomb/commitdiff_plain/70bc6059902c30dcbd1cddbdb628e4bcbd9cc6f5 symm/{chacha,salsa20}-*.S: Indent the hoisted transposition instructions. This hopefully makes it clearer how the various interleaved strands of computation work. --- diff --git a/symm/chacha-arm-neon.S b/symm/chacha-arm-neon.S index d69e2e43..4c72791e 100644 --- a/symm/chacha-arm-neon.S +++ b/symm/chacha-arm-neon.S @@ -85,9 +85,9 @@ FUNC(chacha_core_arm_neon) // c += d; b ^= c; b <<<= 7 vadd.u32 q10, q10, q11 - vext.32 q11, q11, q11, #3 + vext.32 q11, q11, q11, #3 veor q9, q9, q10 - vext.32 q10, q10, q10, #2 + vext.32 q10, q10, q10, #2 vshl.u32 q0, q9, #7 vshr.u32 q9, q9, #25 vorr q9, q9, q0 @@ -132,9 +132,9 @@ FUNC(chacha_core_arm_neon) // c += d; b ^= c; b <<<= 7 vadd.u32 q10, q10, q11 - vext.32 q11, q11, q11, #1 + vext.32 q11, q11, q11, #1 veor q9, q9, q10 - vext.32 q10, q10, q10, #2 + vext.32 q10, q10, q10, #2 vshl.u32 q0, q9, #7 vshr.u32 q9, q9, #25 vorr q9, q9, q0 diff --git a/symm/chacha-x86ish-sse2.S b/symm/chacha-x86ish-sse2.S index 42091536..b58cdcea 100644 --- a/symm/chacha-x86ish-sse2.S +++ b/symm/chacha-x86ish-sse2.S @@ -164,9 +164,9 @@ FUNC(chacha_core_x86ish_sse2) // c += d; b ^= c; b <<<= 7 paddd xmm2, xmm3 - pshufd xmm3, xmm3, ROTL + pshufd xmm3, xmm3, ROTL pxor xmm1, xmm2 - pshufd xmm2, xmm2, ROT2 + pshufd xmm2, xmm2, ROT2 movdqa xmm4, xmm1 pslld xmm1, 7 psrld xmm4, 25 @@ -215,9 +215,9 @@ FUNC(chacha_core_x86ish_sse2) // c += d; b ^= c; b <<<= 7 paddd xmm2, xmm3 - pshufd xmm3, xmm3, ROTR + pshufd xmm3, xmm3, ROTR pxor xmm1, xmm2 - pshufd xmm2, xmm2, ROT2 + pshufd xmm2, xmm2, ROT2 movdqa xmm4, xmm1 pslld xmm1, 7 psrld xmm4, 25 diff --git a/symm/salsa20-arm-neon.S b/symm/salsa20-arm-neon.S index a520e301..f212f2fa 100644 --- a/symm/salsa20-arm-neon.S +++ b/symm/salsa20-arm-neon.S @@ -149,7 +149,7 @@ FUNC(salsa20_core_arm_neon) // d ^= (c + b) <<< 13 vadd.u32 q0, q10, q11 - vext.32 q11, q11, q11, #3 + vext.32 q11, q11, q11, #3 vshl.u32 q1, q0, #13 vshr.u32 q0, q0, #19 vorr q0, q0, q1 @@ -157,8 +157,8 @@ FUNC(salsa20_core_arm_neon) // a ^= (d + c) <<< 18 vadd.u32 q0, q9, q10 - vext.32 q10, q10, q10, #2 - vext.32 q9, q9, q9, #1 + vext.32 q10, q10, q10, #2 + vext.32 q9, q9, q9, #1 vshl.u32 q1, q0, #18 vshr.u32 q0, q0, #14 vorr q0, q0, q1 @@ -188,7 +188,7 @@ FUNC(salsa20_core_arm_neon) // d ^= (c + b) <<< 13 vadd.u32 q0, q10, q9 - vext.32 q9, q9, q9, #3 + vext.32 q9, q9, q9, #3 vshl.u32 q1, q0, #13 vshr.u32 q0, q0, #19 vorr q0, q0, q1 @@ -196,8 +196,8 @@ FUNC(salsa20_core_arm_neon) // a ^= (d + c) <<< 18 vadd.u32 q0, q11, q10 - vext.32 q10, q10, q10, #2 - vext.32 q11, q11, q11, #1 + vext.32 q10, q10, q10, #2 + vext.32 q11, q11, q11, #1 vshl.u32 q1, q0, #18 vshr.u32 q0, q0, #14 vorr q0, q0, q1 diff --git a/symm/salsa20-x86ish-sse2.S b/symm/salsa20-x86ish-sse2.S index fbdfea72..930508a1 100644 --- a/symm/salsa20-x86ish-sse2.S +++ b/symm/salsa20-x86ish-sse2.S @@ -182,7 +182,7 @@ FUNC(salsa20_core_x86ish_sse2) // d ^= (c + b) <<< 13 movdqa xmm4, xmm2 paddd xmm4, xmm1 - pshufd xmm1, xmm1, ROTL + pshufd xmm1, xmm1, ROTL movdqa xmm5, xmm4 pslld xmm4, 13 psrld xmm5, 19 @@ -191,9 +191,9 @@ FUNC(salsa20_core_x86ish_sse2) // a ^= (d + c) <<< 18 movdqa xmm4, xmm3 - pshufd xmm3, xmm3, ROTR + pshufd xmm3, xmm3, ROTR paddd xmm4, xmm2 - pshufd xmm2, xmm2, ROT2 + pshufd xmm2, xmm2, ROT2 movdqa xmm5, xmm4 pslld xmm4, 18 psrld xmm5, 14 @@ -237,7 +237,7 @@ FUNC(salsa20_core_x86ish_sse2) // d ^= (c + b) <<< 13 movdqa xmm4, xmm2 paddd xmm4, xmm3 - pshufd xmm3, xmm3, ROTL + pshufd xmm3, xmm3, ROTL movdqa xmm5, xmm4 pslld xmm4, 13 psrld xmm5, 19 @@ -246,9 +246,9 @@ FUNC(salsa20_core_x86ish_sse2) // a ^= (d + c) <<< 18 movdqa xmm4, xmm1 - pshufd xmm1, xmm1, ROTR + pshufd xmm1, xmm1, ROTR paddd xmm4, xmm2 - pshufd xmm2, xmm2, ROT2 + pshufd xmm2, xmm2, ROT2 movdqa xmm5, xmm4 pslld xmm4, 18 psrld xmm5, 14 @@ -268,43 +268,43 @@ FUNC(salsa20_core_x86ish_sse2) // latency, so arrange to start a new shuffle into a temporary as // soon as we've written out the old value. paddd xmm0, SAVE0 - pshufd xmm4, xmm0, 0x39 + pshufd xmm4, xmm0, 0x39 movd [OUT + 0], xmm0 paddd xmm1, SAVE1 - pshufd xmm5, xmm1, ROTL + pshufd xmm5, xmm1, ROTL movd [OUT + 16], xmm1 paddd xmm2, SAVE2 - pshufd xmm6, xmm2, ROT2 + pshufd xmm6, xmm2, ROT2 movd [OUT + 32], xmm2 paddd xmm3, SAVE3 - pshufd xmm7, xmm3, ROTR + pshufd xmm7, xmm3, ROTR movd [OUT + 48], xmm3 movd [OUT + 4], xmm7 - pshufd xmm7, xmm3, ROT2 + pshufd xmm7, xmm3, ROT2 movd [OUT + 24], xmm7 - pshufd xmm3, xmm3, ROTL + pshufd xmm3, xmm3, ROTL movd [OUT + 44], xmm3 movd [OUT + 8], xmm6 - pshufd xmm6, xmm2, ROTL + pshufd xmm6, xmm2, ROTL movd [OUT + 28], xmm6 - pshufd xmm2, xmm2, ROTR + pshufd xmm2, xmm2, ROTR movd [OUT + 52], xmm2 movd [OUT + 12], xmm5 - pshufd xmm5, xmm1, ROTR + pshufd xmm5, xmm1, ROTR movd [OUT + 36], xmm5 - pshufd xmm1, xmm1, ROT2 + pshufd xmm1, xmm1, ROT2 movd [OUT + 56], xmm1 movd [OUT + 20], xmm4 - pshufd xmm4, xmm0, ROT2 + pshufd xmm4, xmm0, ROT2 movd [OUT + 40], xmm4 - pshufd xmm0, xmm0, ROTL + pshufd xmm0, xmm0, ROTL movd [OUT + 60], xmm0 // Tidy things up.