# define INTADDR__1(addr, got) addr
#endif
-// Permutations for SIMD instructions. SHUF(D, C, B, A) is an immediate,
-// suitable for use in `pshufd' or `shufpd', which copies element D
-// (0 <= D < 4) of the source to element 3 of the destination, element C to
-// element 2, element B to element 1, and element A to element 0.
-#define SHUF(d, c, b, a) (64*(d) + 16*(c) + 4*(b) + (a))
+// Permutations for SIMD instructions. SHUF(A, B, C, D) is an immediate,
+// suitable for use in `pshufd' or `shufpd', which copies element A
+// (0 <= A < 4) of the source to element 0 of the destination, element B to
+// element 1, element C to element 2, and element D to element 3.
+#define SHUF(a, b, c, d) ((a) + 4*(b) + 16*(c) + 64*(d))
// Map register names to their individual pieces.
.macro mulcore r, i, slo, shi, d0, d1=nil, d2=nil, d3=nil
// Multiply R_I by the expanded operand SLO/SHI, and leave the pieces
// of the product in registers D0, D1, D2, D3.
- pshufd \d0, \r, SHUF(3, \i, 3, \i) // (r_i, ?; r_i, ?)
+ pshufd \d0, \r, SHUF(\i, 3, \i, 3) // (r_i, ?; r_i, ?)
.ifnes "\d1", "nil"
movdqa \d1, \slo // (s'_0, s'_1; s''_0, s''_1)
.endif
// lane 0 or 1 of D; the high two lanes of D are clobbered. On
// completion, XMM3 is clobbered. If CC is `nil', then the
// contribution which would have been added to it is left in C.
- pshufd xmm3, \c, SHUF(2, 3, 3, 3) // (?, ?; ?, t = c'' mod B)
+ pshufd xmm3, \c, SHUF(3, 3, 3, 2) // (?, ?; ?, t = c'' mod B)
psrldq xmm3, 12 // (t, 0; 0, 0) = (t; 0)
pslldq xmm3, 2 // (t b; 0)
paddq \c, xmm3 // (c' + t b; c'')
punpcklwd \c, \z // (c'_0, c''_0; c'_1, c''_1)
punpckhwd \d, \z // (c'_2, c''_2; c'_3, c''_3)
.endif
- pshufd \a, \a, SHUF(3, 1, 2, 0) // (a'_0, a'_1; a''_0, a''_1)
- pshufd \b, \b, SHUF(3, 1, 2, 0) // (a'_2, a'_3; a''_2, a''_3)
+ pshufd \a, \a, SHUF(0, 2, 1, 3) // (a'_0, a'_1; a''_0, a''_1)
+ pshufd \b, \b, SHUF(0, 2, 1, 3) // (a'_2, a'_3; a''_2, a''_3)
.ifnes "\c", "nil"
- pshufd \c, \c, SHUF(3, 1, 2, 0) // (c'_0, c'_1; c''_0, c''_1)
- pshufd \d, \d, SHUF(3, 1, 2, 0) // (c'_2, c'_3; c''_2, c''_3)
+ pshufd \c, \c, SHUF(0, 2, 1, 3) // (c'_0, c'_1; c''_0, c''_1)
+ pshufd \d, \d, SHUF(0, 2, 1, 3) // (c'_2, c'_3; c''_2, c''_3)
.endif
.endm
.ifnes "\d3", "nil"
movdqa \d3, [\s + 16] // (s'_2, s'_3; s''_2, s''_3)
.endif
- pshufd \d0, \d0, SHUF(3, 0, 3, 0) // (r_i, ?; r_i, ?)
+ pshufd \d0, \d0, SHUF(0, 3, 0, 3) // (r_i, ?; r_i, ?)
.ifnes "\d1", "nil"
psrldq \d1, 4 // (s'_1, s''_0; s''_1, 0)
.endif
// carry registers. On completion, XMM3 is clobbered. If CC is
// `nil', then the contribution which would have been added to it is
// left in C.
- pshufd xmm3, \c, SHUF(2, 3, 3, 3) // (?, ?; ?, t = c'' mod B)
+ pshufd xmm3, \c, SHUF(3, 3, 3, 2) // (?, ?; ?, t = c'' mod B)
psrldq xmm3, 12 // (t, 0; 0, 0) = (t, 0)
pslldq xmm3, 2 // (t b; 0)
paddq \c, xmm3 // (c' + t b; c'')
punpcklwd \c, \z // (c'_0, c''_0; c'_1, c''_1)
punpckhwd \d, \z // (c'_2, c''_2; c'_3, c''_3)
.endif
- pshufd \a, \a, SHUF(3, 1, 2, 0) // (a'_0, a'_1; a''_0, a''_1)
- pshufd \b, \b, SHUF(3, 1, 2, 0) // (a'_2, a'_3; a''_2, a''_3)
+ pshufd \a, \a, SHUF(0, 2, 1, 3) // (a'_0, a'_1; a''_0, a''_1)
+ pshufd \b, \b, SHUF(0, 2, 1, 3) // (a'_2, a'_3; a''_2, a''_3)
.ifnes "\c", "nil"
- pshufd \c, \c, SHUF(3, 1, 2, 0) // (c'_0, c'_1; c''_0, c''_1)
- pshufd \d, \d, SHUF(3, 1, 2, 0) // (c'_2, c'_3; c''_2, c''_3)
+ pshufd \c, \c, SHUF(0, 2, 1, 3) // (c'_0, c'_1; c''_0, c''_1)
+ pshufd \d, \d, SHUF(0, 2, 1, 3) // (c'_2, c'_3; c''_2, c''_3)
.endif
.endm
// c += d; b ^= c; b <<<= 7
paddd xmm2, xmm3
- pshufd xmm3, xmm3, SHUF(2, 1, 0, 3)
+ pshufd xmm3, xmm3, SHUF(3, 0, 1, 2)
pxor xmm1, xmm2
- pshufd xmm2, xmm2, SHUF(1, 0, 3, 2)
+ pshufd xmm2, xmm2, SHUF(2, 3, 0, 1)
movdqa xmm4, xmm1
pslld xmm1, 7
psrld xmm4, 25
//
// The shuffles have quite high latency, so they've mostly been
// pushed upwards. The remaining one can't be moved, though.
- pshufd xmm1, xmm1, SHUF(0, 3, 2, 1)
+ pshufd xmm1, xmm1, SHUF(1, 2, 3, 0)
// Apply the diagonal quarterround to each of the columns
// simultaneously.
// c += d; b ^= c; b <<<= 7
paddd xmm2, xmm3
- pshufd xmm3, xmm3, SHUF(0, 3, 2, 1)
+ pshufd xmm3, xmm3, SHUF(1, 2, 3, 0)
pxor xmm1, xmm2
- pshufd xmm2, xmm2, SHUF(1, 0, 3, 2)
+ pshufd xmm2, xmm2, SHUF(2, 3, 0, 1)
movdqa xmm4, xmm1
pslld xmm1, 7
psrld xmm4, 25
// Finally, finish off undoing the transpose, and we're done for this
// doubleround. Again, most of this was done above so we don't have
// to wait for the shuffles.
- pshufd xmm1, xmm1, SHUF(2, 1, 0, 3)
+ pshufd xmm1, xmm1, SHUF(3, 0, 1, 2)
// Decrement the loop counter and see if we should go round again.
sub NR, 2
// Fourth word of the cycle, and seven or eight words of key. Do a
// byte substitution.
movd xmm0, eax
- pshufd xmm0, xmm0, SHUF(2, 1, 0, 3)
+ pshufd xmm0, xmm0, SHUF(3, 0, 1, 2)
aeskeygenassist xmm1, xmm0, 0
movd eax, xmm1
jmp 2f
// First word of the cycle. This is the complicated piece.
1: movd xmm0, eax
- pshufd xmm0, xmm0, SHUF(0, 3, 2, 1)
+ pshufd xmm0, xmm0, SHUF(1, 2, 3, 0)
aeskeygenassist xmm1, xmm0, 0
- pshufd xmm1, xmm1, SHUF(2, 1, 0, 3)
+ pshufd xmm1, xmm1, SHUF(3, 0, 1, 2)
movd eax, xmm1
xor al, [RCON]
inc RCON
// d ^= (c + b) <<< 13
movdqa xmm4, xmm2
paddd xmm4, xmm1
- pshufd xmm1, xmm1, SHUF(2, 1, 0, 3)
+ pshufd xmm1, xmm1, SHUF(3, 0, 1, 2)
movdqa xmm5, xmm4
pslld xmm4, 13
psrld xmm5, 19
// a ^= (d + c) <<< 18
movdqa xmm4, xmm3
- pshufd xmm3, xmm3, SHUF(0, 3, 2, 1)
+ pshufd xmm3, xmm3, SHUF(1, 2, 3, 0)
paddd xmm4, xmm2
- pshufd xmm2, xmm2, SHUF(1, 0, 3, 2)
+ pshufd xmm2, xmm2, SHUF(2, 3, 0, 1)
movdqa xmm5, xmm4
pslld xmm4, 18
psrld xmm5, 14
// d ^= (c + b) <<< 13
movdqa xmm4, xmm2
paddd xmm4, xmm3
- pshufd xmm3, xmm3, SHUF(2, 1, 0, 3)
+ pshufd xmm3, xmm3, SHUF(3, 0, 1, 2)
movdqa xmm5, xmm4
pslld xmm4, 13
psrld xmm5, 19
// a ^= (d + c) <<< 18
movdqa xmm4, xmm1
- pshufd xmm1, xmm1, SHUF(0, 3, 2, 1)
+ pshufd xmm1, xmm1, SHUF(1, 2, 3, 0)
paddd xmm4, xmm2
- pshufd xmm2, xmm2, SHUF(1, 0, 3, 2)
+ pshufd xmm2, xmm2, SHUF(2, 3, 0, 1)
movdqa xmm5, xmm4
pslld xmm4, 18
psrld xmm5, 14
// input. This can be done by juggling values in registers, with the
// following fancy footwork: some row rotations, a transpose, and
// some more rotations.
- pshufd xmm1, xmm1, SHUF(2, 1, 0, 3) // 3, 4, 9, 14
- pshufd xmm2, xmm2, SHUF(1, 0, 3, 2) // 2, 7, 8, 13
- pshufd xmm3, xmm3, SHUF(0, 3, 2, 1) // 1, 6, 11, 12
+ pshufd xmm1, xmm1, SHUF(3, 0, 1, 2) // 3, 4, 9, 14
+ pshufd xmm2, xmm2, SHUF(2, 3, 0, 1) // 2, 7, 8, 13
+ pshufd xmm3, xmm3, SHUF(1, 2, 3, 0) // 1, 6, 11, 12
movdqa xmm4, xmm0
movdqa xmm5, xmm3
punpckhdq xmm1, xmm3 // 5, 6, 7, 4
punpckhdq xmm2, xmm5 // 15, 12, 13, 14
- pshufd xmm1, xmm1, SHUF(2, 1, 0, 3) // 4, 5, 6, 7
- pshufd xmm4, xmm4, SHUF(1, 0, 3, 2) // 8, 9, 10, 11
- pshufd xmm2, xmm2, SHUF(0, 3, 2, 1) // 12, 13, 14, 15
+ pshufd xmm1, xmm1, SHUF(3, 0, 1, 2) // 4, 5, 6, 7
+ pshufd xmm4, xmm4, SHUF(2, 3, 0, 1) // 8, 9, 10, 11
+ pshufd xmm2, xmm2, SHUF(1, 2, 3, 0) // 12, 13, 14, 15
// Finally we have to write out the result.
movdqu [OUT + 0], xmm0