#include "config.h"
#include "asm-common.h"
-///--------------------------------------------------------------------------
-/// External definitions.
-
.globl F(abort)
.globl F(rijndael_rcon)
// as an immediate, so it's kind of annoying if you're not
// open-coding the whole thing. It's much easier to leave that as
// zero and XOR in the round constant by hand.
-9: movd xmm0, eax
+0: movd xmm0, eax
pshufd xmm0, xmm0, ROTR
aeskeygenassist xmm1, xmm0, 0
pshufd xmm1, xmm1, ROTL
mov [SI + 4*KSZo], eax
add SI, 4
cmp SI, LIM
- jae 8f
+ jae 9f
// The next three words are simple...
xor eax, [SI]
mov [SI + 4*KSZo], eax
add SI, 4
cmp SI, LIM
- jae 8f
+ jae 9f
// (Word 2...)
xor eax, [SI]
mov [SI + 4*KSZo], eax
add SI, 4
cmp SI, LIM
- jae 8f
+ jae 9f
// (Word 3...)
xor eax, [SI]
mov [SI + 4*KSZo], eax
add SI, 4
cmp SI, LIM
- jae 8f
+ jae 9f
// Word 4. If the key is /more/ than 6 words long, then we must
// apply a substitution here.
cmp KSZ, 5
- jb 9b
+ jb 0b
cmp KSZ, 7
- jb 0f
+ jb 1f
movd xmm0, eax
pshufd xmm0, xmm0, ROTL
aeskeygenassist xmm1, xmm0, 0
movd eax, xmm1
-0: xor eax, [SI]
+1: xor eax, [SI]
mov [SI + 4*KSZo], eax
add SI, 4
cmp SI, LIM
- jae 8f
+ jae 9f
// (Word 5...)
cmp KSZ, 6
- jb 9b
+ jb 0b
xor eax, [SI]
mov [SI + 4*KSZo], eax
add SI, 4
cmp SI, LIM
- jae 8f
+ jae 9f
// (Word 6...)
cmp KSZ, 7
- jb 9b
+ jb 0b
xor eax, [SI]
mov [SI + 4*KSZo], eax
add SI, 4
cmp SI, LIM
- jae 8f
+ jae 9f
// (Word 7...)
cmp KSZ, 8
- jb 9b
+ jb 0b
xor eax, [SI]
mov [SI + 4*KSZo], eax
add SI, 4
cmp SI, LIM
- jae 8f
+ jae 9f
// Must be done by now.
- jmp 9b
+ jmp 0b
// Next job is to construct the decryption keys. The keys for the
// first and last rounds don't need to be mangled, but the remaining
// there's easily enough buffer space for the over-enthusiastic reads
// and writes because the context has space for 32-byte blocks, which
// is our maximum and an exact fit for two SSE registers.
-8: mov NR, [CTX + nr] // number of rounds
+9: mov NR, [CTX + nr] // number of rounds
#if NKW_NEEDS_REFRESH
mov BLKOFF, BLKSZ
mov LRK, NR
movdqu xmm0, [SI]
movdqu [DI], xmm0
cmp BLKOFF, 16
- jbe 9f
+ jbe 0f
movdqu xmm0, [SI + 16]
movdqu [DI + 16], xmm0
// Update the loop variables and stop if we've finished.
-9: add DI, BLKOFFo
+0: add DI, BLKOFFo
sub SI, BLKOFFo
sub NR, 1
- jbe 0f
+ jbe 9f
// Do another middle round's keys...
movdqu xmm0, [SI]
aesimc xmm0, xmm0
movdqu [DI], xmm0
cmp BLKOFF, 16
- jbe 9b
+ jbe 0b
movdqu xmm0, [SI + 16]
aesimc xmm0, xmm0
movdqu [DI + 16], xmm0
- jmp 9b
+ jmp 0b
// Finally do the first encryption round.
-0: movdqu xmm0, [SI]
+9: movdqu xmm0, [SI]
movdqu [DI], xmm0
cmp BLKOFF, 16
- jbe 0f
+ jbe 1f
movdqu xmm0, [SI + 16]
movdqu [DI + 16], xmm0
// If the block size is not exactly four words then we must end-swap
// everything. We can use fancy SSE toys for this.
-0: cmp BLKOFF, 16
- je 0f
+1: cmp BLKOFF, 16
+ je 9f
// Find the byte-reordering table.
ldgot ecx
#endif
// End-swap the encryption keys.
- mov ecx, NKW
lea SI, [CTX + w]
call endswap_block
// And the decryption keys.
- mov ecx, NKW
lea SI, [CTX + wi]
call endswap_block
-0: // All done.
+9: // All done.
#if CPUFAM_X86
pop edi
pop esi
.align 16
endswap_block:
- // End-swap ECX words starting at SI. The end-swapping table is
+ // End-swap NKW words starting at SI. The end-swapping table is
// already loaded into XMM5; and it's OK to work in 16-byte chunks.
- movdqu xmm1, [SI]
+ mov ecx, NKW
+0: movdqu xmm1, [SI]
pshufb xmm1, xmm5
movdqu [SI], xmm1
add SI, 16
sub ecx, 4
- ja endswap_block
+ ja 0b
ret
#undef CTX
///--------------------------------------------------------------------------
/// Encrypting and decrypting blocks.
- .macro encdec op, aes, koff
-FUNC(rijndael_\op\()_x86ish_aesni)
+.macro encdec op, aes, koff
+ FUNC(rijndael_\op\()_x86ish_aesni)
// Find the magic endianness-swapping table.
ldgot ecx
#undef DST
#undef NR
-ENDFUNC
- .endm
+ ENDFUNC
+.endm
encdec eblk, aesenc, w
encdec dblk, aesdec, wi