#include "config.h"
#include "asm-common.h"
-///--------------------------------------------------------------------------
-/// External definitions.
-
.globl F(abort)
.globl F(rijndael_rcon)
-///--------------------------------------------------------------------------
-/// Local utilities.
-
-// Magic constants for shuffling.
-#define ROTL 0x93
-#define ROT2 0x4e
-#define ROTR 0x39
-
///--------------------------------------------------------------------------
/// Main code.
.arch .aes
- .section .text
+ .text
/// The AESNI instructions implement a little-endian version of AES, but
/// Catacomb's internal interface presents as big-endian so as to work better
FUNC(rijndael_setup_x86ish_aesni)
+#define SI WHOLE(si)
+#define DI WHOLE(di)
+
#if CPUFAM_X86
// Arguments are on the stack. We'll need to stack the caller's
// register veriables, but we'll manage.
# define CTX ebp // context pointer
# define BLKSZ [esp + 24] // block size
-# define SI esi // source pointer
-# define DI edi // destination pointer
-
# define KSZ ebx // key size
-# define KSZo ebx // ... as address offset
# define NKW edx // total number of key words
# define NKW_NEEDS_REFRESH 1 // ... needs recalculating
# define RCON ecx // round constants table
# define LIM edx // limit pointer
-# define LIMn edx // ... as integer offset from base
+# define CYIX edi // index in shift-register cycle
# define NR ecx // number of rounds
# define LRK eax // distance to last key
-# define LRKo eax // ... as address offset
# define BLKOFF edx // block size in bytes
-# define BLKOFFo edx // ... as address offset
// Stack the caller's registers.
push ebp
# define CTX r8 // context pointer
# define BLKSZ r9d // block size
-# define SI rsi // source pointer
-# define DI rdi // destination pointer
-
# define KSZ edx // key size
-# define KSZo rdx // ... as address offset
# define NKW r10d // total number of key words
# define RCON rdi // round constants table
-# define LIMn ecx // limit pointer
-# define LIM rcx // ... as integer offset from base
+# define LIM rcx // limit pointer
+# define CYIX r11d // index in shift-register cycle
# define NR ecx // number of rounds
# define LRK eax // distance to last key
-# define LRKo rax // ... as address offset
# define BLKOFF r9d // block size in bytes
-# define BLKOFFo r9 // ... as address offset
// Move arguments to more useful places.
mov CTX, rdi // context base pointer
# define CTX r8 // context pointer
# define BLKSZ edx // block size
-# define SI rsi // source pointer
-# define DI rdi // destination pointer
-
# define KSZ r9d // key size
-# define KSZo r9 // ... as address offset
# define NKW r10d // total number of key words
# define RCON rdi // round constants table
-# define LIMn ecx // limit pointer
-# define LIM rcx // ... as integer offset from base
+# define LIM rcx // limit pointer
+# define CYIX r11d // index in shift-register cycle
# define NR ecx // number of rounds
# define LRK eax // distance to last key
-# define LRKo rax // ... as address offset
# define BLKOFF edx // block size in bytes
-# define BLKOFFo rdx // ... as address offset
// We'll need the index registers, which belong to the caller in this
// ABI.
push rsi
+ .seh_pushreg rsi
push rdi
+ .seh_pushreg rdi
+ .seh_endprologue
// Move arguments to more useful places.
- mov SI, r8 // key material
+ mov rsi, r8 // key material
mov CTX, rcx // context base pointer
#endif
#if CPUFAM_AMD64 && ABI_SYSV
// We've been lucky. We already have a copy of the context pointer
// in rdi, and the key size in ecx.
- add DI, w
+ add rdi, w
#else
lea DI, [CTX + w]
mov ecx, KSZ
#if !NKW_NEEDS_REFRESH
// If we can't keep NKW for later, then we use the same register for
// it and LIM, so this move is unnecessary.
- mov LIMn, NKW
+ mov DWORD(LIM), NKW
#endif
- sub LIMn, KSZ // offset by the key size
+ sub DWORD(LIM), KSZ // offset by the key size
// Find the round constants.
- ldgot ecx
- leaext RCON, rijndael_rcon, ecx
+ ldgot WHOLE(c)
+ leaext RCON, F(rijndael_rcon), WHOLE(c)
// Prepare for the main loop.
lea SI, [CTX + w]
- mov eax, [SI + 4*KSZo - 4] // most recent key word
+ mov eax, [SI + 4*WHOLE(KSZ) - 4] // most recent key word
lea LIM, [SI + 4*LIM] // limit, offset by one key expansion
+ xor CYIX, CYIX // start of new cycle
// Main key expansion loop. The first word of each key-length chunk
// needs special treatment.
// as an immediate, so it's kind of annoying if you're not
// open-coding the whole thing. It's much easier to leave that as
// zero and XOR in the round constant by hand.
-9: movd xmm0, eax
- pshufd xmm0, xmm0, ROTR
+0: cmp CYIX, 0 // first word of the cycle?
+ je 1f
+ cmp CYIX, 4 // fourth word of the cycle?
+ jne 2f
+ cmp KSZ, 7 // and a large key?
+ jb 2f
+
+ // Fourth word of the cycle, and seven or eight words of key. Do a
+ // byte substitution.
+ movd xmm0, eax
+ pshufd xmm0, xmm0, SHUF(2, 1, 0, 3)
aeskeygenassist xmm1, xmm0, 0
- pshufd xmm1, xmm1, ROTL
movd eax, xmm1
- xor eax, [SI]
- xor al, [RCON]
- inc RCON
- mov [SI + 4*KSZo], eax
- add SI, 4
- cmp SI, LIM
- jae 8f
-
- // The next three words are simple...
- xor eax, [SI]
- mov [SI + 4*KSZo], eax
- add SI, 4
- cmp SI, LIM
- jae 8f
-
- // (Word 2...)
- xor eax, [SI]
- mov [SI + 4*KSZo], eax
- add SI, 4
- cmp SI, LIM
- jae 8f
+ jmp 2f
- // (Word 3...)
- xor eax, [SI]
- mov [SI + 4*KSZo], eax
- add SI, 4
- cmp SI, LIM
- jae 8f
-
- // Word 4. If the key is /more/ than 6 words long, then we must
- // apply a substitution here.
- cmp KSZ, 5
- jb 9b
- cmp KSZ, 7
- jb 0f
- movd xmm0, eax
- pshufd xmm0, xmm0, ROTL
+ // First word of the cycle. This is the complicated piece.
+1: movd xmm0, eax
+ pshufd xmm0, xmm0, SHUF(0, 3, 2, 1)
aeskeygenassist xmm1, xmm0, 0
+ pshufd xmm1, xmm1, SHUF(2, 1, 0, 3)
movd eax, xmm1
-0: xor eax, [SI]
- mov [SI + 4*KSZo], eax
- add SI, 4
- cmp SI, LIM
- jae 8f
-
- // (Word 5...)
- cmp KSZ, 6
- jb 9b
- xor eax, [SI]
- mov [SI + 4*KSZo], eax
- add SI, 4
- cmp SI, LIM
- jae 8f
-
- // (Word 6...)
- cmp KSZ, 7
- jb 9b
- xor eax, [SI]
- mov [SI + 4*KSZo], eax
- add SI, 4
- cmp SI, LIM
- jae 8f
+ xor al, [RCON]
+ inc RCON
- // (Word 7...)
- cmp KSZ, 8
- jb 9b
- xor eax, [SI]
- mov [SI + 4*KSZo], eax
+ // Common tail. Mix in the corresponding word from the previous
+ // cycle and prepare for the next loop.
+2: xor eax, [SI]
+ mov [SI + 4*WHOLE(KSZ)], eax
add SI, 4
+ inc CYIX
cmp SI, LIM
- jae 8f
-
- // Must be done by now.
- jmp 9b
+ jae 9f
+ cmp CYIX, KSZ
+ jb 0b
+ xor CYIX, CYIX
+ jmp 0b
// Next job is to construct the decryption keys. The keys for the
// first and last rounds don't need to be mangled, but the remaining
// there's easily enough buffer space for the over-enthusiastic reads
// and writes because the context has space for 32-byte blocks, which
// is our maximum and an exact fit for two SSE registers.
-8: mov NR, [CTX + nr] // number of rounds
+9: mov NR, [CTX + nr] // number of rounds
#if NKW_NEEDS_REFRESH
mov BLKOFF, BLKSZ
mov LRK, NR
sub LRK, BLKSZ
#endif
lea DI, [CTX + wi]
- lea SI, [CTX + w + 4*LRKo] // last round's keys
+ lea SI, [CTX + w + 4*WHOLE(LRK)] // last round's keys
shl BLKOFF, 2 // block size (in bytes now)
// Copy the last encryption round's keys.
movdqu xmm0, [SI]
movdqu [DI], xmm0
cmp BLKOFF, 16
- jbe 9f
+ jbe 0f
movdqu xmm0, [SI + 16]
movdqu [DI + 16], xmm0
// Update the loop variables and stop if we've finished.
-9: add DI, BLKOFFo
- sub SI, BLKOFFo
+0: add DI, WHOLE(BLKOFF)
+ sub SI, WHOLE(BLKOFF)
sub NR, 1
- jbe 0f
+ jbe 9f
// Do another middle round's keys...
movdqu xmm0, [SI]
aesimc xmm0, xmm0
movdqu [DI], xmm0
cmp BLKOFF, 16
- jbe 9b
+ jbe 0b
movdqu xmm0, [SI + 16]
aesimc xmm0, xmm0
movdqu [DI + 16], xmm0
- jmp 9b
+ jmp 0b
// Finally do the first encryption round.
-0: movdqu xmm0, [SI]
+9: movdqu xmm0, [SI]
movdqu [DI], xmm0
cmp BLKOFF, 16
- jbe 0f
+ jbe 1f
movdqu xmm0, [SI + 16]
movdqu [DI + 16], xmm0
// If the block size is not exactly four words then we must end-swap
// everything. We can use fancy SSE toys for this.
-0: cmp BLKOFF, 16
- je 0f
+1: cmp BLKOFF, 16
+ je 9f
// Find the byte-reordering table.
ldgot ecx
#endif
// End-swap the encryption keys.
- mov ecx, NKW
lea SI, [CTX + w]
call endswap_block
// And the decryption keys.
- mov ecx, NKW
lea SI, [CTX + wi]
call endswap_block
-0: // All done.
+9: // All done.
#if CPUFAM_X86
pop edi
pop esi
#endif
ret
- .align 16
-endswap_block:
- // End-swap ECX words starting at SI. The end-swapping table is
+ENDFUNC
+
+INTFUNC(endswap_block)
+ // End-swap NKW words starting at SI. The end-swapping table is
// already loaded into XMM5; and it's OK to work in 16-byte chunks.
- movdqu xmm1, [SI]
+#if CPUFAM_AMD64 && ABI_WIN
+ .seh_endprologue
+#endif
+
+ mov ecx, NKW
+0: movdqu xmm1, [SI]
pshufb xmm1, xmm5
movdqu [SI], xmm1
add SI, 16
sub ecx, 4
- ja endswap_block
+ ja 0b
+
ret
+ENDFUNC
+
#undef CTX
#undef BLKSZ
#undef SI
#undef DI
#undef KSZ
-#undef KSZo
#undef RCON
-#undef LIMn
#undef LIM
#undef NR
#undef LRK
-#undef LRKo
#undef BLKOFF
-#undef BLKOFFo
-
-ENDFUNC
///--------------------------------------------------------------------------
/// Encrypting and decrypting blocks.
- .macro encdec op, aes, koff
-FUNC(rijndael_\op\()_x86ish_aesni)
-
- // Find the magic endianness-swapping table.
- ldgot ecx
- movdqa xmm5, [INTADDR(endswap_tab, ecx)]
+.macro encdec op, aes, koff
+ FUNC(rijndael_\op\()_x86ish_aesni)
#if CPUFAM_X86
// Arguments come in on the stack, and need to be collected. We
// don't have a shortage of registers.
-# define K ecx
+# define K eax
# define SRC edx
# define DST edx
-# define NR eax
+# define NR ecx
mov K, [esp + 4]
mov SRC, [esp + 8]
# define SRC rdx
# define DST r8
# define NR eax
+ .seh_endprologue
#endif
+ // Find the magic endianness-swapping table.
+ ldgot ecx
+ movdqa xmm5, [INTADDR(endswap_tab, ecx)]
+
// Initial setup.
movdqu xmm0, [SRC]
pshufb xmm0, xmm5
movdqu xmm1, [K]
add K, 16
pxor xmm0, xmm1
+#if CPUFAM_X86
+ mov DST, [esp + 12]
+#endif
// Dispatch to the correct code.
cmp NR, 10
// Unpermute the ciphertext block and store it.
pshufb xmm0, xmm5
-#if CPUFAM_X86
- mov DST, [esp + 12]
-#endif
movdqu [DST], xmm0
// And we're done.
#undef DST
#undef NR
-ENDFUNC
- .endm
+ ENDFUNC
+.endm
encdec eblk, aesenc, w
encdec dblk, aesdec, wi
///--------------------------------------------------------------------------
/// Random utilities.
- .align 16
+INTFUNC(bogus)
// Abort the process because of a programming error. Indirecting
// through this point serves several purposes: (a) by CALLing, rather
// than branching to, `abort', we can save the return address, which
// might at least provide a hint as to what went wrong; (b) we don't
// have conditional CALLs (and they'd be big anyway); and (c) we can
// write a HLT here as a backstop against `abort' being mad.
-bogus: callext F(abort)
+#if CPUFAM_AMD64 && ABI_WIN
+ .seh_endprologue
+#endif
+
+ callext F(abort)
0: hlt
jmp 0b
- gotaux ecx
+ENDFUNC
///--------------------------------------------------------------------------
/// Data tables.
+ RODATA
+
.align 16
endswap_tab:
.byte 3, 2, 1, 0