.globl F(abort)
.globl F(rijndael_rcon)
-///--------------------------------------------------------------------------
-/// Local utilities.
-
-// Magic constants for shuffling.
-#define ROTL 0x93
-#define ROT2 0x4e
-#define ROTR 0x39
-
///--------------------------------------------------------------------------
/// Main code.
FUNC(rijndael_setup_x86ish_aesni)
+#define SI WHOLE(si)
+#define DI WHOLE(di)
+
#if CPUFAM_X86
// Arguments are on the stack. We'll need to stack the caller's
// register veriables, but we'll manage.
# define CTX ebp // context pointer
# define BLKSZ [esp + 24] // block size
-# define SI esi // source pointer
-# define DI edi // destination pointer
-
# define KSZ ebx // key size
-# define KSZo ebx // ... as address offset
# define NKW edx // total number of key words
# define NKW_NEEDS_REFRESH 1 // ... needs recalculating
# define RCON ecx // round constants table
# define LIM edx // limit pointer
-# define LIMn edx // ... as integer offset from base
# define CYIX edi // index in shift-register cycle
# define NR ecx // number of rounds
# define LRK eax // distance to last key
-# define LRKo eax // ... as address offset
# define BLKOFF edx // block size in bytes
-# define BLKOFFo edx // ... as address offset
// Stack the caller's registers.
push ebp
# define CTX r8 // context pointer
# define BLKSZ r9d // block size
-# define SI rsi // source pointer
-# define DI rdi // destination pointer
-
# define KSZ edx // key size
-# define KSZo rdx // ... as address offset
# define NKW r10d // total number of key words
# define RCON rdi // round constants table
-# define LIMn ecx // limit pointer
-# define LIM rcx // ... as integer offset from base
+# define LIM rcx // limit pointer
# define CYIX r11d // index in shift-register cycle
# define NR ecx // number of rounds
# define LRK eax // distance to last key
-# define LRKo rax // ... as address offset
# define BLKOFF r9d // block size in bytes
-# define BLKOFFo r9 // ... as address offset
// Move arguments to more useful places.
mov CTX, rdi // context base pointer
# define CTX r8 // context pointer
# define BLKSZ edx // block size
-# define SI rsi // source pointer
-# define DI rdi // destination pointer
-
# define KSZ r9d // key size
-# define KSZo r9 // ... as address offset
# define NKW r10d // total number of key words
# define RCON rdi // round constants table
-# define LIMn ecx // limit pointer
-# define LIM rcx // ... as integer offset from base
+# define LIM rcx // limit pointer
# define CYIX r11d // index in shift-register cycle
# define NR ecx // number of rounds
# define LRK eax // distance to last key
-# define LRKo rax // ... as address offset
# define BLKOFF edx // block size in bytes
-# define BLKOFFo rdx // ... as address offset
// We'll need the index registers, which belong to the caller in this
// ABI.
push rsi
+ .seh_pushreg rsi
push rdi
+ .seh_pushreg rdi
+ .seh_endprologue
// Move arguments to more useful places.
- mov SI, r8 // key material
+ mov rsi, r8 // key material
mov CTX, rcx // context base pointer
#endif
#if CPUFAM_AMD64 && ABI_SYSV
// We've been lucky. We already have a copy of the context pointer
// in rdi, and the key size in ecx.
- add DI, w
+ add rdi, w
#else
lea DI, [CTX + w]
mov ecx, KSZ
#if !NKW_NEEDS_REFRESH
// If we can't keep NKW for later, then we use the same register for
// it and LIM, so this move is unnecessary.
- mov LIMn, NKW
+ mov DWORD(LIM), NKW
#endif
- sub LIMn, KSZ // offset by the key size
+ sub DWORD(LIM), KSZ // offset by the key size
// Find the round constants.
- ldgot ecx
- leaext RCON, F(rijndael_rcon), ecx
+ ldgot WHOLE(c)
+ leaext RCON, F(rijndael_rcon), WHOLE(c)
// Prepare for the main loop.
lea SI, [CTX + w]
- mov eax, [SI + 4*KSZo - 4] // most recent key word
+ mov eax, [SI + 4*WHOLE(KSZ) - 4] // most recent key word
lea LIM, [SI + 4*LIM] // limit, offset by one key expansion
xor CYIX, CYIX // start of new cycle
// Fourth word of the cycle, and seven or eight words of key. Do a
// byte substitution.
movd xmm0, eax
- pshufd xmm0, xmm0, ROTL
+ pshufd xmm0, xmm0, SHUF(2, 1, 0, 3)
aeskeygenassist xmm1, xmm0, 0
movd eax, xmm1
jmp 2f
// First word of the cycle. This is the complicated piece.
1: movd xmm0, eax
- pshufd xmm0, xmm0, ROTR
+ pshufd xmm0, xmm0, SHUF(0, 3, 2, 1)
aeskeygenassist xmm1, xmm0, 0
- pshufd xmm1, xmm1, ROTL
+ pshufd xmm1, xmm1, SHUF(2, 1, 0, 3)
movd eax, xmm1
xor al, [RCON]
inc RCON
// Common tail. Mix in the corresponding word from the previous
// cycle and prepare for the next loop.
2: xor eax, [SI]
- mov [SI + 4*KSZo], eax
+ mov [SI + 4*WHOLE(KSZ)], eax
add SI, 4
inc CYIX
cmp SI, LIM
sub LRK, BLKSZ
#endif
lea DI, [CTX + wi]
- lea SI, [CTX + w + 4*LRKo] // last round's keys
+ lea SI, [CTX + w + 4*WHOLE(LRK)] // last round's keys
shl BLKOFF, 2 // block size (in bytes now)
// Copy the last encryption round's keys.
movdqu [DI + 16], xmm0
// Update the loop variables and stop if we've finished.
-0: add DI, BLKOFFo
- sub SI, BLKOFFo
+0: add DI, WHOLE(BLKOFF)
+ sub SI, WHOLE(BLKOFF)
sub NR, 1
jbe 9f
#endif
ret
- .align 16
-endswap_block:
+ENDFUNC
+
+INTFUNC(endswap_block)
// End-swap NKW words starting at SI. The end-swapping table is
// already loaded into XMM5; and it's OK to work in 16-byte chunks.
+#if CPUFAM_AMD64 && ABI_WIN
+ .seh_endprologue
+#endif
+
mov ecx, NKW
0: movdqu xmm1, [SI]
pshufb xmm1, xmm5
add SI, 16
sub ecx, 4
ja 0b
+
ret
+ENDFUNC
+
#undef CTX
#undef BLKSZ
#undef SI
#undef DI
#undef KSZ
-#undef KSZo
#undef RCON
-#undef LIMn
#undef LIM
#undef NR
#undef LRK
-#undef LRKo
#undef BLKOFF
-#undef BLKOFFo
-
-ENDFUNC
///--------------------------------------------------------------------------
/// Encrypting and decrypting blocks.
.macro encdec op, aes, koff
FUNC(rijndael_\op\()_x86ish_aesni)
- // Find the magic endianness-swapping table.
- ldgot ecx
- movdqa xmm5, [INTADDR(endswap_tab, ecx)]
-
#if CPUFAM_X86
// Arguments come in on the stack, and need to be collected. We
// don't have a shortage of registers.
-# define K ecx
+# define K eax
# define SRC edx
# define DST edx
-# define NR eax
+# define NR ecx
mov K, [esp + 4]
mov SRC, [esp + 8]
# define SRC rdx
# define DST r8
# define NR eax
+ .seh_endprologue
#endif
+ // Find the magic endianness-swapping table.
+ ldgot ecx
+ movdqa xmm5, [INTADDR(endswap_tab, ecx)]
+
// Initial setup.
movdqu xmm0, [SRC]
pshufb xmm0, xmm5
movdqu xmm1, [K]
add K, 16
pxor xmm0, xmm1
+#if CPUFAM_X86
+ mov DST, [esp + 12]
+#endif
// Dispatch to the correct code.
cmp NR, 10
// Unpermute the ciphertext block and store it.
pshufb xmm0, xmm5
-#if CPUFAM_X86
- mov DST, [esp + 12]
-#endif
movdqu [DST], xmm0
// And we're done.
///--------------------------------------------------------------------------
/// Random utilities.
- .align 16
+INTFUNC(bogus)
// Abort the process because of a programming error. Indirecting
// through this point serves several purposes: (a) by CALLing, rather
// than branching to, `abort', we can save the return address, which
// might at least provide a hint as to what went wrong; (b) we don't
// have conditional CALLs (and they'd be big anyway); and (c) we can
// write a HLT here as a backstop against `abort' being mad.
-bogus: callext F(abort)
+#if CPUFAM_AMD64 && ABI_WIN
+ .seh_endprologue
+#endif
+
+ callext F(abort)
0: hlt
jmp 0b
+ENDFUNC
+
///--------------------------------------------------------------------------
/// Data tables.
+ RODATA
+
.align 16
endswap_tab:
.byte 3, 2, 1, 0