-FUNC(rijndael_dblk_x86_aesni)
-
- // On entry, we have:
- // [esp + 4] points to the context block
- // [esp + 8] points to the input data block
- // [esp + 12] points to the output buffer
-
- // Find the magic endianness-swapping table.
- ldgot ecx
- movdqa xmm5, [INTADDR(endswap_tab, ecx)]
-
- // Load the input block and end-swap it. Also, start loading the
- // keys.
- mov eax, [esp + 8]
- movdqu xmm0, [eax]
- pshufb xmm0, xmm5
- mov eax, [esp + 4]
- lea edx, [eax + wi]
- mov eax, [eax + nr]
-
- // Initial whitening.
- movdqu xmm1, [edx]
- add edx, 16
- pxor xmm0, xmm1
-
- // Dispatch to the correct code.
- cmp eax, 10
- je dr10
- jb bogus
- cmp eax, 14
- je dr14
- ja bogus
- cmp eax, 12
- je dr12
- jb dr11
- jmp dr13
-
- .align 2
-
- // 14 rounds...
-dr14: movdqu xmm1, [edx]
- add edx, 16
- aesdec xmm0, xmm1
-
- // 13 rounds...
-dr13: movdqu xmm1, [edx]
- add edx, 16
- aesdec xmm0, xmm1
-
- // 12 rounds...
-dr12: movdqu xmm1, [edx]
- add edx, 16
- aesdec xmm0, xmm1
-
- // 11 rounds...
-dr11: movdqu xmm1, [edx]
- add edx, 16
- aesdec xmm0, xmm1
-
- // 10 rounds...
-dr10: movdqu xmm1, [edx]
- aesdec xmm0, xmm1
-
- // 9 rounds...
- movdqu xmm1, [edx + 16]
- aesdec xmm0, xmm1
-
- // 8 rounds...
- movdqu xmm1, [edx + 32]
- aesdec xmm0, xmm1
-
- // 7 rounds...
- movdqu xmm1, [edx + 48]
- aesdec xmm0, xmm1
-
- // 6 rounds...
- movdqu xmm1, [edx + 64]
- aesdec xmm0, xmm1
-
- // 5 rounds...
- movdqu xmm1, [edx + 80]
- aesdec xmm0, xmm1
-
- // 4 rounds...
- movdqu xmm1, [edx + 96]
- aesdec xmm0, xmm1
-
- // 3 rounds...
- movdqu xmm1, [edx + 112]
- aesdec xmm0, xmm1
-
- // 2 rounds...
- movdqu xmm1, [edx + 128]
- aesdec xmm0, xmm1
-
- // Final round...
- movdqu xmm1, [edx + 144]
- aesdeclast xmm0, xmm1
-
- // Unpermute the ciphertext block and store it.
- pshufb xmm0, xmm5
- mov eax, [esp + 12]
- movdqu [eax], xmm0
-
- // And we're done.
- ret
-
-ENDFUNC