+### -*- mode: asm; asm-comment-char: ?# -*-
+###
+### AESNI-based implementation of Rijndael
+###
+### (c) 2015 Straylight/Edgeware
+###
+
+###----- Licensing notice ---------------------------------------------------
+###
+### This file is part of Catacomb.
+###
+### Catacomb is free software; you can redistribute it and/or modify
+### it under the terms of the GNU Library General Public License as
+### published by the Free Software Foundation; either version 2 of the
+### License, or (at your option) any later version.
+###
+### Catacomb is distributed in the hope that it will be useful,
+### but WITHOUT ANY WARRANTY; without even the implied warranty of
+### MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+### GNU Library General Public License for more details.
+###
+### You should have received a copy of the GNU Library General Public
+### License along with Catacomb; if not, write to the Free
+### Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+### MA 02111-1307, USA.
+
+ .intel_syntax noprefix
+ .arch .aes
+
+ .globl abort
+ .globl rijndael_rcon
+
+ .section .text
+
+### The AESNI instructions implement a little-endian version of AES, but
+### Catacomb's internal interface presents as big-endian so as to work better
+### with things like GCM. We therefore maintain the round keys in
+### little-endian form, and have to end-swap blocks in and out.
+###
+### For added amusement, the AESNI instructions don't implement the
+### larger-block versions of Rijndael, so we have to end-swap the keys if
+### we're preparing for one of those.
+
+ ## Useful constants.
+ .equ maxrounds, 16 # maximum number of rounds
+ .equ maxblksz, 32 # maximum block size, in bytes
+ .equ kbufsz, maxblksz*(maxrounds + 1) # size of a key-schedule buffer
+
+ ## Context structure.
+ .equ nr, 0 # number of rounds
+ .equ w, nr + 4 # encryption key words
+ .equ wi, w + kbufsz # decryption key words
+
+###--------------------------------------------------------------------------
+### Key setup.
+
+ .globl rijndael_setup_x86_aesni
+ .type rijndael_setup_x86_aesni, STT_FUNC
+ .align 16
+rijndael_setup_x86_aesni:
+
+ ## Initial state. We have four arguments:
+ ## [esp + 20] is the context pointer
+ ## [esp + 24] is the block size, in 32-bit words (4, 6, or 8)
+ ## [esp + 28] points to the key material, unaligned
+ ## [esp + 32] is the size of the key, in words
+ ## The key size has already been checked for validity, and the number
+ ## of rounds has been computed. Our job is only to fill in the `w'
+ ## and `wi' vectors.
+
+ push ebp
+ push ebx
+ push esi
+ push edi
+
+ ## The initial round key material is taken directly from the input
+ ## key, so copy it over.
+ mov ebp, [esp + 20] # context base pointer
+ mov ebx, [esp + 32] # key size, in words
+ mov ecx, ebx
+ mov esi, [esp + 28]
+ lea edi, [ebp + w]
+ rep movsd
+
+ ## Find out other useful things.
+ mov edx, [ebp + nr] # number of rounds
+ add edx, 1
+ imul edx, [esp + 24] # total key size in words
+ sub edx, ebx # offset by the key size
+
+ ## Find the round constants.
+ call where_am_i_ecx
+ add ecx, offset _GLOBAL_OFFSET_TABLE_
+ mov ecx, [ecx + rijndael_rcon@GOT]
+
+ ## Prepare for the main loop.
+ lea esi, [ebp + w]
+ mov eax, [esi + 4*ebx - 4] # most recent key word
+ lea edx, [esi + 4*edx] # limit, offset by one key expansion
+
+ ## Main key expansion loop. The first word of each key-length chunk
+ ## needs special treatment.
+ ##
+ ## This is rather tedious because the Intel `AESKEYGENASSIST'
+ ## instruction is very strangely shaped. Firstly, it wants to
+ ## operate on vast SSE registers, even though we're data-blocked from
+ ## doing more than operation at a time unless we're doing two key
+ ## schedules simultaneously -- and even then we can't do more than
+ ## two, because the instruction ignores two of its input words
+ ## entirely, and produces two different outputs for each of the other
+ ## two. And secondly it insists on taking the magic round constant
+ ## as an immediate, so it's kind of annoying if you're not
+ ## open-coding the whole thing. It's much easier to leave that as
+ ## zero and XOR in the round constant by hand.
+9: movd xmm0, eax
+ pshufd xmm0, xmm0, 0x39
+ aeskeygenassist xmm1, xmm0, 0
+ pshufd xmm1, xmm1, 0x93
+ movd eax, xmm1
+ xor eax, [esi]
+ xor al, [ecx]
+ inc ecx
+ mov [esi + 4*ebx], eax
+ add esi, 4
+ cmp esi, edx
+ jae 8f
+
+ ## The next three words are simple...
+ xor eax, [esi]
+ mov [esi + 4*ebx], eax
+ add esi, 4
+ cmp esi, edx
+ jae 8f
+
+ ## (Word 2...)
+ xor eax, [esi]
+ mov [esi + 4*ebx], eax
+ add esi, 4
+ cmp esi, edx
+ jae 8f
+
+ ## (Word 3...)
+ xor eax, [esi]
+ mov [esi + 4*ebx], eax
+ add esi, 4
+ cmp esi, edx
+ jae 8f
+
+ ## Word 4. If the key is /more/ than 6 words long, then we must
+ ## apply a substitution here.
+ cmp ebx, 5
+ jb 9b
+ cmp ebx, 7
+ jb 0f
+ movd xmm0, eax
+ pshufd xmm0, xmm0, 0x93
+ aeskeygenassist xmm1, xmm0, 0
+ movd eax, xmm1
+0: xor eax, [esi]
+ mov [esi + 4*ebx], eax
+ add esi, 4
+ cmp esi, edx
+ jae 8f
+
+ ## (Word 5...)
+ cmp ebx, 6
+ jb 9b
+ xor eax, [esi]
+ mov [esi + 4*ebx], eax
+ add esi, 4
+ cmp esi, edx
+ jae 8f
+
+ ## (Word 6...)
+ cmp ebx, 7
+ jb 9b
+ xor eax, [esi]
+ mov [esi + 4*ebx], eax
+ add esi, 4
+ cmp esi, edx
+ jae 8f
+
+ ## (Word 7...)
+ cmp ebx, 8
+ jb 9b
+ xor eax, [esi]
+ mov [esi + 4*ebx], eax
+ add esi, 4
+ cmp esi, edx
+ jae 8f
+
+ ## Must be done by now.
+ jmp 9b
+
+ ## Next job is to construct the decryption keys. The keys for the
+ ## first and last rounds don't need to be mangled, but the remaining
+ ## ones do -- and they all need to be reordered too.
+ ##
+ ## The plan of action, then, is to copy the final encryption round's
+ ## keys into place first, then to do each of the intermediate rounds
+ ## in reverse order, and finally do the first round.
+ ##
+ ## Do all of the heavy lifting with SSE registers. The order we're
+ ## doing this in means that it's OK if we read or write too much, and
+ ## there's easily enough buffer space for the over-enthusiastic reads
+ ## and writes because the context has space for 32-byte blocks, which
+ ## is our maximum and an exact fit for two SSE registers.
+8: mov ecx, [ebp + nr] # number of rounds
+ mov ebx, [esp + 24] # block size (in words)
+ mov edx, ecx
+ imul edx, ebx
+ lea edi, [ebp + wi]
+ lea esi, [ebp + 4*edx + w] # last round's keys
+ shl ebx, 2 # block size (in bytes now)
+
+ ## Copy the last encryption round's keys.
+ movdqu xmm0, [esi]
+ movdqu [edi], xmm0
+ cmp ebx, 16
+ jbe 9f
+ movdqu xmm0, [esi + 16]
+ movdqu [edi + 16], xmm0
+
+ ## Update the loop variables and stop if we've finished.
+9: add edi, ebx
+ sub esi, ebx
+ sub ecx, 1
+ jbe 0f
+
+ ## Do another middle round's keys...
+ movdqu xmm0, [esi]
+ aesimc xmm0, xmm0
+ movdqu [edi], xmm0
+ cmp ebx, 16
+ jbe 9b
+ movdqu xmm0, [esi + 16]
+ aesimc xmm0, xmm0
+ movdqu [edi + 16], xmm0
+ jmp 9b
+
+ ## Finally do the first encryption round.
+0: movdqu xmm0, [esi]
+ movdqu [edi], xmm0
+ cmp ebx, 16
+ jbe 0f
+ movdqu xmm0, [esi + 16]
+ movdqu [edi + 16], xmm0
+
+ ## If the block size is not exactly four words then we must end-swap
+ ## everything. We can use fancy SSE toys for this.
+0: cmp ebx, 16
+ je 0f
+
+ ## Find the byte-reordering table.
+ call where_am_i_ecx
+ movdqa xmm7, [ecx + endswap_tab - .]
+
+ ## Calculate the number of subkey words again. (It's a good job
+ ## we've got a fast multiplier.)
+ mov ecx, [ebp + nr]
+ add ecx, 1
+ imul ecx, [esp + 24] # total keys in words
+
+ ## End-swap the encryption keys.
+ mov eax, ecx
+ lea esi, [ebp + w]
+ call endswap_block
+
+ ## And the decryption keys.
+ mov ecx, eax
+ lea esi, [ebp + wi]
+ call endswap_block
+
+ ## All done.
+0: pop edi
+ pop esi
+ pop ebx
+ pop ebp
+ ret
+
+ .align 16
+endswap_block:
+ ## End-swap ECX words starting at ESI. The end-swapping table is
+ ## already loaded into XMM7; and it's OK to work in 16-byte chunks.
+ movdqu xmm1, [esi]
+ pshufb xmm1, xmm7
+ movdqu [esi], xmm1
+ add esi, 16
+ sub ecx, 4
+ ja endswap_block
+ ret
+
+ .size rijndael_setup_x86_aesni, . - rijndael_setup_x86_aesni
+
+###--------------------------------------------------------------------------
+### Encrypting and decrypting blocks.
+
+ .globl rijndael_eblk_x86_aesni
+ .type rijndael_eblk_x86_aesni, STT_FUNC
+ .align 16
+rijndael_eblk_x86_aesni:
+
+ ## On entry, we have:
+ ## [esp + 4] points to the context block
+ ## [esp + 8] points to the input data block
+ ## [esp + 12] points to the output buffer
+
+ ## Find the magic endianness-swapping table.
+ call where_am_i_ecx
+ movdqa xmm7, [ecx + endswap_tab - .]
+
+ ## Load the input block and end-swap it. Also, start loading the
+ ## keys.
+ mov eax, [esp + 8]
+ movdqu xmm0, [eax]
+ pshufb xmm0, xmm7
+ mov eax, [esp + 4]
+ lea edx, [eax + w]
+ mov eax, [eax + nr]
+
+ ## Initial whitening.
+ movdqu xmm1, [edx]
+ add edx, 16
+ pxor xmm0, xmm1
+
+ ## Dispatch to the correct code.
+ cmp eax, 10
+ je er10
+ jb bogus
+ cmp eax, 14
+ je er14
+ ja bogus
+ cmp eax, 12
+ je er12
+ jb er11
+ jmp er13
+
+ .align 2
+
+ ## 14 rounds...
+er14: movdqu xmm1, [edx]
+ add edx, 16
+ aesenc xmm0, xmm1
+
+ ## 13 rounds...
+er13: movdqu xmm1, [edx]
+ add edx, 16
+ aesenc xmm0, xmm1
+
+ ## 12 rounds...
+er12: movdqu xmm1, [edx]
+ add edx, 16
+ aesenc xmm0, xmm1
+
+ ## 11 rounds...
+er11: movdqu xmm1, [edx]
+ add edx, 16
+ aesenc xmm0, xmm1
+
+ ## 10 rounds...
+er10: movdqu xmm1, [edx]
+ aesenc xmm0, xmm1
+
+ ## 9 rounds...
+ movdqu xmm1, [edx + 16]
+ aesenc xmm0, xmm1
+
+ ## 8 rounds...
+ movdqu xmm1, [edx + 32]
+ aesenc xmm0, xmm1
+
+ ## 7 rounds...
+ movdqu xmm1, [edx + 48]
+ aesenc xmm0, xmm1
+
+ ## 6 rounds...
+ movdqu xmm1, [edx + 64]
+ aesenc xmm0, xmm1
+
+ ## 5 rounds...
+ movdqu xmm1, [edx + 80]
+ aesenc xmm0, xmm1
+
+ ## 4 rounds...
+ movdqu xmm1, [edx + 96]
+ aesenc xmm0, xmm1
+
+ ## 3 rounds...
+ movdqu xmm1, [edx + 112]
+ aesenc xmm0, xmm1
+
+ ## 2 rounds...
+ movdqu xmm1, [edx + 128]
+ aesenc xmm0, xmm1
+
+ ## Final round...
+ movdqu xmm1, [edx + 144]
+ aesenclast xmm0, xmm1
+
+ ## Unpermute the ciphertext block and store it.
+ pshufb xmm0, xmm7
+ mov eax, [esp + 12]
+ movdqu [eax], xmm0
+
+ ## And we're done.
+ ret
+
+ .size rijndael_eblk_x86_aesni, . - rijndael_dblk_x86_aesni
+
+ .globl rijndael_dblk_x86_aesni
+ .type rijndael_dblk_x86_aesni, STT_FUNC
+ .align 16
+rijndael_dblk_x86_aesni:
+
+ ## On entry, we have:
+ ## [esp + 4] points to the context block
+ ## [esp + 8] points to the input data block
+ ## [esp + 12] points to the output buffer
+
+ ## Find the magic endianness-swapping table.
+ call where_am_i_ecx
+ movdqa xmm7, [ecx + endswap_tab - .]
+
+ ## Load the input block and end-swap it. Also, start loading the
+ ## keys.
+ mov eax, [esp + 8]
+ movdqu xmm0, [eax]
+ pshufb xmm0, xmm7
+ mov eax, [esp + 4]
+ lea edx, [eax + wi]
+ mov eax, [eax + nr]
+
+ ## Initial whitening.
+ movdqu xmm1, [edx]
+ add edx, 16
+ pxor xmm0, xmm1
+
+ ## Dispatch to the correct code.
+ cmp eax, 10
+ je dr10
+ jb bogus
+ cmp eax, 14
+ je dr14
+ ja bogus
+ cmp eax, 12
+ je dr12
+ jb dr11
+ jmp dr13
+
+ .align 2
+
+ ## 14 rounds...
+dr14: movdqu xmm1, [edx]
+ add edx, 16
+ aesdec xmm0, xmm1
+
+ ## 13 rounds...
+dr13: movdqu xmm1, [edx]
+ add edx, 16
+ aesdec xmm0, xmm1
+
+ ## 12 rounds...
+dr12: movdqu xmm1, [edx]
+ add edx, 16
+ aesdec xmm0, xmm1
+
+ ## 11 rounds...
+dr11: movdqu xmm1, [edx]
+ add edx, 16
+ aesdec xmm0, xmm1
+
+ ## 10 rounds...
+dr10: movdqu xmm1, [edx]
+ aesdec xmm0, xmm1
+
+ ## 9 rounds...
+ movdqu xmm1, [edx + 16]
+ aesdec xmm0, xmm1
+
+ ## 8 rounds...
+ movdqu xmm1, [edx + 32]
+ aesdec xmm0, xmm1
+
+ ## 7 rounds...
+ movdqu xmm1, [edx + 48]
+ aesdec xmm0, xmm1
+
+ ## 6 rounds...
+ movdqu xmm1, [edx + 64]
+ aesdec xmm0, xmm1
+
+ ## 5 rounds...
+ movdqu xmm1, [edx + 80]
+ aesdec xmm0, xmm1
+
+ ## 4 rounds...
+ movdqu xmm1, [edx + 96]
+ aesdec xmm0, xmm1
+
+ ## 3 rounds...
+ movdqu xmm1, [edx + 112]
+ aesdec xmm0, xmm1
+
+ ## 2 rounds...
+ movdqu xmm1, [edx + 128]
+ aesdec xmm0, xmm1
+
+ ## Final round...
+ movdqu xmm1, [edx + 144]
+ aesdeclast xmm0, xmm1
+
+ ## Unpermute the ciphertext block and store it.
+ pshufb xmm0, xmm7
+ mov eax, [esp + 12]
+ movdqu [eax], xmm0
+
+ ## And we're done.
+ ret
+
+ .size rijndael_dblk_x86_aesni, . - rijndael_dblk_x86_aesni
+
+###--------------------------------------------------------------------------
+### Random utilities.
+
+ .align 16
+ ## Abort the process because of a programming error. Indirecting
+ ## through this point serves several purposes: (a) by CALLing, rather
+ ## than branching to, `abort', we can save the return address, which
+ ## might at least provide a hint as to what went wrong; (b) we don't
+ ## have conditional CALLs (and they'd be big anyway); and (c) we can
+ ## write a HLT here as a backstop against `abort' being mad.
+bogus: call abort@PLT
+0: hlt
+ jmp 0b
+
+ .align 16
+ ## Return the address of the instruction following the CALL here in
+ ## ECX. This is useful for doing position-independent addressing.
+where_am_i_ecx:
+ mov ecx, [esp]
+ ret
+
+###--------------------------------------------------------------------------
+### Data tables.
+
+ .align 16
+endswap_tab:
+ .byte 3, 2, 1, 0
+ .byte 7, 6, 5, 4
+ .byte 11, 10, 9, 8
+ .byte 15, 14, 13, 12
+
+###----- That's all, folks --------------------------------------------------