1 /// -*- mode: asm; asm-comment-char: ?/ -*-
3 /// AESNI-based implementation of Rijndael
5 /// (c) 2015 Straylight/Edgeware
8 ///----- Licensing notice ---------------------------------------------------
10 /// This file is part of Catacomb.
12 /// Catacomb is free software; you can redistribute it and/or modify
13 /// it under the terms of the GNU Library General Public License as
14 /// published by the Free Software Foundation; either version 2 of the
15 /// License, or (at your option) any later version.
17 /// Catacomb is distributed in the hope that it will be useful,
18 /// but WITHOUT ANY WARRANTY; without even the implied warranty of
19 /// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20 /// GNU Library General Public License for more details.
22 /// You should have received a copy of the GNU Library General Public
23 /// License along with Catacomb; if not, write to the Free
24 /// Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
25 /// MA 02111-1307, USA.
27 ///--------------------------------------------------------------------------
28 /// External definitions.
31 #include "asm-common.h"
33 ///--------------------------------------------------------------------------
34 /// External definitions.
37 .globl F(rijndael_rcon)
39 ///--------------------------------------------------------------------------
42 // Magic constants for shuffling.
47 ///--------------------------------------------------------------------------
53 /// The AESNI instructions implement a little-endian version of AES, but
54 /// Catacomb's internal interface presents as big-endian so as to work better
55 /// with things like GCM. We therefore maintain the round keys in
56 /// little-endian form, and have to end-swap blocks in and out.
58 /// For added amusement, the AESNI instructions don't implement the
59 /// larger-block versions of Rijndael, so we have to end-swap the keys if
60 /// we're preparing for one of those.
63 .equ maxrounds, 16 // maximum number of rounds
64 .equ maxblksz, 32 // maximum block size, in bytes
65 .equ kbufsz, maxblksz*(maxrounds + 1) // size of a key-schedule buffer
68 .equ nr, 0 // number of rounds
69 .equ w, nr + 4 // encryption key words
70 .equ wi, w + kbufsz // decryption key words
72 ///--------------------------------------------------------------------------
75 FUNC(rijndael_setup_x86ish_aesni)
78 // Arguments are on the stack. We'll need to stack the caller's
79 // register veriables, but we'll manage.
81 # define CTX ebp // context pointer
82 # define BLKSZ [esp + 24] // block size
84 # define SI esi // source pointer
85 # define DI edi // destination pointer
87 # define KSZ ebx // key size
88 # define KSZo ebx // ... as address offset
89 # define NKW edx // total number of key words
90 # define NKW_NEEDS_REFRESH 1 // ... needs recalculating
91 # define RCON ecx // round constants table
92 # define LIM edx // limit pointer
93 # define LIMn edx // ... as integer offset from base
95 # define NR ecx // number of rounds
96 # define LRK eax // distance to last key
97 # define LRKo eax // ... as address offset
98 # define BLKOFF edx // block size in bytes
99 # define BLKOFFo edx // ... as address offset
101 // Stack the caller's registers.
107 // Set up our own variables.
108 mov CTX, [esp + 20] // context base pointer
109 mov SI, [esp + 28] // key material
110 mov KSZ, [esp + 32] // key size, in words
113 #if CPUFAM_AMD64 && ABI_SYSV
114 // Arguments are in registers. We have plenty, but, to be honest,
115 // the initial register allocation is a bit annoying.
117 # define CTX r8 // context pointer
118 # define BLKSZ r9d // block size
120 # define SI rsi // source pointer
121 # define DI rdi // destination pointer
123 # define KSZ edx // key size
124 # define KSZo rdx // ... as address offset
125 # define NKW r10d // total number of key words
126 # define RCON rdi // round constants table
127 # define LIMn ecx // limit pointer
128 # define LIM rcx // ... as integer offset from base
130 # define NR ecx // number of rounds
131 # define LRK eax // distance to last key
132 # define LRKo rax // ... as address offset
133 # define BLKOFF r9d // block size in bytes
134 # define BLKOFFo r9 // ... as address offset
136 // Move arguments to more useful places.
137 mov CTX, rdi // context base pointer
138 mov BLKSZ, esi // block size in words
139 mov SI, rdx // key material
140 mov KSZ, ecx // key size, in words
143 #if CPUFAM_AMD64 && ABI_WIN
144 // Arguments are in different registers, and they're a little tight.
146 # define CTX r8 // context pointer
147 # define BLKSZ edx // block size
149 # define SI rsi // source pointer
150 # define DI rdi // destination pointer
152 # define KSZ r9d // key size
153 # define KSZo r9 // ... as address offset
154 # define NKW r10d // total number of key words
155 # define RCON rdi // round constants table
156 # define LIMn ecx // limit pointer
157 # define LIM rcx // ... as integer offset from base
159 # define NR ecx // number of rounds
160 # define LRK eax // distance to last key
161 # define LRKo rax // ... as address offset
162 # define BLKOFF edx // block size in bytes
163 # define BLKOFFo rdx // ... as address offset
165 // We'll need the index registers, which belong to the caller in this
170 // Move arguments to more useful places.
171 mov SI, r8 // key material
172 mov CTX, rcx // context base pointer
175 // The initial round key material is taken directly from the input
176 // key, so copy it over.
177 #if CPUFAM_AMD64 && ABI_SYSV
178 // We've been lucky. We already have a copy of the context pointer
179 // in rdi, and the key size in ecx.
187 // Find out other useful things.
188 mov NKW, [CTX + nr] // number of rounds
190 imul NKW, BLKSZ // total key size in words
191 #if !NKW_NEEDS_REFRESH
192 // If we can't keep NKW for later, then we use the same register for
193 // it and LIM, so this move is unnecessary.
196 sub LIMn, KSZ // offset by the key size
198 // Find the round constants.
200 leaext RCON, rijndael_rcon, ecx
202 // Prepare for the main loop.
204 mov eax, [SI + 4*KSZo - 4] // most recent key word
205 lea LIM, [SI + 4*LIM] // limit, offset by one key expansion
207 // Main key expansion loop. The first word of each key-length chunk
208 // needs special treatment.
210 // This is rather tedious because the Intel `AESKEYGENASSIST'
211 // instruction is very strangely shaped. Firstly, it wants to
212 // operate on vast SSE registers, even though we're data-blocked from
213 // doing more than operation at a time unless we're doing two key
214 // schedules simultaneously -- and even then we can't do more than
215 // two, because the instruction ignores two of its input words
216 // entirely, and produces two different outputs for each of the other
217 // two. And secondly it insists on taking the magic round constant
218 // as an immediate, so it's kind of annoying if you're not
219 // open-coding the whole thing. It's much easier to leave that as
220 // zero and XOR in the round constant by hand.
222 pshufd xmm0, xmm0, ROTR
223 aeskeygenassist xmm1, xmm0, 0
224 pshufd xmm1, xmm1, ROTL
229 mov [SI + 4*KSZo], eax
234 // The next three words are simple...
236 mov [SI + 4*KSZo], eax
243 mov [SI + 4*KSZo], eax
250 mov [SI + 4*KSZo], eax
255 // Word 4. If the key is /more/ than 6 words long, then we must
256 // apply a substitution here.
262 pshufd xmm0, xmm0, ROTL
263 aeskeygenassist xmm1, xmm0, 0
266 mov [SI + 4*KSZo], eax
275 mov [SI + 4*KSZo], eax
284 mov [SI + 4*KSZo], eax
293 mov [SI + 4*KSZo], eax
298 // Must be done by now.
301 // Next job is to construct the decryption keys. The keys for the
302 // first and last rounds don't need to be mangled, but the remaining
303 // ones do -- and they all need to be reordered too.
305 // The plan of action, then, is to copy the final encryption round's
306 // keys into place first, then to do each of the intermediate rounds
307 // in reverse order, and finally do the first round.
309 // Do all of the heavy lifting with SSE registers. The order we're
310 // doing this in means that it's OK if we read or write too much, and
311 // there's easily enough buffer space for the over-enthusiastic reads
312 // and writes because the context has space for 32-byte blocks, which
313 // is our maximum and an exact fit for two SSE registers.
314 8: mov NR, [CTX + nr] // number of rounds
315 #if NKW_NEEDS_REFRESH
320 // If we retain NKW, then BLKSZ and BLKOFF are the same register
321 // because we won't need the former again.
326 lea SI, [CTX + w + 4*LRKo] // last round's keys
327 shl BLKOFF, 2 // block size (in bytes now)
329 // Copy the last encryption round's keys.
334 movdqu xmm0, [SI + 16]
335 movdqu [DI + 16], xmm0
337 // Update the loop variables and stop if we've finished.
343 // Do another middle round's keys...
349 movdqu xmm0, [SI + 16]
351 movdqu [DI + 16], xmm0
354 // Finally do the first encryption round.
359 movdqu xmm0, [SI + 16]
360 movdqu [DI + 16], xmm0
362 // If the block size is not exactly four words then we must end-swap
363 // everything. We can use fancy SSE toys for this.
367 // Find the byte-reordering table.
369 movdqa xmm5, [INTADDR(endswap_tab, ecx)]
371 #if NKW_NEEDS_REFRESH
372 // Calculate the number of subkey words again. (It's a good job
373 // we've got a fast multiplier.)
379 // End-swap the encryption keys.
384 // And the decryption keys.
396 #if CPUFAM_AMD64 && ABI_WIN
404 // End-swap ECX words starting at SI. The end-swapping table is
405 // already loaded into XMM5; and it's OK to work in 16-byte chunks.
431 ///--------------------------------------------------------------------------
432 /// Encrypting and decrypting blocks.
434 .macro encdec op, aes, koff
435 FUNC(rijndael_\op\()_x86ish_aesni)
437 // Find the magic endianness-swapping table.
439 movdqa xmm5, [INTADDR(endswap_tab, ecx)]
442 // Arguments come in on the stack, and need to be collected. We
443 // don't have a shortage of registers.
454 #if CPUFAM_AMD64 && ABI_SYSV
455 // Arguments come in registers. All is good.
463 #if CPUFAM_AMD64 && ABI_WIN
464 // Arguments come in different registers.
478 // Initial whitening.
483 // Dispatch to the correct code.
522 movdqu xmm1, [K + 16]
526 movdqu xmm1, [K + 32]
530 movdqu xmm1, [K + 48]
534 movdqu xmm1, [K + 64]
538 movdqu xmm1, [K + 80]
542 movdqu xmm1, [K + 96]
546 movdqu xmm1, [K + 112]
550 movdqu xmm1, [K + 128]
554 movdqu xmm1, [K + 144]
555 \aes\()last xmm0, xmm1
557 // Unpermute the ciphertext block and store it.
575 encdec eblk, aesenc, w
576 encdec dblk, aesdec, wi
578 ///--------------------------------------------------------------------------
579 /// Random utilities.
582 // Abort the process because of a programming error. Indirecting
583 // through this point serves several purposes: (a) by CALLing, rather
584 // than branching to, `abort', we can save the return address, which
585 // might at least provide a hint as to what went wrong; (b) we don't
586 // have conditional CALLs (and they'd be big anyway); and (c) we can
587 // write a HLT here as a backstop against `abort' being mad.
588 bogus: callext F(abort)
594 ///--------------------------------------------------------------------------
604 ///----- That's all, folks --------------------------------------------------