Commit | Line | Data |
---|---|---|
1a0c09c4 MW |
1 | /// -*- mode: asm; asm-comment-char: ?/ -*- |
2 | /// | |
3 | /// AESNI-based implementation of Rijndael | |
4 | /// | |
5 | /// (c) 2015 Straylight/Edgeware | |
6 | /// | |
7 | ||
8 | ///----- Licensing notice --------------------------------------------------- | |
9 | /// | |
10 | /// This file is part of Catacomb. | |
11 | /// | |
12 | /// Catacomb is free software; you can redistribute it and/or modify | |
13 | /// it under the terms of the GNU Library General Public License as | |
14 | /// published by the Free Software Foundation; either version 2 of the | |
15 | /// License, or (at your option) any later version. | |
16 | /// | |
17 | /// Catacomb is distributed in the hope that it will be useful, | |
18 | /// but WITHOUT ANY WARRANTY; without even the implied warranty of | |
19 | /// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
20 | /// GNU Library General Public License for more details. | |
21 | /// | |
22 | /// You should have received a copy of the GNU Library General Public | |
23 | /// License along with Catacomb; if not, write to the Free | |
24 | /// Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, | |
25 | /// MA 02111-1307, USA. | |
26 | ||
27 | ///-------------------------------------------------------------------------- | |
df07f2c0 | 28 | /// Preliminaries. |
1a0c09c4 MW |
29 | |
30 | #include "config.h" | |
31 | #include "asm-common.h" | |
32 | ||
df07f2c0 MW |
33 | .arch .aes |
34 | ||
81bc2bb8 MW |
35 | .extern F(abort) |
36 | .extern F(rijndael_rcon) | |
1a0c09c4 | 37 | |
df07f2c0 MW |
38 | .text |
39 | ||
1a0c09c4 MW |
40 | ///-------------------------------------------------------------------------- |
41 | /// Main code. | |
42 | ||
1a0c09c4 MW |
43 | /// The AESNI instructions implement a little-endian version of AES, but |
44 | /// Catacomb's internal interface presents as big-endian so as to work better | |
45 | /// with things like GCM. We therefore maintain the round keys in | |
46 | /// little-endian form, and have to end-swap blocks in and out. | |
47 | /// | |
48 | /// For added amusement, the AESNI instructions don't implement the | |
49 | /// larger-block versions of Rijndael, so we have to end-swap the keys if | |
50 | /// we're preparing for one of those. | |
51 | ||
52 | // Useful constants. | |
53 | .equ maxrounds, 16 // maximum number of rounds | |
54 | .equ maxblksz, 32 // maximum block size, in bytes | |
55 | .equ kbufsz, maxblksz*(maxrounds + 1) // size of a key-schedule buffer | |
56 | ||
57 | // Context structure. | |
58 | .equ nr, 0 // number of rounds | |
59 | .equ w, nr + 4 // encryption key words | |
60 | .equ wi, w + kbufsz // decryption key words | |
61 | ||
62 | ///-------------------------------------------------------------------------- | |
63 | /// Key setup. | |
64 | ||
b9b279b4 MW |
65 | FUNC(rijndael_setup_x86ish_aesni_avx) |
66 | vzeroupper // avoid penalty on `legacy' XMM access | |
67 | endprologue | |
68 | // and drop through... | |
69 | ENDFUNC | |
70 | ||
0f23f75f | 71 | FUNC(rijndael_setup_x86ish_aesni) |
1a0c09c4 | 72 | |
0f23f75f MW |
73 | #if CPUFAM_X86 |
74 | // Arguments are on the stack. We'll need to stack the caller's | |
75 | // register veriables, but we'll manage. | |
1a0c09c4 | 76 | |
a90d420c MW |
77 | # define CTX BP // context pointer |
78 | # define BLKSZ [SP + 24] // block size | |
0f23f75f | 79 | |
0f23f75f | 80 | # define KSZ ebx // key size |
0f23f75f MW |
81 | # define NKW edx // total number of key words |
82 | # define NKW_NEEDS_REFRESH 1 // ... needs recalculating | |
83 | # define RCON ecx // round constants table | |
84 | # define LIM edx // limit pointer | |
16021451 | 85 | # define CYIX edi // index in shift-register cycle |
0f23f75f MW |
86 | |
87 | # define NR ecx // number of rounds | |
88 | # define LRK eax // distance to last key | |
0f23f75f | 89 | # define BLKOFF edx // block size in bytes |
0f23f75f MW |
90 | |
91 | // Stack the caller's registers. | |
a90d420c | 92 | pushreg BP |
0923a413 MW |
93 | pushreg ebx |
94 | pushreg esi | |
95 | pushreg edi | |
1a0c09c4 | 96 | |
0f23f75f | 97 | // Set up our own variables. |
a90d420c MW |
98 | mov CTX, [SP + 20] // context base pointer |
99 | mov SI, [SP + 28] // key material | |
100 | mov KSZ, [SP + 32] // key size, in words | |
0f23f75f MW |
101 | #endif |
102 | ||
103 | #if CPUFAM_AMD64 && ABI_SYSV | |
104 | // Arguments are in registers. We have plenty, but, to be honest, | |
105 | // the initial register allocation is a bit annoying. | |
106 | ||
107 | # define CTX r8 // context pointer | |
108 | # define BLKSZ r9d // block size | |
109 | ||
0f23f75f | 110 | # define KSZ edx // key size |
0f23f75f MW |
111 | # define NKW r10d // total number of key words |
112 | # define RCON rdi // round constants table | |
43ea7558 | 113 | # define LIM rcx // limit pointer |
16021451 | 114 | # define CYIX r11d // index in shift-register cycle |
0f23f75f MW |
115 | |
116 | # define NR ecx // number of rounds | |
117 | # define LRK eax // distance to last key | |
0f23f75f | 118 | # define BLKOFF r9d // block size in bytes |
0f23f75f MW |
119 | |
120 | // Move arguments to more useful places. | |
121 | mov CTX, rdi // context base pointer | |
122 | mov BLKSZ, esi // block size in words | |
123 | mov SI, rdx // key material | |
124 | mov KSZ, ecx // key size, in words | |
125 | #endif | |
126 | ||
127 | #if CPUFAM_AMD64 && ABI_WIN | |
128 | // Arguments are in different registers, and they're a little tight. | |
129 | ||
130 | # define CTX r8 // context pointer | |
131 | # define BLKSZ edx // block size | |
132 | ||
0f23f75f | 133 | # define KSZ r9d // key size |
0f23f75f MW |
134 | # define NKW r10d // total number of key words |
135 | # define RCON rdi // round constants table | |
43ea7558 | 136 | # define LIM rcx // limit pointer |
16021451 | 137 | # define CYIX r11d // index in shift-register cycle |
0f23f75f MW |
138 | |
139 | # define NR ecx // number of rounds | |
140 | # define LRK eax // distance to last key | |
0f23f75f | 141 | # define BLKOFF edx // block size in bytes |
0f23f75f MW |
142 | |
143 | // We'll need the index registers, which belong to the caller in this | |
144 | // ABI. | |
0923a413 MW |
145 | pushreg rsi |
146 | pushreg rdi | |
0f23f75f MW |
147 | |
148 | // Move arguments to more useful places. | |
43ea7558 | 149 | mov rsi, r8 // key material |
0f23f75f MW |
150 | mov CTX, rcx // context base pointer |
151 | #endif | |
152 | ||
0923a413 MW |
153 | endprologue |
154 | ||
1a0c09c4 MW |
155 | // The initial round key material is taken directly from the input |
156 | // key, so copy it over. | |
0f23f75f MW |
157 | #if CPUFAM_AMD64 && ABI_SYSV |
158 | // We've been lucky. We already have a copy of the context pointer | |
159 | // in rdi, and the key size in ecx. | |
43ea7558 | 160 | add rdi, w |
0f23f75f MW |
161 | #else |
162 | lea DI, [CTX + w] | |
163 | mov ecx, KSZ | |
164 | #endif | |
1a0c09c4 MW |
165 | rep movsd |
166 | ||
167 | // Find out other useful things. | |
0f23f75f MW |
168 | mov NKW, [CTX + nr] // number of rounds |
169 | add NKW, 1 | |
170 | imul NKW, BLKSZ // total key size in words | |
171 | #if !NKW_NEEDS_REFRESH | |
172 | // If we can't keep NKW for later, then we use the same register for | |
173 | // it and LIM, so this move is unnecessary. | |
43ea7558 | 174 | mov DWORD(LIM), NKW |
0f23f75f | 175 | #endif |
43ea7558 | 176 | sub DWORD(LIM), KSZ // offset by the key size |
1a0c09c4 MW |
177 | |
178 | // Find the round constants. | |
43ea7558 MW |
179 | ldgot WHOLE(c) |
180 | leaext RCON, F(rijndael_rcon), WHOLE(c) | |
1a0c09c4 MW |
181 | |
182 | // Prepare for the main loop. | |
0f23f75f | 183 | lea SI, [CTX + w] |
43ea7558 | 184 | mov eax, [SI + 4*WHOLE(KSZ) - 4] // most recent key word |
0f23f75f | 185 | lea LIM, [SI + 4*LIM] // limit, offset by one key expansion |
16021451 | 186 | xor CYIX, CYIX // start of new cycle |
1a0c09c4 MW |
187 | |
188 | // Main key expansion loop. The first word of each key-length chunk | |
189 | // needs special treatment. | |
190 | // | |
191 | // This is rather tedious because the Intel `AESKEYGENASSIST' | |
192 | // instruction is very strangely shaped. Firstly, it wants to | |
193 | // operate on vast SSE registers, even though we're data-blocked from | |
194 | // doing more than operation at a time unless we're doing two key | |
195 | // schedules simultaneously -- and even then we can't do more than | |
196 | // two, because the instruction ignores two of its input words | |
197 | // entirely, and produces two different outputs for each of the other | |
198 | // two. And secondly it insists on taking the magic round constant | |
199 | // as an immediate, so it's kind of annoying if you're not | |
200 | // open-coding the whole thing. It's much easier to leave that as | |
201 | // zero and XOR in the round constant by hand. | |
16021451 MW |
202 | 0: cmp CYIX, 0 // first word of the cycle? |
203 | je 1f | |
204 | cmp CYIX, 4 // fourth word of the cycle? | |
205 | jne 2f | |
206 | cmp KSZ, 7 // and a large key? | |
207 | jb 2f | |
208 | ||
209 | // Fourth word of the cycle, and seven or eight words of key. Do a | |
210 | // byte substitution. | |
211 | movd xmm0, eax | |
981a9e5d | 212 | pshufd xmm0, xmm0, SHUF(2, 1, 0, 3) |
16021451 MW |
213 | aeskeygenassist xmm1, xmm0, 0 |
214 | movd eax, xmm1 | |
215 | jmp 2f | |
216 | ||
217 | // First word of the cycle. This is the complicated piece. | |
218 | 1: movd xmm0, eax | |
981a9e5d | 219 | pshufd xmm0, xmm0, SHUF(0, 3, 2, 1) |
1a0c09c4 | 220 | aeskeygenassist xmm1, xmm0, 0 |
981a9e5d | 221 | pshufd xmm1, xmm1, SHUF(2, 1, 0, 3) |
1a0c09c4 | 222 | movd eax, xmm1 |
0f23f75f MW |
223 | xor al, [RCON] |
224 | inc RCON | |
1a0c09c4 | 225 | |
16021451 MW |
226 | // Common tail. Mix in the corresponding word from the previous |
227 | // cycle and prepare for the next loop. | |
228 | 2: xor eax, [SI] | |
43ea7558 | 229 | mov [SI + 4*WHOLE(KSZ)], eax |
0f23f75f | 230 | add SI, 4 |
16021451 | 231 | inc CYIX |
0f23f75f | 232 | cmp SI, LIM |
89b34050 | 233 | jae 9f |
16021451 | 234 | cmp CYIX, KSZ |
89b34050 | 235 | jb 0b |
16021451 | 236 | xor CYIX, CYIX |
89b34050 | 237 | jmp 0b |
1a0c09c4 MW |
238 | |
239 | // Next job is to construct the decryption keys. The keys for the | |
240 | // first and last rounds don't need to be mangled, but the remaining | |
241 | // ones do -- and they all need to be reordered too. | |
242 | // | |
243 | // The plan of action, then, is to copy the final encryption round's | |
244 | // keys into place first, then to do each of the intermediate rounds | |
245 | // in reverse order, and finally do the first round. | |
246 | // | |
247 | // Do all of the heavy lifting with SSE registers. The order we're | |
248 | // doing this in means that it's OK if we read or write too much, and | |
249 | // there's easily enough buffer space for the over-enthusiastic reads | |
250 | // and writes because the context has space for 32-byte blocks, which | |
251 | // is our maximum and an exact fit for two SSE registers. | |
89b34050 | 252 | 9: mov NR, [CTX + nr] // number of rounds |
0f23f75f MW |
253 | #if NKW_NEEDS_REFRESH |
254 | mov BLKOFF, BLKSZ | |
255 | mov LRK, NR | |
256 | imul LRK, BLKOFF | |
257 | #else | |
258 | // If we retain NKW, then BLKSZ and BLKOFF are the same register | |
259 | // because we won't need the former again. | |
260 | mov LRK, NKW | |
261 | sub LRK, BLKSZ | |
262 | #endif | |
263 | lea DI, [CTX + wi] | |
43ea7558 | 264 | lea SI, [CTX + w + 4*WHOLE(LRK)] // last round's keys |
0f23f75f | 265 | shl BLKOFF, 2 // block size (in bytes now) |
1a0c09c4 MW |
266 | |
267 | // Copy the last encryption round's keys. | |
0f23f75f MW |
268 | movdqu xmm0, [SI] |
269 | movdqu [DI], xmm0 | |
270 | cmp BLKOFF, 16 | |
89b34050 | 271 | jbe 0f |
0f23f75f MW |
272 | movdqu xmm0, [SI + 16] |
273 | movdqu [DI + 16], xmm0 | |
1a0c09c4 MW |
274 | |
275 | // Update the loop variables and stop if we've finished. | |
43ea7558 MW |
276 | 0: add DI, WHOLE(BLKOFF) |
277 | sub SI, WHOLE(BLKOFF) | |
0f23f75f | 278 | sub NR, 1 |
89b34050 | 279 | jbe 9f |
1a0c09c4 MW |
280 | |
281 | // Do another middle round's keys... | |
0f23f75f | 282 | movdqu xmm0, [SI] |
1a0c09c4 | 283 | aesimc xmm0, xmm0 |
0f23f75f MW |
284 | movdqu [DI], xmm0 |
285 | cmp BLKOFF, 16 | |
89b34050 | 286 | jbe 0b |
0f23f75f | 287 | movdqu xmm0, [SI + 16] |
1a0c09c4 | 288 | aesimc xmm0, xmm0 |
0f23f75f | 289 | movdqu [DI + 16], xmm0 |
89b34050 | 290 | jmp 0b |
1a0c09c4 MW |
291 | |
292 | // Finally do the first encryption round. | |
89b34050 | 293 | 9: movdqu xmm0, [SI] |
0f23f75f MW |
294 | movdqu [DI], xmm0 |
295 | cmp BLKOFF, 16 | |
89b34050 | 296 | jbe 1f |
0f23f75f MW |
297 | movdqu xmm0, [SI + 16] |
298 | movdqu [DI + 16], xmm0 | |
1a0c09c4 MW |
299 | |
300 | // If the block size is not exactly four words then we must end-swap | |
301 | // everything. We can use fancy SSE toys for this. | |
89b34050 MW |
302 | 1: cmp BLKOFF, 16 |
303 | je 9f | |
1a0c09c4 MW |
304 | |
305 | // Find the byte-reordering table. | |
306 | ldgot ecx | |
8d6ca554 | 307 | movdqa xmm5, [INTADDR(endswap_tab, ecx)] |
1a0c09c4 | 308 | |
0f23f75f | 309 | #if NKW_NEEDS_REFRESH |
1a0c09c4 MW |
310 | // Calculate the number of subkey words again. (It's a good job |
311 | // we've got a fast multiplier.) | |
0f23f75f MW |
312 | mov NKW, [CTX + nr] |
313 | add NKW, 1 | |
314 | imul NKW, BLKSZ | |
315 | #endif | |
1a0c09c4 MW |
316 | |
317 | // End-swap the encryption keys. | |
0f23f75f | 318 | lea SI, [CTX + w] |
1a0c09c4 MW |
319 | call endswap_block |
320 | ||
321 | // And the decryption keys. | |
0f23f75f | 322 | lea SI, [CTX + wi] |
1a0c09c4 MW |
323 | call endswap_block |
324 | ||
89b34050 | 325 | 9: // All done. |
0f23f75f | 326 | #if CPUFAM_X86 |
0923a413 MW |
327 | popreg edi |
328 | popreg esi | |
329 | popreg ebx | |
a90d420c | 330 | popreg BP |
0f23f75f MW |
331 | #endif |
332 | #if CPUFAM_AMD64 && ABI_WIN | |
0923a413 MW |
333 | popreg rdi |
334 | popreg rsi | |
0f23f75f | 335 | #endif |
1a0c09c4 MW |
336 | ret |
337 | ||
1a517bb3 MW |
338 | ENDFUNC |
339 | ||
340 | INTFUNC(endswap_block) | |
1a384903 | 341 | // End-swap NKW words starting at SI. The end-swapping table is |
8d6ca554 | 342 | // already loaded into XMM5; and it's OK to work in 16-byte chunks. |
0923a413 | 343 | endprologue |
1a517bb3 | 344 | |
1a384903 MW |
345 | mov ecx, NKW |
346 | 0: movdqu xmm1, [SI] | |
8d6ca554 | 347 | pshufb xmm1, xmm5 |
0f23f75f MW |
348 | movdqu [SI], xmm1 |
349 | add SI, 16 | |
1a0c09c4 | 350 | sub ecx, 4 |
1a384903 | 351 | ja 0b |
1a517bb3 | 352 | |
1a0c09c4 MW |
353 | ret |
354 | ||
1a517bb3 MW |
355 | ENDFUNC |
356 | ||
0f23f75f MW |
357 | #undef CTX |
358 | #undef BLKSZ | |
359 | #undef SI | |
360 | #undef DI | |
361 | #undef KSZ | |
0f23f75f | 362 | #undef RCON |
0f23f75f MW |
363 | #undef LIM |
364 | #undef NR | |
365 | #undef LRK | |
0f23f75f | 366 | #undef BLKOFF |
0f23f75f | 367 | |
1a0c09c4 MW |
368 | ///-------------------------------------------------------------------------- |
369 | /// Encrypting and decrypting blocks. | |
370 | ||
8a1aa284 | 371 | .macro encdec op, aes, koff |
b9b279b4 MW |
372 | FUNC(rijndael_\op\()_x86ish_aesni_avx) |
373 | vzeroupper // avoid XMM penalties | |
374 | endprologue | |
375 | // and drop through... | |
376 | ENDFUNC | |
377 | ||
8a1aa284 | 378 | FUNC(rijndael_\op\()_x86ish_aesni) |
1a0c09c4 | 379 | |
0f23f75f MW |
380 | #if CPUFAM_X86 |
381 | // Arguments come in on the stack, and need to be collected. We | |
382 | // don't have a shortage of registers. | |
383 | ||
c410f911 | 384 | # define K eax |
0f23f75f MW |
385 | # define SRC edx |
386 | # define DST edx | |
c410f911 | 387 | # define NR ecx |
0f23f75f | 388 | |
a90d420c MW |
389 | mov K, [SP + 4] |
390 | mov SRC, [SP + 8] | |
0f23f75f MW |
391 | #endif |
392 | ||
393 | #if CPUFAM_AMD64 && ABI_SYSV | |
394 | // Arguments come in registers. All is good. | |
395 | ||
396 | # define K rdi | |
397 | # define SRC rsi | |
398 | # define DST rdx | |
399 | # define NR eax | |
400 | #endif | |
401 | ||
402 | #if CPUFAM_AMD64 && ABI_WIN | |
403 | // Arguments come in different registers. | |
404 | ||
405 | # define K rcx | |
406 | # define SRC rdx | |
407 | # define DST r8 | |
408 | # define NR eax | |
409 | #endif | |
410 | ||
0923a413 MW |
411 | endprologue |
412 | ||
28321c96 MW |
413 | // Find the magic endianness-swapping table. |
414 | ldgot ecx | |
415 | movdqa xmm5, [INTADDR(endswap_tab, ecx)] | |
416 | ||
0f23f75f MW |
417 | // Initial setup. |
418 | movdqu xmm0, [SRC] | |
8d6ca554 | 419 | pshufb xmm0, xmm5 |
0f23f75f MW |
420 | mov NR, [K + nr] |
421 | add K, \koff | |
1a0c09c4 MW |
422 | |
423 | // Initial whitening. | |
0f23f75f MW |
424 | movdqu xmm1, [K] |
425 | add K, 16 | |
1a0c09c4 | 426 | pxor xmm0, xmm1 |
1d63fee4 | 427 | #if CPUFAM_X86 |
a90d420c | 428 | mov DST, [SP + 12] |
1d63fee4 | 429 | #endif |
1a0c09c4 MW |
430 | |
431 | // Dispatch to the correct code. | |
0f23f75f | 432 | cmp NR, 10 |
e297526c | 433 | je 10f |
1a0c09c4 | 434 | jb bogus |
0f23f75f | 435 | cmp NR, 14 |
e297526c | 436 | je 14f |
1a0c09c4 | 437 | ja bogus |
0f23f75f | 438 | cmp NR, 12 |
e297526c MW |
439 | je 12f |
440 | jb 11f | |
441 | jmp 13f | |
1a0c09c4 MW |
442 | |
443 | .align 2 | |
444 | ||
445 | // 14 rounds... | |
0f23f75f MW |
446 | 14: movdqu xmm1, [K] |
447 | add K, 16 | |
e297526c | 448 | \aes xmm0, xmm1 |
1a0c09c4 MW |
449 | |
450 | // 13 rounds... | |
0f23f75f MW |
451 | 13: movdqu xmm1, [K] |
452 | add K, 16 | |
e297526c | 453 | \aes xmm0, xmm1 |
1a0c09c4 MW |
454 | |
455 | // 12 rounds... | |
0f23f75f MW |
456 | 12: movdqu xmm1, [K] |
457 | add K, 16 | |
e297526c | 458 | \aes xmm0, xmm1 |
1a0c09c4 MW |
459 | |
460 | // 11 rounds... | |
0f23f75f MW |
461 | 11: movdqu xmm1, [K] |
462 | add K, 16 | |
e297526c | 463 | \aes xmm0, xmm1 |
1a0c09c4 MW |
464 | |
465 | // 10 rounds... | |
0f23f75f | 466 | 10: movdqu xmm1, [K] |
e297526c | 467 | \aes xmm0, xmm1 |
1a0c09c4 MW |
468 | |
469 | // 9 rounds... | |
0f23f75f | 470 | movdqu xmm1, [K + 16] |
e297526c | 471 | \aes xmm0, xmm1 |
1a0c09c4 MW |
472 | |
473 | // 8 rounds... | |
0f23f75f | 474 | movdqu xmm1, [K + 32] |
e297526c | 475 | \aes xmm0, xmm1 |
1a0c09c4 MW |
476 | |
477 | // 7 rounds... | |
0f23f75f | 478 | movdqu xmm1, [K + 48] |
e297526c | 479 | \aes xmm0, xmm1 |
1a0c09c4 MW |
480 | |
481 | // 6 rounds... | |
0f23f75f | 482 | movdqu xmm1, [K + 64] |
e297526c | 483 | \aes xmm0, xmm1 |
1a0c09c4 MW |
484 | |
485 | // 5 rounds... | |
0f23f75f | 486 | movdqu xmm1, [K + 80] |
e297526c | 487 | \aes xmm0, xmm1 |
1a0c09c4 MW |
488 | |
489 | // 4 rounds... | |
0f23f75f | 490 | movdqu xmm1, [K + 96] |
e297526c | 491 | \aes xmm0, xmm1 |
1a0c09c4 MW |
492 | |
493 | // 3 rounds... | |
0f23f75f | 494 | movdqu xmm1, [K + 112] |
e297526c | 495 | \aes xmm0, xmm1 |
1a0c09c4 MW |
496 | |
497 | // 2 rounds... | |
0f23f75f | 498 | movdqu xmm1, [K + 128] |
e297526c | 499 | \aes xmm0, xmm1 |
1a0c09c4 MW |
500 | |
501 | // Final round... | |
0f23f75f | 502 | movdqu xmm1, [K + 144] |
e297526c | 503 | \aes\()last xmm0, xmm1 |
1a0c09c4 MW |
504 | |
505 | // Unpermute the ciphertext block and store it. | |
8d6ca554 | 506 | pshufb xmm0, xmm5 |
0f23f75f | 507 | movdqu [DST], xmm0 |
1a0c09c4 MW |
508 | |
509 | // And we're done. | |
510 | ret | |
511 | ||
0f23f75f MW |
512 | #undef K |
513 | #undef SRC | |
514 | #undef DST | |
515 | #undef NR | |
516 | ||
8a1aa284 MW |
517 | ENDFUNC |
518 | .endm | |
1a0c09c4 | 519 | |
e297526c MW |
520 | encdec eblk, aesenc, w |
521 | encdec dblk, aesdec, wi | |
1a0c09c4 MW |
522 | |
523 | ///-------------------------------------------------------------------------- | |
524 | /// Random utilities. | |
525 | ||
1a517bb3 | 526 | INTFUNC(bogus) |
1a0c09c4 MW |
527 | // Abort the process because of a programming error. Indirecting |
528 | // through this point serves several purposes: (a) by CALLing, rather | |
529 | // than branching to, `abort', we can save the return address, which | |
530 | // might at least provide a hint as to what went wrong; (b) we don't | |
531 | // have conditional CALLs (and they'd be big anyway); and (c) we can | |
532 | // write a HLT here as a backstop against `abort' being mad. | |
0923a413 | 533 | endprologue |
1a517bb3 MW |
534 | |
535 | callext F(abort) | |
1a0c09c4 MW |
536 | 0: hlt |
537 | jmp 0b | |
538 | ||
1a517bb3 MW |
539 | ENDFUNC |
540 | ||
1a0c09c4 MW |
541 | ///-------------------------------------------------------------------------- |
542 | /// Data tables. | |
543 | ||
645fcce0 MW |
544 | RODATA |
545 | ||
1a0c09c4 MW |
546 | .align 16 |
547 | endswap_tab: | |
548 | .byte 3, 2, 1, 0 | |
549 | .byte 7, 6, 5, 4 | |
550 | .byte 11, 10, 9, 8 | |
551 | .byte 15, 14, 13, 12 | |
552 | ||
553 | ///----- That's all, folks -------------------------------------------------- |