chiark - git - mdw - catacomb/blob - symm/rijndael-x86-aesni.S

   1 /// -*- mode: asm; asm-comment-char: ?/ -*-
   2 ///
   3 /// AESNI-based implementation of Rijndael
   4 ///
   5 /// (c) 2015 Straylight/Edgeware
   6 ///
   7
   8 ///----- Licensing notice ---------------------------------------------------
   9 ///
  10 /// This file is part of Catacomb.
  11 ///
  12 /// Catacomb is free software; you can redistribute it and/or modify
  13 /// it under the terms of the GNU Library General Public License as
  14 /// published by the Free Software Foundation; either version 2 of the
  15 /// License, or (at your option) any later version.
  16 ///
  17 /// Catacomb is distributed in the hope that it will be useful,
  18 /// but WITHOUT ANY WARRANTY; without even the implied warranty of
  19 /// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  20 /// GNU Library General Public License for more details.
  21 ///
  22 /// You should have received a copy of the GNU Library General Public
  23 /// License along with Catacomb; if not, write to the Free
  24 /// Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
  25 /// MA 02111-1307, USA.
  26
  27 ///--------------------------------------------------------------------------
  28 /// External definitions.
  29
  30 #include "config.h"
  31 #include "asm-common.h"
  32
  33 ///--------------------------------------------------------------------------
  34 /// External definitions.
  35
  36         .globl  F(abort)
  37         .globl  F(rijndael_rcon)
  38
  39 ///--------------------------------------------------------------------------
  40 /// Local utilities.
  41
  42 // Magic constants for shuffling.
  43 #define ROTL 0x93
  44 #define ROT2 0x4e
  45 #define ROTR 0x39
  46
  47 ///--------------------------------------------------------------------------
  48 /// Main code.
  49
  50         .arch   .aes
  51         .section .text
  52
  53 /// The AESNI instructions implement a little-endian version of AES, but
  54 /// Catacomb's internal interface presents as big-endian so as to work better
  55 /// with things like GCM.  We therefore maintain the round keys in
  56 /// little-endian form, and have to end-swap blocks in and out.
  57 ///
  58 /// For added amusement, the AESNI instructions don't implement the
  59 /// larger-block versions of Rijndael, so we have to end-swap the keys if
  60 /// we're preparing for one of those.
  61
  62         // Useful constants.
  63         .equ    maxrounds, 16           // maximum number of rounds
  64         .equ    maxblksz, 32            // maximum block size, in bytes
  65         .equ    kbufsz, maxblksz*(maxrounds + 1) // size of a key-schedule buffer
  66
  67         // Context structure.
  68         .equ    nr, 0                   // number of rounds
  69         .equ    w, nr + 4               // encryption key words
  70         .equ    wi, w + kbufsz          // decryption key words
  71
  72 ///--------------------------------------------------------------------------
  73 /// Key setup.
  74
  75 FUNC(rijndael_setup_x86_aesni)
  76
  77         // Initial state.  We have four arguments:
  78         // [esp + 20] is the context pointer
  79         // [esp + 24] is the block size, in 32-bit words (4, 6, or 8)
  80         // [esp + 28] points to the key material, unaligned
  81         // [esp + 32] is the size of the key, in words
  82         // The key size has already been checked for validity, and the number
  83         // of rounds has been computed.  Our job is only to fill in the `w'
  84         // and `wi' vectors.
  85
  86         push    ebp
  87         push    ebx
  88         push    esi
  89         push    edi
  90
  91         // The initial round key material is taken directly from the input
  92         // key, so copy it over.
  93         mov     ebp, [esp + 20]         // context base pointer
  94         mov     ebx, [esp + 32]         // key size, in words
  95         mov     ecx, ebx
  96         mov     esi, [esp + 28]
  97         lea     edi, [ebp + w]
  98         rep     movsd
  99
 100         // Find out other useful things.
 101         mov     edx, [ebp + nr]         // number of rounds
 102         add     edx, 1
 103         imul    edx, [esp + 24]         // total key size in words
 104         sub     edx, ebx                // offset by the key size
 105
 106         // Find the round constants.
 107         ldgot   ecx
 108         leaext  ecx, rijndael_rcon, ecx
 109
 110         // Prepare for the main loop.
 111         lea     esi, [ebp + w]
 112         mov     eax, [esi + 4*ebx - 4]  // most recent key word
 113         lea     edx, [esi + 4*edx]      // limit, offset by one key expansion
 114
 115         // Main key expansion loop.  The first word of each key-length chunk
 116         // needs special treatment.
 117         //
 118         // This is rather tedious because the Intel `AESKEYGENASSIST'
 119         // instruction is very strangely shaped.  Firstly, it wants to
 120         // operate on vast SSE registers, even though we're data-blocked from
 121         // doing more than operation at a time unless we're doing two key
 122         // schedules simultaneously -- and even then we can't do more than
 123         // two, because the instruction ignores two of its input words
 124         // entirely, and produces two different outputs for each of the other
 125         // two.  And secondly it insists on taking the magic round constant
 126         // as an immediate, so it's kind of annoying if you're not
 127         // open-coding the whole thing.  It's much easier to leave that as
 128         // zero and XOR in the round constant by hand.
 129 9:      movd    xmm0, eax
 130         pshufd  xmm0, xmm0, ROTR
 131         aeskeygenassist xmm1, xmm0, 0
 132         pshufd  xmm1, xmm1, ROTL
 133         movd    eax, xmm1
 134         xor     eax, [esi]
 135         xor     al, [ecx]
 136         inc     ecx
 137         mov     [esi + 4*ebx], eax
 138         add     esi, 4
 139         cmp     esi, edx
 140         jae     8f
 141
 142         // The next three words are simple...
 143         xor     eax, [esi]
 144         mov     [esi + 4*ebx], eax
 145         add     esi, 4
 146         cmp     esi, edx
 147         jae     8f
 148
 149         // (Word 2...)
 150         xor     eax, [esi]
 151         mov     [esi + 4*ebx], eax
 152         add     esi, 4
 153         cmp     esi, edx
 154         jae     8f
 155
 156         // (Word 3...)
 157         xor     eax, [esi]
 158         mov     [esi + 4*ebx], eax
 159         add     esi, 4
 160         cmp     esi, edx
 161         jae     8f
 162
 163         // Word 4.  If the key is /more/ than 6 words long, then we must
 164         // apply a substitution here.
 165         cmp     ebx, 5
 166         jb      9b
 167         cmp     ebx, 7
 168         jb      0f
 169         movd    xmm0, eax
 170         pshufd  xmm0, xmm0, ROTL
 171         aeskeygenassist xmm1, xmm0, 0
 172         movd    eax, xmm1
 173 0:      xor     eax, [esi]
 174         mov     [esi + 4*ebx], eax
 175         add     esi, 4
 176         cmp     esi, edx
 177         jae     8f
 178
 179         // (Word 5...)
 180         cmp     ebx, 6
 181         jb      9b
 182         xor     eax, [esi]
 183         mov     [esi + 4*ebx], eax
 184         add     esi, 4
 185         cmp     esi, edx
 186         jae     8f
 187
 188         // (Word 6...)
 189         cmp     ebx, 7
 190         jb      9b
 191         xor     eax, [esi]
 192         mov     [esi + 4*ebx], eax
 193         add     esi, 4
 194         cmp     esi, edx
 195         jae     8f
 196
 197         // (Word 7...)
 198         cmp     ebx, 8
 199         jb      9b
 200         xor     eax, [esi]
 201         mov     [esi + 4*ebx], eax
 202         add     esi, 4
 203         cmp     esi, edx
 204         jae     8f
 205
 206         // Must be done by now.
 207         jmp     9b
 208
 209         // Next job is to construct the decryption keys.  The keys for the
 210         // first and last rounds don't need to be mangled, but the remaining
 211         // ones do -- and they all need to be reordered too.
 212         //
 213         // The plan of action, then, is to copy the final encryption round's
 214         // keys into place first, then to do each of the intermediate rounds
 215         // in reverse order, and finally do the first round.
 216         //
 217         // Do all of the heavy lifting with SSE registers.  The order we're
 218         // doing this in means that it's OK if we read or write too much, and
 219         // there's easily enough buffer space for the over-enthusiastic reads
 220         // and writes because the context has space for 32-byte blocks, which
 221         // is our maximum and an exact fit for two SSE registers.
 222 8:      mov     ecx, [ebp + nr]         // number of rounds
 223         mov     ebx, [esp + 24]         // block size (in words)
 224         mov     edx, ecx
 225         imul    edx, ebx
 226         lea     edi, [ebp + wi]
 227         lea     esi, [ebp + 4*edx + w]  // last round's keys
 228         shl     ebx, 2                  // block size (in bytes now)
 229
 230         // Copy the last encryption round's keys.
 231         movdqu  xmm0, [esi]
 232         movdqu  [edi], xmm0
 233         cmp     ebx, 16
 234         jbe     9f
 235         movdqu  xmm0, [esi + 16]
 236         movdqu  [edi + 16], xmm0
 237
 238         // Update the loop variables and stop if we've finished.
 239 9:      add     edi, ebx
 240         sub     esi, ebx
 241         sub     ecx, 1
 242         jbe     0f
 243
 244         // Do another middle round's keys...
 245         movdqu  xmm0, [esi]
 246         aesimc  xmm0, xmm0
 247         movdqu  [edi], xmm0
 248         cmp     ebx, 16
 249         jbe     9b
 250         movdqu  xmm0, [esi + 16]
 251         aesimc  xmm0, xmm0
 252         movdqu  [edi + 16], xmm0
 253         jmp     9b
 254
 255         // Finally do the first encryption round.
 256 0:      movdqu  xmm0, [esi]
 257         movdqu  [edi], xmm0
 258         cmp     ebx, 16
 259         jbe     0f
 260         movdqu  xmm0, [esi + 16]
 261         movdqu  [edi + 16], xmm0
 262
 263         // If the block size is not exactly four words then we must end-swap
 264         // everything.  We can use fancy SSE toys for this.
 265 0:      cmp     ebx, 16
 266         je      0f
 267
 268         // Find the byte-reordering table.
 269         ldgot   ecx
 270         movdqa  xmm5, [INTADDR(endswap_tab, ecx)]
 271
 272         // Calculate the number of subkey words again.  (It's a good job
 273         // we've got a fast multiplier.)
 274         mov     ecx, [ebp + nr]
 275         add     ecx, 1
 276         imul    ecx, [esp + 24]         // total keys in words
 277
 278         // End-swap the encryption keys.
 279         mov     eax, ecx
 280         lea     esi, [ebp + w]
 281         call    endswap_block
 282
 283         // And the decryption keys.
 284         mov     ecx, eax
 285         lea     esi, [ebp + wi]
 286         call    endswap_block
 287
 288         // All done.
 289 0:      pop     edi
 290         pop     esi
 291         pop     ebx
 292         pop     ebp
 293         ret
 294
 295         .align  16
 296 endswap_block:
 297         // End-swap ECX words starting at ESI.  The end-swapping table is
 298         // already loaded into XMM5; and it's OK to work in 16-byte chunks.
 299         movdqu  xmm1, [esi]
 300         pshufb  xmm1, xmm5
 301         movdqu  [esi], xmm1
 302         add     esi, 16
 303         sub     ecx, 4
 304         ja      endswap_block
 305         ret
 306
 307 ENDFUNC
 308
 309 ///--------------------------------------------------------------------------
 310 /// Encrypting and decrypting blocks.
 311
 312         .macro  encdec op, aes, koff
 313 FUNC(rijndael_\op\()_x86_aesni)
 314
 315         // On entry, we have:
 316         // [esp +  4] points to the context block
 317         // [esp +  8] points to the input data block
 318         // [esp + 12] points to the output buffer
 319
 320         // Find the magic endianness-swapping table.
 321         ldgot   ecx
 322         movdqa  xmm5, [INTADDR(endswap_tab, ecx)]
 323
 324         // Load the input block and end-swap it.  Also, start loading the
 325         // keys.
 326         mov     eax, [esp + 8]
 327         movdqu  xmm0, [eax]
 328         pshufb  xmm0, xmm5
 329         mov     eax, [esp + 4]
 330         lea     edx, [eax + \koff]
 331         mov     eax, [eax + nr]
 332
 333         // Initial whitening.
 334         movdqu  xmm1, [edx]
 335         add     edx, 16
 336         pxor    xmm0, xmm1
 337
 338         // Dispatch to the correct code.
 339         cmp     eax, 10
 340         je      10f
 341         jb      bogus
 342         cmp     eax, 14
 343         je      14f
 344         ja      bogus
 345         cmp     eax, 12
 346         je      12f
 347         jb      11f
 348         jmp     13f
 349
 350         .align  2
 351
 352         // 14 rounds...
 353 14:     movdqu  xmm1, [edx]
 354         add     edx, 16
 355         \aes    xmm0, xmm1
 356
 357         // 13 rounds...
 358 13:     movdqu  xmm1, [edx]
 359         add     edx, 16
 360         \aes    xmm0, xmm1
 361
 362         // 12 rounds...
 363 12:     movdqu  xmm1, [edx]
 364         add     edx, 16
 365         \aes    xmm0, xmm1
 366
 367         // 11 rounds...
 368 11:     movdqu  xmm1, [edx]
 369         add     edx, 16
 370         \aes    xmm0, xmm1
 371
 372         // 10 rounds...
 373 10:     movdqu  xmm1, [edx]
 374         \aes    xmm0, xmm1
 375
 376         // 9 rounds...
 377         movdqu  xmm1, [edx + 16]
 378         \aes    xmm0, xmm1
 379
 380         // 8 rounds...
 381         movdqu  xmm1, [edx + 32]
 382         \aes    xmm0, xmm1
 383
 384         // 7 rounds...
 385         movdqu  xmm1, [edx + 48]
 386         \aes    xmm0, xmm1
 387
 388         // 6 rounds...
 389         movdqu  xmm1, [edx + 64]
 390         \aes    xmm0, xmm1
 391
 392         // 5 rounds...
 393         movdqu  xmm1, [edx + 80]
 394         \aes    xmm0, xmm1
 395
 396         // 4 rounds...
 397         movdqu  xmm1, [edx + 96]
 398         \aes    xmm0, xmm1
 399
 400         // 3 rounds...
 401         movdqu  xmm1, [edx + 112]
 402         \aes    xmm0, xmm1
 403
 404         // 2 rounds...
 405         movdqu  xmm1, [edx + 128]
 406         \aes    xmm0, xmm1
 407
 408         // Final round...
 409         movdqu  xmm1, [edx + 144]
 410         \aes\()last xmm0, xmm1
 411
 412         // Unpermute the ciphertext block and store it.
 413         pshufb  xmm0, xmm5
 414         mov     eax, [esp + 12]
 415         movdqu  [eax], xmm0
 416
 417         // And we're done.
 418         ret
 419
 420 ENDFUNC
 421         .endm
 422
 423         encdec  eblk, aesenc, w
 424         encdec  dblk, aesdec, wi
 425
 426 ///--------------------------------------------------------------------------
 427 /// Random utilities.
 428
 429         .align  16
 430         // Abort the process because of a programming error.  Indirecting
 431         // through this point serves several purposes: (a) by CALLing, rather
 432         // than branching to, `abort', we can save the return address, which
 433         // might at least provide a hint as to what went wrong; (b) we don't
 434         // have conditional CALLs (and they'd be big anyway); and (c) we can
 435         // write a HLT here as a backstop against `abort' being mad.
 436 bogus:  callext F(abort)
 437 0:      hlt
 438         jmp     0b
 439
 440         gotaux  ecx
 441
 442 ///--------------------------------------------------------------------------
 443 /// Data tables.
 444
 445         .align  16
 446 endswap_tab:
 447         .byte    3,  2,  1,  0
 448         .byte    7,  6,  5,  4
 449         .byte   11, 10,  9,  8
 450         .byte   15, 14, 13, 12
 451
 452 ///----- That's all, folks --------------------------------------------------