chiark - git - mdw - catacomb/blob - symm/rijndael-x86ish-aesni.S

   1 /// -*- mode: asm; asm-comment-char: ?/ -*-
   2 ///
   3 /// AESNI-based implementation of Rijndael
   4 ///
   5 /// (c) 2015 Straylight/Edgeware
   6 ///
   7
   8 ///----- Licensing notice ---------------------------------------------------
   9 ///
  10 /// This file is part of Catacomb.
  11 ///
  12 /// Catacomb is free software; you can redistribute it and/or modify
  13 /// it under the terms of the GNU Library General Public License as
  14 /// published by the Free Software Foundation; either version 2 of the
  15 /// License, or (at your option) any later version.
  16 ///
  17 /// Catacomb is distributed in the hope that it will be useful,
  18 /// but WITHOUT ANY WARRANTY; without even the implied warranty of
  19 /// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  20 /// GNU Library General Public License for more details.
  21 ///
  22 /// You should have received a copy of the GNU Library General Public
  23 /// License along with Catacomb; if not, write to the Free
  24 /// Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
  25 /// MA 02111-1307, USA.
  26
  27 ///--------------------------------------------------------------------------
  28 /// External definitions.
  29
  30 #include "config.h"
  31 #include "asm-common.h"
  32
  33         .globl  F(abort)
  34         .globl  F(rijndael_rcon)
  35
  36 ///--------------------------------------------------------------------------
  37 /// Main code.
  38
  39         .arch   .aes
  40         .text
  41
  42 /// The AESNI instructions implement a little-endian version of AES, but
  43 /// Catacomb's internal interface presents as big-endian so as to work better
  44 /// with things like GCM.  We therefore maintain the round keys in
  45 /// little-endian form, and have to end-swap blocks in and out.
  46 ///
  47 /// For added amusement, the AESNI instructions don't implement the
  48 /// larger-block versions of Rijndael, so we have to end-swap the keys if
  49 /// we're preparing for one of those.
  50
  51         // Useful constants.
  52         .equ    maxrounds, 16           // maximum number of rounds
  53         .equ    maxblksz, 32            // maximum block size, in bytes
  54         .equ    kbufsz, maxblksz*(maxrounds + 1) // size of a key-schedule buffer
  55
  56         // Context structure.
  57         .equ    nr, 0                   // number of rounds
  58         .equ    w, nr + 4               // encryption key words
  59         .equ    wi, w + kbufsz          // decryption key words
  60
  61 ///--------------------------------------------------------------------------
  62 /// Key setup.
  63
  64 FUNC(rijndael_setup_x86ish_aesni)
  65
  66 #define SI WHOLE(si)
  67 #define DI WHOLE(di)
  68
  69 #if CPUFAM_X86
  70         // Arguments are on the stack.  We'll need to stack the caller's
  71         // register veriables, but we'll manage.
  72
  73 #  define CTX ebp                       // context pointer
  74 #  define BLKSZ [esp + 24]              // block size
  75
  76 #  define KSZ ebx                       // key size
  77 #  define NKW edx                       // total number of key words
  78 #  define NKW_NEEDS_REFRESH 1           // ... needs recalculating
  79 #  define RCON ecx                      // round constants table
  80 #  define LIM edx                       // limit pointer
  81 #  define CYIX edi                      // index in shift-register cycle
  82
  83 #  define NR ecx                        // number of rounds
  84 #  define LRK eax                       // distance to last key
  85 #  define BLKOFF edx                    // block size in bytes
  86
  87         // Stack the caller's registers.
  88         pushreg ebp
  89         pushreg ebx
  90         pushreg esi
  91         pushreg edi
  92
  93         // Set up our own variables.
  94         mov     CTX, [esp + 20]         // context base pointer
  95         mov     SI, [esp + 28]          // key material
  96         mov     KSZ, [esp + 32]         // key size, in words
  97 #endif
  98
  99 #if CPUFAM_AMD64 && ABI_SYSV
 100         // Arguments are in registers.  We have plenty, but, to be honest,
 101         // the initial register allocation is a bit annoying.
 102
 103 #  define CTX r8                        // context pointer
 104 #  define BLKSZ r9d                     // block size
 105
 106 #  define KSZ edx                       // key size
 107 #  define NKW r10d                      // total number of key words
 108 #  define RCON rdi                      // round constants table
 109 #  define LIM rcx                       // limit pointer
 110 #  define CYIX r11d                     // index in shift-register cycle
 111
 112 #  define NR ecx                        // number of rounds
 113 #  define LRK eax                       // distance to last key
 114 #  define BLKOFF r9d                    // block size in bytes
 115
 116         // Move arguments to more useful places.
 117         mov     CTX, rdi                // context base pointer
 118         mov     BLKSZ, esi              // block size in words
 119         mov     SI, rdx                 // key material
 120         mov     KSZ, ecx                // key size, in words
 121 #endif
 122
 123 #if CPUFAM_AMD64 && ABI_WIN
 124         // Arguments are in different registers, and they're a little tight.
 125
 126 #  define CTX r8                        // context pointer
 127 #  define BLKSZ edx                     // block size
 128
 129 #  define KSZ r9d                       // key size
 130 #  define NKW r10d                      // total number of key words
 131 #  define RCON rdi                      // round constants table
 132 #  define LIM rcx                       // limit pointer
 133 #  define CYIX r11d                     // index in shift-register cycle
 134
 135 #  define NR ecx                        // number of rounds
 136 #  define LRK eax                       // distance to last key
 137 #  define BLKOFF edx                    // block size in bytes
 138
 139         // We'll need the index registers, which belong to the caller in this
 140         // ABI.
 141         pushreg rsi
 142         pushreg rdi
 143
 144         // Move arguments to more useful places.
 145         mov     rsi, r8                 // key material
 146         mov     CTX, rcx                // context base pointer
 147 #endif
 148
 149   endprologue
 150
 151         // The initial round key material is taken directly from the input
 152         // key, so copy it over.
 153 #if CPUFAM_AMD64 && ABI_SYSV
 154         // We've been lucky.  We already have a copy of the context pointer
 155         // in rdi, and the key size in ecx.
 156         add     rdi, w
 157 #else
 158         lea     DI, [CTX + w]
 159         mov     ecx, KSZ
 160 #endif
 161         rep     movsd
 162
 163         // Find out other useful things.
 164         mov     NKW, [CTX + nr]         // number of rounds
 165         add     NKW, 1
 166         imul    NKW, BLKSZ              // total key size in words
 167 #if !NKW_NEEDS_REFRESH
 168         // If we can't keep NKW for later, then we use the same register for
 169         // it and LIM, so this move is unnecessary.
 170         mov     DWORD(LIM), NKW
 171 #endif
 172         sub     DWORD(LIM), KSZ         // offset by the key size
 173
 174         // Find the round constants.
 175         ldgot   WHOLE(c)
 176         leaext  RCON, F(rijndael_rcon), WHOLE(c)
 177
 178         // Prepare for the main loop.
 179         lea     SI, [CTX + w]
 180         mov     eax, [SI + 4*WHOLE(KSZ) - 4] // most recent key word
 181         lea     LIM, [SI + 4*LIM]       // limit, offset by one key expansion
 182         xor     CYIX, CYIX              // start of new cycle
 183
 184         // Main key expansion loop.  The first word of each key-length chunk
 185         // needs special treatment.
 186         //
 187         // This is rather tedious because the Intel `AESKEYGENASSIST'
 188         // instruction is very strangely shaped.  Firstly, it wants to
 189         // operate on vast SSE registers, even though we're data-blocked from
 190         // doing more than operation at a time unless we're doing two key
 191         // schedules simultaneously -- and even then we can't do more than
 192         // two, because the instruction ignores two of its input words
 193         // entirely, and produces two different outputs for each of the other
 194         // two.  And secondly it insists on taking the magic round constant
 195         // as an immediate, so it's kind of annoying if you're not
 196         // open-coding the whole thing.  It's much easier to leave that as
 197         // zero and XOR in the round constant by hand.
 198 0:      cmp     CYIX, 0                 // first word of the cycle?
 199         je      1f
 200         cmp     CYIX, 4                 // fourth word of the cycle?
 201         jne     2f
 202         cmp     KSZ, 7                  // and a large key?
 203         jb      2f
 204
 205         // Fourth word of the cycle, and seven or eight words of key.  Do a
 206         // byte substitution.
 207         movd    xmm0, eax
 208         pshufd  xmm0, xmm0, SHUF(2, 1, 0, 3)
 209         aeskeygenassist xmm1, xmm0, 0
 210         movd    eax, xmm1
 211         jmp     2f
 212
 213         // First word of the cycle.  This is the complicated piece.
 214 1:      movd    xmm0, eax
 215         pshufd  xmm0, xmm0, SHUF(0, 3, 2, 1)
 216         aeskeygenassist xmm1, xmm0, 0
 217         pshufd  xmm1, xmm1, SHUF(2, 1, 0, 3)
 218         movd    eax, xmm1
 219         xor     al, [RCON]
 220         inc     RCON
 221
 222         // Common tail.  Mix in the corresponding word from the previous
 223         // cycle and prepare for the next loop.
 224 2:      xor     eax, [SI]
 225         mov     [SI + 4*WHOLE(KSZ)], eax
 226         add     SI, 4
 227         inc     CYIX
 228         cmp     SI, LIM
 229         jae     9f
 230         cmp     CYIX, KSZ
 231         jb      0b
 232         xor     CYIX, CYIX
 233         jmp     0b
 234
 235         // Next job is to construct the decryption keys.  The keys for the
 236         // first and last rounds don't need to be mangled, but the remaining
 237         // ones do -- and they all need to be reordered too.
 238         //
 239         // The plan of action, then, is to copy the final encryption round's
 240         // keys into place first, then to do each of the intermediate rounds
 241         // in reverse order, and finally do the first round.
 242         //
 243         // Do all of the heavy lifting with SSE registers.  The order we're
 244         // doing this in means that it's OK if we read or write too much, and
 245         // there's easily enough buffer space for the over-enthusiastic reads
 246         // and writes because the context has space for 32-byte blocks, which
 247         // is our maximum and an exact fit for two SSE registers.
 248 9:      mov     NR, [CTX + nr]          // number of rounds
 249 #if NKW_NEEDS_REFRESH
 250         mov     BLKOFF, BLKSZ
 251         mov     LRK, NR
 252         imul    LRK, BLKOFF
 253 #else
 254         // If we retain NKW, then BLKSZ and BLKOFF are the same register
 255         // because we won't need the former again.
 256         mov     LRK, NKW
 257         sub     LRK, BLKSZ
 258 #endif
 259         lea     DI, [CTX + wi]
 260         lea     SI, [CTX + w + 4*WHOLE(LRK)] // last round's keys
 261         shl     BLKOFF, 2               // block size (in bytes now)
 262
 263         // Copy the last encryption round's keys.
 264         movdqu  xmm0, [SI]
 265         movdqu  [DI], xmm0
 266         cmp     BLKOFF, 16
 267         jbe     0f
 268         movdqu  xmm0, [SI + 16]
 269         movdqu  [DI + 16], xmm0
 270
 271         // Update the loop variables and stop if we've finished.
 272 0:      add     DI, WHOLE(BLKOFF)
 273         sub     SI, WHOLE(BLKOFF)
 274         sub     NR, 1
 275         jbe     9f
 276
 277         // Do another middle round's keys...
 278         movdqu  xmm0, [SI]
 279         aesimc  xmm0, xmm0
 280         movdqu  [DI], xmm0
 281         cmp     BLKOFF, 16
 282         jbe     0b
 283         movdqu  xmm0, [SI + 16]
 284         aesimc  xmm0, xmm0
 285         movdqu  [DI + 16], xmm0
 286         jmp     0b
 287
 288         // Finally do the first encryption round.
 289 9:      movdqu  xmm0, [SI]
 290         movdqu  [DI], xmm0
 291         cmp     BLKOFF, 16
 292         jbe     1f
 293         movdqu  xmm0, [SI + 16]
 294         movdqu  [DI + 16], xmm0
 295
 296         // If the block size is not exactly four words then we must end-swap
 297         // everything.  We can use fancy SSE toys for this.
 298 1:      cmp     BLKOFF, 16
 299         je      9f
 300
 301         // Find the byte-reordering table.
 302         ldgot   ecx
 303         movdqa  xmm5, [INTADDR(endswap_tab, ecx)]
 304
 305 #if NKW_NEEDS_REFRESH
 306         // Calculate the number of subkey words again.  (It's a good job
 307         // we've got a fast multiplier.)
 308         mov     NKW, [CTX + nr]
 309         add     NKW, 1
 310         imul    NKW, BLKSZ
 311 #endif
 312
 313         // End-swap the encryption keys.
 314         lea     SI, [CTX + w]
 315         call    endswap_block
 316
 317         // And the decryption keys.
 318         lea     SI, [CTX + wi]
 319         call    endswap_block
 320
 321 9:      // All done.
 322 #if CPUFAM_X86
 323         popreg  edi
 324         popreg  esi
 325         popreg  ebx
 326         popreg  ebp
 327 #endif
 328 #if CPUFAM_AMD64 && ABI_WIN
 329         popreg  rdi
 330         popreg  rsi
 331 #endif
 332         ret
 333
 334 ENDFUNC
 335
 336 INTFUNC(endswap_block)
 337         // End-swap NKW words starting at SI.  The end-swapping table is
 338         // already loaded into XMM5; and it's OK to work in 16-byte chunks.
 339   endprologue
 340
 341         mov     ecx, NKW
 342 0:      movdqu  xmm1, [SI]
 343         pshufb  xmm1, xmm5
 344         movdqu  [SI], xmm1
 345         add     SI, 16
 346         sub     ecx, 4
 347         ja      0b
 348
 349         ret
 350
 351 ENDFUNC
 352
 353 #undef CTX
 354 #undef BLKSZ
 355 #undef SI
 356 #undef DI
 357 #undef KSZ
 358 #undef RCON
 359 #undef LIM
 360 #undef NR
 361 #undef LRK
 362 #undef BLKOFF
 363
 364 ///--------------------------------------------------------------------------
 365 /// Encrypting and decrypting blocks.
 366
 367 .macro  encdec  op, aes, koff
 368   FUNC(rijndael_\op\()_x86ish_aesni)
 369
 370 #if CPUFAM_X86
 371         // Arguments come in on the stack, and need to be collected.  We
 372         // don't have a shortage of registers.
 373
 374 #  define K eax
 375 #  define SRC edx
 376 #  define DST edx
 377 #  define NR ecx
 378
 379         mov     K, [esp + 4]
 380         mov     SRC, [esp + 8]
 381 #endif
 382
 383 #if CPUFAM_AMD64 && ABI_SYSV
 384         // Arguments come in registers.  All is good.
 385
 386 #  define K rdi
 387 #  define SRC rsi
 388 #  define DST rdx
 389 #  define NR eax
 390 #endif
 391
 392 #if CPUFAM_AMD64 && ABI_WIN
 393         // Arguments come in different registers.
 394
 395 #  define K rcx
 396 #  define SRC rdx
 397 #  define DST r8
 398 #  define NR eax
 399 #endif
 400
 401   endprologue
 402
 403         // Find the magic endianness-swapping table.
 404         ldgot   ecx
 405         movdqa  xmm5, [INTADDR(endswap_tab, ecx)]
 406
 407         // Initial setup.
 408         movdqu  xmm0, [SRC]
 409         pshufb  xmm0, xmm5
 410         mov     NR, [K + nr]
 411         add     K, \koff
 412
 413         // Initial whitening.
 414         movdqu  xmm1, [K]
 415         add     K, 16
 416         pxor    xmm0, xmm1
 417 #if CPUFAM_X86
 418         mov     DST, [esp + 12]
 419 #endif
 420
 421         // Dispatch to the correct code.
 422         cmp     NR, 10
 423         je      10f
 424         jb      bogus
 425         cmp     NR, 14
 426         je      14f
 427         ja      bogus
 428         cmp     NR, 12
 429         je      12f
 430         jb      11f
 431         jmp     13f
 432
 433         .align  2
 434
 435         // 14 rounds...
 436 14:     movdqu  xmm1, [K]
 437         add     K, 16
 438         \aes    xmm0, xmm1
 439
 440         // 13 rounds...
 441 13:     movdqu  xmm1, [K]
 442         add     K, 16
 443         \aes    xmm0, xmm1
 444
 445         // 12 rounds...
 446 12:     movdqu  xmm1, [K]
 447         add     K, 16
 448         \aes    xmm0, xmm1
 449
 450         // 11 rounds...
 451 11:     movdqu  xmm1, [K]
 452         add     K, 16
 453         \aes    xmm0, xmm1
 454
 455         // 10 rounds...
 456 10:     movdqu  xmm1, [K]
 457         \aes    xmm0, xmm1
 458
 459         // 9 rounds...
 460         movdqu  xmm1, [K + 16]
 461         \aes    xmm0, xmm1
 462
 463         // 8 rounds...
 464         movdqu  xmm1, [K + 32]
 465         \aes    xmm0, xmm1
 466
 467         // 7 rounds...
 468         movdqu  xmm1, [K + 48]
 469         \aes    xmm0, xmm1
 470
 471         // 6 rounds...
 472         movdqu  xmm1, [K + 64]
 473         \aes    xmm0, xmm1
 474
 475         // 5 rounds...
 476         movdqu  xmm1, [K + 80]
 477         \aes    xmm0, xmm1
 478
 479         // 4 rounds...
 480         movdqu  xmm1, [K + 96]
 481         \aes    xmm0, xmm1
 482
 483         // 3 rounds...
 484         movdqu  xmm1, [K + 112]
 485         \aes    xmm0, xmm1
 486
 487         // 2 rounds...
 488         movdqu  xmm1, [K + 128]
 489         \aes    xmm0, xmm1
 490
 491         // Final round...
 492         movdqu  xmm1, [K + 144]
 493         \aes\()last xmm0, xmm1
 494
 495         // Unpermute the ciphertext block and store it.
 496         pshufb  xmm0, xmm5
 497         movdqu  [DST], xmm0
 498
 499         // And we're done.
 500         ret
 501
 502 #undef K
 503 #undef SRC
 504 #undef DST
 505 #undef NR
 506
 507   ENDFUNC
 508 .endm
 509
 510         encdec  eblk, aesenc, w
 511         encdec  dblk, aesdec, wi
 512
 513 ///--------------------------------------------------------------------------
 514 /// Random utilities.
 515
 516 INTFUNC(bogus)
 517         // Abort the process because of a programming error.  Indirecting
 518         // through this point serves several purposes: (a) by CALLing, rather
 519         // than branching to, `abort', we can save the return address, which
 520         // might at least provide a hint as to what went wrong; (b) we don't
 521         // have conditional CALLs (and they'd be big anyway); and (c) we can
 522         // write a HLT here as a backstop against `abort' being mad.
 523   endprologue
 524
 525         callext F(abort)
 526 0:      hlt
 527         jmp     0b
 528
 529 ENDFUNC
 530
 531 ///--------------------------------------------------------------------------
 532 /// Data tables.
 533
 534         RODATA
 535
 536         .align  16
 537 endswap_tab:
 538         .byte    3,  2,  1,  0
 539         .byte    7,  6,  5,  4
 540         .byte   11, 10,  9,  8
 541         .byte   15, 14, 13, 12
 542
 543 ///----- That's all, folks --------------------------------------------------