chiark - git - mdw - catacomb/blob - symm/rijndael-x86ish-aesni.S

   1 /// -*- mode: asm; asm-comment-char: ?/ -*-
   2 ///
   3 /// AESNI-based implementation of Rijndael
   4 ///
   5 /// (c) 2015 Straylight/Edgeware
   6 ///
   7
   8 ///----- Licensing notice ---------------------------------------------------
   9 ///
  10 /// This file is part of Catacomb.
  11 ///
  12 /// Catacomb is free software; you can redistribute it and/or modify
  13 /// it under the terms of the GNU Library General Public License as
  14 /// published by the Free Software Foundation; either version 2 of the
  15 /// License, or (at your option) any later version.
  16 ///
  17 /// Catacomb is distributed in the hope that it will be useful,
  18 /// but WITHOUT ANY WARRANTY; without even the implied warranty of
  19 /// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  20 /// GNU Library General Public License for more details.
  21 ///
  22 /// You should have received a copy of the GNU Library General Public
  23 /// License along with Catacomb; if not, write to the Free
  24 /// Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
  25 /// MA 02111-1307, USA.
  26
  27 ///--------------------------------------------------------------------------
  28 /// External definitions.
  29
  30 #include "config.h"
  31 #include "asm-common.h"
  32
  33         .globl  F(abort)
  34         .globl  F(rijndael_rcon)
  35
  36 ///--------------------------------------------------------------------------
  37 /// Local utilities.
  38
  39 // Magic constants for shuffling.
  40 #define ROTL 0x93
  41 #define ROT2 0x4e
  42 #define ROTR 0x39
  43
  44 ///--------------------------------------------------------------------------
  45 /// Main code.
  46
  47         .arch   .aes
  48         .text
  49
  50 /// The AESNI instructions implement a little-endian version of AES, but
  51 /// Catacomb's internal interface presents as big-endian so as to work better
  52 /// with things like GCM.  We therefore maintain the round keys in
  53 /// little-endian form, and have to end-swap blocks in and out.
  54 ///
  55 /// For added amusement, the AESNI instructions don't implement the
  56 /// larger-block versions of Rijndael, so we have to end-swap the keys if
  57 /// we're preparing for one of those.
  58
  59         // Useful constants.
  60         .equ    maxrounds, 16           // maximum number of rounds
  61         .equ    maxblksz, 32            // maximum block size, in bytes
  62         .equ    kbufsz, maxblksz*(maxrounds + 1) // size of a key-schedule buffer
  63
  64         // Context structure.
  65         .equ    nr, 0                   // number of rounds
  66         .equ    w, nr + 4               // encryption key words
  67         .equ    wi, w + kbufsz          // decryption key words
  68
  69 ///--------------------------------------------------------------------------
  70 /// Key setup.
  71
  72 FUNC(rijndael_setup_x86ish_aesni)
  73
  74 #if CPUFAM_X86
  75         // Arguments are on the stack.  We'll need to stack the caller's
  76         // register veriables, but we'll manage.
  77
  78 #  define CTX ebp                       // context pointer
  79 #  define BLKSZ [esp + 24]              // block size
  80
  81 #  define SI esi                        // source pointer
  82 #  define DI edi                        // destination pointer
  83
  84 #  define KSZ ebx                       // key size
  85 #  define KSZo ebx                      // ... as address offset
  86 #  define NKW edx                       // total number of key words
  87 #  define NKW_NEEDS_REFRESH 1           // ... needs recalculating
  88 #  define RCON ecx                      // round constants table
  89 #  define LIM edx                       // limit pointer
  90 #  define LIMn edx                      // ... as integer offset from base
  91
  92 #  define NR ecx                        // number of rounds
  93 #  define LRK eax                       // distance to last key
  94 #  define LRKo eax                      // ... as address offset
  95 #  define BLKOFF edx                    // block size in bytes
  96 #  define BLKOFFo edx                   // ... as address offset
  97
  98         // Stack the caller's registers.
  99         push    ebp
 100         push    ebx
 101         push    esi
 102         push    edi
 103
 104         // Set up our own variables.
 105         mov     CTX, [esp + 20]         // context base pointer
 106         mov     SI, [esp + 28]          // key material
 107         mov     KSZ, [esp + 32]         // key size, in words
 108 #endif
 109
 110 #if CPUFAM_AMD64 && ABI_SYSV
 111         // Arguments are in registers.  We have plenty, but, to be honest,
 112         // the initial register allocation is a bit annoying.
 113
 114 #  define CTX r8                        // context pointer
 115 #  define BLKSZ r9d                     // block size
 116
 117 #  define SI rsi                        // source pointer
 118 #  define DI rdi                        // destination pointer
 119
 120 #  define KSZ edx                       // key size
 121 #  define KSZo rdx                      // ... as address offset
 122 #  define NKW r10d                      // total number of key words
 123 #  define RCON rdi                      // round constants table
 124 #  define LIMn ecx                      // limit pointer
 125 #  define LIM rcx                       // ... as integer offset from base
 126
 127 #  define NR ecx                        // number of rounds
 128 #  define LRK eax                       // distance to last key
 129 #  define LRKo rax                      // ... as address offset
 130 #  define BLKOFF r9d                    // block size in bytes
 131 #  define BLKOFFo r9                    // ... as address offset
 132
 133         // Move arguments to more useful places.
 134         mov     CTX, rdi                // context base pointer
 135         mov     BLKSZ, esi              // block size in words
 136         mov     SI, rdx                 // key material
 137         mov     KSZ, ecx                // key size, in words
 138 #endif
 139
 140 #if CPUFAM_AMD64 && ABI_WIN
 141         // Arguments are in different registers, and they're a little tight.
 142
 143 #  define CTX r8                        // context pointer
 144 #  define BLKSZ edx                     // block size
 145
 146 #  define SI rsi                        // source pointer
 147 #  define DI rdi                        // destination pointer
 148
 149 #  define KSZ r9d                       // key size
 150 #  define KSZo r9                       // ... as address offset
 151 #  define NKW r10d                      // total number of key words
 152 #  define RCON rdi                      // round constants table
 153 #  define LIMn ecx                      // limit pointer
 154 #  define LIM rcx                       // ... as integer offset from base
 155
 156 #  define NR ecx                        // number of rounds
 157 #  define LRK eax                       // distance to last key
 158 #  define LRKo rax                      // ... as address offset
 159 #  define BLKOFF edx                    // block size in bytes
 160 #  define BLKOFFo rdx                   // ... as address offset
 161
 162         // We'll need the index registers, which belong to the caller in this
 163         // ABI.
 164         push    rsi
 165         push    rdi
 166
 167         // Move arguments to more useful places.
 168         mov     SI, r8                  // key material
 169         mov     CTX, rcx                // context base pointer
 170 #endif
 171
 172         // The initial round key material is taken directly from the input
 173         // key, so copy it over.
 174 #if CPUFAM_AMD64 && ABI_SYSV
 175         // We've been lucky.  We already have a copy of the context pointer
 176         // in rdi, and the key size in ecx.
 177         add     DI, w
 178 #else
 179         lea     DI, [CTX + w]
 180         mov     ecx, KSZ
 181 #endif
 182         rep     movsd
 183
 184         // Find out other useful things.
 185         mov     NKW, [CTX + nr]         // number of rounds
 186         add     NKW, 1
 187         imul    NKW, BLKSZ              // total key size in words
 188 #if !NKW_NEEDS_REFRESH
 189         // If we can't keep NKW for later, then we use the same register for
 190         // it and LIM, so this move is unnecessary.
 191         mov     LIMn, NKW
 192 #endif
 193         sub     LIMn, KSZ               // offset by the key size
 194
 195         // Find the round constants.
 196         ldgot   ecx
 197         leaext  RCON, F(rijndael_rcon), ecx
 198
 199         // Prepare for the main loop.
 200         lea     SI, [CTX + w]
 201         mov     eax, [SI + 4*KSZo - 4]  // most recent key word
 202         lea     LIM, [SI + 4*LIM]       // limit, offset by one key expansion
 203
 204         // Main key expansion loop.  The first word of each key-length chunk
 205         // needs special treatment.
 206         //
 207         // This is rather tedious because the Intel `AESKEYGENASSIST'
 208         // instruction is very strangely shaped.  Firstly, it wants to
 209         // operate on vast SSE registers, even though we're data-blocked from
 210         // doing more than operation at a time unless we're doing two key
 211         // schedules simultaneously -- and even then we can't do more than
 212         // two, because the instruction ignores two of its input words
 213         // entirely, and produces two different outputs for each of the other
 214         // two.  And secondly it insists on taking the magic round constant
 215         // as an immediate, so it's kind of annoying if you're not
 216         // open-coding the whole thing.  It's much easier to leave that as
 217         // zero and XOR in the round constant by hand.
 218 9:      movd    xmm0, eax
 219         pshufd  xmm0, xmm0, ROTR
 220         aeskeygenassist xmm1, xmm0, 0
 221         pshufd  xmm1, xmm1, ROTL
 222         movd    eax, xmm1
 223         xor     eax, [SI]
 224         xor     al, [RCON]
 225         inc     RCON
 226         mov     [SI + 4*KSZo], eax
 227         add     SI, 4
 228         cmp     SI, LIM
 229         jae     8f
 230
 231         // The next three words are simple...
 232         xor     eax, [SI]
 233         mov     [SI + 4*KSZo], eax
 234         add     SI, 4
 235         cmp     SI, LIM
 236         jae     8f
 237
 238         // (Word 2...)
 239         xor     eax, [SI]
 240         mov     [SI + 4*KSZo], eax
 241         add     SI, 4
 242         cmp     SI, LIM
 243         jae     8f
 244
 245         // (Word 3...)
 246         xor     eax, [SI]
 247         mov     [SI + 4*KSZo], eax
 248         add     SI, 4
 249         cmp     SI, LIM
 250         jae     8f
 251
 252         // Word 4.  If the key is /more/ than 6 words long, then we must
 253         // apply a substitution here.
 254         cmp     KSZ, 5
 255         jb      9b
 256         cmp     KSZ, 7
 257         jb      0f
 258         movd    xmm0, eax
 259         pshufd  xmm0, xmm0, ROTL
 260         aeskeygenassist xmm1, xmm0, 0
 261         movd    eax, xmm1
 262 0:      xor     eax, [SI]
 263         mov     [SI + 4*KSZo], eax
 264         add     SI, 4
 265         cmp     SI, LIM
 266         jae     8f
 267
 268         // (Word 5...)
 269         cmp     KSZ, 6
 270         jb      9b
 271         xor     eax, [SI]
 272         mov     [SI + 4*KSZo], eax
 273         add     SI, 4
 274         cmp     SI, LIM
 275         jae     8f
 276
 277         // (Word 6...)
 278         cmp     KSZ, 7
 279         jb      9b
 280         xor     eax, [SI]
 281         mov     [SI + 4*KSZo], eax
 282         add     SI, 4
 283         cmp     SI, LIM
 284         jae     8f
 285
 286         // (Word 7...)
 287         cmp     KSZ, 8
 288         jb      9b
 289         xor     eax, [SI]
 290         mov     [SI + 4*KSZo], eax
 291         add     SI, 4
 292         cmp     SI, LIM
 293         jae     8f
 294
 295         // Must be done by now.
 296         jmp     9b
 297
 298         // Next job is to construct the decryption keys.  The keys for the
 299         // first and last rounds don't need to be mangled, but the remaining
 300         // ones do -- and they all need to be reordered too.
 301         //
 302         // The plan of action, then, is to copy the final encryption round's
 303         // keys into place first, then to do each of the intermediate rounds
 304         // in reverse order, and finally do the first round.
 305         //
 306         // Do all of the heavy lifting with SSE registers.  The order we're
 307         // doing this in means that it's OK if we read or write too much, and
 308         // there's easily enough buffer space for the over-enthusiastic reads
 309         // and writes because the context has space for 32-byte blocks, which
 310         // is our maximum and an exact fit for two SSE registers.
 311 8:      mov     NR, [CTX + nr]          // number of rounds
 312 #if NKW_NEEDS_REFRESH
 313         mov     BLKOFF, BLKSZ
 314         mov     LRK, NR
 315         imul    LRK, BLKOFF
 316 #else
 317         // If we retain NKW, then BLKSZ and BLKOFF are the same register
 318         // because we won't need the former again.
 319         mov     LRK, NKW
 320         sub     LRK, BLKSZ
 321 #endif
 322         lea     DI, [CTX + wi]
 323         lea     SI, [CTX + w + 4*LRKo]  // last round's keys
 324         shl     BLKOFF, 2               // block size (in bytes now)
 325
 326         // Copy the last encryption round's keys.
 327         movdqu  xmm0, [SI]
 328         movdqu  [DI], xmm0
 329         cmp     BLKOFF, 16
 330         jbe     9f
 331         movdqu  xmm0, [SI + 16]
 332         movdqu  [DI + 16], xmm0
 333
 334         // Update the loop variables and stop if we've finished.
 335 9:      add     DI, BLKOFFo
 336         sub     SI, BLKOFFo
 337         sub     NR, 1
 338         jbe     0f
 339
 340         // Do another middle round's keys...
 341         movdqu  xmm0, [SI]
 342         aesimc  xmm0, xmm0
 343         movdqu  [DI], xmm0
 344         cmp     BLKOFF, 16
 345         jbe     9b
 346         movdqu  xmm0, [SI + 16]
 347         aesimc  xmm0, xmm0
 348         movdqu  [DI + 16], xmm0
 349         jmp     9b
 350
 351         // Finally do the first encryption round.
 352 0:      movdqu  xmm0, [SI]
 353         movdqu  [DI], xmm0
 354         cmp     BLKOFF, 16
 355         jbe     0f
 356         movdqu  xmm0, [SI + 16]
 357         movdqu  [DI + 16], xmm0
 358
 359         // If the block size is not exactly four words then we must end-swap
 360         // everything.  We can use fancy SSE toys for this.
 361 0:      cmp     BLKOFF, 16
 362         je      0f
 363
 364         // Find the byte-reordering table.
 365         ldgot   ecx
 366         movdqa  xmm5, [INTADDR(endswap_tab, ecx)]
 367
 368 #if NKW_NEEDS_REFRESH
 369         // Calculate the number of subkey words again.  (It's a good job
 370         // we've got a fast multiplier.)
 371         mov     NKW, [CTX + nr]
 372         add     NKW, 1
 373         imul    NKW, BLKSZ
 374 #endif
 375
 376         // End-swap the encryption keys.
 377         mov     ecx, NKW
 378         lea     SI, [CTX + w]
 379         call    endswap_block
 380
 381         // And the decryption keys.
 382         mov     ecx, NKW
 383         lea     SI, [CTX + wi]
 384         call    endswap_block
 385
 386 0:      // All done.
 387 #if CPUFAM_X86
 388         pop     edi
 389         pop     esi
 390         pop     ebx
 391         pop     ebp
 392 #endif
 393 #if CPUFAM_AMD64 && ABI_WIN
 394         pop     rdi
 395         pop     rsi
 396 #endif
 397         ret
 398
 399         .align  16
 400 endswap_block:
 401         // End-swap ECX words starting at SI.  The end-swapping table is
 402         // already loaded into XMM5; and it's OK to work in 16-byte chunks.
 403         movdqu  xmm1, [SI]
 404         pshufb  xmm1, xmm5
 405         movdqu  [SI], xmm1
 406         add     SI, 16
 407         sub     ecx, 4
 408         ja      endswap_block
 409         ret
 410
 411 #undef CTX
 412 #undef BLKSZ
 413 #undef SI
 414 #undef DI
 415 #undef KSZ
 416 #undef KSZo
 417 #undef RCON
 418 #undef LIMn
 419 #undef LIM
 420 #undef NR
 421 #undef LRK
 422 #undef LRKo
 423 #undef BLKOFF
 424 #undef BLKOFFo
 425
 426 ENDFUNC
 427
 428 ///--------------------------------------------------------------------------
 429 /// Encrypting and decrypting blocks.
 430
 431 .macro  encdec  op, aes, koff
 432   FUNC(rijndael_\op\()_x86ish_aesni)
 433
 434         // Find the magic endianness-swapping table.
 435         ldgot   ecx
 436         movdqa  xmm5, [INTADDR(endswap_tab, ecx)]
 437
 438 #if CPUFAM_X86
 439         // Arguments come in on the stack, and need to be collected.  We
 440         // don't have a shortage of registers.
 441
 442 #  define K ecx
 443 #  define SRC edx
 444 #  define DST edx
 445 #  define NR eax
 446
 447         mov     K, [esp + 4]
 448         mov     SRC, [esp + 8]
 449 #endif
 450
 451 #if CPUFAM_AMD64 && ABI_SYSV
 452         // Arguments come in registers.  All is good.
 453
 454 #  define K rdi
 455 #  define SRC rsi
 456 #  define DST rdx
 457 #  define NR eax
 458 #endif
 459
 460 #if CPUFAM_AMD64 && ABI_WIN
 461         // Arguments come in different registers.
 462
 463 #  define K rcx
 464 #  define SRC rdx
 465 #  define DST r8
 466 #  define NR eax
 467 #endif
 468
 469         // Initial setup.
 470         movdqu  xmm0, [SRC]
 471         pshufb  xmm0, xmm5
 472         mov     NR, [K + nr]
 473         add     K, \koff
 474
 475         // Initial whitening.
 476         movdqu  xmm1, [K]
 477         add     K, 16
 478         pxor    xmm0, xmm1
 479
 480         // Dispatch to the correct code.
 481         cmp     NR, 10
 482         je      10f
 483         jb      bogus
 484         cmp     NR, 14
 485         je      14f
 486         ja      bogus
 487         cmp     NR, 12
 488         je      12f
 489         jb      11f
 490         jmp     13f
 491
 492         .align  2
 493
 494         // 14 rounds...
 495 14:     movdqu  xmm1, [K]
 496         add     K, 16
 497         \aes    xmm0, xmm1
 498
 499         // 13 rounds...
 500 13:     movdqu  xmm1, [K]
 501         add     K, 16
 502         \aes    xmm0, xmm1
 503
 504         // 12 rounds...
 505 12:     movdqu  xmm1, [K]
 506         add     K, 16
 507         \aes    xmm0, xmm1
 508
 509         // 11 rounds...
 510 11:     movdqu  xmm1, [K]
 511         add     K, 16
 512         \aes    xmm0, xmm1
 513
 514         // 10 rounds...
 515 10:     movdqu  xmm1, [K]
 516         \aes    xmm0, xmm1
 517
 518         // 9 rounds...
 519         movdqu  xmm1, [K + 16]
 520         \aes    xmm0, xmm1
 521
 522         // 8 rounds...
 523         movdqu  xmm1, [K + 32]
 524         \aes    xmm0, xmm1
 525
 526         // 7 rounds...
 527         movdqu  xmm1, [K + 48]
 528         \aes    xmm0, xmm1
 529
 530         // 6 rounds...
 531         movdqu  xmm1, [K + 64]
 532         \aes    xmm0, xmm1
 533
 534         // 5 rounds...
 535         movdqu  xmm1, [K + 80]
 536         \aes    xmm0, xmm1
 537
 538         // 4 rounds...
 539         movdqu  xmm1, [K + 96]
 540         \aes    xmm0, xmm1
 541
 542         // 3 rounds...
 543         movdqu  xmm1, [K + 112]
 544         \aes    xmm0, xmm1
 545
 546         // 2 rounds...
 547         movdqu  xmm1, [K + 128]
 548         \aes    xmm0, xmm1
 549
 550         // Final round...
 551         movdqu  xmm1, [K + 144]
 552         \aes\()last xmm0, xmm1
 553
 554         // Unpermute the ciphertext block and store it.
 555         pshufb  xmm0, xmm5
 556 #if CPUFAM_X86
 557         mov     DST, [esp + 12]
 558 #endif
 559         movdqu  [DST], xmm0
 560
 561         // And we're done.
 562         ret
 563
 564 #undef K
 565 #undef SRC
 566 #undef DST
 567 #undef NR
 568
 569   ENDFUNC
 570 .endm
 571
 572         encdec  eblk, aesenc, w
 573         encdec  dblk, aesdec, wi
 574
 575 ///--------------------------------------------------------------------------
 576 /// Random utilities.
 577
 578         .align  16
 579         // Abort the process because of a programming error.  Indirecting
 580         // through this point serves several purposes: (a) by CALLing, rather
 581         // than branching to, `abort', we can save the return address, which
 582         // might at least provide a hint as to what went wrong; (b) we don't
 583         // have conditional CALLs (and they'd be big anyway); and (c) we can
 584         // write a HLT here as a backstop against `abort' being mad.
 585 bogus:  callext F(abort)
 586 0:      hlt
 587         jmp     0b
 588
 589         gotaux  ecx
 590
 591 ///--------------------------------------------------------------------------
 592 /// Data tables.
 593
 594         .align  16
 595 endswap_tab:
 596         .byte    3,  2,  1,  0
 597         .byte    7,  6,  5,  4
 598         .byte   11, 10,  9,  8
 599         .byte   15, 14, 13, 12
 600
 601 ///----- That's all, folks --------------------------------------------------