chiark - git - mdw - catacomb/blob - symm/rijndael-x86-aesni.S

   1 /// -*- mode: asm; asm-comment-char: ?/ -*-
   2 ///
   3 /// AESNI-based implementation of Rijndael
   4 ///
   5 /// (c) 2015 Straylight/Edgeware
   6 ///
   7
   8 ///----- Licensing notice ---------------------------------------------------
   9 ///
  10 /// This file is part of Catacomb.
  11 ///
  12 /// Catacomb is free software; you can redistribute it and/or modify
  13 /// it under the terms of the GNU Library General Public License as
  14 /// published by the Free Software Foundation; either version 2 of the
  15 /// License, or (at your option) any later version.
  16 ///
  17 /// Catacomb is distributed in the hope that it will be useful,
  18 /// but WITHOUT ANY WARRANTY; without even the implied warranty of
  19 /// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  20 /// GNU Library General Public License for more details.
  21 ///
  22 /// You should have received a copy of the GNU Library General Public
  23 /// License along with Catacomb; if not, write to the Free
  24 /// Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
  25 /// MA 02111-1307, USA.
  26
  27 ///--------------------------------------------------------------------------
  28 /// External definitions.
  29
  30 #include "config.h"
  31 #include "asm-common.h"
  32
  33 ///--------------------------------------------------------------------------
  34 /// External definitions.
  35
  36         .globl  F(abort)
  37         .globl  F(rijndael_rcon)
  38
  39 ///--------------------------------------------------------------------------
  40 /// Local utilities.
  41
  42 // Magic constants for shuffling.
  43 #define ROTL 0x93
  44 #define ROT2 0x4e
  45 #define ROTR 0x39
  46
  47 ///--------------------------------------------------------------------------
  48 /// Main code.
  49
  50         .arch   .aes
  51         .section .text
  52
  53 /// The AESNI instructions implement a little-endian version of AES, but
  54 /// Catacomb's internal interface presents as big-endian so as to work better
  55 /// with things like GCM.  We therefore maintain the round keys in
  56 /// little-endian form, and have to end-swap blocks in and out.
  57 ///
  58 /// For added amusement, the AESNI instructions don't implement the
  59 /// larger-block versions of Rijndael, so we have to end-swap the keys if
  60 /// we're preparing for one of those.
  61
  62         // Useful constants.
  63         .equ    maxrounds, 16           // maximum number of rounds
  64         .equ    maxblksz, 32            // maximum block size, in bytes
  65         .equ    kbufsz, maxblksz*(maxrounds + 1) // size of a key-schedule buffer
  66
  67         // Context structure.
  68         .equ    nr, 0                   // number of rounds
  69         .equ    w, nr + 4               // encryption key words
  70         .equ    wi, w + kbufsz          // decryption key words
  71
  72 ///--------------------------------------------------------------------------
  73 /// Key setup.
  74
  75 FUNC(rijndael_setup_x86_aesni)
  76
  77         // Initial state.  We have four arguments:
  78         // [esp + 20] is the context pointer
  79         // [esp + 24] is the block size, in 32-bit words (4, 6, or 8)
  80         // [esp + 28] points to the key material, unaligned
  81         // [esp + 32] is the size of the key, in words
  82         // The key size has already been checked for validity, and the number
  83         // of rounds has been computed.  Our job is only to fill in the `w'
  84         // and `wi' vectors.
  85
  86         push    ebp
  87         push    ebx
  88         push    esi
  89         push    edi
  90
  91         // The initial round key material is taken directly from the input
  92         // key, so copy it over.
  93         mov     ebp, [esp + 20]         // context base pointer
  94         mov     ebx, [esp + 32]         // key size, in words
  95         mov     ecx, ebx
  96         mov     esi, [esp + 28]
  97         lea     edi, [ebp + w]
  98         rep     movsd
  99
 100         // Find out other useful things.
 101         mov     edx, [ebp + nr]         // number of rounds
 102         add     edx, 1
 103         imul    edx, [esp + 24]         // total key size in words
 104         sub     edx, ebx                // offset by the key size
 105
 106         // Find the round constants.
 107         ldgot   ecx
 108         leaext  ecx, rijndael_rcon, ecx
 109
 110         // Prepare for the main loop.
 111         lea     esi, [ebp + w]
 112         mov     eax, [esi + 4*ebx - 4]  // most recent key word
 113         lea     edx, [esi + 4*edx]      // limit, offset by one key expansion
 114
 115         // Main key expansion loop.  The first word of each key-length chunk
 116         // needs special treatment.
 117         //
 118         // This is rather tedious because the Intel `AESKEYGENASSIST'
 119         // instruction is very strangely shaped.  Firstly, it wants to
 120         // operate on vast SSE registers, even though we're data-blocked from
 121         // doing more than operation at a time unless we're doing two key
 122         // schedules simultaneously -- and even then we can't do more than
 123         // two, because the instruction ignores two of its input words
 124         // entirely, and produces two different outputs for each of the other
 125         // two.  And secondly it insists on taking the magic round constant
 126         // as an immediate, so it's kind of annoying if you're not
 127         // open-coding the whole thing.  It's much easier to leave that as
 128         // zero and XOR in the round constant by hand.
 129 9:      movd    xmm0, eax
 130         pshufd  xmm0, xmm0, ROTR
 131         aeskeygenassist xmm1, xmm0, 0
 132         pshufd  xmm1, xmm1, ROTL
 133         movd    eax, xmm1
 134         xor     eax, [esi]
 135         xor     al, [ecx]
 136         inc     ecx
 137         mov     [esi + 4*ebx], eax
 138         add     esi, 4
 139         cmp     esi, edx
 140         jae     8f
 141
 142         // The next three words are simple...
 143         xor     eax, [esi]
 144         mov     [esi + 4*ebx], eax
 145         add     esi, 4
 146         cmp     esi, edx
 147         jae     8f
 148
 149         // (Word 2...)
 150         xor     eax, [esi]
 151         mov     [esi + 4*ebx], eax
 152         add     esi, 4
 153         cmp     esi, edx
 154         jae     8f
 155
 156         // (Word 3...)
 157         xor     eax, [esi]
 158         mov     [esi + 4*ebx], eax
 159         add     esi, 4
 160         cmp     esi, edx
 161         jae     8f
 162
 163         // Word 4.  If the key is /more/ than 6 words long, then we must
 164         // apply a substitution here.
 165         cmp     ebx, 5
 166         jb      9b
 167         cmp     ebx, 7
 168         jb      0f
 169         movd    xmm0, eax
 170         pshufd  xmm0, xmm0, ROTL
 171         aeskeygenassist xmm1, xmm0, 0
 172         movd    eax, xmm1
 173 0:      xor     eax, [esi]
 174         mov     [esi + 4*ebx], eax
 175         add     esi, 4
 176         cmp     esi, edx
 177         jae     8f
 178
 179         // (Word 5...)
 180         cmp     ebx, 6
 181         jb      9b
 182         xor     eax, [esi]
 183         mov     [esi + 4*ebx], eax
 184         add     esi, 4
 185         cmp     esi, edx
 186         jae     8f
 187
 188         // (Word 6...)
 189         cmp     ebx, 7
 190         jb      9b
 191         xor     eax, [esi]
 192         mov     [esi + 4*ebx], eax
 193         add     esi, 4
 194         cmp     esi, edx
 195         jae     8f
 196
 197         // (Word 7...)
 198         cmp     ebx, 8
 199         jb      9b
 200         xor     eax, [esi]
 201         mov     [esi + 4*ebx], eax
 202         add     esi, 4
 203         cmp     esi, edx
 204         jae     8f
 205
 206         // Must be done by now.
 207         jmp     9b
 208
 209         // Next job is to construct the decryption keys.  The keys for the
 210         // first and last rounds don't need to be mangled, but the remaining
 211         // ones do -- and they all need to be reordered too.
 212         //
 213         // The plan of action, then, is to copy the final encryption round's
 214         // keys into place first, then to do each of the intermediate rounds
 215         // in reverse order, and finally do the first round.
 216         //
 217         // Do all of the heavy lifting with SSE registers.  The order we're
 218         // doing this in means that it's OK if we read or write too much, and
 219         // there's easily enough buffer space for the over-enthusiastic reads
 220         // and writes because the context has space for 32-byte blocks, which
 221         // is our maximum and an exact fit for two SSE registers.
 222 8:      mov     ecx, [ebp + nr]         // number of rounds
 223         mov     ebx, [esp + 24]         // block size (in words)
 224         mov     edx, ecx
 225         imul    edx, ebx
 226         lea     edi, [ebp + wi]
 227         lea     esi, [ebp + 4*edx + w]  // last round's keys
 228         shl     ebx, 2                  // block size (in bytes now)
 229
 230         // Copy the last encryption round's keys.
 231         movdqu  xmm0, [esi]
 232         movdqu  [edi], xmm0
 233         cmp     ebx, 16
 234         jbe     9f
 235         movdqu  xmm0, [esi + 16]
 236         movdqu  [edi + 16], xmm0
 237
 238         // Update the loop variables and stop if we've finished.
 239 9:      add     edi, ebx
 240         sub     esi, ebx
 241         sub     ecx, 1
 242         jbe     0f
 243
 244         // Do another middle round's keys...
 245         movdqu  xmm0, [esi]
 246         aesimc  xmm0, xmm0
 247         movdqu  [edi], xmm0
 248         cmp     ebx, 16
 249         jbe     9b
 250         movdqu  xmm0, [esi + 16]
 251         aesimc  xmm0, xmm0
 252         movdqu  [edi + 16], xmm0
 253         jmp     9b
 254
 255         // Finally do the first encryption round.
 256 0:      movdqu  xmm0, [esi]
 257         movdqu  [edi], xmm0
 258         cmp     ebx, 16
 259         jbe     0f
 260         movdqu  xmm0, [esi + 16]
 261         movdqu  [edi + 16], xmm0
 262
 263         // If the block size is not exactly four words then we must end-swap
 264         // everything.  We can use fancy SSE toys for this.
 265 0:      cmp     ebx, 16
 266         je      0f
 267
 268         // Find the byte-reordering table.
 269         ldgot   ecx
 270         movdqa  xmm5, [INTADDR(endswap_tab, ecx)]
 271
 272         // Calculate the number of subkey words again.  (It's a good job
 273         // we've got a fast multiplier.)
 274         mov     ecx, [ebp + nr]
 275         add     ecx, 1
 276         imul    ecx, [esp + 24]         // total keys in words
 277
 278         // End-swap the encryption keys.
 279         mov     eax, ecx
 280         lea     esi, [ebp + w]
 281         call    endswap_block
 282
 283         // And the decryption keys.
 284         mov     ecx, eax
 285         lea     esi, [ebp + wi]
 286         call    endswap_block
 287
 288         // All done.
 289 0:      pop     edi
 290         pop     esi
 291         pop     ebx
 292         pop     ebp
 293         ret
 294
 295         .align  16
 296 endswap_block:
 297         // End-swap ECX words starting at ESI.  The end-swapping table is
 298         // already loaded into XMM5; and it's OK to work in 16-byte chunks.
 299         movdqu  xmm1, [esi]
 300         pshufb  xmm1, xmm5
 301         movdqu  [esi], xmm1
 302         add     esi, 16
 303         sub     ecx, 4
 304         ja      endswap_block
 305         ret
 306
 307 ENDFUNC
 308
 309 ///--------------------------------------------------------------------------
 310 /// Encrypting and decrypting blocks.
 311
 312 FUNC(rijndael_eblk_x86_aesni)
 313
 314         // On entry, we have:
 315         // [esp +  4] points to the context block
 316         // [esp +  8] points to the input data block
 317         // [esp + 12] points to the output buffer
 318
 319         // Find the magic endianness-swapping table.
 320         ldgot   ecx
 321         movdqa  xmm5, [INTADDR(endswap_tab, ecx)]
 322
 323         // Load the input block and end-swap it.  Also, start loading the
 324         // keys.
 325         mov     eax, [esp + 8]
 326         movdqu  xmm0, [eax]
 327         pshufb  xmm0, xmm5
 328         mov     eax, [esp + 4]
 329         lea     edx, [eax + w]
 330         mov     eax, [eax + nr]
 331
 332         // Initial whitening.
 333         movdqu  xmm1, [edx]
 334         add     edx, 16
 335         pxor    xmm0, xmm1
 336
 337         // Dispatch to the correct code.
 338         cmp     eax, 10
 339         je      er10
 340         jb      bogus
 341         cmp     eax, 14
 342         je      er14
 343         ja      bogus
 344         cmp     eax, 12
 345         je      er12
 346         jb      er11
 347         jmp     er13
 348
 349         .align  2
 350
 351         // 14 rounds...
 352 er14:   movdqu  xmm1, [edx]
 353         add     edx, 16
 354         aesenc  xmm0, xmm1
 355
 356         // 13 rounds...
 357 er13:   movdqu  xmm1, [edx]
 358         add     edx, 16
 359         aesenc  xmm0, xmm1
 360
 361         // 12 rounds...
 362 er12:   movdqu  xmm1, [edx]
 363         add     edx, 16
 364         aesenc  xmm0, xmm1
 365
 366         // 11 rounds...
 367 er11:   movdqu  xmm1, [edx]
 368         add     edx, 16
 369         aesenc  xmm0, xmm1
 370
 371         // 10 rounds...
 372 er10:   movdqu  xmm1, [edx]
 373         aesenc  xmm0, xmm1
 374
 375         // 9 rounds...
 376         movdqu  xmm1, [edx + 16]
 377         aesenc  xmm0, xmm1
 378
 379         // 8 rounds...
 380         movdqu  xmm1, [edx + 32]
 381         aesenc  xmm0, xmm1
 382
 383         // 7 rounds...
 384         movdqu  xmm1, [edx + 48]
 385         aesenc  xmm0, xmm1
 386
 387         // 6 rounds...
 388         movdqu  xmm1, [edx + 64]
 389         aesenc  xmm0, xmm1
 390
 391         // 5 rounds...
 392         movdqu  xmm1, [edx + 80]
 393         aesenc  xmm0, xmm1
 394
 395         // 4 rounds...
 396         movdqu  xmm1, [edx + 96]
 397         aesenc  xmm0, xmm1
 398
 399         // 3 rounds...
 400         movdqu  xmm1, [edx + 112]
 401         aesenc  xmm0, xmm1
 402
 403         // 2 rounds...
 404         movdqu  xmm1, [edx + 128]
 405         aesenc  xmm0, xmm1
 406
 407         // Final round...
 408         movdqu  xmm1, [edx + 144]
 409         aesenclast xmm0, xmm1
 410
 411         // Unpermute the ciphertext block and store it.
 412         pshufb  xmm0, xmm5
 413         mov     eax, [esp + 12]
 414         movdqu  [eax], xmm0
 415
 416         // And we're done.
 417         ret
 418
 419 ENDFUNC
 420
 421 FUNC(rijndael_dblk_x86_aesni)
 422
 423         // On entry, we have:
 424         // [esp +  4] points to the context block
 425         // [esp +  8] points to the input data block
 426         // [esp + 12] points to the output buffer
 427
 428         // Find the magic endianness-swapping table.
 429         ldgot   ecx
 430         movdqa  xmm5, [INTADDR(endswap_tab, ecx)]
 431
 432         // Load the input block and end-swap it.  Also, start loading the
 433         // keys.
 434         mov     eax, [esp + 8]
 435         movdqu  xmm0, [eax]
 436         pshufb  xmm0, xmm5
 437         mov     eax, [esp + 4]
 438         lea     edx, [eax + wi]
 439         mov     eax, [eax + nr]
 440
 441         // Initial whitening.
 442         movdqu  xmm1, [edx]
 443         add     edx, 16
 444         pxor    xmm0, xmm1
 445
 446         // Dispatch to the correct code.
 447         cmp     eax, 10
 448         je      dr10
 449         jb      bogus
 450         cmp     eax, 14
 451         je      dr14
 452         ja      bogus
 453         cmp     eax, 12
 454         je      dr12
 455         jb      dr11
 456         jmp     dr13
 457
 458         .align  2
 459
 460         // 14 rounds...
 461 dr14:   movdqu  xmm1, [edx]
 462         add     edx, 16
 463         aesdec  xmm0, xmm1
 464
 465         // 13 rounds...
 466 dr13:   movdqu  xmm1, [edx]
 467         add     edx, 16
 468         aesdec  xmm0, xmm1
 469
 470         // 12 rounds...
 471 dr12:   movdqu  xmm1, [edx]
 472         add     edx, 16
 473         aesdec  xmm0, xmm1
 474
 475         // 11 rounds...
 476 dr11:   movdqu  xmm1, [edx]
 477         add     edx, 16
 478         aesdec  xmm0, xmm1
 479
 480         // 10 rounds...
 481 dr10:   movdqu  xmm1, [edx]
 482         aesdec  xmm0, xmm1
 483
 484         // 9 rounds...
 485         movdqu  xmm1, [edx + 16]
 486         aesdec  xmm0, xmm1
 487
 488         // 8 rounds...
 489         movdqu  xmm1, [edx + 32]
 490         aesdec  xmm0, xmm1
 491
 492         // 7 rounds...
 493         movdqu  xmm1, [edx + 48]
 494         aesdec  xmm0, xmm1
 495
 496         // 6 rounds...
 497         movdqu  xmm1, [edx + 64]
 498         aesdec  xmm0, xmm1
 499
 500         // 5 rounds...
 501         movdqu  xmm1, [edx + 80]
 502         aesdec  xmm0, xmm1
 503
 504         // 4 rounds...
 505         movdqu  xmm1, [edx + 96]
 506         aesdec  xmm0, xmm1
 507
 508         // 3 rounds...
 509         movdqu  xmm1, [edx + 112]
 510         aesdec  xmm0, xmm1
 511
 512         // 2 rounds...
 513         movdqu  xmm1, [edx + 128]
 514         aesdec  xmm0, xmm1
 515
 516         // Final round...
 517         movdqu  xmm1, [edx + 144]
 518         aesdeclast xmm0, xmm1
 519
 520         // Unpermute the ciphertext block and store it.
 521         pshufb  xmm0, xmm5
 522         mov     eax, [esp + 12]
 523         movdqu  [eax], xmm0
 524
 525         // And we're done.
 526         ret
 527
 528 ENDFUNC
 529
 530 ///--------------------------------------------------------------------------
 531 /// Random utilities.
 532
 533         .align  16
 534         // Abort the process because of a programming error.  Indirecting
 535         // through this point serves several purposes: (a) by CALLing, rather
 536         // than branching to, `abort', we can save the return address, which
 537         // might at least provide a hint as to what went wrong; (b) we don't
 538         // have conditional CALLs (and they'd be big anyway); and (c) we can
 539         // write a HLT here as a backstop against `abort' being mad.
 540 bogus:  callext F(abort)
 541 0:      hlt
 542         jmp     0b
 543
 544         gotaux  ecx
 545
 546 ///--------------------------------------------------------------------------
 547 /// Data tables.
 548
 549         .align  16
 550 endswap_tab:
 551         .byte    3,  2,  1,  0
 552         .byte    7,  6,  5,  4
 553         .byte   11, 10,  9,  8
 554         .byte   15, 14, 13, 12
 555
 556 ///----- That's all, folks --------------------------------------------------