chiark - git - mdw - catacomb/blob - symm/rijndael-arm64-crypto.S

   1 /// -*- mode: asm; asm-comment-char: ?/ -*-
   2 ///
   3 /// AArch64 crypto-extension-based implementation of Rijndael
   4 ///
   5 /// (c) 2018 Straylight/Edgeware
   6 ///
   7
   8 ///----- Licensing notice ---------------------------------------------------
   9 ///
  10 /// This file is part of Catacomb.
  11 ///
  12 /// Catacomb is free software; you can redistribute it and/or modify
  13 /// it under the terms of the GNU Library General Public License as
  14 /// published by the Free Software Foundation; either version 2 of the
  15 /// License, or (at your option) any later version.
  16 ///
  17 /// Catacomb is distributed in the hope that it will be useful,
  18 /// but WITHOUT ANY WARRANTY; without even the implied warranty of
  19 /// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  20 /// GNU Library General Public License for more details.
  21 ///
  22 /// You should have received a copy of the GNU Library General Public
  23 /// License along with Catacomb; if not, write to the Free
  24 /// Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
  25 /// MA 02111-1307, USA.
  26
  27 ///--------------------------------------------------------------------------
  28 /// Preliminaries.
  29
  30 #include "config.h"
  31 #include "asm-common.h"
  32
  33         .arch   armv8-a+crypto
  34
  35         .extern F(abort)
  36         .extern F(rijndael_rcon)
  37
  38         .text
  39
  40 ///--------------------------------------------------------------------------
  41 /// Main code.
  42
  43 /// The ARM crypto extension implements a little-endian version of AES
  44 /// (though the manual doesn't actually spell this out and you have to
  45 /// experiment), but Catacomb's internal interface presents as big-endian so
  46 /// as to work better with things like GCM.  We therefore maintain the round
  47 /// keys in little-endian form, and have to end-swap blocks in and out.
  48 ///
  49 /// For added amusement, the crypto extension doesn't implement the larger-
  50 /// block versions of Rijndael, so we have to end-swap the keys if we're
  51 /// preparing for one of those.
  52
  53         // Useful constants.
  54         .equ    maxrounds, 16           // maximum number of rounds
  55         .equ    maxblksz, 32            // maximum block size, in bytes
  56         .equ    kbufsz, maxblksz*(maxrounds + 1) // size of key-sched buffer
  57
  58         // Context structure.
  59         .equ    nr, 0                   // number of rounds
  60         .equ    w, nr + 4               // encryption key words
  61         .equ    wi, w + kbufsz          // decryption key words
  62
  63 ///--------------------------------------------------------------------------
  64 /// Key setup.
  65
  66 FUNC(rijndael_setup_arm64_crypto)
  67
  68         // Arguments:
  69         //      x0 = pointer to context
  70         //      w1 = block size in 32-bit words
  71         //      x2 = pointer to key material
  72         //      x3 = key size in words
  73
  74         pushreg x29, x30
  75         mov     x29, sp
  76
  77         // The initial round key material is taken directly from the input
  78         // key, so copy it over.  Unfortunately, the key material is not
  79         // guaranteed to be aligned in any especially useful way.  Assume
  80         // that alignment traps are not enabled.  (Why would they be?  On
  81         // A32, alignment traps were part of a transition plan which changed
  82         // the way unaligned loads and stores behaved, but there's never been
  83         // any other behaviour on A64.)
  84         mov     x15, x3
  85         add     x4, x0, #w
  86 0:      sub     x15, x15, #1
  87         ldr     w14, [x2], #4
  88         str     w14, [x4], #4
  89         cbnz    x15, 0b
  90
  91         // Find out other useful things and prepare for the main loop.
  92 9:      ldr     w9, [x0, #nr]           // number of rounds
  93         madd    w2, w1, w9, w1          // total key size in words
  94         leaext  x5, rijndael_rcon       // round constants
  95         sub     x6, x2, x3              // minus what we've copied already
  96         add     x7, x0, #w              // position in previous cycle
  97         movi    v1.4s, #0               // all-zero register for the key
  98         mov     x8, #0                  // position in current cycle
  99
 100         // Main key expansion loop.  Dispatch according to the position in
 101         // the cycle.
 102 0:      ldr     w15, [x7], #4           // word from previous cycle
 103         cbz     x8, 1f                  // first word of the cycle?
 104         cmp     x8, #4                  // fourth word of the cycle?
 105         b.ne    2f
 106         cmp     x3, #7                  // seven or eight words of key?
 107         b.cc    2f
 108
 109         // Fourth word of the cycle, seven or eight words of key.  We must do
 110         // the byte substitution.
 111         dup     v0.4s, w14
 112         aese    v0.16b, v1.16b          // effectively, just SubBytes
 113         mov     w14, v0.s[0]
 114         b       2f
 115
 116         // First word of the cycle.  Byte substitution, rotation, and round
 117         // constant.
 118 1:      ldrb    w13, [x5], #1           // next round constant
 119         dup     v0.4s, w14
 120         aese    v0.16b, v1.16b          // effectively, just SubBytes
 121         mov     w14, v0.s[0]
 122         eor     w14, w13, w14, ror #8
 123
 124         // Common ending: mix in the word from the previous cycle and store.
 125 2:      eor     w14, w14, w15
 126         str     w14, [x4], #4
 127
 128         // Prepare for the next iteration.  If we're done, then stop; if
 129         // we've finished a cycle then reset the counter.
 130         add     x8, x8, #1
 131         sub     x6, x6, #1
 132         cmp     x8, x3
 133         cbz     x6, 9f
 134         csel    x8, x8, xzr, cc
 135         b       0b
 136
 137         // Next job is to construct the decryption keys.  The keys for the
 138         // first and last rounds don't need to be mangled, but the remaining
 139         // ones do -- and they all need to be reordered too.
 140         //
 141         // The plan of action, then, is to copy the final encryption round's
 142         // keys into place first, then to do each of the intermediate rounds
 143         // in reverse order, and finally do the first round.
 144         //
 145         // Do all the heavy lifting with the vector registers.  The order
 146         // we're doing this in means that it's OK if we read or write too
 147         // much, and there's easily enough buffer space for the
 148         // over-enthusiastic reads and writes because the context has space
 149         // for 32-byte blocks, which is our maximum and an exact fit for two
 150         // full-width registers.
 151 9:      add     x5, x0, #wi
 152         add     x4, x0, #w
 153         add     x4, x4, w2, uxtw #2
 154         sub     x4, x4, w1, uxtw #2             // last round's keys
 155
 156         // Copy the last encryption round's keys.
 157         ld1     {v0.4s, v1.4s}, [x4]
 158         st1     {v0.4s, v1.4s}, [x5]
 159
 160         // Update the loop variables and stop if we've finished.
 161 0:      sub     w9, w9, #1
 162         add     x5, x5, w1, uxtw #2
 163         sub     x4, x4, w1, uxtw #2
 164         cbz     w9, 9f
 165
 166         // Do another middle round's keys...
 167         ld1     {v0.4s, v1.4s}, [x4]
 168         aesimc  v0.16b, v0.16b
 169         aesimc  v1.16b, v1.16b
 170         st1     {v0.4s, v1.4s}, [x5]
 171         b       0b
 172
 173         // Finally do the first encryption round.
 174 9:      ld1     {v0.4s, v1.4s}, [x4]
 175         st1     {v0.4s, v1.4s}, [x5]
 176
 177         // If the block size is not exactly four words then we must end-swap
 178         // everything.  We can use fancy vector toys for this.
 179         cmp     w1, #4
 180         b.eq    9f
 181
 182         // End-swap the encryption keys.
 183         add     x1, x0, #w
 184         bl      endswap_block
 185
 186         // And the decryption keys
 187         add     x1, x0, #wi
 188         bl      endswap_block
 189
 190         // All done.
 191 9:      popreg  x29, x30
 192         ret
 193
 194 ENDFUNC
 195
 196 INTFUNC(endswap_block)
 197         // End-swap w2 words starting at x1.  x1 is clobbered; w2 is not.
 198         // It's OK to work in 16-byte chunks.
 199
 200         mov     w3, w2
 201 0:      subs    w3, w3, #4
 202         ld1     {v0.4s}, [x1]
 203         rev32   v0.16b, v0.16b
 204         st1     {v0.4s}, [x1], #16
 205         b.hi    0b
 206         ret
 207
 208 ENDFUNC
 209
 210 ///--------------------------------------------------------------------------
 211 /// Encrypting and decrypting blocks.
 212
 213 .macro  encdec  op, aes, mc, koff
 214   FUNC(rijndael_\op\()_arm64_crypto)
 215
 216         // Arguments:
 217         //      x0 = pointer to context
 218         //      x1 = pointer to input block
 219         //      x2 = pointer to output block
 220
 221         // Set things up ready.
 222         ldr     w3, [x0, #nr]
 223         add     x0, x0, #\koff
 224         ld1     {v0.4s}, [x1]
 225         rev32   v0.16b, v0.16b
 226
 227         // Check the number of rounds and dispatch.
 228         cmp     w3, #14
 229         b.eq    14f
 230         cmp     w3, #10
 231         b.eq    10f
 232         cmp     w3, #12
 233         b.eq    12f
 234         cmp     w3, #13
 235         b.eq    13f
 236         cmp     w3, #11
 237         b.eq    11f
 238         callext F(abort)
 239
 240         // Eleven rounds.
 241 11:     ld1     {v16.4s}, [x0], #16
 242         \aes    v0.16b, v16.16b
 243         \mc     v0.16b, v0.16b
 244         b       10f
 245
 246         // Twelve rounds.
 247 12:     ld1     {v16.4s, v17.4s}, [x0], #32
 248         \aes    v0.16b, v16.16b
 249         \mc     v0.16b, v0.16b
 250         \aes    v0.16b, v17.16b
 251         \mc     v0.16b, v0.16b
 252         b       10f
 253
 254         // Thirteen rounds.
 255 13:     ld1     {v16.4s-v18.4s}, [x0], #48
 256         \aes    v0.16b, v16.16b
 257         \mc     v0.16b, v0.16b
 258         \aes    v0.16b, v17.16b
 259         \mc     v0.16b, v0.16b
 260         \aes    v0.16b, v18.16b
 261         \mc     v0.16b, v0.16b
 262         b       10f
 263
 264         // Fourteen rounds.  (Drops through to the ten round case because
 265         // this is the next most common.)
 266 14:     ld1     {v16.4s-v19.4s}, [x0], #64
 267         \aes    v0.16b, v16.16b
 268         \mc     v0.16b, v0.16b
 269         \aes    v0.16b, v17.16b
 270         \mc     v0.16b, v0.16b
 271         \aes    v0.16b, v18.16b
 272         \mc     v0.16b, v0.16b
 273         \aes    v0.16b, v19.16b
 274         \mc     v0.16b, v0.16b
 275         // Drop through...
 276
 277         // Ten rounds.
 278 10:     ld1     {v16.4s-v19.4s}, [x0], #64
 279         ld1     {v20.4s-v23.4s}, [x0], #64
 280         \aes    v0.16b, v16.16b
 281         \mc     v0.16b, v0.16b
 282         \aes    v0.16b, v17.16b
 283         \mc     v0.16b, v0.16b
 284         \aes    v0.16b, v18.16b
 285         \mc     v0.16b, v0.16b
 286         \aes    v0.16b, v19.16b
 287         \mc     v0.16b, v0.16b
 288
 289         ld1     {v16.4s-v18.4s}, [x0], #48
 290         \aes    v0.16b, v20.16b
 291         \mc     v0.16b, v0.16b
 292         \aes    v0.16b, v21.16b
 293         \mc     v0.16b, v0.16b
 294         \aes    v0.16b, v22.16b
 295         \mc     v0.16b, v0.16b
 296         \aes    v0.16b, v23.16b
 297         \mc     v0.16b, v0.16b
 298
 299         // Final round has no MixColumns, but is followed by final whitening.
 300         \aes    v0.16b, v16.16b
 301         \mc     v0.16b, v0.16b
 302         \aes    v0.16b, v17.16b
 303         eor     v0.16b, v0.16b, v18.16b
 304
 305         // All done.
 306         rev32   v0.16b, v0.16b
 307         st1     {v0.4s}, [x2]
 308         ret
 309
 310   ENDFUNC
 311 .endm
 312
 313         encdec  eblk, aese, aesmc, w
 314         encdec  dblk, aesd, aesimc, wi
 315
 316 ///----- That's all, folks --------------------------------------------------