chiark - git - mdw - catacomb/blob - symm/rijndael-arm-crypto.S

   1 /// -*- mode: asm; asm-comment-char: ?/ -*-
   2 ///
   3 /// ARM crypto-extension-based implementation of Rijndael
   4 ///
   5 /// (c) 2016 Straylight/Edgeware
   6 ///
   7
   8 ///----- Licensing notice ---------------------------------------------------
   9 ///
  10 /// This file is part of Catacomb.
  11 ///
  12 /// Catacomb is free software; you can redistribute it and/or modify
  13 /// it under the terms of the GNU Library General Public License as
  14 /// published by the Free Software Foundation; either version 2 of the
  15 /// License, or (at your option) any later version.
  16 ///
  17 /// Catacomb is distributed in the hope that it will be useful,
  18 /// but WITHOUT ANY WARRANTY; without even the implied warranty of
  19 /// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  20 /// GNU Library General Public License for more details.
  21 ///
  22 /// You should have received a copy of the GNU Library General Public
  23 /// License along with Catacomb; if not, write to the Free
  24 /// Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
  25 /// MA 02111-1307, USA.
  26
  27 ///--------------------------------------------------------------------------
  28 /// External definitions.
  29
  30 #include "config.h"
  31 #include "asm-common.h"
  32
  33         .globl  F(abort)
  34         .globl  F(rijndael_rcon)
  35
  36 ///--------------------------------------------------------------------------
  37 /// Main code.
  38
  39         .arch   armv8-a
  40         .fpu    crypto-neon-fp-armv8
  41
  42 /// The ARM crypto extension implements a little-endian version of AES
  43 /// (though the manual doesn't actually spell this out and you have to
  44 /// experiment), but Catacomb's internal interface presents as big-endian so
  45 /// as to work better with things like GCM.  We therefore maintain the round
  46 /// keys in little-endian form, and have to end-swap blocks in and out.
  47 ///
  48 /// For added amusement, the crypto extension doesn't implement the larger-
  49 /// block versions of Rijndael, so we have to end-swap the keys if we're
  50 /// preparing for one of those.
  51
  52         // Useful constants.
  53         .equ    maxrounds, 16           // maximum number of rounds
  54         .equ    maxblksz, 32            // maximum block size, in bytes
  55         .equ    kbufsz, maxblksz*(maxrounds + 1) // size of a key-schedule buffer
  56
  57         // Context structure.
  58         .equ    nr, 0                   // number of rounds
  59         .equ    w, nr + 4               // encryption key words
  60         .equ    wi, w + kbufsz          // decryption key words
  61
  62 ///--------------------------------------------------------------------------
  63 /// Key setup.
  64
  65 FUNC(rijndael_setup_arm_crypto)
  66
  67         // Arguments:
  68         //      r0 = pointer to context
  69         //      r1 = block size in words
  70         //      r2 = pointer to key material
  71         //      r3 = key size in words
  72
  73         stmfd   sp!, {r4-r9, r14}
  74
  75         // The initial round key material is taken directly from the input
  76         // key, so copy it over.  Unfortunately, the key material is not
  77         // guaranteed to be aligned in any especially useful way, so we must
  78         // sort this out.
  79         add     r9, r0, #w
  80         mov     r14, r3
  81         ands    r6, r2, #3
  82         beq     1f
  83         mov     r6, r6, lsl #3
  84         rsb     r7, r6, #32
  85         bic     r2, r2, #3
  86         ldr     r4, [r2], #4
  87
  88 0:      ldr     r5, [r2], #4
  89         mov     r4, r4, lsr r6
  90         orr     r4, r5, lsl r7
  91         str     r4, [r9], #4
  92         subs    r14, r14, #1
  93         movhi   r4, r5
  94         bhi     0b
  95         b       9f
  96
  97 1:      ldr     r4, [r2], #4
  98         str     r4, [r9], #4
  99         subs    r14, r14, #1
 100         bhi     1b
 101
 102         // Find out other useful things and prepare for the main loop.
 103 9:      ldr     r7, [r0, #nr]           // number of rounds
 104         mla     r2, r1, r7, r1          // total key size in words
 105         leaextq r5, rijndael_rcon       // round constants
 106         sub     r8, r2, r3              // minus what we've copied already
 107         veor    q1, q1                  // all-zero register for the key
 108         add     r8, r9, r8, lsl #2      // limit of the key buffer
 109         mov     r12, #0                 // position in current cycle
 110
 111         // Main key expansion loop.  Dispatch according to the position in
 112         // the cycle.
 113 0:      ldr     r6, [r9, -r3, lsl #2]   // word from previous cycle
 114         cmp     r12, #0                 // first word of the cycle?
 115         beq     1f
 116         cmp     r12, #4                 // fourth word of the cycle?
 117         bne     2f
 118         cmp     r3, #7                  // seven or eight words of key?
 119         bcc     2f
 120
 121         // Fourth word of the cycle, seven or eight words of key.  We must do
 122         // the byte substitution.
 123         vdup.32 q0, r4
 124         aese.8  q0, q1                  // effectively, just SubBytes
 125         vmov.32 r4, d0[0]
 126         b       2f
 127
 128         // First word of the cycle.  Byte substitution, rotation, and round
 129         // constant.
 130 1:      ldrb    r14, [r5], #1           // next round constant
 131         ldr     r6, [r9, -r3, lsl #2]
 132         vdup.32 q0, r4
 133         aese.8  q0, q1                  // effectively, just SubBytes
 134         vmov.32 r4, d0[0]
 135         eor     r4, r14, r4, ror #8
 136
 137         // Common ending: mix in the word from the previous cycle and store.
 138 2:      eor     r4, r4, r6
 139         str     r4, [r9], #4
 140
 141         // Prepare for the next iteration.  If we're done, then stop; if
 142         // we've finished a cycle then reset the counter.
 143         add     r12, r12, #1
 144         cmp     r9, r8
 145         bcs     9f
 146         cmp     r12, r3
 147         movcs   r12, #0
 148         b       0b
 149
 150         // Next job is to construct the decryption keys.  The keys for the
 151         // first and last rounds don't need to be mangled, but the remaining
 152         // ones do -- and they all need to be reordered too.
 153         //
 154         // The plan of action, then, is to copy the final encryption round's
 155         // keys into place first, then to do each of the intermediate rounds
 156         // in reverse order, and finally do the first round.
 157         //
 158         // Do all the heavy lifting with NEON registers.  The order we're
 159         // doing this in means that it's OK if we read or write too much, and
 160         // there's easily enough buffer space for the over-enthusiastic reads
 161         // and writes because the context has space for 32-byte blocks, which
 162         // is our maximum and an exact fit for two Q-class registers.
 163 9:      add     r5, r0, #wi
 164         add     r4, r0, #w
 165         add     r4, r4, r2, lsl #2
 166         sub     r4, r4, r1, lsl #2              // last round's keys
 167
 168         // Copy the last encryption round's keys.
 169         teq     r1, #4
 170         vldmiaeq r4, {d0, d1}
 171         vldmiane r4, {d0-d3}
 172         vstmiaeq r5, {d0, d1}
 173         vstmiane r5, {d0-d3}
 174
 175         // Update the loop variables and stop if we've finished.
 176 0:      sub     r4, r4, r1, lsl #2
 177         add     r5, r5, r1, lsl #2
 178         subs    r7, r7, #1
 179         beq     9f
 180
 181         // Do another middle round's keys...
 182         teq     r1, #4
 183         vldmiaeq r4, {d0, d1}
 184         vldmiane r4, {d0-d3}
 185         aesimc.8 q0, q0
 186         vstmiaeq r5, {d0, d1}
 187         beq     0b
 188         aesimc.8 q1, q1
 189         vstmia  r5, {d0-d3}
 190         b       0b
 191
 192         // Finally do the first encryption round.
 193 9:      teq     r1, #4
 194         vldmiaeq r4, {d0, d1}
 195         vldmiane r4, {d0-d3}
 196         vstmiaeq r5, {d0, d1}
 197         vstmiane r5, {d0-d3}
 198
 199         // If the block size is not exactly four words then we must end-swap
 200         // everything.  We can use fancy NEON toys for this.
 201         beq     9f
 202
 203         // End-swap the encryption keys.
 204         add     r1, r0, #w
 205         bl      endswap_block
 206
 207         // And the decryption keys
 208         add     r1, r0, #wi
 209         bl      endswap_block
 210
 211         // All done.
 212 9:      ldmfd   sp!, {r4-r9, pc}
 213
 214 endswap_block:
 215         // End-swap R2 words starting at R1.  R1 is clobbered; R2 is not.
 216         // It's OK to work in 16-byte chunks.
 217         mov     r4, r2
 218 0:      vldmia  r1, {d0, d1}
 219         vrev32.8 q0, q0
 220         vstmia  r1!, {d0, d1}
 221         subs    r4, r4, #4
 222         bhi     0b
 223         bx      r14
 224
 225 ENDFUNC
 226
 227 ///--------------------------------------------------------------------------
 228 /// Encrypting and decrypting blocks.
 229
 230 FUNC(rijndael_eblk_arm_crypto)
 231
 232         // Arguments:
 233         //      r0 = pointer to context
 234         //      r1 = pointer to input block
 235         //      r2 = pointer to output block
 236
 237         // Set things up ready.
 238         ldr     r3, [r0, #nr]
 239         add     r0, r0, #w
 240         vldmia  r1, {d0, d1}
 241         vrev32.8 q0, q0
 242
 243         // Dispatch according to the number of rounds.
 244         add     r3, r3, r3, lsl #1
 245         rsbs    r3, r3, #3*14
 246         addcs   pc, pc, r3, lsl #2
 247         callext F(abort)
 248
 249         // The last round doesn't have MixColumns, so do it separately.
 250   .rept 13
 251         vldmia  r0!, {d2, d3}
 252         aese.8  q0, q1
 253         aesmc.8 q0, q0
 254   .endr
 255
 256         // Final round.
 257         vldmia  r0!, {d2, d3}
 258         aese.8  q0, q1
 259
 260         // Final whitening.
 261         vldmia  r0!, {d2, d3}
 262         veor    q0, q1
 263
 264         // All done.
 265         vrev32.8 q0, q0
 266         vstmia  r2, {d0, d1}
 267         bx      r14
 268
 269 ENDFUNC
 270
 271 FUNC(rijndael_dblk_arm_crypto)
 272
 273         // Arguments:
 274         //      r0 = pointer to context
 275         //      r1 = pointer to input block
 276         //      r2 = pointer to output block
 277
 278         // Set things up ready.
 279         ldr     r3, [r0, #nr]
 280         add     r0, r0, #wi
 281         vldmia  r1, {d0, d1}
 282         vrev32.8 q0, q0
 283
 284         // Dispatch according to the number of rounds.
 285         add     r3, r3, r3, lsl #1
 286         rsbs    r3, r3, #3*14
 287         addcs   pc, pc, r3, lsl #2
 288         callext F(abort)
 289
 290         // The last round doesn't have MixColumns, so do it separately.
 291   .rept 13
 292         vldmia  r0!, {d2, d3}
 293         aesd.8  q0, q1
 294         aesimc.8 q0, q0
 295   .endr
 296
 297         // Final round.
 298         vldmia  r0!, {d2, d3}
 299         aesd.8  q0, q1
 300
 301         // Final whitening.
 302         vldmia  r0!, {d2, d3}
 303         veor    q0, q1
 304
 305         // All done.
 306         vrev32.8 q0, q0
 307         vstmia  r2, {d0, d1}
 308         bx      r14
 309
 310 ENDFUNC
 311
 312 ///----- That's all, folks --------------------------------------------------