sub r8, r2, r3 // minus what we've copied already
veor q1, q1 // all-zero register for the key
add r8, r9, r8, lsl #2 // limit of the key buffer
+ mov r12, #0 // position in current cycle
- // Main key expansion loop. The first word of each key-length chunk
- // needs special treatment.
-0: ldrb r14, [r5], #1 // next round constant
- ldr r6, [r9, -r3, lsl #2]
+ // Main key expansion loop. Dispatch according to the position in
+ // the cycle.
+0: ldr r6, [r9, -r3, lsl #2] // word from previous cycle
+ cmp r12, #0 // first word of the cycle?
+ beq 1f
+ cmp r12, #4 // fourth word of the cycle?
+ bne 2f
+ cmp r3, #7 // seven or eight words of key?
+ bcc 2f
+
+ // Fourth word of the cycle, seven or eight words of key. We must do
+ // the byte substitution.
vdup.32 q0, r4
aese.8 q0, q1 // effectively, just SubBytes
vmov.32 r4, d0[0]
- eor r4, r14, r4, ror #8
- eor r4, r4, r6
- str r4, [r9], #4
- cmp r9, r8
- bcs 9f
-
- // The next three words are simple.
- ldr r6, [r9, -r3, lsl #2]
- eor r4, r4, r6
- str r4, [r9], #4
- cmp r9, r8
- bcs 9f
-
- // (Word 2...)
- ldr r6, [r9, -r3, lsl #2]
- eor r4, r4, r6
- str r4, [r9], #4
- cmp r9, r8
- bcs 9f
+ b 2f
- // (Word 3...)
+ // First word of the cycle. Byte substitution, rotation, and round
+ // constant.
+1: ldrb r14, [r5], #1 // next round constant
ldr r6, [r9, -r3, lsl #2]
- eor r4, r4, r6
- str r4, [r9], #4
- cmp r9, r8
- bcs 9f
-
- // Word 4. If the key is /more/ than 6 words long, then we must
- // apply a substitution here.
- cmp r3, #5
- bcc 0b
- ldr r6, [r9, -r3, lsl #2]
- cmp r3, #7
- bcc 1f
vdup.32 q0, r4
aese.8 q0, q1 // effectively, just SubBytes
vmov.32 r4, d0[0]
-1: eor r4, r4, r6
- str r4, [r9], #4
- cmp r9, r8
- bcs 9f
+ eor r4, r14, r4, ror #8
- // (Word 5...)
- cmp r3, #6
- bcc 0b
- ldr r6, [r9, -r3, lsl #2]
- eor r4, r4, r6
+ // Common ending: mix in the word from the previous cycle and store.
+2: eor r4, r4, r6
str r4, [r9], #4
- cmp r9, r8
- bcs 9f
- // (Word 6...)
- cmp r3, #7
- bcc 0b
- ldr r6, [r9, -r3, lsl #2]
- eor r4, r4, r6
- str r4, [r9], #4
+ // Prepare for the next iteration. If we're done, then stop; if
+ // we've finished a cycle then reset the counter.
+ add r12, r12, #1
cmp r9, r8
bcs 9f
-
- // (Word 7...)
- cmp r3, #8
- bcc 0b
- ldr r6, [r9, -r3, lsl #2]
- eor r4, r4, r6
- str r4, [r9], #4
- cmp r9, r8
- bcs 9f
-
- // Must be done by now.
+ cmp r12, r3
+ movcs r12, #0
b 0b
// Next job is to construct the decryption keys. The keys for the
///--------------------------------------------------------------------------
/// Encrypting and decrypting blocks.
-FUNC(rijndael_eblk_arm_crypto)
+.macro encdec op, aes, mc, koff
+ FUNC(rijndael_\op\()_arm_crypto)
// Arguments:
// r0 = pointer to context
// Set things up ready.
ldr r3, [r0, #nr]
- add r0, r0, #w
+ add r0, r0, #\koff
vldmia r1, {d0, d1}
vrev32.8 q0, q0
- // Dispatch according to the number of rounds.
- add r3, r3, r3, lsl #1
- rsbs r3, r3, #3*14
- addcs pc, pc, r3, lsl #2
+ // Check the number of rounds and dispatch.
+ sub r3, r3, #10
+ cmp r3, #5
+ addlo pc, pc, r3, lsl #2
callext F(abort)
- // The last round doesn't have MixColumns, so do it separately.
- .rept 13
- vldmia r0!, {d2, d3}
- aese.8 q0, q1
- aesmc.8 q0, q0
- .endr
-
- // Final round.
- vldmia r0!, {d2, d3}
- aese.8 q0, q1
-
- // Final whitening.
- vldmia r0!, {d2, d3}
- veor q0, q1
+ b 10f
+ b 11f
+ b 12f
+ b 13f
+ b 14f
+
+ // Eleven rounds.
+11: vldmia r0!, {d16, d17}
+ \aes\().8 q0, q8
+ \mc\().8 q0, q0
+ b 10f
+
+ // Twelve rounds.
+12: vldmia r0!, {d16-d19}
+ \aes\().8 q0, q8
+ \mc\().8 q0, q0
+ \aes\().8 q0, q9
+ \mc\().8 q0, q0
+ b 10f
+
+ // Thirteen rounds.
+13: vldmia r0!, {d16-d21}
+ \aes\().8 q0, q8
+ \mc\().8 q0, q0
+ \aes\().8 q0, q9
+ \mc\().8 q0, q0
+ \aes\().8 q0, q10
+ \mc\().8 q0, q0
+ b 10f
+
+ // Fourteen rounds. (Drops through to the ten round case because
+ // this is the next most common.)
+14: vldmia r0!, {d16-d23}
+ \aes\().8 q0, q8
+ \mc\().8 q0, q0
+ \aes\().8 q0, q9
+ \mc\().8 q0, q0
+ \aes\().8 q0, q10
+ \mc\().8 q0, q0
+ \aes\().8 q0, q11
+ \mc\().8 q0, q0
+ // Drop through...
+
+ // Ten rounds.
+10: vldmia r0!, {d16-d25}
+ \aes\().8 q0, q8
+ \mc\().8 q0, q0
+ \aes\().8 q0, q9
+ \mc\().8 q0, q0
+ \aes\().8 q0, q10
+ \mc\().8 q0, q0
+ \aes\().8 q0, q11
+ \mc\().8 q0, q0
+ \aes\().8 q0, q12
+ \mc\().8 q0, q0
+
+ vldmia r0!, {d16-d27}
+ \aes\().8 q0, q8
+ \mc\().8 q0, q0
+ \aes\().8 q0, q9
+ \mc\().8 q0, q0
+ \aes\().8 q0, q10
+ \mc\().8 q0, q0
+ \aes\().8 q0, q11
+ \mc\().8 q0, q0
+
+ // Final round has no MixColumns, but is followed by final whitening.
+ \aes\().8 q0, q12
+ veor q0, q0, q13
// All done.
vrev32.8 q0, q0
vstmia r2, {d0, d1}
bx r14
-ENDFUNC
-
-FUNC(rijndael_dblk_arm_crypto)
-
- // Arguments:
- // r0 = pointer to context
- // r1 = pointer to input block
- // r2 = pointer to output block
-
- // Set things up ready.
- ldr r3, [r0, #nr]
- add r0, r0, #wi
- vldmia r1, {d0, d1}
- vrev32.8 q0, q0
-
- // Dispatch according to the number of rounds.
- add r3, r3, r3, lsl #1
- rsbs r3, r3, #3*14
- addcs pc, pc, r3, lsl #2
- callext F(abort)
-
- // The last round doesn't have MixColumns, so do it separately.
- .rept 13
- vldmia r0!, {d2, d3}
- aesd.8 q0, q1
- aesimc.8 q0, q0
- .endr
+ ENDFUNC
+.endm
- // Final round.
- vldmia r0!, {d2, d3}
- aesd.8 q0, q1
-
- // Final whitening.
- vldmia r0!, {d2, d3}
- veor q0, q1
-
- // All done.
- vrev32.8 q0, q0
- vstmia r2, {d0, d1}
- bx r14
-
-ENDFUNC
+ encdec eblk, aese, aesmc, w
+ encdec dblk, aesd, aesimc, wi
///----- That's all, folks --------------------------------------------------