Be more consistent about the label numbering. Specifically, 0 is
usually a loop head, and 9 is usually a thing to do next.
// Main key expansion loop. The first word of each key-length chunk
// needs special treatment.
// Main key expansion loop. The first word of each key-length chunk
// needs special treatment.
-9: ldrb r14, [r5], #1 // next round constant
+0: ldrb r14, [r5], #1 // next round constant
ldr r6, [r9, -r3, lsl #2]
vdup.32 q0, r4
aese.8 q0, q1 // effectively, just SubBytes
ldr r6, [r9, -r3, lsl #2]
vdup.32 q0, r4
aese.8 q0, q1 // effectively, just SubBytes
eor r4, r4, r6
str r4, [r9], #4
cmp r9, r8
eor r4, r4, r6
str r4, [r9], #4
cmp r9, r8
// The next three words are simple.
ldr r6, [r9, -r3, lsl #2]
eor r4, r4, r6
str r4, [r9], #4
cmp r9, r8
// The next three words are simple.
ldr r6, [r9, -r3, lsl #2]
eor r4, r4, r6
str r4, [r9], #4
cmp r9, r8
// (Word 2...)
ldr r6, [r9, -r3, lsl #2]
eor r4, r4, r6
str r4, [r9], #4
cmp r9, r8
// (Word 2...)
ldr r6, [r9, -r3, lsl #2]
eor r4, r4, r6
str r4, [r9], #4
cmp r9, r8
// (Word 3...)
ldr r6, [r9, -r3, lsl #2]
eor r4, r4, r6
str r4, [r9], #4
cmp r9, r8
// (Word 3...)
ldr r6, [r9, -r3, lsl #2]
eor r4, r4, r6
str r4, [r9], #4
cmp r9, r8
// Word 4. If the key is /more/ than 6 words long, then we must
// apply a substitution here.
cmp r3, #5
// Word 4. If the key is /more/ than 6 words long, then we must
// apply a substitution here.
cmp r3, #5
ldr r6, [r9, -r3, lsl #2]
cmp r3, #7
ldr r6, [r9, -r3, lsl #2]
cmp r3, #7
vdup.32 q0, r4
aese.8 q0, q1 // effectively, just SubBytes
vmov.32 r4, d0[0]
vdup.32 q0, r4
aese.8 q0, q1 // effectively, just SubBytes
vmov.32 r4, d0[0]
str r4, [r9], #4
cmp r9, r8
str r4, [r9], #4
cmp r9, r8
// (Word 5...)
cmp r3, #6
// (Word 5...)
cmp r3, #6
ldr r6, [r9, -r3, lsl #2]
eor r4, r4, r6
str r4, [r9], #4
cmp r9, r8
ldr r6, [r9, -r3, lsl #2]
eor r4, r4, r6
str r4, [r9], #4
cmp r9, r8
// (Word 6...)
cmp r3, #7
// (Word 6...)
cmp r3, #7
ldr r6, [r9, -r3, lsl #2]
eor r4, r4, r6
str r4, [r9], #4
cmp r9, r8
ldr r6, [r9, -r3, lsl #2]
eor r4, r4, r6
str r4, [r9], #4
cmp r9, r8
// (Word 7...)
cmp r3, #8
// (Word 7...)
cmp r3, #8
ldr r6, [r9, -r3, lsl #2]
eor r4, r4, r6
str r4, [r9], #4
cmp r9, r8
ldr r6, [r9, -r3, lsl #2]
eor r4, r4, r6
str r4, [r9], #4
cmp r9, r8
// Next job is to construct the decryption keys. The keys for the
// first and last rounds don't need to be mangled, but the remaining
// Next job is to construct the decryption keys. The keys for the
// first and last rounds don't need to be mangled, but the remaining
// there's easily enough buffer space for the over-enthusiastic reads
// and writes because the context has space for 32-byte blocks, which
// is our maximum and an exact fit for two Q-class registers.
// there's easily enough buffer space for the over-enthusiastic reads
// and writes because the context has space for 32-byte blocks, which
// is our maximum and an exact fit for two Q-class registers.
add r4, r0, #w
add r4, r4, r2, lsl #2
sub r4, r4, r1, lsl #2 // last round's keys
add r4, r0, #w
add r4, r4, r2, lsl #2
sub r4, r4, r1, lsl #2 // last round's keys
vstmiane r5, {d0-d3}
// Update the loop variables and stop if we've finished.
vstmiane r5, {d0-d3}
// Update the loop variables and stop if we've finished.
-9: sub r4, r4, r1, lsl #2
+0: sub r4, r4, r1, lsl #2
add r5, r5, r1, lsl #2
subs r7, r7, #1
add r5, r5, r1, lsl #2
subs r7, r7, #1
// Do another middle round's keys...
teq r1, #4
// Do another middle round's keys...
teq r1, #4
vldmiane r4, {d0-d3}
aesimc.8 q0, q0
vstmiaeq r5, {d0, d1}
vldmiane r4, {d0-d3}
aesimc.8 q0, q0
vstmiaeq r5, {d0, d1}
aesimc.8 q1, q1
vstmia r5, {d0-d3}
aesimc.8 q1, q1
vstmia r5, {d0-d3}
// Finally do the first encryption round.
// Finally do the first encryption round.
vldmiaeq r4, {d0, d1}
vldmiane r4, {d0-d3}
vstmiaeq r5, {d0, d1}
vldmiaeq r4, {d0, d1}
vldmiane r4, {d0-d3}
vstmiaeq r5, {d0, d1}
// If the block size is not exactly four words then we must end-swap
// everything. We can use fancy NEON toys for this.
// If the block size is not exactly four words then we must end-swap
// everything. We can use fancy NEON toys for this.
// End-swap the encryption keys.
add r1, r0, #w
// End-swap the encryption keys.
add r1, r0, #w
bl endswap_block
// All done.
bl endswap_block
// All done.
-0: ldmfd sp!, {r4-r9, pc}
+9: ldmfd sp!, {r4-r9, pc}
endswap_block:
// End-swap R2 words starting at R1. R1 is clobbered; R2 is not.
endswap_block:
// End-swap R2 words starting at R1. R1 is clobbered; R2 is not.