From: Mark Wooding <mdw@distorted.org.uk>
Date: Sat, 30 Jul 2016 10:48:16 +0000 (+0100)
Subject: symm/rijndael-arm-crypto.S: More aggressive loading of subkey data.
X-Git-Tag: 2.3.0~46
X-Git-Url: http://www.chiark.greenend.org.uk/ucgi/~mdw/git/catacomb/commitdiff_plain/6717fd12f186b68e677fb83f647092cd2407284c?hp=f71dd54d995dcffeb73fb41132a60b6ae62d2ea3

symm/rijndael-arm-crypto.S: More aggressive loading of subkey data.

Rewrite the block-encryption primitives so that they load key data in
multiple round chunks.  There's now a separate prefix piece for each
number of rounds other than ten which does the extra and flows into the
main sequence.  Because the code is now rather more complicated, there's
only one copy of it, in a macro, as for the AESNI version.
---

diff --git a/symm/rijndael-arm-crypto.S b/symm/rijndael-arm-crypto.S
index 908faaa4..49aa0166 100644
--- a/symm/rijndael-arm-crypto.S
+++ b/symm/rijndael-arm-crypto.S
@@ -227,7 +227,8 @@ ENDFUNC
 ///--------------------------------------------------------------------------
 /// Encrypting and decrypting blocks.
 
-FUNC(rijndael_eblk_arm_crypto)
+.macro	encdec	op, aes, mc, koff
+  FUNC(rijndael_\op\()_arm_crypto)
 
 	// Arguments:
 	//	r0 = pointer to context
@@ -236,77 +237,95 @@ FUNC(rijndael_eblk_arm_crypto)
 
 	// Set things up ready.
 	ldr	r3, [r0, #nr]
-	add	r0, r0, #w
+	add	r0, r0, #\koff
 	vldmia	r1, {d0, d1}
 	vrev32.8 q0, q0
 
-	// Dispatch according to the number of rounds.
-	add	r3, r3, r3, lsl #1
-	rsbs	r3, r3, #3*14
-	addcs	pc, pc, r3, lsl #2
+	// Check the number of rounds and dispatch.
+	sub	r3, r3, #10
+	cmp	r3, #5
+	addlo	pc, pc, r3, lsl #2
 	callext	F(abort)
 
-	// The last round doesn't have MixColumns, so do it separately.
-  .rept	13
-	vldmia	r0!, {d2, d3}
-	aese.8	q0, q1
-	aesmc.8	q0, q0
-  .endr
-
-	// Final round.
-	vldmia	r0!, {d2, d3}
-	aese.8	q0, q1
-
-	// Final whitening.
-	vldmia	r0!, {d2, d3}
-	veor	q0, q1
+	b	10f
+	b	11f
+	b	12f
+	b	13f
+	b	14f
+
+	// Eleven rounds.
+11:	vldmia	r0!, {d16, d17}
+	\aes\().8 q0, q8
+	\mc\().8 q0, q0
+	b	10f
+
+	// Twelve rounds.
+12:	vldmia	r0!, {d16-d19}
+	\aes\().8 q0, q8
+	\mc\().8 q0, q0
+	\aes\().8 q0, q9
+	\mc\().8 q0, q0
+	b	10f
+
+	// Thirteen rounds.
+13:	vldmia	r0!, {d16-d21}
+	\aes\().8 q0, q8
+	\mc\().8 q0, q0
+	\aes\().8 q0, q9
+	\mc\().8 q0, q0
+	\aes\().8 q0, q10
+	\mc\().8 q0, q0
+	b	10f
+
+	// Fourteen rounds.  (Drops through to the ten round case because
+	// this is the next most common.)
+14:	vldmia	r0!, {d16-d23}
+	\aes\().8 q0, q8
+	\mc\().8 q0, q0
+	\aes\().8 q0, q9
+	\mc\().8 q0, q0
+	\aes\().8 q0, q10
+	\mc\().8 q0, q0
+	\aes\().8 q0, q11
+	\mc\().8 q0, q0
+	// Drop through...
+
+	// Ten rounds.
+10:	vldmia	r0!, {d16-d25}
+	\aes\().8 q0, q8
+	\mc\().8 q0, q0
+	\aes\().8 q0, q9
+	\mc\().8 q0, q0
+	\aes\().8 q0, q10
+	\mc\().8 q0, q0
+	\aes\().8 q0, q11
+	\mc\().8 q0, q0
+	\aes\().8 q0, q12
+	\mc\().8 q0, q0
+
+	vldmia	r0!, {d16-d27}
+	\aes\().8 q0, q8
+	\mc\().8 q0, q0
+	\aes\().8 q0, q9
+	\mc\().8 q0, q0
+	\aes\().8 q0, q10
+	\mc\().8 q0, q0
+	\aes\().8 q0, q11
+	\mc\().8 q0, q0
+
+	// Final round has no MixColumns, but is followed by final whitening.
+	\aes\().8 q0, q12
+	veor	q0, q0, q13
 
 	// All done.
 	vrev32.8 q0, q0
 	vstmia	r2, {d0, d1}
 	bx	r14
 
-ENDFUNC
-
-FUNC(rijndael_dblk_arm_crypto)
-
-	// Arguments:
-	//	r0 = pointer to context
-	//	r1 = pointer to input block
-	//	r2 = pointer to output block
-
-	// Set things up ready.
-	ldr	r3, [r0, #nr]
-	add	r0, r0, #wi
-	vldmia	r1, {d0, d1}
-	vrev32.8 q0, q0
-
-	// Dispatch according to the number of rounds.
-	add	r3, r3, r3, lsl #1
-	rsbs	r3, r3, #3*14
-	addcs	pc, pc, r3, lsl #2
-	callext	F(abort)
-
-	// The last round doesn't have MixColumns, so do it separately.
-  .rept	13
-	vldmia	r0!, {d2, d3}
-	aesd.8	q0, q1
-	aesimc.8 q0, q0
-  .endr
-
-	// Final round.
-	vldmia	r0!, {d2, d3}
-	aesd.8	q0, q1
-
-	// Final whitening.
-	vldmia	r0!, {d2, d3}
-	veor	q0, q1
+  ENDFUNC
+.endm
 
-	// All done.
-	vrev32.8 q0, q0
-	vstmia	r2, {d0, d1}
-	bx	r14
-
-ENDFUNC
+	encdec	eblk, aese, aesmc, w
+	encdec	dblk, aesd, aesimc, wi
 
 ///----- That's all, folks --------------------------------------------------