[catacomb] / symm / rijndael-arm-crypto.S

/// -*- mode: asm; asm-comment-char: ?/ -*-
///
/// ARM crypto-extension-based implementation of Rijndael
///
/// (c) 2016 Straylight/Edgeware
///

///----- Licensing notice ---------------------------------------------------
///
/// This file is part of Catacomb.
///
/// Catacomb is free software; you can redistribute it and/or modify
/// it under the terms of the GNU Library General Public License as
/// published by the Free Software Foundation; either version 2 of the
/// License, or (at your option) any later version.
///
/// Catacomb is distributed in the hope that it will be useful,
/// but WITHOUT ANY WARRANTY; without even the implied warranty of
/// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
/// GNU Library General Public License for more details.
///
/// You should have received a copy of the GNU Library General Public
/// License along with Catacomb; if not, write to the Free
/// Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
/// MA 02111-1307, USA.

///--------------------------------------------------------------------------
/// External definitions.

#include "config.h"
#include "asm-common.h"

	.globl	F(abort)
	.globl	F(rijndael_rcon)

///--------------------------------------------------------------------------
/// Main code.

	.arch	armv8-a
	.fpu	crypto-neon-fp-armv8

/// The ARM crypto extension implements a little-endian version of AES
/// (though the manual doesn't actually spell this out and you have to
/// experiment), but Catacomb's internal interface presents as big-endian so
/// as to work better with things like GCM.  We therefore maintain the round
/// keys in little-endian form, and have to end-swap blocks in and out.
///
/// For added amusement, the crypto extension doesn't implement the larger-
/// block versions of Rijndael, so we have to end-swap the keys if we're
/// preparing for one of those.

	// Useful constants.
	.equ	maxrounds, 16		// maximum number of rounds
	.equ	maxblksz, 32		// maximum block size, in bytes
	.equ	kbufsz, maxblksz*(maxrounds + 1) // size of a key-schedule buffer

	// Context structure.
	.equ	nr, 0			// number of rounds
	.equ	w, nr + 4		// encryption key words
	.equ	wi, w + kbufsz		// decryption key words

///--------------------------------------------------------------------------
/// Key setup.

FUNC(rijndael_setup_arm_crypto)

	// Arguments:
	//	r0 = pointer to context
	//	r1 = block size in words
	//	r2 = pointer to key material
	//	r3 = key size in words

	stmfd	sp!, {r4-r9, r14}

	// The initial round key material is taken directly from the input
	// key, so copy it over.  Unfortunately, the key material is not
	// guaranteed to be aligned in any especially useful way, so we must
	// sort this out.
	add	r9, r0, #w
	mov	r14, r3
	ands	r4, r2, #3
	beq	1f
	mov	r4, r4, lsl #3
	rsb	r5, r4, #32
	bic	r2, r2, #3
	ldr	r6, [r2], #4

0:	ldr	r7, [r2], #4
	mov	r6, r6, lsr r4
	orr	r6, r7, lsl r5
	str	r6, [r9], #4
	mov	r6, r7
	subs	r14, r14, #1
	bhi	0b
	b	9f

1:	ldr	r6, [r2], #4
	str	r6, [r9], #4
	subs	r14, r14, #1
	bhi	1b

	// Find out other useful things and prepare for the main loop.
	ldr	r7, [r0, #nr]		// number of rounds
	mla	r2, r1, r7, r1		// total key size in words
	ldr	r4, [r9, #-4]		// most recent key word
	leaextq	r5, rijndael_rcon	// round constants
	sub	r8, r2, r3		// minus what we've copied already
	veor	q1, q1			// all-zero register for the key
	add	r8, r9, r8, lsl #2	// limit of the key buffer

	// Main key expansion loop.  The first word of each key-length chunk
	// needs special treatment.
9:	ldrb	r14, [r5], #1		// next round constant
	ldr	r6, [r9, -r3, lsl #2]
	vdup.32	q0, r4
	aese.8	q0, q1			// effectively, just SubBytes
	vmov.32	r4, d0[0]
	eor	r4, r14, r4, ror #8
	eor	r4, r4, r6
	str	r4, [r9], #4
	cmp	r9, r8
	bcs	8f

	// The next three words are simple.
	ldr	r6, [r9, -r3, lsl #2]
	eor	r4, r4, r6
	str	r4, [r9], #4
	cmp	r9, r8
	bcs	8f

	// (Word 2...)
	ldr	r6, [r9, -r3, lsl #2]
	eor	r4, r4, r6
	str	r4, [r9], #4
	cmp	r9, r8
	bcs	8f

	// (Word 3...)
	ldr	r6, [r9, -r3, lsl #2]
	eor	r4, r4, r6
	str	r4, [r9], #4
	cmp	r9, r8
	bcs	8f

	// Word 4.  If the key is /more/ than 6 words long, then we must
	// apply a substitution here.
	cmp	r3, #5
	bcc	9b
	ldr	r6, [r9, -r3, lsl #2]
	cmp	r3, #7
	bcc	0f
	vdup.32	q0, r4
	aese.8	q0, q1			// effectively, just SubBytes
	vmov.32	r4, d0[0]
0:	eor	r4, r4, r6
	str	r4, [r9], #4
	cmp	r9, r8
	bcs	8f

	// (Word 5...)
	cmp	r3, #6
	bcc	9b
	ldr	r6, [r9, -r3, lsl #2]
	eor	r4, r4, r6
	str	r4, [r9], #4
	cmp	r9, r8
	bcs	8f

	// (Word 6...)
	cmp	r3, #7
	bcc	9b
	ldr	r6, [r9, -r3, lsl #2]
	eor	r4, r4, r6
	str	r4, [r9], #4
	cmp	r9, r8
	bcs	8f

	// (Word 7...)
	cmp	r3, #8
	bcc	9b
	ldr	r6, [r9, -r3, lsl #2]
	eor	r4, r4, r6
	str	r4, [r9], #4
	cmp	r9, r8
	bcs	8f

	// Must be done by now.
	b	9b

	// Next job is to construct the decryption keys.  The keys for the
	// first and last rounds don't need to be mangled, but the remaining
	// ones do -- and they all need to be reordered too.
	//
	// The plan of action, then, is to copy the final encryption round's
	// keys into place first, then to do each of the intermediate rounds
	// in reverse order, and finally do the first round.
	//
	// Do all the heavy lifting with NEON registers.  The order we're
	// doing this in means that it's OK if we read or write too much, and
	// there's easily enough buffer space for the over-enthusiastic reads
	// and writes because the context has space for 32-byte blocks, which
	// is our maximum and an exact fit for two Q-class registers.
8:	add	r5, r0, #wi
	add	r4, r0, #w
	add	r4, r4, r2, lsl #2
	sub	r4, r4, r1, lsl #2		// last round's keys

	// Copy the last encryption round's keys.
	teq	r1, #4
	vldmiaeq r4, {d0, d1}
	vldmiane r4, {d0-d3}
	vstmiaeq r5, {d0, d1}
	vstmiane r5, {d0-d3}

	// Update the loop variables and stop if we've finished.
9:	sub	r4, r4, r1, lsl #2
	add	r5, r5, r1, lsl #2
	subs	r7, r7, #1
	beq	0f

	// Do another middle round's keys...
	teq	r1, #4
	vldmiaeq r4, {d0, d1}
	vldmiane r4, {d0-d3}
	aesimc.8 q0, q0
	vstmiaeq r5, {d0, d1}
	beq	9b
	aesimc.8 q1, q1
	vstmia	r5, {d0-d3}
	b	9b

	// Finally do the first encryption round.
0:	teq	r1, #4
	vldmiaeq r4, {d0, d1}
	vldmiane r4, {d0-d3}
	vstmiaeq r5, {d0, d1}
	vstmiane r5, {d0-d3}

	// If the block size is not exactly four words then we must end-swap
	// everything.  We can use fancy NEON toys for this.
	beq	0f

	// End-swap the encryption keys.
	add	r1, r0, #w
	bl	endswap_block

	// And the decryption keys
	add	r1, r0, #wi
	bl	endswap_block

	// All done.
0:	ldmfd	sp!, {r4-r9, pc}

endswap_block:
	// End-swap R2 words starting at R1.  R1 is clobbered; R2 is not.
	// It's OK to work in 16-byte chunks.
	mov	r4, r2
0:	vldmia	r1, {d0, d1}
	vrev32.8 q0, q0
	vstmia	r1!, {d0, d1}
	subs	r4, r4, #4
	bhi	0b
	bx	r14

ENDFUNC

///--------------------------------------------------------------------------
/// Encrypting and decrypting blocks.

FUNC(rijndael_eblk_arm_crypto)

	// Arguments:
	//	r0 = pointer to context
	//	r1 = pointer to input block
	//	r2 = pointer to output block

	// Set things up ready.
	ldr	r3, [r0, #nr]
	add	r0, r0, #w
	vldmia	r1, {d0, d1}
	vrev32.8 q0, q0

	// Dispatch according to the number of rounds.
	add	r3, r3, r3, lsl #1
	rsbs	r3, r3, #3*14
	addcs	pc, pc, r3, lsl #2
	callext	F(abort)

	// The last round doesn't have MixColumns, so do it separately.
  .rept	13
	vldmia	r0!, {d2, d3}
	aese.8	q0, q1
	aesmc.8	q0, q0
  .endr

	// Final round.
	vldmia	r0!, {d2, d3}
	aese.8	q0, q1

	// Final whitening.
	vldmia	r0!, {d2, d3}
	veor	q0, q1

	// All done.
	vrev32.8 q0, q0
	vstmia	r2, {d0, d1}
	bx	r14

ENDFUNC

FUNC(rijndael_dblk_arm_crypto)

	// Arguments:
	//	r0 = pointer to context
	//	r1 = pointer to input block
	//	r2 = pointer to output block

	// Set things up ready.
	ldr	r3, [r0, #nr]
	add	r0, r0, #wi
	vldmia	r1, {d0, d1}
	vrev32.8 q0, q0

	// Dispatch according to the number of rounds.
	add	r3, r3, r3, lsl #1
	rsbs	r3, r3, #3*14
	addcs	pc, pc, r3, lsl #2
	callext	F(abort)

	// The last round doesn't have MixColumns, so do it separately.
  .rept	13
	vldmia	r0!, {d2, d3}
	aesd.8	q0, q1
	aesimc.8 q0, q0
  .endr

	// Final round.
	vldmia	r0!, {d2, d3}
	aesd.8	q0, q1

	// Final whitening.
	vldmia	r0!, {d2, d3}
	veor	q0, q1

	// All done.
	vrev32.8 q0, q0
	vstmia	r2, {d0, d1}
	bx	r14

ENDFUNC

///----- That's all, folks --------------------------------------------------
Commit	Line	Data
26e182fc MW	1	/// -- mode: asm; asm-comment-char: ?/ --
	2	///
	3	/// ARM crypto-extension-based implementation of Rijndael
	4	///
	5	/// (c) 2016 Straylight/Edgeware
	6	///
	7
	8	///----- Licensing notice ---------------------------------------------------
	9	///
	10	/// This file is part of Catacomb.
	11	///
	12	/// Catacomb is free software; you can redistribute it and/or modify
	13	/// it under the terms of the GNU Library General Public License as
	14	/// published by the Free Software Foundation; either version 2 of the
	15	/// License, or (at your option) any later version.
	16	///
	17	/// Catacomb is distributed in the hope that it will be useful,
	18	/// but WITHOUT ANY WARRANTY; without even the implied warranty of
	19	/// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	20	/// GNU Library General Public License for more details.
	21	///
	22	/// You should have received a copy of the GNU Library General Public
	23	/// License along with Catacomb; if not, write to the Free
	24	/// Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
	25	/// MA 02111-1307, USA.
	26
	27	///--------------------------------------------------------------------------
	28	/// External definitions.
	29
	30	#include "config.h"
	31	#include "asm-common.h"
	32
	33	.globl F(abort)
	34	.globl F(rijndael_rcon)
	35
	36	///--------------------------------------------------------------------------
	37	/// Main code.
	38
	39	.arch armv8-a
	40	.fpu crypto-neon-fp-armv8
	41
	42	/// The ARM crypto extension implements a little-endian version of AES
	43	/// (though the manual doesn't actually spell this out and you have to
	44	/// experiment), but Catacomb's internal interface presents as big-endian so
	45	/// as to work better with things like GCM. We therefore maintain the round
	46	/// keys in little-endian form, and have to end-swap blocks in and out.
	47	///
	48	/// For added amusement, the crypto extension doesn't implement the larger-
	49	/// block versions of Rijndael, so we have to end-swap the keys if we're
	50	/// preparing for one of those.
	51
	52	// Useful constants.
	53	.equ maxrounds, 16 // maximum number of rounds
	54	.equ maxblksz, 32 // maximum block size, in bytes
	55	.equ kbufsz, maxblksz*(maxrounds + 1) // size of a key-schedule buffer
	56
	57	// Context structure.
	58	.equ nr, 0 // number of rounds
	59	.equ w, nr + 4 // encryption key words
	60	.equ wi, w + kbufsz // decryption key words
	61
	62	///--------------------------------------------------------------------------
	63	/// Key setup.
	64
65	FUNC(rijndael_setup_arm_crypto)
66
67	// Arguments:
68	// r0 = pointer to context
69	// r1 = block size in words
70	// r2 = pointer to key material
71	// r3 = key size in words
72
73	stmfd sp!, {r4-r9, r14}
74
75	// The initial round key material is taken directly from the input
76	// key, so copy it over. Unfortunately, the key material is not
77	// guaranteed to be aligned in any especially useful way, so we must
78	// sort this out.
79	add r9, r0, #w
80	mov r14, r3
81	ands r4, r2, #3
82	beq 1f
83	mov r4, r4, lsl #3
84	rsb r5, r4, #32
85	bic r2, r2, #3
86	ldr r6, [r2], #4
87
88	0: ldr r7, [r2], #4
89	mov r6, r6, lsr r4
90	orr r6, r7, lsl r5
91	str r6, [r9], #4
92	mov r6, r7
93	subs r14, r14, #1
94	bhi 0b
95	b 9f
96
97	1: ldr r6, [r2], #4
98	str r6, [r9], #4
99	subs r14, r14, #1
100	bhi 1b
101
102	// Find out other useful things and prepare for the main loop.
103	ldr r7, [r0, #nr] // number of rounds
104	mla r2, r1, r7, r1 // total key size in words
105	ldr r4, [r9, #-4] // most recent key word
106	leaextq r5, rijndael_rcon // round constants
107	sub r8, r2, r3 // minus what we've copied already
108	veor q1, q1 // all-zero register for the key
109	add r8, r9, r8, lsl #2 // limit of the key buffer
110
111	// Main key expansion loop. The first word of each key-length chunk
112	// needs special treatment.
113	9: ldrb r14, [r5], #1 // next round constant
114	ldr r6, [r9, -r3, lsl #2]
115	vdup.32 q0, r4
116	aese.8 q0, q1 // effectively, just SubBytes
117	vmov.32 r4, d0[0]
118	eor r4, r14, r4, ror #8
119	eor r4, r4, r6
120	str r4, [r9], #4
121	cmp r9, r8
122	bcs 8f
123
124	// The next three words are simple.
125	ldr r6, [r9, -r3, lsl #2]
126	eor r4, r4, r6
127	str r4, [r9], #4
128	cmp r9, r8
129	bcs 8f
130
131	// (Word 2...)
132	ldr r6, [r9, -r3, lsl #2]
133	eor r4, r4, r6
134	str r4, [r9], #4
135	cmp r9, r8
136	bcs 8f
137
138	// (Word 3...)
139	ldr r6, [r9, -r3, lsl #2]
140	eor r4, r4, r6
141	str r4, [r9], #4
142	cmp r9, r8
143	bcs 8f
144
145	// Word 4. If the key is /more/ than 6 words long, then we must
146	// apply a substitution here.
147	cmp r3, #5
148	bcc 9b
149	ldr r6, [r9, -r3, lsl #2]
150	cmp r3, #7
151	bcc 0f
152	vdup.32 q0, r4
153	aese.8 q0, q1 // effectively, just SubBytes
154	vmov.32 r4, d0[0]
155	0: eor r4, r4, r6
156	str r4, [r9], #4
157	cmp r9, r8
158	bcs 8f
159
160	// (Word 5...)
161	cmp r3, #6
162	bcc 9b
163	ldr r6, [r9, -r3, lsl #2]
164	eor r4, r4, r6
165	str r4, [r9], #4
166	cmp r9, r8
167	bcs 8f
168
169	// (Word 6...)
170	cmp r3, #7
171	bcc 9b
172	ldr r6, [r9, -r3, lsl #2]
173	eor r4, r4, r6
174	str r4, [r9], #4
175	cmp r9, r8
176	bcs 8f
177
178	// (Word 7...)
179	cmp r3, #8
180	bcc 9b
181	ldr r6, [r9, -r3, lsl #2]
182	eor r4, r4, r6
183	str r4, [r9], #4
184	cmp r9, r8
185	bcs 8f
186
187	// Must be done by now.
188	b 9b
189
190	// Next job is to construct the decryption keys. The keys for the
191	// first and last rounds don't need to be mangled, but the remaining
192	// ones do -- and they all need to be reordered too.
193	//
194	// The plan of action, then, is to copy the final encryption round's
195	// keys into place first, then to do each of the intermediate rounds
196	// in reverse order, and finally do the first round.
197	//
198	// Do all the heavy lifting with NEON registers. The order we're
199	// doing this in means that it's OK if we read or write too much, and
200	// there's easily enough buffer space for the over-enthusiastic reads
201	// and writes because the context has space for 32-byte blocks, which
202	// is our maximum and an exact fit for two Q-class registers.
203	8: add r5, r0, #wi
204	add r4, r0, #w
205	add r4, r4, r2, lsl #2
206	sub r4, r4, r1, lsl #2 // last round's keys
207
208	// Copy the last encryption round's keys.
209	teq r1, #4
210	vldmiaeq r4, {d0, d1}
211	vldmiane r4, {d0-d3}
212	vstmiaeq r5, {d0, d1}
213	vstmiane r5, {d0-d3}
214
215	// Update the loop variables and stop if we've finished.
216	9: sub r4, r4, r1, lsl #2
217	add r5, r5, r1, lsl #2
218	subs r7, r7, #1
219	beq 0f
220
221	// Do another middle round's keys...
222	teq r1, #4
223	vldmiaeq r4, {d0, d1}
224	vldmiane r4, {d0-d3}
225	aesimc.8 q0, q0
226	vstmiaeq r5, {d0, d1}
227	beq 9b
228	aesimc.8 q1, q1
229	vstmia r5, {d0-d3}
230	b 9b
231
232	// Finally do the first encryption round.
233	0: teq r1, #4
234	vldmiaeq r4, {d0, d1}
235	vldmiane r4, {d0-d3}
236	vstmiaeq r5, {d0, d1}
237	vstmiane r5, {d0-d3}
238
239	// If the block size is not exactly four words then we must end-swap
240	// everything. We can use fancy NEON toys for this.
241	beq 0f
242
243	// End-swap the encryption keys.
244	add r1, r0, #w
245	bl endswap_block
246
247	// And the decryption keys
248	add r1, r0, #wi
249	bl endswap_block
250
251	// All done.
252	0: ldmfd sp!, {r4-r9, pc}
253
254	endswap_block:
255	// End-swap R2 words starting at R1. R1 is clobbered; R2 is not.
256	// It's OK to work in 16-byte chunks.
257	mov r4, r2
258	0: vldmia r1, {d0, d1}
259	vrev32.8 q0, q0
260	vstmia r1!, {d0, d1}
261	subs r4, r4, #4
262	bhi 0b
263	bx r14
264
265	ENDFUNC
266
267	///--------------------------------------------------------------------------
268	/// Encrypting and decrypting blocks.
269
270	FUNC(rijndael_eblk_arm_crypto)
271
272	// Arguments:
273	// r0 = pointer to context
274	// r1 = pointer to input block
275	// r2 = pointer to output block
276
277	// Set things up ready.
278	ldr r3, [r0, #nr]
279	add r0, r0, #w
280	vldmia r1, {d0, d1}
281	vrev32.8 q0, q0
282
283	// Dispatch according to the number of rounds.
284	add r3, r3, r3, lsl #1
285	rsbs r3, r3, #3*14
286	addcs pc, pc, r3, lsl #2
287	callext F(abort)
288
289	// The last round doesn't have MixColumns, so do it separately.
9ba8a1d0	290	.rept 13
26e182fc MW	291	vldmia r0!, {d2, d3}
	292	aese.8 q0, q1
	293	aesmc.8 q0, q0
9ba8a1d0	294	.endr
26e182fc MW	295
	296	// Final round.
	297	vldmia r0!, {d2, d3}
	298	aese.8 q0, q1
	299
	300	// Final whitening.
	301	vldmia r0!, {d2, d3}
	302	veor q0, q1
	303
	304	// All done.
	305	vrev32.8 q0, q0
	306	vstmia r2, {d0, d1}
	307	bx r14
	308
	309	ENDFUNC
	310
	311	FUNC(rijndael_dblk_arm_crypto)
	312
	313	// Arguments:
	314	// r0 = pointer to context
	315	// r1 = pointer to input block
	316	// r2 = pointer to output block
	317
	318	// Set things up ready.
	319	ldr r3, [r0, #nr]
	320	add r0, r0, #wi
	321	vldmia r1, {d0, d1}
	322	vrev32.8 q0, q0
	323
	324	// Dispatch according to the number of rounds.
	325	add r3, r3, r3, lsl #1
	326	rsbs r3, r3, #3*14
	327	addcs pc, pc, r3, lsl #2
	328	callext F(abort)
	329
	330	// The last round doesn't have MixColumns, so do it separately.
9ba8a1d0	331	.rept 13
26e182fc MW	332	vldmia r0!, {d2, d3}
	333	aesd.8 q0, q1
	334	aesimc.8 q0, q0
9ba8a1d0	335	.endr
26e182fc MW	336
	337	// Final round.
	338	vldmia r0!, {d2, d3}
	339	aesd.8 q0, q1
	340
	341	// Final whitening.
	342	vldmia r0!, {d2, d3}
	343	veor q0, q1
	344
	345	// All done.
	346	vrev32.8 q0, q0
	347	vstmia r2, {d0, d1}
	348	bx r14
	349
	350	ENDFUNC
	351
	352	///----- That's all, folks --------------------------------------------------