[catacomb] / symm / rijndael-arm64-crypto.S

/// -*- mode: asm; asm-comment-char: ?/ -*-
///
/// AArch64 crypto-extension-based implementation of Rijndael
///
/// (c) 2018 Straylight/Edgeware
///

///----- Licensing notice ---------------------------------------------------
///
/// This file is part of Catacomb.
///
/// Catacomb is free software; you can redistribute it and/or modify
/// it under the terms of the GNU Library General Public License as
/// published by the Free Software Foundation; either version 2 of the
/// License, or (at your option) any later version.
///
/// Catacomb is distributed in the hope that it will be useful,
/// but WITHOUT ANY WARRANTY; without even the implied warranty of
/// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
/// GNU Library General Public License for more details.
///
/// You should have received a copy of the GNU Library General Public
/// License along with Catacomb; if not, write to the Free
/// Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
/// MA 02111-1307, USA.

///--------------------------------------------------------------------------
/// Preliminaries.

#include "config.h"
#include "asm-common.h"

	.arch	armv8-a+crypto

	.extern	F(abort)
	.extern	F(rijndael_rcon)

	.text

///--------------------------------------------------------------------------
/// Main code.

/// The ARM crypto extension implements a little-endian version of AES
/// (though the manual doesn't actually spell this out and you have to
/// experiment), but Catacomb's internal interface presents as big-endian so
/// as to work better with things like GCM.  We therefore maintain the round
/// keys in little-endian form, and have to end-swap blocks in and out.
///
/// For added amusement, the crypto extension doesn't implement the larger-
/// block versions of Rijndael, so we have to end-swap the keys if we're
/// preparing for one of those.

	// Useful constants.
	.equ	maxrounds, 16		// maximum number of rounds
	.equ	maxblksz, 32		// maximum block size, in bytes
	.equ	kbufsz, maxblksz*(maxrounds + 1) // size of key-sched buffer

	// Context structure.
	.equ	nr, 0			// number of rounds
	.equ	w, nr + 4		// encryption key words
	.equ	wi, w + kbufsz		// decryption key words

///--------------------------------------------------------------------------
/// Key setup.

FUNC(rijndael_setup_arm64_crypto)

	// Arguments:
	//	x0 = pointer to context
	//	w1 = block size in 32-bit words
	//	x2 = pointer to key material
	//	x3 = key size in words

	pushreg	x29, x30
	mov	x29, sp

	// The initial round key material is taken directly from the input
	// key, so copy it over.  Unfortunately, the key material is not
	// guaranteed to be aligned in any especially useful way.  Assume
	// that alignment traps are not enabled.  (Why would they be?  On
	// A32, alignment traps were part of a transition plan which changed
	// the way unaligned loads and stores behaved, but there's never been
	// any other behaviour on A64.)
	mov	x15, x3
	add	x4, x0, #w
0:	sub	x15, x15, #1
	ldr	w14, [x2], #4
	str	w14, [x4], #4
	cbnz	x15, 0b

	// Find out other useful things and prepare for the main loop.
9:	ldr	w9, [x0, #nr]		// number of rounds
	madd	w2, w1, w9, w1		// total key size in words
	leaext	x5, rijndael_rcon	// round constants
	sub	x6, x2, x3		// minus what we've copied already
	add	x7, x0, #w		// position in previous cycle
	movi	v1.4s, #0		// all-zero register for the key
	mov	x8, #0			// position in current cycle

	// Main key expansion loop.  Dispatch according to the position in
	// the cycle.
0:	ldr	w15, [x7], #4		// word from previous cycle
	cbz	x8, 1f			// first word of the cycle?
	cmp	x8, #4			// fourth word of the cycle?
	b.ne	2f
	cmp	x3, #7			// seven or eight words of key?
	b.cc	2f

	// Fourth word of the cycle, seven or eight words of key.  We must do
	// the byte substitution.
	dup	v0.4s, w14
	aese	v0.16b, v1.16b		// effectively, just SubBytes
	mov	w14, v0.s[0]
	b	2f

	// First word of the cycle.  Byte substitution, rotation, and round
	// constant.
1:	ldrb	w13, [x5], #1		// next round constant
	dup	v0.4s, w14
	aese	v0.16b, v1.16b		// effectively, just SubBytes
	mov	w14, v0.s[0]
	eor	w14, w13, w14, ror #8

	// Common ending: mix in the word from the previous cycle and store.
2:	eor	w14, w14, w15
	str	w14, [x4], #4

	// Prepare for the next iteration.  If we're done, then stop; if
	// we've finished a cycle then reset the counter.
	add	x8, x8, #1
	sub	x6, x6, #1
	cmp	x8, x3
	cbz	x6, 9f
	cmov.cs	x8, xzr
	b	0b

	// Next job is to construct the decryption keys.  The keys for the
	// first and last rounds don't need to be mangled, but the remaining
	// ones do -- and they all need to be reordered too.
	//
	// The plan of action, then, is to copy the final encryption round's
	// keys into place first, then to do each of the intermediate rounds
	// in reverse order, and finally do the first round.
	//
	// Do all the heavy lifting with the vector registers.  The order
	// we're doing this in means that it's OK if we read or write too
	// much, and there's easily enough buffer space for the
	// over-enthusiastic reads and writes because the context has space
	// for 32-byte blocks, which is our maximum and an exact fit for two
	// full-width registers.
9:	add	x5, x0, #wi
	add	x4, x0, #w
	add	x4, x4, w2, uxtw #2
	sub	x4, x4, w1, uxtw #2		// last round's keys

	// Copy the last encryption round's keys.
	ld1	{v0.4s, v1.4s}, [x4]
	st1	{v0.4s, v1.4s}, [x5]

	// Update the loop variables and stop if we've finished.
0:	sub	w9, w9, #1
	add	x5, x5, w1, uxtw #2
	sub	x4, x4, w1, uxtw #2
	cbz	w9, 9f

	// Do another middle round's keys...
	ld1	{v0.4s, v1.4s}, [x4]
	aesimc	v0.16b, v0.16b
	aesimc	v1.16b, v1.16b
	st1	{v0.4s, v1.4s}, [x5]
	b	0b

	// Finally do the first encryption round.
9:	ld1	{v0.4s, v1.4s}, [x4]
	st1	{v0.4s, v1.4s}, [x5]

	// If the block size is not exactly four words then we must end-swap
	// everything.  We can use fancy vector toys for this.
	cmp	w1, #4
	b.eq	9f

	// End-swap the encryption keys.
	add	x1, x0, #w
	bl	endswap_block

	// And the decryption keys
	add	x1, x0, #wi
	bl	endswap_block

	// All done.
9:	popreg	x29, x30
	ret

ENDFUNC

INTFUNC(endswap_block)
	// End-swap w2 words starting at x1.  x1 is clobbered; w2 is not.
	// It's OK to work in 16-byte chunks.

	mov	w3, w2
0:	subs	w3, w3, #4
	ld1	{v0.4s}, [x1]
	rev32	v0.16b, v0.16b
	st1	{v0.4s}, [x1], #16
	b.hi	0b
	ret

ENDFUNC

///--------------------------------------------------------------------------
/// Encrypting and decrypting blocks.

.macro	encdec	op, aes, mc, koff
  FUNC(rijndael_\op\()_arm64_crypto)

	// Arguments:
	//	x0 = pointer to context
	//	x1 = pointer to input block
	//	x2 = pointer to output block

	// Set things up ready.
	ldr	w3, [x0, #nr]
	add	x0, x0, #\koff
	ld1	{v0.4s}, [x1]
	rev32	v0.16b, v0.16b

	// Check the number of rounds and dispatch.
	cmp	w3, #14
	b.eq	14f
	cmp	w3, #10
	b.eq	10f
	cmp	w3, #12
	b.eq	12f
	cmp	w3, #13
	b.eq	13f
	cmp	w3, #11
	b.eq	11f
	callext	F(abort)

	// Eleven rounds.
11:	ld1	{v16.4s}, [x0], #16
	\aes	v0.16b, v16.16b
	\mc	v0.16b, v0.16b
	b	10f

	// Twelve rounds.
12:	ld1	{v16.4s, v17.4s}, [x0], #32
	\aes	v0.16b, v16.16b
	\mc	v0.16b, v0.16b
	\aes	v0.16b, v17.16b
	\mc	v0.16b, v0.16b
	b	10f

	// Thirteen rounds.
13:	ld1	{v16.4s-v18.4s}, [x0], #48
	\aes	v0.16b, v16.16b
	\mc	v0.16b, v0.16b
	\aes	v0.16b, v17.16b
	\mc	v0.16b, v0.16b
	\aes	v0.16b, v18.16b
	\mc	v0.16b, v0.16b
	b	10f

	// Fourteen rounds.  (Drops through to the ten round case because
	// this is the next most common.)
14:	ld1	{v16.4s-v19.4s}, [x0], #64
	\aes	v0.16b, v16.16b
	\mc	v0.16b, v0.16b
	\aes	v0.16b, v17.16b
	\mc	v0.16b, v0.16b
	\aes	v0.16b, v18.16b
	\mc	v0.16b, v0.16b
	\aes	v0.16b, v19.16b
	\mc	v0.16b, v0.16b
	// Drop through...

	// Ten rounds.
10:	ld1	{v16.4s-v19.4s}, [x0], #64
	ld1	{v20.4s-v23.4s}, [x0], #64
	\aes	v0.16b, v16.16b
	\mc	v0.16b, v0.16b
	\aes	v0.16b, v17.16b
	\mc	v0.16b, v0.16b
	\aes	v0.16b, v18.16b
	\mc	v0.16b, v0.16b
	\aes	v0.16b, v19.16b
	\mc	v0.16b, v0.16b

	ld1	{v16.4s-v18.4s}, [x0], #48
	\aes	v0.16b, v20.16b
	\mc	v0.16b, v0.16b
	\aes	v0.16b, v21.16b
	\mc	v0.16b, v0.16b
	\aes	v0.16b, v22.16b
	\mc	v0.16b, v0.16b
	\aes	v0.16b, v23.16b
	\mc	v0.16b, v0.16b

	// Final round has no MixColumns, but is followed by final whitening.
	\aes	v0.16b, v16.16b
	\mc	v0.16b, v0.16b
	\aes	v0.16b, v17.16b
	eor	v0.16b, v0.16b, v18.16b

	// All done.
	rev32	v0.16b, v0.16b
	st1	{v0.4s}, [x2]
	ret

  ENDFUNC
.endm

	encdec	eblk, aese, aesmc, w
	encdec	dblk, aesd, aesimc, wi

///----- That's all, folks --------------------------------------------------
Commit	Line	Data
e492db88 MW	1	/// -- mode: asm; asm-comment-char: ?/ --
	2	///
	3	/// AArch64 crypto-extension-based implementation of Rijndael
	4	///
	5	/// (c) 2018 Straylight/Edgeware
	6	///
	7
	8	///----- Licensing notice ---------------------------------------------------
	9	///
	10	/// This file is part of Catacomb.
	11	///
	12	/// Catacomb is free software; you can redistribute it and/or modify
	13	/// it under the terms of the GNU Library General Public License as
	14	/// published by the Free Software Foundation; either version 2 of the
	15	/// License, or (at your option) any later version.
	16	///
	17	/// Catacomb is distributed in the hope that it will be useful,
	18	/// but WITHOUT ANY WARRANTY; without even the implied warranty of
	19	/// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	20	/// GNU Library General Public License for more details.
	21	///
	22	/// You should have received a copy of the GNU Library General Public
	23	/// License along with Catacomb; if not, write to the Free
	24	/// Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
	25	/// MA 02111-1307, USA.
	26
	27	///--------------------------------------------------------------------------
df07f2c0	28	/// Preliminaries.
e492db88 MW	29
	30	#include "config.h"
	31	#include "asm-common.h"
	32
df07f2c0 MW	33	.arch armv8-a+crypto
df07f2c0 MW	34
e492db88 MW	35	.extern F(abort)
	36	.extern F(rijndael_rcon)
	37
df07f2c0 MW	38	.text
df07f2c0 MW	39
e492db88 MW	40	///--------------------------------------------------------------------------
	41	/// Main code.
	42
e492db88 MW	43	/// The ARM crypto extension implements a little-endian version of AES
	44	/// (though the manual doesn't actually spell this out and you have to
	45	/// experiment), but Catacomb's internal interface presents as big-endian so
	46	/// as to work better with things like GCM. We therefore maintain the round
	47	/// keys in little-endian form, and have to end-swap blocks in and out.
	48	///
	49	/// For added amusement, the crypto extension doesn't implement the larger-
	50	/// block versions of Rijndael, so we have to end-swap the keys if we're
	51	/// preparing for one of those.
	52
	53	// Useful constants.
	54	.equ maxrounds, 16 // maximum number of rounds
	55	.equ maxblksz, 32 // maximum block size, in bytes
	56	.equ kbufsz, maxblksz*(maxrounds + 1) // size of key-sched buffer
	57
	58	// Context structure.
	59	.equ nr, 0 // number of rounds
	60	.equ w, nr + 4 // encryption key words
	61	.equ wi, w + kbufsz // decryption key words
	62
	63	///--------------------------------------------------------------------------
	64	/// Key setup.
	65
	66	FUNC(rijndael_setup_arm64_crypto)
	67
	68	// Arguments:
	69	// x0 = pointer to context
	70	// w1 = block size in 32-bit words
	71	// x2 = pointer to key material
	72	// x3 = key size in words
	73
	74	pushreg x29, x30
	75	mov x29, sp
	76
	77	// The initial round key material is taken directly from the input
	78	// key, so copy it over. Unfortunately, the key material is not
	79	// guaranteed to be aligned in any especially useful way. Assume
	80	// that alignment traps are not enabled. (Why would they be? On
	81	// A32, alignment traps were part of a transition plan which changed
	82	// the way unaligned loads and stores behaved, but there's never been
	83	// any other behaviour on A64.)
	84	mov x15, x3
	85	add x4, x0, #w
	86	0: sub x15, x15, #1
	87	ldr w14, [x2], #4
	88	str w14, [x4], #4
	89	cbnz x15, 0b
	90
	91	// Find out other useful things and prepare for the main loop.
	92	9: ldr w9, [x0, #nr] // number of rounds
	93	madd w2, w1, w9, w1 // total key size in words
	94	leaext x5, rijndael_rcon // round constants
	95	sub x6, x2, x3 // minus what we've copied already
	96	add x7, x0, #w // position in previous cycle
	97	movi v1.4s, #0 // all-zero register for the key
	98	mov x8, #0 // position in current cycle
	99
	100	// Main key expansion loop. Dispatch according to the position in
	101	// the cycle.
	102	0: ldr w15, [x7], #4 // word from previous cycle
	103	cbz x8, 1f // first word of the cycle?
	104	cmp x8, #4 // fourth word of the cycle?
	105	b.ne 2f
	106	cmp x3, #7 // seven or eight words of key?
107	b.cc 2f
108
109	// Fourth word of the cycle, seven or eight words of key. We must do
110	// the byte substitution.
111	dup v0.4s, w14
112	aese v0.16b, v1.16b // effectively, just SubBytes
cead42fc	113	mov w14, v0.s[0]
e492db88 MW	114	b 2f
	115
	116	// First word of the cycle. Byte substitution, rotation, and round
	117	// constant.
	118	1: ldrb w13, [x5], #1 // next round constant
	119	dup v0.4s, w14
	120	aese v0.16b, v1.16b // effectively, just SubBytes
cead42fc	121	mov w14, v0.s[0]
e492db88 MW	122	eor w14, w13, w14, ror #8
	123
	124	// Common ending: mix in the word from the previous cycle and store.
	125	2: eor w14, w14, w15
	126	str w14, [x4], #4
	127
	128	// Prepare for the next iteration. If we're done, then stop; if
	129	// we've finished a cycle then reset the counter.
	130	add x8, x8, #1
	131	sub x6, x6, #1
	132	cmp x8, x3
	133	cbz x6, 9f
5f49478b	134	cmov.cs x8, xzr
e492db88 MW	135	b 0b
	136
	137	// Next job is to construct the decryption keys. The keys for the
	138	// first and last rounds don't need to be mangled, but the remaining
	139	// ones do -- and they all need to be reordered too.
	140	//
	141	// The plan of action, then, is to copy the final encryption round's
	142	// keys into place first, then to do each of the intermediate rounds
	143	// in reverse order, and finally do the first round.
	144	//
	145	// Do all the heavy lifting with the vector registers. The order
	146	// we're doing this in means that it's OK if we read or write too
	147	// much, and there's easily enough buffer space for the
	148	// over-enthusiastic reads and writes because the context has space
	149	// for 32-byte blocks, which is our maximum and an exact fit for two
	150	// full-width registers.
	151	9: add x5, x0, #wi
	152	add x4, x0, #w
	153	add x4, x4, w2, uxtw #2
	154	sub x4, x4, w1, uxtw #2 // last round's keys
	155
	156	// Copy the last encryption round's keys.
	157	ld1 {v0.4s, v1.4s}, [x4]
	158	st1 {v0.4s, v1.4s}, [x5]
	159
	160	// Update the loop variables and stop if we've finished.
	161	0: sub w9, w9, #1
	162	add x5, x5, w1, uxtw #2
	163	sub x4, x4, w1, uxtw #2
	164	cbz w9, 9f
	165
	166	// Do another middle round's keys...
	167	ld1 {v0.4s, v1.4s}, [x4]
	168	aesimc v0.16b, v0.16b
	169	aesimc v1.16b, v1.16b
	170	st1 {v0.4s, v1.4s}, [x5]
	171	b 0b
	172
	173	// Finally do the first encryption round.
	174	9: ld1 {v0.4s, v1.4s}, [x4]
	175	st1 {v0.4s, v1.4s}, [x5]
	176
	177	// If the block size is not exactly four words then we must end-swap
	178	// everything. We can use fancy vector toys for this.
	179	cmp w1, #4
	180	b.eq 9f
	181
	182	// End-swap the encryption keys.
	183	add x1, x0, #w
	184	bl endswap_block
	185
	186	// And the decryption keys
	187	add x1, x0, #wi
	188	bl endswap_block
	189
	190	// All done.
	191	9: popreg x29, x30
	192	ret
	193
	194	ENDFUNC
	195
	196	INTFUNC(endswap_block)
	197	// End-swap w2 words starting at x1. x1 is clobbered; w2 is not.
	198	// It's OK to work in 16-byte chunks.
199
200	mov w3, w2
201	0: subs w3, w3, #4
202	ld1 {v0.4s}, [x1]
203	rev32 v0.16b, v0.16b
204	st1 {v0.4s}, [x1], #16
205	b.hi 0b
206	ret
207
208	ENDFUNC
209
210	///--------------------------------------------------------------------------
211	/// Encrypting and decrypting blocks.
212
213	.macro encdec op, aes, mc, koff
214	FUNC(rijndael_\op\()_arm64_crypto)
215
216	// Arguments:
217	// x0 = pointer to context
218	// x1 = pointer to input block
219	// x2 = pointer to output block
220
221	// Set things up ready.
222	ldr w3, [x0, #nr]
223	add x0, x0, #\koff
224	ld1 {v0.4s}, [x1]
225	rev32 v0.16b, v0.16b
226
227	// Check the number of rounds and dispatch.
228	cmp w3, #14
229	b.eq 14f
230	cmp w3, #10
231	b.eq 10f
232	cmp w3, #12
233	b.eq 12f
234	cmp w3, #13
235	b.eq 13f
236	cmp w3, #11
237	b.eq 11f
238	callext F(abort)
239
240	// Eleven rounds.
241	11: ld1 {v16.4s}, [x0], #16
242	\aes v0.16b, v16.16b
243	\mc v0.16b, v0.16b
244	b 10f
245
246	// Twelve rounds.
247	12: ld1 {v16.4s, v17.4s}, [x0], #32
248	\aes v0.16b, v16.16b
249	\mc v0.16b, v0.16b
250	\aes v0.16b, v17.16b
251	\mc v0.16b, v0.16b
252	b 10f
253
254	// Thirteen rounds.
255	13: ld1 {v16.4s-v18.4s}, [x0], #48
256	\aes v0.16b, v16.16b
257	\mc v0.16b, v0.16b
258	\aes v0.16b, v17.16b
259	\mc v0.16b, v0.16b
260	\aes v0.16b, v18.16b
261	\mc v0.16b, v0.16b
262	b 10f
263
264	// Fourteen rounds. (Drops through to the ten round case because
265	// this is the next most common.)
266	14: ld1 {v16.4s-v19.4s}, [x0], #64
267	\aes v0.16b, v16.16b
268	\mc v0.16b, v0.16b
269	\aes v0.16b, v17.16b
270	\mc v0.16b, v0.16b
271	\aes v0.16b, v18.16b
272	\mc v0.16b, v0.16b
273	\aes v0.16b, v19.16b
274	\mc v0.16b, v0.16b
275	// Drop through...
276
277	// Ten rounds.
278	10: ld1 {v16.4s-v19.4s}, [x0], #64
279	ld1 {v20.4s-v23.4s}, [x0], #64
280	\aes v0.16b, v16.16b
281	\mc v0.16b, v0.16b
282	\aes v0.16b, v17.16b
283	\mc v0.16b, v0.16b
284	\aes v0.16b, v18.16b
285	\mc v0.16b, v0.16b
286	\aes v0.16b, v19.16b
287	\mc v0.16b, v0.16b
288
289	ld1 {v16.4s-v18.4s}, [x0], #48
290	\aes v0.16b, v20.16b
291	\mc v0.16b, v0.16b
292	\aes v0.16b, v21.16b
293	\mc v0.16b, v0.16b
294	\aes v0.16b, v22.16b
295	\mc v0.16b, v0.16b
296	\aes v0.16b, v23.16b
297	\mc v0.16b, v0.16b
298
299	// Final round has no MixColumns, but is followed by final whitening.
300	\aes v0.16b, v16.16b
301	\mc v0.16b, v0.16b
302	\aes v0.16b, v17.16b
303	eor v0.16b, v0.16b, v18.16b
304
305	// All done.
306	rev32 v0.16b, v0.16b
307	st1 {v0.4s}, [x2]
308	ret
309
310	ENDFUNC
311	.endm
312
313	encdec eblk, aese, aesmc, w
314	encdec dblk, aesd, aesimc, wi
315
316	///----- That's all, folks --------------------------------------------------