/// -*- mode: asm; asm-comment-char: ?/ -*-
///
/// ARM crypto-extension-based implementation of Rijndael
///
/// (c) 2016 Straylight/Edgeware
///

///----- Licensing notice ---------------------------------------------------
///
/// This file is part of Catacomb.
///
/// Catacomb is free software; you can redistribute it and/or modify
/// it under the terms of the GNU Library General Public License as
/// published by the Free Software Foundation; either version 2 of the
/// License, or (at your option) any later version.
///
/// Catacomb is distributed in the hope that it will be useful,
/// but WITHOUT ANY WARRANTY; without even the implied warranty of
/// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
/// GNU Library General Public License for more details.
///
/// You should have received a copy of the GNU Library General Public
/// License along with Catacomb; if not, write to the Free
/// Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
/// MA 02111-1307, USA.

///--------------------------------------------------------------------------
/// External definitions.

#include "config.h"
#include "asm-common.h"

	.globl	F(abort)
	.globl	F(rijndael_rcon)

///--------------------------------------------------------------------------
/// Main code.

	.arch	armv8-a
	.fpu	crypto-neon-fp-armv8

/// The ARM crypto extension implements a little-endian version of AES
/// (though the manual doesn't actually spell this out and you have to
/// experiment), but Catacomb's internal interface presents as big-endian so
/// as to work better with things like GCM.  We therefore maintain the round
/// keys in little-endian form, and have to end-swap blocks in and out.
///
/// For added amusement, the crypto extension doesn't implement the larger-
/// block versions of Rijndael, so we have to end-swap the keys if we're
/// preparing for one of those.

	// Useful constants.
	.equ	maxrounds, 16		// maximum number of rounds
	.equ	maxblksz, 32		// maximum block size, in bytes
	.equ	kbufsz, maxblksz*(maxrounds + 1) // size of a key-schedule buffer

	// Context structure.
	.equ	nr, 0			// number of rounds
	.equ	w, nr + 4		// encryption key words
	.equ	wi, w + kbufsz		// decryption key words

///--------------------------------------------------------------------------
/// Key setup.

FUNC(rijndael_setup_arm_crypto)

	// Arguments:
	//	r0 = pointer to context
	//	r1 = block size in words
	//	r2 = pointer to key material
	//	r3 = key size in words

	stmfd	sp!, {r4-r9, r14}

	// The initial round key material is taken directly from the input
	// key, so copy it over.  Unfortunately, the key material is not
	// guaranteed to be aligned in any especially useful way, so we must
	// sort this out.
	add	r9, r0, #w
	mov	r14, r3
	ands	r4, r2, #3
	beq	1f
	mov	r4, r4, lsl #3
	rsb	r5, r4, #32
	bic	r2, r2, #3
	ldr	r6, [r2], #4

0:	ldr	r7, [r2], #4
	mov	r6, r6, lsr r4
	orr	r6, r7, lsl r5
	str	r6, [r9], #4
	mov	r6, r7
	subs	r14, r14, #1
	bhi	0b
	b	9f

1:	ldr	r6, [r2], #4
	str	r6, [r9], #4
	subs	r14, r14, #1
	bhi	1b

	// Find out other useful things and prepare for the main loop.
	ldr	r7, [r0, #nr]		// number of rounds
	mla	r2, r1, r7, r1		// total key size in words
	ldr	r4, [r9, #-4]		// most recent key word
	leaextq	r5, rijndael_rcon	// round constants
	sub	r8, r2, r3		// minus what we've copied already
	veor	q1, q1			// all-zero register for the key
	add	r8, r9, r8, lsl #2	// limit of the key buffer

	// Main key expansion loop.  The first word of each key-length chunk
	// needs special treatment.
9:	ldrb	r14, [r5], #1		// next round constant
	ldr	r6, [r9, -r3, lsl #2]
	vdup.32	q0, r4
	aese.8	q0, q1			// effectively, just SubBytes
	vmov.32	r4, d0[0]
	eor	r4, r14, r4, ror #8
	eor	r4, r4, r6
	str	r4, [r9], #4
	cmp	r9, r8
	bcs	8f

	// The next three words are simple.
	ldr	r6, [r9, -r3, lsl #2]
	eor	r4, r4, r6
	str	r4, [r9], #4
	cmp	r9, r8
	bcs	8f

	// (Word 2...)
	ldr	r6, [r9, -r3, lsl #2]
	eor	r4, r4, r6
	str	r4, [r9], #4
	cmp	r9, r8
	bcs	8f

	// (Word 3...)
	ldr	r6, [r9, -r3, lsl #2]
	eor	r4, r4, r6
	str	r4, [r9], #4
	cmp	r9, r8
	bcs	8f

	// Word 4.  If the key is /more/ than 6 words long, then we must
	// apply a substitution here.
	cmp	r3, #5
	bcc	9b
	ldr	r6, [r9, -r3, lsl #2]
	cmp	r3, #7
	bcc	0f
	vdup.32	q0, r4
	aese.8	q0, q1			// effectively, just SubBytes
	vmov.32	r4, d0[0]
0:	eor	r4, r4, r6
	str	r4, [r9], #4
	cmp	r9, r8
	bcs	8f

	// (Word 5...)
	cmp	r3, #6
	bcc	9b
	ldr	r6, [r9, -r3, lsl #2]
	eor	r4, r4, r6
	str	r4, [r9], #4
	cmp	r9, r8
	bcs	8f

	// (Word 6...)
	cmp	r3, #7
	bcc	9b
	ldr	r6, [r9, -r3, lsl #2]
	eor	r4, r4, r6
	str	r4, [r9], #4
	cmp	r9, r8
	bcs	8f

	// (Word 7...)
	cmp	r3, #8
	bcc	9b
	ldr	r6, [r9, -r3, lsl #2]
	eor	r4, r4, r6
	str	r4, [r9], #4
	cmp	r9, r8
	bcs	8f

	// Must be done by now.
	b	9b

	// Next job is to construct the decryption keys.  The keys for the
	// first and last rounds don't need to be mangled, but the remaining
	// ones do -- and they all need to be reordered too.
	//
	// The plan of action, then, is to copy the final encryption round's
	// keys into place first, then to do each of the intermediate rounds
	// in reverse order, and finally do the first round.
	//
	// Do all the heavy lifting with NEON registers.  The order we're
	// doing this in means that it's OK if we read or write too much, and
	// there's easily enough buffer space for the over-enthusiastic reads
	// and writes because the context has space for 32-byte blocks, which
	// is our maximum and an exact fit for two Q-class registers.
8:	add	r5, r0, #wi
	add	r4, r0, #w
	add	r4, r4, r2, lsl #2
	sub	r4, r4, r1, lsl #2		// last round's keys

	// Copy the last encryption round's keys.
	teq	r1, #4
	vldmiaeq r4, {d0, d1}
	vldmiane r4, {d0-d3}
	vstmiaeq r5, {d0, d1}
	vstmiane r5, {d0-d3}

	// Update the loop variables and stop if we've finished.
9:	sub	r4, r4, r1, lsl #2
	add	r5, r5, r1, lsl #2
	subs	r7, r7, #1
	beq	0f

	// Do another middle round's keys...
	teq	r1, #4
	vldmiaeq r4, {d0, d1}
	vldmiane r4, {d0-d3}
	aesimc.8 q0, q0
	vstmiaeq r5, {d0, d1}
	beq	9b
	aesimc.8 q1, q1
	vstmia	r5, {d0-d3}
	b	9b

	// Finally do the first encryption round.
0:	teq	r1, #4
	vldmiaeq r4, {d0, d1}
	vldmiane r4, {d0-d3}
	vstmiaeq r5, {d0, d1}
	vstmiane r5, {d0-d3}

	// If the block size is not exactly four words then we must end-swap
	// everything.  We can use fancy NEON toys for this.
	beq	0f

	// End-swap the encryption keys.
	add	r1, r0, #w
	bl	endswap_block

	// And the decryption keys
	add	r1, r0, #wi
	bl	endswap_block

	// All done.
0:	ldmfd	sp!, {r4-r9, pc}

endswap_block:
	// End-swap R2 words starting at R1.  R1 is clobbered; R2 is not.
	// It's OK to work in 16-byte chunks.
	mov	r4, r2
0:	vldmia	r1, {d0, d1}
	vrev32.8 q0, q0
	vstmia	r1!, {d0, d1}
	subs	r4, r4, #4
	bhi	0b
	bx	r14

ENDFUNC

///--------------------------------------------------------------------------
/// Encrypting and decrypting blocks.

FUNC(rijndael_eblk_arm_crypto)

	// Arguments:
	//	r0 = pointer to context
	//	r1 = pointer to input block
	//	r2 = pointer to output block

	// Set things up ready.
	ldr	r3, [r0, #nr]
	add	r0, r0, #w
	vldmia	r1, {d0, d1}
	vrev32.8 q0, q0

	// Dispatch according to the number of rounds.
	add	r3, r3, r3, lsl #1
	rsbs	r3, r3, #3*14
	addcs	pc, pc, r3, lsl #2
	callext	F(abort)

	// The last round doesn't have MixColumns, so do it separately.
	.rept	13
	vldmia	r0!, {d2, d3}
	aese.8	q0, q1
	aesmc.8	q0, q0
	.endr

	// Final round.
	vldmia	r0!, {d2, d3}
	aese.8	q0, q1

	// Final whitening.
	vldmia	r0!, {d2, d3}
	veor	q0, q1

	// All done.
	vrev32.8 q0, q0
	vstmia	r2, {d0, d1}
	bx	r14

ENDFUNC

FUNC(rijndael_dblk_arm_crypto)

	// Arguments:
	//	r0 = pointer to context
	//	r1 = pointer to input block
	//	r2 = pointer to output block

	// Set things up ready.
	ldr	r3, [r0, #nr]
	add	r0, r0, #wi
	vldmia	r1, {d0, d1}
	vrev32.8 q0, q0

	// Dispatch according to the number of rounds.
	add	r3, r3, r3, lsl #1
	rsbs	r3, r3, #3*14
	addcs	pc, pc, r3, lsl #2
	callext	F(abort)

	// The last round doesn't have MixColumns, so do it separately.
	.rept	13
	vldmia	r0!, {d2, d3}
	aesd.8	q0, q1
	aesimc.8 q0, q0
	.endr

	// Final round.
	vldmia	r0!, {d2, d3}
	aesd.8	q0, q1

	// Final whitening.
	vldmia	r0!, {d2, d3}
	veor	q0, q1

	// All done.
	vrev32.8 q0, q0
	vstmia	r2, {d0, d1}
	bx	r14

ENDFUNC

///----- That's all, folks --------------------------------------------------