chiark - git - mdw - catacomb/blame_incremental

... / ...

Commit	Line	Data
	1	/// -- mode: asm; asm-comment-char: ?/ --
	2	///
	3	/// AESNI-based implementation of Rijndael
	4	///
	5	/// (c) 2015 Straylight/Edgeware
	6	///
	7
	8	///----- Licensing notice ---------------------------------------------------
	9	///
	10	/// This file is part of Catacomb.
	11	///
	12	/// Catacomb is free software; you can redistribute it and/or modify
	13	/// it under the terms of the GNU Library General Public License as
	14	/// published by the Free Software Foundation; either version 2 of the
	15	/// License, or (at your option) any later version.
	16	///
	17	/// Catacomb is distributed in the hope that it will be useful,
	18	/// but WITHOUT ANY WARRANTY; without even the implied warranty of
	19	/// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	20	/// GNU Library General Public License for more details.
	21	///
	22	/// You should have received a copy of the GNU Library General Public
	23	/// License along with Catacomb; if not, write to the Free
	24	/// Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
	25	/// MA 02111-1307, USA.
	26
	27	///--------------------------------------------------------------------------
	28	/// External definitions.
	29
	30	#include "config.h"
	31	#include "asm-common.h"
	32
	33	.globl F(abort)
	34	.globl F(rijndael_rcon)
	35
	36	///--------------------------------------------------------------------------
	37	/// Main code.
	38
	39	.arch .aes
	40	.text
	41
	42	/// The AESNI instructions implement a little-endian version of AES, but
	43	/// Catacomb's internal interface presents as big-endian so as to work better
	44	/// with things like GCM. We therefore maintain the round keys in
	45	/// little-endian form, and have to end-swap blocks in and out.
	46	///
	47	/// For added amusement, the AESNI instructions don't implement the
	48	/// larger-block versions of Rijndael, so we have to end-swap the keys if
	49	/// we're preparing for one of those.
	50
	51	// Useful constants.
	52	.equ maxrounds, 16 // maximum number of rounds
	53	.equ maxblksz, 32 // maximum block size, in bytes
	54	.equ kbufsz, maxblksz*(maxrounds + 1) // size of a key-schedule buffer
	55
	56	// Context structure.
	57	.equ nr, 0 // number of rounds
	58	.equ w, nr + 4 // encryption key words
	59	.equ wi, w + kbufsz // decryption key words
	60
	61	///--------------------------------------------------------------------------
	62	/// Key setup.
	63
	64	FUNC(rijndael_setup_x86ish_aesni)
	65
	66	#define SI WHOLE(si)
	67	#define DI WHOLE(di)
	68
	69	#if CPUFAM_X86
	70	// Arguments are on the stack. We'll need to stack the caller's
	71	// register veriables, but we'll manage.
	72
	73	# define CTX ebp // context pointer
	74	# define BLKSZ [esp + 24] // block size
	75
	76	# define KSZ ebx // key size
	77	# define NKW edx // total number of key words
	78	# define NKW_NEEDS_REFRESH 1 // ... needs recalculating
	79	# define RCON ecx // round constants table
	80	# define LIM edx // limit pointer
	81	# define CYIX edi // index in shift-register cycle
	82
	83	# define NR ecx // number of rounds
	84	# define LRK eax // distance to last key
	85	# define BLKOFF edx // block size in bytes
	86
	87	// Stack the caller's registers.
	88	push ebp
	89	push ebx
	90	push esi
	91	push edi
	92
	93	// Set up our own variables.
	94	mov CTX, [esp + 20] // context base pointer
	95	mov SI, [esp + 28] // key material
	96	mov KSZ, [esp + 32] // key size, in words
	97	#endif
	98
	99	#if CPUFAM_AMD64 && ABI_SYSV
	100	// Arguments are in registers. We have plenty, but, to be honest,
	101	// the initial register allocation is a bit annoying.
	102
	103	# define CTX r8 // context pointer
	104	# define BLKSZ r9d // block size
	105
	106	# define KSZ edx // key size
	107	# define NKW r10d // total number of key words
	108	# define RCON rdi // round constants table
	109	# define LIM rcx // limit pointer
	110	# define CYIX r11d // index in shift-register cycle
	111
	112	# define NR ecx // number of rounds
	113	# define LRK eax // distance to last key
	114	# define BLKOFF r9d // block size in bytes
	115
	116	// Move arguments to more useful places.
	117	mov CTX, rdi // context base pointer
	118	mov BLKSZ, esi // block size in words
	119	mov SI, rdx // key material
	120	mov KSZ, ecx // key size, in words
	121	#endif
	122
	123	#if CPUFAM_AMD64 && ABI_WIN
	124	// Arguments are in different registers, and they're a little tight.
	125
	126	# define CTX r8 // context pointer
	127	# define BLKSZ edx // block size
	128
	129	# define KSZ r9d // key size
	130	# define NKW r10d // total number of key words
	131	# define RCON rdi // round constants table
	132	# define LIM rcx // limit pointer
	133	# define CYIX r11d // index in shift-register cycle
	134
	135	# define NR ecx // number of rounds
	136	# define LRK eax // distance to last key
	137	# define BLKOFF edx // block size in bytes
	138
	139	// We'll need the index registers, which belong to the caller in this
	140	// ABI.
	141	push rsi
	142	.seh_pushreg rsi
	143	push rdi
	144	.seh_pushreg rdi
	145	.seh_endprologue
	146
	147	// Move arguments to more useful places.
	148	mov rsi, r8 // key material
	149	mov CTX, rcx // context base pointer
	150	#endif
	151
	152	// The initial round key material is taken directly from the input
	153	// key, so copy it over.
	154	#if CPUFAM_AMD64 && ABI_SYSV
	155	// We've been lucky. We already have a copy of the context pointer
	156	// in rdi, and the key size in ecx.
	157	add rdi, w
	158	#else
	159	lea DI, [CTX + w]
	160	mov ecx, KSZ
	161	#endif
	162	rep movsd
	163
	164	// Find out other useful things.
	165	mov NKW, [CTX + nr] // number of rounds
	166	add NKW, 1
	167	imul NKW, BLKSZ // total key size in words
	168	#if !NKW_NEEDS_REFRESH
	169	// If we can't keep NKW for later, then we use the same register for
	170	// it and LIM, so this move is unnecessary.
	171	mov DWORD(LIM), NKW
	172	#endif
	173	sub DWORD(LIM), KSZ // offset by the key size
	174
	175	// Find the round constants.
	176	ldgot WHOLE(c)
	177	leaext RCON, F(rijndael_rcon), WHOLE(c)
	178
	179	// Prepare for the main loop.
	180	lea SI, [CTX + w]
	181	mov eax, [SI + 4*WHOLE(KSZ) - 4] // most recent key word
	182	lea LIM, [SI + 4*LIM] // limit, offset by one key expansion
	183	xor CYIX, CYIX // start of new cycle
	184
	185	// Main key expansion loop. The first word of each key-length chunk
	186	// needs special treatment.
	187	//
	188	// This is rather tedious because the Intel `AESKEYGENASSIST'
	189	// instruction is very strangely shaped. Firstly, it wants to
	190	// operate on vast SSE registers, even though we're data-blocked from
	191	// doing more than operation at a time unless we're doing two key
	192	// schedules simultaneously -- and even then we can't do more than
	193	// two, because the instruction ignores two of its input words
	194	// entirely, and produces two different outputs for each of the other
	195	// two. And secondly it insists on taking the magic round constant
	196	// as an immediate, so it's kind of annoying if you're not
	197	// open-coding the whole thing. It's much easier to leave that as
	198	// zero and XOR in the round constant by hand.
	199	0: cmp CYIX, 0 // first word of the cycle?
	200	je 1f
	201	cmp CYIX, 4 // fourth word of the cycle?
	202	jne 2f
	203	cmp KSZ, 7 // and a large key?
	204	jb 2f
	205
	206	// Fourth word of the cycle, and seven or eight words of key. Do a
	207	// byte substitution.
	208	movd xmm0, eax
	209	pshufd xmm0, xmm0, SHUF(2, 1, 0, 3)
	210	aeskeygenassist xmm1, xmm0, 0
	211	movd eax, xmm1
	212	jmp 2f
	213
	214	// First word of the cycle. This is the complicated piece.
	215	1: movd xmm0, eax
	216	pshufd xmm0, xmm0, SHUF(0, 3, 2, 1)
	217	aeskeygenassist xmm1, xmm0, 0
	218	pshufd xmm1, xmm1, SHUF(2, 1, 0, 3)
	219	movd eax, xmm1
	220	xor al, [RCON]
	221	inc RCON
	222
	223	// Common tail. Mix in the corresponding word from the previous
	224	// cycle and prepare for the next loop.
	225	2: xor eax, [SI]
	226	mov [SI + 4*WHOLE(KSZ)], eax
	227	add SI, 4
	228	inc CYIX
	229	cmp SI, LIM
	230	jae 9f
	231	cmp CYIX, KSZ
	232	jb 0b
	233	xor CYIX, CYIX
	234	jmp 0b
	235
	236	// Next job is to construct the decryption keys. The keys for the
	237	// first and last rounds don't need to be mangled, but the remaining
	238	// ones do -- and they all need to be reordered too.
	239	//
	240	// The plan of action, then, is to copy the final encryption round's
	241	// keys into place first, then to do each of the intermediate rounds
	242	// in reverse order, and finally do the first round.
	243	//
	244	// Do all of the heavy lifting with SSE registers. The order we're
	245	// doing this in means that it's OK if we read or write too much, and
	246	// there's easily enough buffer space for the over-enthusiastic reads
	247	// and writes because the context has space for 32-byte blocks, which
	248	// is our maximum and an exact fit for two SSE registers.
	249	9: mov NR, [CTX + nr] // number of rounds
	250	#if NKW_NEEDS_REFRESH
	251	mov BLKOFF, BLKSZ
	252	mov LRK, NR
	253	imul LRK, BLKOFF
	254	#else
	255	// If we retain NKW, then BLKSZ and BLKOFF are the same register
	256	// because we won't need the former again.
	257	mov LRK, NKW
	258	sub LRK, BLKSZ
	259	#endif
	260	lea DI, [CTX + wi]
	261	lea SI, [CTX + w + 4*WHOLE(LRK)] // last round's keys
	262	shl BLKOFF, 2 // block size (in bytes now)
	263
	264	// Copy the last encryption round's keys.
	265	movdqu xmm0, [SI]
	266	movdqu [DI], xmm0
	267	cmp BLKOFF, 16
	268	jbe 0f
	269	movdqu xmm0, [SI + 16]
	270	movdqu [DI + 16], xmm0
	271
	272	// Update the loop variables and stop if we've finished.
	273	0: add DI, WHOLE(BLKOFF)
	274	sub SI, WHOLE(BLKOFF)
	275	sub NR, 1
	276	jbe 9f
	277
	278	// Do another middle round's keys...
	279	movdqu xmm0, [SI]
	280	aesimc xmm0, xmm0
	281	movdqu [DI], xmm0
	282	cmp BLKOFF, 16
	283	jbe 0b
	284	movdqu xmm0, [SI + 16]
	285	aesimc xmm0, xmm0
	286	movdqu [DI + 16], xmm0
	287	jmp 0b
	288
	289	// Finally do the first encryption round.
	290	9: movdqu xmm0, [SI]
	291	movdqu [DI], xmm0
	292	cmp BLKOFF, 16
	293	jbe 1f
	294	movdqu xmm0, [SI + 16]
	295	movdqu [DI + 16], xmm0
	296
	297	// If the block size is not exactly four words then we must end-swap
	298	// everything. We can use fancy SSE toys for this.
	299	1: cmp BLKOFF, 16
	300	je 9f
	301
	302	// Find the byte-reordering table.
	303	ldgot ecx
	304	movdqa xmm5, [INTADDR(endswap_tab, ecx)]
	305
	306	#if NKW_NEEDS_REFRESH
	307	// Calculate the number of subkey words again. (It's a good job
	308	// we've got a fast multiplier.)
	309	mov NKW, [CTX + nr]
	310	add NKW, 1
	311	imul NKW, BLKSZ
	312	#endif
	313
	314	// End-swap the encryption keys.
	315	lea SI, [CTX + w]
	316	call endswap_block
	317
	318	// And the decryption keys.
	319	lea SI, [CTX + wi]
	320	call endswap_block
	321
	322	9: // All done.
	323	#if CPUFAM_X86
	324	pop edi
	325	pop esi
	326	pop ebx
	327	pop ebp
	328	#endif
	329	#if CPUFAM_AMD64 && ABI_WIN
	330	pop rdi
	331	pop rsi
	332	#endif
	333	ret
	334
	335	ENDFUNC
	336
	337	INTFUNC(endswap_block)
	338	// End-swap NKW words starting at SI. The end-swapping table is
	339	// already loaded into XMM5; and it's OK to work in 16-byte chunks.
	340	#if CPUFAM_AMD64 && ABI_WIN
	341	.seh_endprologue
	342	#endif
	343
	344	mov ecx, NKW
	345	0: movdqu xmm1, [SI]
	346	pshufb xmm1, xmm5
	347	movdqu [SI], xmm1
	348	add SI, 16
	349	sub ecx, 4
	350	ja 0b
	351
	352	ret
	353
	354	ENDFUNC
	355
	356	#undef CTX
	357	#undef BLKSZ
	358	#undef SI
	359	#undef DI
	360	#undef KSZ
	361	#undef RCON
	362	#undef LIM
	363	#undef NR
	364	#undef LRK
	365	#undef BLKOFF
	366
	367	///--------------------------------------------------------------------------
	368	/// Encrypting and decrypting blocks.
	369
	370	.macro encdec op, aes, koff
	371	FUNC(rijndael_\op\()_x86ish_aesni)
	372
	373	#if CPUFAM_X86
	374	// Arguments come in on the stack, and need to be collected. We
	375	// don't have a shortage of registers.
	376
	377	# define K eax
	378	# define SRC edx
	379	# define DST edx
	380	# define NR ecx
	381
	382	mov K, [esp + 4]
	383	mov SRC, [esp + 8]
	384	#endif
	385
	386	#if CPUFAM_AMD64 && ABI_SYSV
	387	// Arguments come in registers. All is good.
	388
	389	# define K rdi
	390	# define SRC rsi
	391	# define DST rdx
	392	# define NR eax
	393	#endif
	394
	395	#if CPUFAM_AMD64 && ABI_WIN
	396	// Arguments come in different registers.
	397
	398	# define K rcx
	399	# define SRC rdx
	400	# define DST r8
	401	# define NR eax
	402	.seh_endprologue
	403	#endif
	404
	405	// Find the magic endianness-swapping table.
	406	ldgot ecx
	407	movdqa xmm5, [INTADDR(endswap_tab, ecx)]
	408
	409	// Initial setup.
	410	movdqu xmm0, [SRC]
	411	pshufb xmm0, xmm5
	412	mov NR, [K + nr]
	413	add K, \koff
	414
	415	// Initial whitening.
	416	movdqu xmm1, [K]
	417	add K, 16
	418	pxor xmm0, xmm1
	419	#if CPUFAM_X86
	420	mov DST, [esp + 12]
	421	#endif
	422
	423	// Dispatch to the correct code.
	424	cmp NR, 10
	425	je 10f
	426	jb bogus
	427	cmp NR, 14
	428	je 14f
	429	ja bogus
	430	cmp NR, 12
	431	je 12f
	432	jb 11f
	433	jmp 13f
	434
	435	.align 2
	436
	437	// 14 rounds...
	438	14: movdqu xmm1, [K]
	439	add K, 16
	440	\aes xmm0, xmm1
	441
	442	// 13 rounds...
	443	13: movdqu xmm1, [K]
	444	add K, 16
	445	\aes xmm0, xmm1
	446
	447	// 12 rounds...
	448	12: movdqu xmm1, [K]
	449	add K, 16
	450	\aes xmm0, xmm1
	451
	452	// 11 rounds...
	453	11: movdqu xmm1, [K]
	454	add K, 16
	455	\aes xmm0, xmm1
	456
	457	// 10 rounds...
	458	10: movdqu xmm1, [K]
	459	\aes xmm0, xmm1
	460
	461	// 9 rounds...
	462	movdqu xmm1, [K + 16]
	463	\aes xmm0, xmm1
	464
	465	// 8 rounds...
	466	movdqu xmm1, [K + 32]
	467	\aes xmm0, xmm1
	468
	469	// 7 rounds...
	470	movdqu xmm1, [K + 48]
	471	\aes xmm0, xmm1
	472
	473	// 6 rounds...
	474	movdqu xmm1, [K + 64]
	475	\aes xmm0, xmm1
	476
	477	// 5 rounds...
	478	movdqu xmm1, [K + 80]
	479	\aes xmm0, xmm1
	480
	481	// 4 rounds...
	482	movdqu xmm1, [K + 96]
	483	\aes xmm0, xmm1
	484
	485	// 3 rounds...
	486	movdqu xmm1, [K + 112]
	487	\aes xmm0, xmm1
	488
	489	// 2 rounds...
	490	movdqu xmm1, [K + 128]
	491	\aes xmm0, xmm1
	492
	493	// Final round...
	494	movdqu xmm1, [K + 144]
	495	\aes\()last xmm0, xmm1
	496
	497	// Unpermute the ciphertext block and store it.
	498	pshufb xmm0, xmm5
	499	movdqu [DST], xmm0
	500
	501	// And we're done.
	502	ret
	503
	504	#undef K
	505	#undef SRC
	506	#undef DST
	507	#undef NR
	508
	509	ENDFUNC
	510	.endm
	511
	512	encdec eblk, aesenc, w
	513	encdec dblk, aesdec, wi
	514
	515	///--------------------------------------------------------------------------
	516	/// Random utilities.
	517
	518	INTFUNC(bogus)
	519	// Abort the process because of a programming error. Indirecting
	520	// through this point serves several purposes: (a) by CALLing, rather
	521	// than branching to, `abort', we can save the return address, which
	522	// might at least provide a hint as to what went wrong; (b) we don't
	523	// have conditional CALLs (and they'd be big anyway); and (c) we can
	524	// write a HLT here as a backstop against `abort' being mad.
	525	#if CPUFAM_AMD64 && ABI_WIN
	526	.seh_endprologue
	527	#endif
	528
	529	callext F(abort)
	530	0: hlt
	531	jmp 0b
	532
	533	ENDFUNC
	534
	535	///--------------------------------------------------------------------------
	536	/// Data tables.
	537
	538	RODATA
	539
	540	.align 16
	541	endswap_tab:
	542	.byte 3, 2, 1, 0
	543	.byte 7, 6, 5, 4
	544	.byte 11, 10, 9, 8
	545	.byte 15, 14, 13, 12
	546
	547	///----- That's all, folks --------------------------------------------------