From 160214515f6913d84e0e41253cf61281718bcd99 Mon Sep 17 00:00:00 2001 Message-Id: <160214515f6913d84e0e41253cf61281718bcd99.1716390302.git.mdw@distorted.org.uk> From: Mark Wooding Date: Thu, 26 May 2016 09:26:09 +0100 Subject: [PATCH] symm/rijndael-*.S (rijndael_setup_*): Roll up the inner loop. Organization: Straylight/Edgeware From: Mark Wooding Reduce code size by tracking position in the main key-schedule loop in a register and dispatching rather than tracking it in the program-counter. --- symm/rijndael-arm-crypto.S | 89 ++++++++++------------------------ symm/rijndael-x86ish-aesni.S | 93 +++++++++++------------------------- 2 files changed, 52 insertions(+), 130 deletions(-) diff --git a/symm/rijndael-arm-crypto.S b/symm/rijndael-arm-crypto.S index b277bf78..908faaa4 100644 --- a/symm/rijndael-arm-crypto.S +++ b/symm/rijndael-arm-crypto.S @@ -106,84 +106,45 @@ FUNC(rijndael_setup_arm_crypto) sub r8, r2, r3 // minus what we've copied already veor q1, q1 // all-zero register for the key add r8, r9, r8, lsl #2 // limit of the key buffer + mov r12, #0 // position in current cycle - // Main key expansion loop. The first word of each key-length chunk - // needs special treatment. -0: ldrb r14, [r5], #1 // next round constant - ldr r6, [r9, -r3, lsl #2] + // Main key expansion loop. Dispatch according to the position in + // the cycle. +0: ldr r6, [r9, -r3, lsl #2] // word from previous cycle + cmp r12, #0 // first word of the cycle? + beq 1f + cmp r12, #4 // fourth word of the cycle? + bne 2f + cmp r3, #7 // seven or eight words of key? + bcc 2f + + // Fourth word of the cycle, seven or eight words of key. We must do + // the byte substitution. vdup.32 q0, r4 aese.8 q0, q1 // effectively, just SubBytes vmov.32 r4, d0[0] - eor r4, r14, r4, ror #8 - eor r4, r4, r6 - str r4, [r9], #4 - cmp r9, r8 - bcs 9f + b 2f - // The next three words are simple. + // First word of the cycle. Byte substitution, rotation, and round + // constant. +1: ldrb r14, [r5], #1 // next round constant ldr r6, [r9, -r3, lsl #2] - eor r4, r4, r6 - str r4, [r9], #4 - cmp r9, r8 - bcs 9f - - // (Word 2...) - ldr r6, [r9, -r3, lsl #2] - eor r4, r4, r6 - str r4, [r9], #4 - cmp r9, r8 - bcs 9f - - // (Word 3...) - ldr r6, [r9, -r3, lsl #2] - eor r4, r4, r6 - str r4, [r9], #4 - cmp r9, r8 - bcs 9f - - // Word 4. If the key is /more/ than 6 words long, then we must - // apply a substitution here. - cmp r3, #5 - bcc 0b - ldr r6, [r9, -r3, lsl #2] - cmp r3, #7 - bcc 1f vdup.32 q0, r4 aese.8 q0, q1 // effectively, just SubBytes vmov.32 r4, d0[0] -1: eor r4, r4, r6 - str r4, [r9], #4 - cmp r9, r8 - bcs 9f - - // (Word 5...) - cmp r3, #6 - bcc 0b - ldr r6, [r9, -r3, lsl #2] - eor r4, r4, r6 - str r4, [r9], #4 - cmp r9, r8 - bcs 9f + eor r4, r14, r4, ror #8 - // (Word 6...) - cmp r3, #7 - bcc 0b - ldr r6, [r9, -r3, lsl #2] - eor r4, r4, r6 + // Common ending: mix in the word from the previous cycle and store. +2: eor r4, r4, r6 str r4, [r9], #4 - cmp r9, r8 - bcs 9f - // (Word 7...) - cmp r3, #8 - bcc 0b - ldr r6, [r9, -r3, lsl #2] - eor r4, r4, r6 - str r4, [r9], #4 + // Prepare for the next iteration. If we're done, then stop; if + // we've finished a cycle then reset the counter. + add r12, r12, #1 cmp r9, r8 bcs 9f - - // Must be done by now. + cmp r12, r3 + movcs r12, #0 b 0b // Next job is to construct the decryption keys. The keys for the diff --git a/symm/rijndael-x86ish-aesni.S b/symm/rijndael-x86ish-aesni.S index 4196181d..5194c17d 100644 --- a/symm/rijndael-x86ish-aesni.S +++ b/symm/rijndael-x86ish-aesni.S @@ -88,6 +88,7 @@ FUNC(rijndael_setup_x86ish_aesni) # define RCON ecx // round constants table # define LIM edx // limit pointer # define LIMn edx // ... as integer offset from base +# define CYIX edi // index in shift-register cycle # define NR ecx // number of rounds # define LRK eax // distance to last key @@ -123,6 +124,7 @@ FUNC(rijndael_setup_x86ish_aesni) # define RCON rdi // round constants table # define LIMn ecx // limit pointer # define LIM rcx // ... as integer offset from base +# define CYIX r11d // index in shift-register cycle # define NR ecx // number of rounds # define LRK eax // distance to last key @@ -152,6 +154,7 @@ FUNC(rijndael_setup_x86ish_aesni) # define RCON rdi // round constants table # define LIMn ecx // limit pointer # define LIM rcx // ... as integer offset from base +# define CYIX r11d // index in shift-register cycle # define NR ecx // number of rounds # define LRK eax // distance to last key @@ -200,6 +203,7 @@ FUNC(rijndael_setup_x86ish_aesni) lea SI, [CTX + w] mov eax, [SI + 4*KSZo - 4] // most recent key word lea LIM, [SI + 4*LIM] // limit, offset by one key expansion + xor CYIX, CYIX // start of new cycle // Main key expansion loop. The first word of each key-length chunk // needs special treatment. @@ -215,84 +219,41 @@ FUNC(rijndael_setup_x86ish_aesni) // as an immediate, so it's kind of annoying if you're not // open-coding the whole thing. It's much easier to leave that as // zero and XOR in the round constant by hand. -0: movd xmm0, eax +0: cmp CYIX, 0 // first word of the cycle? + je 1f + cmp CYIX, 4 // fourth word of the cycle? + jne 2f + cmp KSZ, 7 // and a large key? + jb 2f + + // Fourth word of the cycle, and seven or eight words of key. Do a + // byte substitution. + movd xmm0, eax + pshufd xmm0, xmm0, ROTL + aeskeygenassist xmm1, xmm0, 0 + movd eax, xmm1 + jmp 2f + + // First word of the cycle. This is the complicated piece. +1: movd xmm0, eax pshufd xmm0, xmm0, ROTR aeskeygenassist xmm1, xmm0, 0 pshufd xmm1, xmm1, ROTL movd eax, xmm1 - xor eax, [SI] xor al, [RCON] inc RCON - mov [SI + 4*KSZo], eax - add SI, 4 - cmp SI, LIM - jae 9f - - // The next three words are simple... - xor eax, [SI] - mov [SI + 4*KSZo], eax - add SI, 4 - cmp SI, LIM - jae 9f - - // (Word 2...) - xor eax, [SI] - mov [SI + 4*KSZo], eax - add SI, 4 - cmp SI, LIM - jae 9f - // (Word 3...) - xor eax, [SI] + // Common tail. Mix in the corresponding word from the previous + // cycle and prepare for the next loop. +2: xor eax, [SI] mov [SI + 4*KSZo], eax add SI, 4 + inc CYIX cmp SI, LIM jae 9f - - // Word 4. If the key is /more/ than 6 words long, then we must - // apply a substitution here. - cmp KSZ, 5 + cmp CYIX, KSZ jb 0b - cmp KSZ, 7 - jb 1f - movd xmm0, eax - pshufd xmm0, xmm0, ROTL - aeskeygenassist xmm1, xmm0, 0 - movd eax, xmm1 -1: xor eax, [SI] - mov [SI + 4*KSZo], eax - add SI, 4 - cmp SI, LIM - jae 9f - - // (Word 5...) - cmp KSZ, 6 - jb 0b - xor eax, [SI] - mov [SI + 4*KSZo], eax - add SI, 4 - cmp SI, LIM - jae 9f - - // (Word 6...) - cmp KSZ, 7 - jb 0b - xor eax, [SI] - mov [SI + 4*KSZo], eax - add SI, 4 - cmp SI, LIM - jae 9f - - // (Word 7...) - cmp KSZ, 8 - jb 0b - xor eax, [SI] - mov [SI + 4*KSZo], eax - add SI, 4 - cmp SI, LIM - jae 9f - - // Must be done by now. + xor CYIX, CYIX jmp 0b // Next job is to construct the decryption keys. The keys for the -- [mdw]