Eliminate a tiny bit of code duplication. It's not like anyone else
uses that subroutine.
#endif
// End-swap the encryption keys.
- mov ecx, NKW
lea SI, [CTX + w]
call endswap_block
// And the decryption keys.
- mov ecx, NKW
lea SI, [CTX + wi]
call endswap_block
.align 16
endswap_block:
- // End-swap ECX words starting at SI. The end-swapping table is
+ // End-swap NKW words starting at SI. The end-swapping table is
// already loaded into XMM5; and it's OK to work in 16-byte chunks.
- movdqu xmm1, [SI]
+ mov ecx, NKW
+0: movdqu xmm1, [SI]
pshufb xmm1, xmm5
movdqu [SI], xmm1
add SI, 16
sub ecx, 4
- ja endswap_block
+ ja 0b
ret
#undef CTX