/// Main code.
.arch pentium4
- .section .text
+ .text
FUNC(salsa20_core_x86ish_sse2)
movdqu xmm2, [IN + 32]
movdqu xmm3, [IN + 48]
- ## Take a copy for later.
+ // Take a copy for later.
movdqa SAVE0, xmm0
movdqa SAVE1, xmm1
movdqa SAVE2, xmm2
movdqa SAVE3, xmm3
-loop:
+0:
// Apply a column quarterround to each of the columns simultaneously.
// Alas, there doesn't seem to be a packed doubleword rotate, so we
// have to synthesize it.
// Decrement the loop counter and see if we should go round again.
// Later processors fuse this pair into a single uop.
sub NR, 2
- ja loop
+ ja 0b
// Almost there. Firstly, the feedforward addition, and then we have
// to write out the result. Here we have to undo the permutation