This actually slightly reduces the amount of stack needed, but I don't
quite understand why. There's a knock-on rearrangement of the stack
frame in the test wrappers and C-interface subroutines.
There's also a slightly sneaky introduction of space for a later change.
But there shouldn't be any externally observable difference.
// to the packed operands U and N; ECX and ESI point to the expanded
// operands V and M; and EDX points to a place to store an expanded
// result Y (32 bytes, at a 16-byte boundary). The stack pointer
// to the packed operands U and N; ECX and ESI point to the expanded
// operands V and M; and EDX points to a place to store an expanded
// result Y (32 bytes, at a 16-byte boundary). The stack pointer
- // must be 16-byte aligned. (This is not the usual convention, which
- // requires alignment before the call.)
+ // must be 12 modulo 16, as is usual for modern x86 ABIs.
//
// On exit, we write Y = U V M mod B to [EDX], and the low 128 bits
// of the sum U V + N Y to [EDI], leaving the remaining carry in
// XMM4, XMM5, and XMM6. The registers XMM0, XMM1, XMM2, XMM3, and
// XMM7 are clobbered; the general-purpose registers are preserved.
//
// On exit, we write Y = U V M mod B to [EDX], and the low 128 bits
// of the sum U V + N Y to [EDI], leaving the remaining carry in
// XMM4, XMM5, and XMM6. The registers XMM0, XMM1, XMM2, XMM3, and
// XMM7 are clobbered; the general-purpose registers are preserved.
- stalloc 48 // space for the carries
+ stalloc 48 + 12 // space for the carries
endprologue
// Calculate W = U V, and leave it in the destination. Stash the
endprologue
// Calculate W = U V, and leave it in the destination. Stash the
// packed operands U and N; ECX and ESI point to the expanded
// operands V and M; and EDX points to a place to store an expanded
// result Y (32 bytes, at a 16-byte boundary). The stack pointer
// packed operands U and N; ECX and ESI point to the expanded
// operands V and M; and EDX points to a place to store an expanded
// result Y (32 bytes, at a 16-byte boundary). The stack pointer
- // must be 16-byte aligned. (This is not the usual convention, which
- // requires alignment before the call.)
+ // must be 12 modulo 16, as is usual for modern x86 ABIs.
//
// On exit, we write Y = (A + U V) M mod B to [EDX], and the low 128
// bits of the sum A + U V + N Y to [EDI], leaving the remaining
// carry in XMM4, XMM5, and XMM6. The registers XMM0, XMM1, XMM2,
// XMM3, and XMM7 are clobbered; the general-purpose registers are
// preserved.
//
// On exit, we write Y = (A + U V) M mod B to [EDX], and the low 128
// bits of the sum A + U V + N Y to [EDI], leaving the remaining
// carry in XMM4, XMM5, and XMM6. The registers XMM0, XMM1, XMM2,
// XMM3, and XMM7 are clobbered; the general-purpose registers are
// preserved.
- stalloc 48 // space for the carries
+ stalloc 48 + 12 // space for the carries
endprologue
movd xmm4, [edi + 0]
endprologue
movd xmm4, [edi + 0]
paddq xmm6, [esp + 32]
// And, with that, we're done.
paddq xmm6, [esp + 32]
// And, with that, we're done.
// ebp + 36 n (nonzero multiple of 4)
// ebp + 40 mi
//
// ebp + 36 n (nonzero multiple of 4)
// ebp + 40 mi
//
- // Locals are relative to ESP, which is 4 mod 16, as follows.
+ // Locals are relative to ESP, which 16-byte aligned, as follows.
- // esp + 0 outer loop dv
- // esp + 4 outer loop bv
- // esp + 8 av limit (mostly in ESI)
- // esp + 12 expanded V (32 bytes)
- // esp + 44 expanded M (32 bytes)
- // esp + 76 expanded Y (32 bytes)
+ // esp + 0 expanded V (32 bytes)
+ // esp + 32 expanded M (32 bytes)
+ // esp + 64 expanded Y (32 bytes)
+ // esp + 96 outer loop dv
+ // esp + 100 outer loop bv
+ // esp + 104 av limit (mostly in ESI)
- // esp + 112 (gap)
- // esp + 124 (top of locals)
+ // esp + 112 (top of locals)
pushreg ebp
pushreg ebx
pushreg esi
pushreg edi
setfp ebp
and esp, ~15
pushreg ebp
pushreg ebx
pushreg esi
pushreg edi
setfp ebp
and esp, ~15
endprologue
// Establish the expanded operands.
endprologue
// Establish the expanded operands.
movdqu xmm0, [ecx] // bv[0]
movdqu xmm2, [edx] // mi
expand xmm7, xmm0, xmm1, xmm2, xmm3
movdqu xmm0, [ecx] // bv[0]
movdqu xmm2, [edx] // mi
expand xmm7, xmm0, xmm1, xmm2, xmm3
- movdqa [esp + 12], xmm0 // bv[0] expanded low
- movdqa [esp + 28], xmm1 // bv[0] expanded high
- movdqa [esp + 44], xmm2 // mi expanded low
- movdqa [esp + 60], xmm3 // mi expanded high
+ movdqa [esp + 0], xmm0 // bv[0] expanded low
+ movdqa [esp + 16], xmm1 // bv[0] expanded high
+ movdqa [esp + 32], xmm2 // mi expanded low
+ movdqa [esp + 48], xmm3 // mi expanded high
// Set up the outer loop state and prepare for the first iteration.
mov edx, [ebp + 36] // n
mov eax, [ebp + 24] // -> U = av[0]
mov ebx, [ebp + 32] // -> X = nv[0]
mov edi, [ebp + 20] // -> Z = dv[0]
// Set up the outer loop state and prepare for the first iteration.
mov edx, [ebp + 36] // n
mov eax, [ebp + 24] // -> U = av[0]
mov ebx, [ebp + 32] // -> X = nv[0]
mov edi, [ebp + 20] // -> Z = dv[0]
lea ecx, [ecx + 4*edx] // -> bv[n/4] = bv limit
lea edx, [eax + 4*edx] // -> av[n/4] = av limit
lea ecx, [ecx + 4*edx] // -> bv[n/4] = bv limit
lea edx, [eax + 4*edx] // -> av[n/4] = av limit
+ mov [esp + 96], edi
+ mov [esp + 104], edx
- mov [esp + 8], edx
- lea ecx, [esp + 12] // -> expanded V = bv[0]
- lea esi, [esp + 44] // -> expanded M = mi
- lea edx, [esp + 76] // -> space for Y
+ lea ecx, [esp + 0] // -> expanded V = bv[0]
+ lea esi, [esp + 32] // -> expanded M = mi
+ lea edx, [esp + 64] // -> space for Y
- mov esi, [esp + 8] // recover av limit
+ mov esi, [esp + 104] // recover av limit
add edi, 16
add eax, 16
add ebx, 16
cmp eax, esi // done already?
jae 8f
add edi, 16
add eax, 16
add ebx, 16
cmp eax, esi // done already?
jae 8f
.p2align 4
// Complete the first inner loop.
.p2align 4
// Complete the first inner loop.
// Embark on the next iteration. (There must be one. If n = 1, then
// we would have bailed above, to label 8. Similarly, the subsequent
// iterations can fall into the inner loop immediately.)
// Embark on the next iteration. (There must be one. If n = 1, then
// we would have bailed above, to label 8. Similarly, the subsequent
// iterations can fall into the inner loop immediately.)
-1: mov eax, [esp + 4] // -> bv[i - 1]
- mov edi, [esp + 0] // -> Z = dv[i]
+1: mov eax, [esp + 100] // -> bv[i - 1]
+ mov edi, [esp + 96] // -> Z = dv[i]
add eax, 16 // -> bv[i]
pxor xmm7, xmm7
add eax, 16 // -> bv[i]
pxor xmm7, xmm7
- movdqu xmm0, [eax] // bv[i]
- mov [esp + 4], eax
cmp eax, [esp + 108] // done yet?
jae 9f
cmp eax, [esp + 108] // done yet?
jae 9f
+ movdqu xmm0, [eax] // bv[i]
mov ebx, [ebp + 32] // -> X = nv[0]
mov ebx, [ebp + 32] // -> X = nv[0]
- lea esi, [esp + 44] // -> expanded M = mi
+ lea esi, [esp + 32] // -> expanded M = mi
mov eax, [ebp + 24] // -> U = av[0]
expand xmm7, xmm0, xmm1
mov eax, [ebp + 24] // -> U = av[0]
expand xmm7, xmm0, xmm1
- movdqa [esp + 12], xmm0 // bv[i] expanded low
- movdqa [esp + 28], xmm1 // bv[i] expanded high
+ movdqa [esp + 0], xmm0 // bv[i] expanded low
+ movdqa [esp + 16], xmm1 // bv[i] expanded high
- mov esi, [esp + 8] // recover av limit
+ mov esi, [esp + 104] // recover av limit
add edi, 16
add eax, 16
add ebx, 16
add edi, 16
add eax, 16
add ebx, 16
.p2align 4
// Complete the next inner loop.
.p2align 4
// Complete the next inner loop.
pushreg edi
setfp ebp
and esp, ~15
pushreg edi
setfp ebp
and esp, ~15
- // esp + 0 = cycles
- // esp + 12 = v expanded
- // esp + 44 = y expanded
- // esp + 72 = ? expanded
+ // esp + 0 = v expanded
+ // esp + 32 = y expanded
+ // esp + 64 = ? expanded
+ // esp + 96 = cycles
.endm
.macro testepilogue
.endm
.macro testepilogue
mov ecx, \v
movdqu xmm0, [ecx]
expand xmm7, xmm0, xmm1
mov ecx, \v
movdqu xmm0, [ecx]
expand xmm7, xmm0, xmm1
- movdqa [esp + 12], xmm0
- movdqa [esp + 28], xmm1
+ movdqa [esp + 0], xmm0
+ movdqa [esp + 16], xmm1
.endif
.ifnes "\y", "nil"
mov edx, \y
movdqu xmm2, [edx]
expand xmm7, xmm2, xmm3
.endif
.ifnes "\y", "nil"
mov edx, \y
movdqu xmm2, [edx]
expand xmm7, xmm2, xmm3
- movdqa [esp + 44], xmm2
- movdqa [esp + 60], xmm3
+ movdqa [esp + 32], xmm2
+ movdqa [esp + 48], xmm3
.p2align 4
0:
.ifnes "\u", "nil"
.p2align 4
0:
.ifnes "\u", "nil"
.endif
mov ebx, \x
.ifeqs "\mode", "mont"
.endif
mov ebx, \x
.ifeqs "\mode", "mont"
.ifnes "\u", "nil"
mov eax, \u
.endif
.ifeqs "\mode", "mont"
.ifnes "\u", "nil"
mov eax, \u
.endif
.ifeqs "\mode", "mont"
.endif
.endm
.macro testtail cyv, n
.endif
.endm
.macro testtail cyv, n
- cystore esp + 0, \cyv, \n
+ cystore esp + 96, \cyv, \n
call mmul4
testtail [ebp + 52], [ebp + 48]
mov edi, [ebp + 28]
call mmul4
testtail [ebp + 52], [ebp + 48]
mov edi, [ebp + 28]
- movdqa xmm0, [esp + 76]
- movdqa xmm1, [esp + 92]
+ movdqa xmm0, [esp + 64]
+ movdqa xmm1, [esp + 80]
movdqu [edi], xmm0
movdqu [edi + 16], xmm1
testcarryout [ebp + 24]
movdqu [edi], xmm0
movdqu [edi + 16], xmm1
testcarryout [ebp + 24]
call mmla4
testtail [ebp + 52], [ebp + 48]
mov edi, [ebp + 28]
call mmla4
testtail [ebp + 52], [ebp + 48]
mov edi, [ebp + 28]
- movdqa xmm0, [esp + 76]
- movdqa xmm1, [esp + 92]
+ movdqa xmm0, [esp + 64]
+ movdqa xmm1, [esp + 80]
movdqu [edi], xmm0
movdqu [edi + 16], xmm1
testcarryout [ebp + 24]
movdqu [edi], xmm0
movdqu [edi + 16], xmm1
testcarryout [ebp + 24]
call mont4
testtail [ebp + 44], [ebp + 40]
mov edi, [ebp + 28]
call mont4
testtail [ebp + 44], [ebp + 40]
mov edi, [ebp + 28]
- movdqa xmm0, [esp + 76]
- movdqa xmm1, [esp + 92]
+ movdqa xmm0, [esp + 64]
+ movdqa xmm1, [esp + 80]
movdqu [edi], xmm0
movdqu [edi + 16], xmm1
testcarryout [ebp + 24]
movdqu [edi], xmm0
movdqu [edi + 16], xmm1
testcarryout [ebp + 24]