// form. Store the low 128 bits of the represented carry to [EDI] as
// a packed 128-bit value, and leave the remaining 16 bits in the low
// 32 bits of XMM4. On exit, XMM3, XMM5 and XMM6 are clobbered.
+ endprologue
+
propout [edi + 0], xmm4, xmm5
propout [edi + 4], xmm5, xmm6
propout [edi + 8], xmm6, nil
// [EDI], and update the carry registers with the carry out. The
// registers XMM0, XMM1, XMM2, XMM3, and XMM7 are clobbered; the
// general-purpose registers are preserved.
+ endprologue
+
mulacc [eax + 0], ecx, xmm4, xmm5, xmm6, xmm7, t
mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7, nil
propout [edi + 0], xmm4, xmm5
// [EDI], and update the carry registers with the carry out. The
// registers XMM0, XMM1, XMM2, XMM3, and XMM7 are clobbered; the
// general-purpose registers are preserved.
+ endprologue
+
carryadd
mulacc [eax + 0], ecx, xmm4, xmm5, xmm6, xmm7, nil
// and set the carry registers XMM4, XMM5, XMM6 to the carry out.
// The registers XMM0, XMM1, XMM2, XMM3, and XMM7 are clobbered; the
// general-purpose registers are preserved.
+ endprologue
+
mulcore [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7
propout [edi + 0], xmm4, xmm5
// and update the carry registers with the carry out. The registers
// XMM0, XMM1, XMM2, XMM3, and XMM7 are clobbered; the
// general-purpose registers are preserved.
+ endprologue
+
mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7, t
propout [edi + 0], xmm4, xmm5
// and set the carry registers XMM4, XMM5, XMM6 to the carry out.
// The registers XMM0, XMM1, XMM2, XMM3, and XMM7 are clobbered; the
// general-purpose registers are preserved.
+ endprologue
+
movd xmm4, [edi + 0]
movd xmm5, [edi + 4]
movd xmm6, [edi + 8]
// [EDI], and update the carry registers with the carry out. The
// registers XMM0, XMM1, XMM2, XMM3, and XMM7 are clobbered; the
// general-purpose registers are preserved.
+ endprologue
+
carryadd
mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7, nil
// of the sum U V + N Y to [EDI], leaving the remaining carry in
// XMM4, XMM5, and XMM6. The registers XMM0, XMM1, XMM2, XMM3, and
// XMM7 are clobbered; the general-purpose registers are preserved.
- sub esp, 64 // space for the carries
+ stalloc 48 // space for the carries
+ endprologue
// Calculate W = U V, and leave it in the destination. Stash the
// carry pieces for later.
// carry in XMM4, XMM5, and XMM6. The registers XMM0, XMM1, XMM2,
// XMM3, and XMM7 are clobbered; the general-purpose registers are
// preserved.
- sub esp, 64 // space for the carries
+ stalloc 48 // space for the carries
+ endprologue
+
movd xmm4, [edi + 0]
movd xmm5, [edi + 4]
movd xmm6, [edi + 8]
paddq xmm6, [esp + 32]
// And, with that, we're done.
- add esp, 64
+ stfree 48
ret
ENDFUNC
// of the sum W + N Y to [EDI], leaving the remaining carry in
// XMM4, XMM5, and XMM6. The registers XMM0, XMM1, XMM2, XMM3, and
// XMM7 are clobbered; the general-purpose registers are preserved.
+ endprologue
// Calculate Y = W M.
mulcore [edi + 0], esi, xmm4, xmm5, xmm6, xmm7
//
// esp + 0 expanded Y (32 bytes)
// esp + 32 (top of locals)
- push ebp
- push ebx
- push esi
- push edi
- mov ebp, esp
+ pushreg ebp
+ pushreg ebx
+ pushreg esi
+ pushreg edi
+ setfp ebp
and esp, ~15
sub esp, 32
+ endprologue
// Prepare for the first iteration.
mov esi, [ebp + 32] // -> bv[0]
jb 1b
// All over.
-9: mov esp, ebp
+9: dropfp
pop edi
pop esi
pop ebx
// esp + 108 bv limit
// esp + 112 (gap)
// esp + 124 (top of locals)
- push ebp
- push ebx
- push esi
- push edi
- mov ebp, esp
+ pushreg ebp
+ pushreg ebx
+ pushreg esi
+ pushreg edi
+ setfp ebp
and esp, ~15
sub esp, 124
+ endprologue
// Establish the expanded operands.
pxor xmm7, xmm7
movd [edi + 16], xmm4
// All done.
-9: mov esp, ebp
- pop edi
- pop esi
- pop ebx
- pop ebp
+9: dropfp
+ popreg edi
+ popreg esi
+ popreg ebx
+ popreg ebp
ret
ENDFUNC
// esp + 12 expanded M (32 bytes)
// esp + 44 expanded Y (32 bytes)
// esp + 76 (top of locals)
- push ebp
- push ebx
- push esi
- push edi
- mov ebp, esp
+ pushreg ebp
+ pushreg ebx
+ pushreg esi
+ pushreg edi
+ setfp ebp
and esp, ~15
sub esp, 76
+ endprologue
// Establish the expanded operands and the blocks-of-4 dv limit.
mov edi, [ebp + 20] // -> Z = dv[0]
jmp 5b
// All over.
-9: mov esp, ebp
- pop edi
- pop esi
- pop ebx
- pop ebp
+9: dropfp
+ popreg edi
+ popreg esi
+ popreg ebx
+ popreg ebp
ret
ENDFUNC
.endm
.macro testprologue
- push ebp
- push ebx
- push esi
- push edi
- mov ebp, esp
+ pushreg ebp
+ pushreg ebx
+ pushreg esi
+ pushreg edi
+ setfp ebp
and esp, ~15
sub esp, 3*32 + 12
+ endprologue
// vars:
// esp + 0 = cycles
// esp + 12 = v expanded
.endm
.macro testepilogue
- mov esp, ebp
- pop edi
- pop esi
- pop ebx
- pop ebp
+ dropfp
+ popreg edi
+ popreg esi
+ popreg ebx
+ popreg ebp
ret
.endm
movdqu [ecx + 32], xmm6
.endm
- .globl test_dmul4
-test_dmul4:
+FUNC(test_dmul4)
testprologue
testldcarry [ebp + 24]
testexpand [ebp + 36], [ebp + 40]
testtail [ebp + 48], [ebp + 44]
testcarryout [ebp + 24]
testepilogue
+ENDFUNC
- .globl test_dmla4
-test_dmla4:
+FUNC(test_dmla4)
testprologue
testldcarry [ebp + 24]
testexpand [ebp + 36], [ebp + 40]
testtail [ebp + 48], [ebp + 44]
testcarryout [ebp + 24]
testepilogue
+ENDFUNC
- .globl test_mul4
-test_mul4:
+FUNC(test_mul4)
testprologue
testldcarry [ebp + 24]
testexpand nil, [ebp + 32]
testtail [ebp + 40], [ebp + 36]
testcarryout [ebp + 24]
testepilogue
+ENDFUNC
- .globl test_mla4
-test_mla4:
+FUNC(test_mla4)
testprologue
testldcarry [ebp + 24]
testexpand nil, [ebp + 32]
testtail [ebp + 40], [ebp + 36]
testcarryout [ebp + 24]
testepilogue
+ENDFUNC
- .globl test_mmul4
-test_mmul4:
+FUNC(test_mmul4)
testprologue
testexpand [ebp + 40], [ebp + 44]
mov edi, [ebp + 20]
movdqu [edi + 16], xmm1
testcarryout [ebp + 24]
testepilogue
+ENDFUNC
- .globl test_mmla4
-test_mmla4:
+FUNC(test_mmla4)
testprologue
testexpand [ebp + 40], [ebp + 44]
mov edi, [ebp + 20]
movdqu [edi + 16], xmm1
testcarryout [ebp + 24]
testepilogue
+ENDFUNC
- .globl test_mont4
-test_mont4:
+FUNC(test_mont4)
testprologue
testexpand nil, [ebp + 36]
mov edi, [ebp + 20]
movdqu [edi + 16], xmm1
testcarryout [ebp + 24]
testepilogue
+ENDFUNC
#endif