psrldq \c1, 8 // (0, y^*_1, 0, 0)
psrldq \c2, 4 // (0, 0, y^*_2, 0)
.ifnes "\h", "nil"
- movdqu \t, \c3
+ movdqa \t, \c3
pxor \u, \u
.endif
pslldq \c3, 12 // (0, 0, 0, y^*_3)
movd xmm1, [edi + 4] // (a_1, 0)
movd xmm2, [edi + 8] // (a_2, 0)
movd xmm7, [edi + 12] // (a_3, 0)
+
paddq xmm4, xmm0 // (c'_0 + a_0, c''_0)
paddq xmm5, xmm1 // (c'_1 + a_1, c''_1)
paddq xmm6, xmm2 // (c'_2 + a_2, c''_2 + a_3 b)
// of the sum U V + N Y to [EDI], leaving the remaining carry in
// XMM4, XMM5, and XMM6. The registers XMM0, XMM1, XMM2, XMM3, and
// XMM7 are clobbered; the general-purpose registers are preserved.
- stalloc 64 // space for the carries
+ stalloc 48 // space for the carries
endprologue
// Calculate W = U V, and leave it in the destination. Stash the
// carry in XMM4, XMM5, and XMM6. The registers XMM0, XMM1, XMM2,
// XMM3, and XMM7 are clobbered; the general-purpose registers are
// preserved.
- stalloc 64 // space for the carries
+ stalloc 48 // space for the carries
endprologue
movd xmm4, [edi + 0]
paddq xmm6, [esp + 32]
// And, with that, we're done.
- stfree 64
+ stfree 48
ret
ENDFUNC
INTFUNC(mont4)
// On entry, EDI points to the destination buffer holding a packed
- // value A; EBX points to a packed operand N; ESI points to an
+ // value W; EBX points to a packed operand N; ESI points to an
// expanded operand M; and EDX points to a place to store an expanded
// result Y (32 bytes, at a 16-byte boundary).
//