chiark - git - mdw - catacomb/blame - math/mpx-mul4-x86-sse2.S

Commit	Line	Data
444083ae MW	1	/// -- mode: asm; asm-comment-char: ?/; comment-start: "// " --
	2	///
	3	/// Large SIMD-based multiplications
	4	///
	5	/// (c) 2016 Straylight/Edgeware
	6
	7	///----- Licensing notice ---------------------------------------------------
	8	///
	9	/// This file is part of Catacomb.
	10	///
	11	/// Catacomb is free software; you can redistribute it and/or modify
	12	/// it under the terms of the GNU Library General Public License as
	13	/// published by the Free Software Foundation; either version 2 of the
	14	/// License, or (at your option) any later version.
	15	///
	16	/// Catacomb is distributed in the hope that it will be useful,
	17	/// but WITHOUT ANY WARRANTY; without even the implied warranty of
	18	/// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	19	/// GNU Library General Public License for more details.
	20	///
	21	/// You should have received a copy of the GNU Library General Public
	22	/// License along with Catacomb; if not, write to the Free
	23	/// Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
	24	/// MA 02111-1307, USA.
	25
	26	///--------------------------------------------------------------------------
df07f2c0	27	/// Preliminaries.
444083ae MW	28
	29	#include "config.h"
	30	#include "asm-common.h"
	31
444083ae	32	.arch pentium4
df07f2c0	33
444083ae MW	34	.text
	35
	36	///--------------------------------------------------------------------------
	37	/// Theory.
	38	///
	39	/// We define a number of primitive fixed-size multipliers from which we can
	40	/// construct more general variable-length multipliers.
	41	///
	42	/// The basic trick is the same throughout. In an operand-scanning
9599917f MW	43	/// multiplication, the inner multiplication loop multiplies a multiple-
	44	/// precision operand by a single precision factor, and adds the result,
	45	/// appropriately shifted, to the result. A `finely integrated operand
	46	/// scanning' implementation of Montgomery multiplication also adds the
	47	/// product of a single-precision `Montgomery factor' and the modulus,
444083ae MW	48	/// calculated in the same pass. The more common `coarsely integrated
	49	/// operand scanning' alternates main multiplication and Montgomery passes,
	50	/// which requires additional carry propagation.
	51	///
	52	/// Throughout both plain-multiplication and Montgomery stages, then, one of
	53	/// the factors remains constant throughout the operation, so we can afford
	54	/// to take a little time to preprocess it. The transformation we perform is
	55	/// as follows. Let b = 2^16, and B = b^2 = 2^32. Suppose we're given a
	56	/// 128-bit factor v = v_0 + v_1 B + v_2 B^2 + v_3 B^3. Split each v_i into
	57	/// two sixteen-bit pieces, so v_i = v'_i + v''_i b. These eight 16-bit
	58	/// pieces are placed into 32-bit cells, and arranged as two 128-bit SSE
	59	/// operands, as follows.
	60	///
981a9e5d MW	61	/// Offset 12 8 4 0
	62	/// 0 v''_1 v''_0 v'_1 v'_0
	63	/// 16 v''_3 v''_2 v'_3 v'_2
444083ae	64	///
2aaa07f8	65	/// A `pmuludq' instruction ignores the odd positions in its operands; thus,
444083ae MW	66	/// it will act on (say) v'_0 and v''_0 in a single instruction. Shifting
	67	/// this vector right by 4 bytes brings v'_1 and v''_1 into position. We can
	68	/// multiply such a vector by a full 32-bit scalar to produce two 48-bit
	69	/// results in 64-bit fields. The sixteen bits of headroom allows us to add
	70	/// many products together before we must deal with carrying; it also allows
	71	/// for some calculations to be performed on the above expanded form.
	72	///
9599917f MW	73	/// We maintain four `carry' registers XMM4--XMM7 accumulating intermediate
	74	/// results. The registers' precise roles rotate during the computation; we
	75	/// name them `c0', `c1', `c2', and `c3'. Each carry register holds two
	76	/// 64-bit halves: the register c0, for example, holds c'_0 (low half) and
	77	/// c''_0 (high half), and represents the value c_0 = c'_0 + c''_0 b; the
	78	/// carry registers collectively represent the value c_0 + c_1 B + c_2 B^2 +
	79	/// c_3 B^3. The `pmuluqd' instruction acting on a scalar operand (broadcast
	80	/// across all lanes of its vector) and an operand in the expanded form above
	81	/// produces a result which can be added directly to the appropriate carry
	82	/// register. Following a pass of four multiplications, we perform some
	83	/// limited carry propagation: let t = c''_0 mod B, and let d = c'_0 + t b;
	84	/// then we output z = d mod B, add (floor(d/B), floor(c''_0/B)) to c1, and
	85	/// cycle the carry registers around, so that c1 becomes c0, and the old
	86	/// (implicitly) zeroed c0 becomes c3.
	87	///
444083ae	88	/// On 32-bit x86, we are register starved: the expanded operands are kept in
9599917f MW	89	/// memory, typically in warm L1 cache. The packed operands are read from
	90	/// memory into working registers XMM0--XMM3 and processed immediately.
	91	/// The following conventional argument names and locations are used
	92	/// throughout.
	93	///
	94	/// Arg Format Location Notes
	95	///
	96	/// U packed [EAX]
	97	/// X packed [EBX] In Montgomery multiplication, X = N
	98	/// V expanded [ECX]
	99	/// Y expanded [EDX] In Montgomery multiplication, Y = (A + U V) M
	100	/// M expanded [ESI] -N^{-1} (mod B^4)
	101	/// N Modulus, for Montgomery multiplication
	102	/// A packed [EDI] Destination/accumulator
	103	/// C carry XMM4--XMM7
	104	///
	105	/// The calculation is some variant of
	106	///
	107	/// A' + C' B^4 <- U V + X Y + A + C
	108	///
	109	/// The low-level functions fit into a fairly traditional (finely-integrated)
	110	/// operand scanning loop over operand pairs (U, X) (indexed by j) and (V, Y)
	111	/// (indexed by i).
	112	///
	113	/// The variants are as follows.
	114	///
	115	/// Function Variant Use i j
	116	///
	117	/// mmul4 A = C = 0 Montgomery 0 0
	118	/// dmul4 A = 0 Montgomery 0 +
	119	/// mmla4 C = 0 Montgomery + 0
	120	/// dmla4 exactly as shown Montgomery + +
	121	/// mont4 U = V = C = 0 Montgomery any 0
	122	///
	123	/// mul4zc U = V = A = C = 0 Plain 0 0
	124	/// mul4 U = V = A = 0 Plain 0 +
	125	/// mla4zc U = V = C = 0 Plain + 0
	126	/// mla4 U = V = 0 Plain + +
	127	///
	128	/// The `mmul4' and `mmla4' functions are also responsible for calculating
	129	/// the Montgomery reduction factor Y = (A + U V) M used by the rest of the
	130	/// inner loop.
444083ae MW	131
	132	///--------------------------------------------------------------------------
	133	/// Macro definitions.
	134
71ac8e5e	135	.macro mulcore r, s, d0, d1=nil, d2=nil, d3=nil
444083ae MW	136	// Load a word r_i from R, multiply by the expanded operand [S], and
444083ae MW	137	// leave the pieces of the product in registers D0, D1, D2, D3.
981a9e5d	138	movd \d0, \r // (0, 0; 0, r_i)
444083ae	139	.ifnes "\d1", "nil"
981a9e5d	140	movdqa \d1, [\s] // (s''_1, s''_0; s'_1, s'_0)
444083ae MW	141	.endif
444083ae MW	142	.ifnes "\d3", "nil"
981a9e5d	143	movdqa \d3, [\s + 16] // (s''_3, s''_2; s'_3, s'_2)
444083ae	144	.endif
981a9e5d	145	pshufd \d0, \d0, SHUF(3, 0, 3, 0) // (?, r_i; ?, r_i)
444083ae	146	.ifnes "\d1", "nil"
981a9e5d	147	psrldq \d1, 4 // (0, s''_1; s''_0, s'_1)
444083ae MW	148	.endif
	149	.ifnes "\d2", "nil"
	150	.ifnes "\d3", "nil"
8e91d6e5	151	movdqa \d2, \d3 // another copy of (s'_2, s'_3; ...)
444083ae	152	.else
981a9e5d	153	movdqa \d2, \d0 // another copy of (?, r_i; ?, r_i)
444083ae MW	154	.endif
	155	.endif
	156	.ifnes "\d3", "nil"
981a9e5d	157	psrldq \d3, 4 // (0, s''_3; s''_2, s'_3)
444083ae MW	158	.endif
444083ae MW	159	.ifnes "\d1", "nil"
981a9e5d	160	pmuludq \d1, \d0 // (r_i s''_1; r_i s'_1)
444083ae MW	161	.endif
444083ae MW	162	.ifnes "\d3", "nil"
981a9e5d	163	pmuludq \d3, \d0 // (r_i s''_3; r_i s'_3)
444083ae MW	164	.endif
	165	.ifnes "\d2", "nil"
	166	.ifnes "\d3", "nil"
981a9e5d	167	pmuludq \d2, \d0 // (r_i s''_2; r_i s'_2)
444083ae	168	.else
2aaa07f8	169	pmuludq \d2, [\s + 16]
444083ae MW	170	.endif
444083ae MW	171	.endif
981a9e5d	172	pmuludq \d0, [\s] // (r_i s''_0; r_i s'_0)
444083ae MW	173	.endm
444083ae MW	174
71ac8e5e MW	175	.macro accum c0, c1=nil, c2=nil, c3=nil
	176	// Accumulate 64-bit pieces in XMM0--XMM3 into the corresponding
	177	// carry registers C0--C3. Any or all of C1--C3 may be `nil' to skip
	178	// updating that register.
444083ae MW	179	paddq \c0, xmm0
	180	.ifnes "\c1", "nil"
	181	paddq \c1, xmm1
	182	.endif
	183	.ifnes "\c2", "nil"
	184	paddq \c2, xmm2
	185	.endif
	186	.ifnes "\c3", "nil"
	187	paddq \c3, xmm3
	188	.endif
	189	.endm
	190
71ac8e5e	191	.macro mulacc r, s, c0, c1, c2, c3, z3p=nil
444083ae MW	192	// Load a word r_i from R, multiply by the expanded operand [S],
	193	// and accumulate in carry registers C0, C1, C2, C3. If Z3P is `t'
	194	// then C3 notionally contains zero, but needs clearing; in practice,
	195	// we store the product directly rather than attempting to add. On
	196	// completion, XMM0, XMM1, and XMM2 are clobbered, as is XMM3 if Z3P
	197	// is not `t'.
	198	.ifeqs "\z3p", "t"
	199	mulcore \r, \s, xmm0, xmm1, xmm2, \c3
71ac8e5e	200	accum \c0, \c1, \c2
444083ae MW	201	.else
	202	mulcore \r, \s, xmm0, xmm1, xmm2, xmm3
	203	accum \c0, \c1, \c2, \c3
	204	.endif
	205	.endm
	206
71ac8e5e	207	.macro propout d, c, cc=nil
444083ae MW	208	// Calculate an output word from C, and store it in D; propagate
	209	// carries out from C to CC in preparation for a rotation of the
	210	// carry registers. On completion, XMM3 is clobbered. If CC is
	211	// `nil', then the contribution which would have been added to it is
	212	// left in C.
981a9e5d MW	213	pshufd xmm3, \c, SHUF(2, 3, 3, 3) // (t = c'' mod B, ?; ?, ?)
	214	psrldq xmm3, 12 // (0, 0; 0, t) = (0; t)
	215	pslldq xmm3, 2 // (0; t b)
	216	paddq \c, xmm3 // (c''; c' + t b)
444083ae MW	217	movd \d, \c
	218	psrlq \c, 32 // floor(c/B)
	219	.ifnes "\cc", "nil"
	220	paddq \cc, \c // propagate up
	221	.endif
	222	.endm
	223
	224	.macro endprop d, c, t
	225	// On entry, C contains a carry register. On exit, the low 32 bits
	226	// of the value represented in C are written to D, and the remaining
	227	// bits are left at the bottom of T.
	228	movdqa \t, \c
981a9e5d MW	229	psllq \t, 16 // (c'' b; ?)
	230	pslldq \c, 8 // (c'; 0)
	231	paddq \t, \c // (c' + c'' b; ?)
	232	psrldq \t, 8 // (0; c' + c'' b) = (0; c)
444083ae	233	movd \d, \t
8e91d6e5	234	psrldq \t, 4 // (floor(c/B); 0)
444083ae MW	235	.endm
444083ae MW	236
71ac8e5e	237	.macro expand z, a, b, c=nil, d=nil
444083ae MW	238	// On entry, A and C hold packed 128-bit values, and Z is zero. On
	239	// exit, A:B and C:D together hold the same values in expanded
	240	// form. If C is `nil', then only expand A to A:B.
981a9e5d	241	movdqa \b, \a // (a_3, a_2; a_1, a_0)
444083ae	242	.ifnes "\c", "nil"
981a9e5d	243	movdqa \d, \c // (c_3, c_2; c_1, c_0)
444083ae	244	.endif
981a9e5d MW	245	punpcklwd \a, \z // (a''_1, a'_1; a''_0, a'_0)
981a9e5d MW	246	punpckhwd \b, \z // (a''_3, a'_3; a''_2, a'_2)
444083ae	247	.ifnes "\c", "nil"
981a9e5d MW	248	punpcklwd \c, \z // (c''_1, c'_1; c''_0, c'_0)
981a9e5d MW	249	punpckhwd \d, \z // (c''_3, c'_3; c''_2, c'_2)
444083ae	250	.endif
981a9e5d MW	251	pshufd \a, \a, SHUF(3, 1, 2, 0) // (a''_1, a''_0; a'_1, a'_0)
981a9e5d MW	252	pshufd \b, \b, SHUF(3, 1, 2, 0) // (a''_3, a''_2; a'_3, a'_2)
444083ae	253	.ifnes "\c", "nil"
981a9e5d MW	254	pshufd \c, \c, SHUF(3, 1, 2, 0) // (c''_1, c''_0; c'_1, c'_0)
981a9e5d MW	255	pshufd \d, \d, SHUF(3, 1, 2, 0) // (c''_3, c''_2; c'_3, c'_2)
444083ae MW	256	.endif
	257	.endm
	258
71ac8e5e	259	.macro squash c0, c1, c2, c3, t, u, lo, hi=nil
444083ae	260	// On entry, C0, C1, C2, C3 are carry registers representing a value
4b30aca5	261	// Y. On exit, LO holds the low 128 bits of the carry value; C1, C2,
444083ae	262	// C3, T, and U are clobbered; and the high bits of Y are stored in
4b30aca5	263	// HI, if this is not `nil'.
444083ae MW	264
	265	// The first step is to eliminate the `double-prime' pieces -- i.e.,
	266	// the ones offset by 16 bytes from a 32-bit boundary -- by carrying
	267	// them into the 32-bit-aligned pieces above and below. But before
	268	// we can do that, we must gather them together.
	269	movdqa \t, \c0
	270	movdqa \u, \c1
981a9e5d MW	271	punpcklqdq \t, \c2 // (y'_2; y'_0)
	272	punpckhqdq \c0, \c2 // (y''_2; y''_0)
	273	punpcklqdq \u, \c3 // (y'_3; y'_1)
	274	punpckhqdq \c1, \c3 // (y''_3; y''_1)
444083ae MW	275
	276	// Now split the double-prime pieces. The high (up to) 48 bits will
	277	// go up; the low 16 bits go down.
	278	movdqa \c2, \c0
	279	movdqa \c3, \c1
	280	psllq \c2, 48
	281	psllq \c3, 48
981a9e5d MW	282	psrlq \c0, 16 // high parts of (y''_2; y''_0)
	283	psrlq \c1, 16 // high parts of (y''_3; y''_1)
	284	psrlq \c2, 32 // low parts of (y''_2; y''_0)
	285	psrlq \c3, 32 // low parts of (y''_3; y''_1)
4b30aca5 MW	286	.ifnes "\hi", "nil"
4b30aca5 MW	287	movdqa \hi, \c1
444083ae	288	.endif
981a9e5d	289	pslldq \c1, 8 // high part of (y''_1; 0)
444083ae MW	290
	291	paddq \t, \c2 // propagate down
	292	paddq \u, \c3
981a9e5d MW	293	paddq \t, \c1 // and up: (y_2; y_0)
981a9e5d MW	294	paddq \u, \c0 // (y_3; y_1)
4b30aca5	295	.ifnes "\hi", "nil"
981a9e5d	296	psrldq \hi, 8 // high part of (0; y''_3)
444083ae MW	297	.endif
	298
	299	// Finally extract the answer. This complicated dance is better than
	300	// storing to memory and loading, because the piecemeal stores
	301	// inhibit store forwarding.
981a9e5d MW	302	movdqa \c3, \t // (?; y_0)
	303	movdqa \lo, \t // (?, ?; ?, y^*_0)
	304	psrldq \t, 8 // (0; y_2)
8e91d6e5 MW	305	psrlq \c3, 32 // (floor(y_0/B); ?)
8e91d6e5 MW	306	paddq \c3, \u // (y_1 + floor(y_0/B); ?)
981a9e5d MW	307	movdqa \c1, \c3 // (?, ?; ?, y^*_1)
981a9e5d MW	308	psrldq \u, 8 // (0; y_3)
8e91d6e5 MW	309	psrlq \c3, 32 // (floor((y_1 B + y_0)/B^2; ?)
8e91d6e5 MW	310	paddq \c3, \t // (y_2 + floor((y_1 B + y_0)/B^2; ?)
981a9e5d	311	punpckldq \lo, \c3 // (?, ?; y^_2, y^_0)
8e91d6e5 MW	312	psrlq \c3, 32 // (floor((y_2 B^2 + y_1 B + y_0)/B^3; ?)
8e91d6e5 MW	313	paddq \c3, \u // (y_3 + floor((y_2 B^2 + y_1 B + y_0)/B^3; ?)
4b30aca5	314	.ifnes "\hi", "nil"
d2269179	315	movdqa \t, \c3
444083ae MW	316	pxor \u, \u
444083ae MW	317	.endif
981a9e5d	318	punpckldq \c1, \c3 // (?, ?; y^_3, y^_1)
4b30aca5	319	.ifnes "\hi", "nil"
444083ae	320	psrlq \t, 32 // very high bits of y
4b30aca5 MW	321	paddq \hi, \t
4b30aca5 MW	322	punpcklqdq \hi, \u // carry up
444083ae	323	.endif
4b30aca5	324	punpckldq \lo, \c1 // y mod B^4
444083ae MW	325	.endm
	326
	327	.macro carryadd
	328	// On entry, EDI points to a packed addend A, and XMM4, XMM5, XMM6
	329	// hold the incoming carry registers c0, c1, and c2 representing a
	330	// carry-in C.
	331	//
	332	// On exit, the carry registers, including XMM7, are updated to hold
	333	// C + A; XMM0, XMM1, XMM2, and XMM3 are clobbered. The other
	334	// registers are preserved.
981a9e5d MW	335	movd xmm0, [edi + 0] // (0; a_0)
	336	movd xmm1, [edi + 4] // (0; a_1)
	337	movd xmm2, [edi + 8] // (0; a_2)
	338	movd xmm7, [edi + 12] // (0; a_3)
	339
	340	paddq xmm4, xmm0 // (c''_0; c'_0 + a_0)
	341	paddq xmm5, xmm1 // (c''_1; c'_1 + a_1)
	342	paddq xmm6, xmm2 // (c''_2 + a_3 b; c'_2 + a_2)
444083ae MW	343	.endm
	344
	345	///--------------------------------------------------------------------------
	346	/// Primitive multipliers and related utilities.
	347
1a517bb3	348	INTFUNC(carryprop)
444083ae MW	349	// On entry, XMM4, XMM5, and XMM6 hold a 144-bit carry in an expanded
	350	// form. Store the low 128 bits of the represented carry to [EDI] as
	351	// a packed 128-bit value, and leave the remaining 16 bits in the low
	352	// 32 bits of XMM4. On exit, XMM3, XMM5 and XMM6 are clobbered.
0923a413 MW	353	endprologue
0923a413 MW	354
444083ae MW	355	propout [edi + 0], xmm4, xmm5
	356	propout [edi + 4], xmm5, xmm6
	357	propout [edi + 8], xmm6, nil
	358	endprop [edi + 12], xmm6, xmm4
	359	ret
1a517bb3 MW	360	ENDFUNC
	361
	362	INTFUNC(dmul4)
444083ae MW	363	// On entry, EDI points to the destination buffer; EAX and EBX point
	364	// to the packed operands U and X; ECX and EDX point to the expanded
	365	// operands V and Y; and XMM4, XMM5, XMM6 hold the incoming carry
	366	// registers c0, c1, and c2; c3 is assumed to be zero.
	367	//
	368	// On exit, we write the low 128 bits of the sum C + U V + X Y to
	369	// [EDI], and update the carry registers with the carry out. The
	370	// registers XMM0, XMM1, XMM2, XMM3, and XMM7 are clobbered; the
	371	// general-purpose registers are preserved.
0923a413 MW	372	endprologue
0923a413 MW	373
444083ae	374	mulacc [eax + 0], ecx, xmm4, xmm5, xmm6, xmm7, t
71ac8e5e	375	mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7
444083ae MW	376	propout [edi + 0], xmm4, xmm5
	377
	378	mulacc [eax + 4], ecx, xmm5, xmm6, xmm7, xmm4, t
71ac8e5e	379	mulacc [ebx + 4], edx, xmm5, xmm6, xmm7, xmm4
444083ae MW	380	propout [edi + 4], xmm5, xmm6
	381
	382	mulacc [eax + 8], ecx, xmm6, xmm7, xmm4, xmm5, t
71ac8e5e	383	mulacc [ebx + 8], edx, xmm6, xmm7, xmm4, xmm5
444083ae MW	384	propout [edi + 8], xmm6, xmm7
	385
	386	mulacc [eax + 12], ecx, xmm7, xmm4, xmm5, xmm6, t
71ac8e5e	387	mulacc [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6
444083ae MW	388	propout [edi + 12], xmm7, xmm4
	389
	390	ret
1a517bb3 MW	391	ENDFUNC
	392
	393	INTFUNC(dmla4)
444083ae MW	394	// On entry, EDI points to the destination buffer, which also
	395	// contains an addend A to accumulate; EAX and EBX point to the
	396	// packed operands U and X; ECX and EDX point to the expanded
	397	// operands V and Y; and XMM4, XMM5, XMM6 hold the incoming carry
	398	// registers c0, c1, and c2 representing a carry-in C; c3 is assumed
	399	// to be zero.
	400	//
	401	// On exit, we write the low 128 bits of the sum A + C + U V + X Y to
	402	// [EDI], and update the carry registers with the carry out. The
	403	// registers XMM0, XMM1, XMM2, XMM3, and XMM7 are clobbered; the
	404	// general-purpose registers are preserved.
0923a413 MW	405	endprologue
0923a413 MW	406
444083ae MW	407	carryadd
444083ae MW	408
71ac8e5e MW	409	mulacc [eax + 0], ecx, xmm4, xmm5, xmm6, xmm7
71ac8e5e MW	410	mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7
444083ae MW	411	propout [edi + 0], xmm4, xmm5
	412
	413	mulacc [eax + 4], ecx, xmm5, xmm6, xmm7, xmm4, t
71ac8e5e	414	mulacc [ebx + 4], edx, xmm5, xmm6, xmm7, xmm4
444083ae MW	415	propout [edi + 4], xmm5, xmm6
	416
	417	mulacc [eax + 8], ecx, xmm6, xmm7, xmm4, xmm5, t
71ac8e5e	418	mulacc [ebx + 8], edx, xmm6, xmm7, xmm4, xmm5
444083ae MW	419	propout [edi + 8], xmm6, xmm7
	420
	421	mulacc [eax + 12], ecx, xmm7, xmm4, xmm5, xmm6, t
71ac8e5e	422	mulacc [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6
444083ae MW	423	propout [edi + 12], xmm7, xmm4
	424
	425	ret
1a517bb3 MW	426	ENDFUNC
	427
	428	INTFUNC(mul4zc)
444083ae MW	429	// On entry, EDI points to the destination buffer; EBX points to a
	430	// packed operand X; and EDX points to an expanded operand Y.
	431	//
	432	// On exit, we write the low 128 bits of the product X Y to [EDI],
	433	// and set the carry registers XMM4, XMM5, XMM6 to the carry out.
	434	// The registers XMM0, XMM1, XMM2, XMM3, and XMM7 are clobbered; the
	435	// general-purpose registers are preserved.
0923a413 MW	436	endprologue
0923a413 MW	437
444083ae MW	438	mulcore [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7
	439	propout [edi + 0], xmm4, xmm5
	440
	441	mulacc [ebx + 4], edx, xmm5, xmm6, xmm7, xmm4, t
	442	propout [edi + 4], xmm5, xmm6
	443
	444	mulacc [ebx + 8], edx, xmm6, xmm7, xmm4, xmm5, t
	445	propout [edi + 8], xmm6, xmm7
	446
	447	mulacc [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6, t
	448	propout [edi + 12], xmm7, xmm4
	449
	450	ret
1a517bb3 MW	451	ENDFUNC
	452
	453	INTFUNC(mul4)
444083ae MW	454	// On entry, EDI points to the destination buffer; EBX points to a
	455	// packed operand X; EDX points to an expanded operand Y; and XMM4,
	456	// XMM5, XMM6 hold the incoming carry registers c0, c1, and c2,
	457	// representing a carry-in C; c3 is assumed to be zero.
	458	//
	459	// On exit, we write the low 128 bits of the sum C + X Y to [EDI],
	460	// and update the carry registers with the carry out. The registers
	461	// XMM0, XMM1, XMM2, XMM3, and XMM7 are clobbered; the
	462	// general-purpose registers are preserved.
0923a413 MW	463	endprologue
0923a413 MW	464
444083ae MW	465	mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7, t
	466	propout [edi + 0], xmm4, xmm5
	467
	468	mulacc [ebx + 4], edx, xmm5, xmm6, xmm7, xmm4, t
	469	propout [edi + 4], xmm5, xmm6
	470
	471	mulacc [ebx + 8], edx, xmm6, xmm7, xmm4, xmm5, t
	472	propout [edi + 8], xmm6, xmm7
	473
	474	mulacc [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6, t
	475	propout [edi + 12], xmm7, xmm4
	476
	477	ret
1a517bb3 MW	478	ENDFUNC
	479
	480	INTFUNC(mla4zc)
444083ae MW	481	// On entry, EDI points to the destination buffer, which also
	482	// contains an addend A to accumulate; EBX points to a packed operand
	483	// X; and EDX points to an expanded operand Y.
	484	//
	485	// On exit, we write the low 128 bits of the sum A + X Y to [EDI],
	486	// and set the carry registers XMM4, XMM5, XMM6 to the carry out.
	487	// The registers XMM0, XMM1, XMM2, XMM3, and XMM7 are clobbered; the
	488	// general-purpose registers are preserved.
0923a413 MW	489	endprologue
0923a413 MW	490
444083ae MW	491	movd xmm4, [edi + 0]
	492	movd xmm5, [edi + 4]
	493	movd xmm6, [edi + 8]
	494	movd xmm7, [edi + 12]
	495
71ac8e5e	496	mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7
444083ae MW	497	propout [edi + 0], xmm4, xmm5
	498
	499	mulacc [ebx + 4], edx, xmm5, xmm6, xmm7, xmm4, t
	500	propout [edi + 4], xmm5, xmm6
	501
	502	mulacc [ebx + 8], edx, xmm6, xmm7, xmm4, xmm5, t
	503	propout [edi + 8], xmm6, xmm7
	504
	505	mulacc [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6, t
	506	propout [edi + 12], xmm7, xmm4
	507
	508	ret
1a517bb3 MW	509	ENDFUNC
	510
	511	INTFUNC(mla4)
444083ae MW	512	// On entry, EDI points to the destination buffer, which also
	513	// contains an addend A to accumulate; EBX points to a packed operand
	514	// X; EDX points to an expanded operand Y; and XMM4, XMM5, XMM6 hold
	515	// the incoming carry registers c0, c1, and c2, representing a
	516	// carry-in C; c3 is assumed to be zero.
	517	//
	518	// On exit, we write the low 128 bits of the sum A + C + X Y to
	519	// [EDI], and update the carry registers with the carry out. The
	520	// registers XMM0, XMM1, XMM2, XMM3, and XMM7 are clobbered; the
	521	// general-purpose registers are preserved.
0923a413 MW	522	endprologue
0923a413 MW	523
444083ae MW	524	carryadd
444083ae MW	525
71ac8e5e	526	mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7
444083ae MW	527	propout [edi + 0], xmm4, xmm5
	528
	529	mulacc [ebx + 4], edx, xmm5, xmm6, xmm7, xmm4, t
	530	propout [edi + 4], xmm5, xmm6
	531
	532	mulacc [ebx + 8], edx, xmm6, xmm7, xmm4, xmm5, t
	533	propout [edi + 8], xmm6, xmm7
	534
	535	mulacc [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6, t
	536	propout [edi + 12], xmm7, xmm4
	537
	538	ret
1a517bb3 MW	539	ENDFUNC
	540
	541	INTFUNC(mmul4)
444083ae MW	542	// On entry, EDI points to the destination buffer; EAX and EBX point
	543	// to the packed operands U and N; ECX and ESI point to the expanded
	544	// operands V and M; and EDX points to a place to store an expanded
	545	// result Y (32 bytes, at a 16-byte boundary). The stack pointer
6ecc0b8f	546	// must be 12 modulo 16, as is usual for modern x86 ABIs.
444083ae MW	547	//
	548	// On exit, we write Y = U V M mod B to [EDX], and the low 128 bits
	549	// of the sum U V + N Y to [EDI], leaving the remaining carry in
	550	// XMM4, XMM5, and XMM6. The registers XMM0, XMM1, XMM2, XMM3, and
	551	// XMM7 are clobbered; the general-purpose registers are preserved.
6ecc0b8f	552	stalloc 48 + 12 // space for the carries
0923a413	553	endprologue
444083ae MW	554
	555	// Calculate W = U V, and leave it in the destination. Stash the
	556	// carry pieces for later.
	557	mulcore [eax + 0], ecx, xmm4, xmm5, xmm6, xmm7
	558	propout [edi + 0], xmm4, xmm5
	559	jmp 5f
1a517bb3 MW	560	ENDFUNC
	561
	562	INTFUNC(mmla4)
444083ae	563	// On entry, EDI points to the destination buffer, which also
14e7b1f5 MW	564	// contains an addend A to accumulate; EAX and EBX point to the
14e7b1f5 MW	565	// packed operands U and N; ECX and ESI point to the expanded
444083ae MW	566	// operands V and M; and EDX points to a place to store an expanded
444083ae MW	567	// result Y (32 bytes, at a 16-byte boundary). The stack pointer
6ecc0b8f	568	// must be 12 modulo 16, as is usual for modern x86 ABIs.
444083ae MW	569	//
	570	// On exit, we write Y = (A + U V) M mod B to [EDX], and the low 128
	571	// bits of the sum A + U V + N Y to [EDI], leaving the remaining
	572	// carry in XMM4, XMM5, and XMM6. The registers XMM0, XMM1, XMM2,
	573	// XMM3, and XMM7 are clobbered; the general-purpose registers are
	574	// preserved.
6ecc0b8f	575	stalloc 48 + 12 // space for the carries
0923a413 MW	576	endprologue
0923a413 MW	577
444083ae MW	578	movd xmm4, [edi + 0]
	579	movd xmm5, [edi + 4]
	580	movd xmm6, [edi + 8]
	581	movd xmm7, [edi + 12]
ba12677b MW	582
	583	// Calculate W = U V, and leave it in the destination. Stash the
	584	// carry pieces for later.
71ac8e5e	585	mulacc [eax + 0], ecx, xmm4, xmm5, xmm6, xmm7
444083ae MW	586	propout [edi + 0], xmm4, xmm5
	587
	588	5: mulacc [eax + 4], ecx, xmm5, xmm6, xmm7, xmm4, t
	589	propout [edi + 4], xmm5, xmm6
	590
	591	mulacc [eax + 8], ecx, xmm6, xmm7, xmm4, xmm5, t
	592	propout [edi + 8], xmm6, xmm7
	593
	594	mulacc [eax + 12], ecx, xmm7, xmm4, xmm5, xmm6, t
	595	propout [edi + 12], xmm7, xmm4
	596
a90d420c MW	597	movdqa [SP + 0], xmm4
	598	movdqa [SP + 16], xmm5
	599	movdqa [SP + 32], xmm6
444083ae MW	600
	601	// Calculate Y = W M.
	602	mulcore [edi + 0], esi, xmm4, xmm5, xmm6, xmm7
	603
71ac8e5e MW	604	mulcore [edi + 4], esi, xmm0, xmm1, xmm2
71ac8e5e MW	605	accum xmm5, xmm6, xmm7
444083ae	606
71ac8e5e MW	607	mulcore [edi + 8], esi, xmm0, xmm1
71ac8e5e MW	608	accum xmm6, xmm7
444083ae	609
71ac8e5e MW	610	mulcore [edi + 12], esi, xmm0
71ac8e5e MW	611	accum xmm7
444083ae MW	612
444083ae MW	613	// That's lots of pieces. Now we have to assemble the answer.
71ac8e5e	614	squash xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm4
444083ae MW	615
	616	// Expand it.
	617	pxor xmm2, xmm2
71ac8e5e	618	expand xmm2, xmm4, xmm1
444083ae MW	619	movdqa [edx + 0], xmm4
	620	movdqa [edx + 16], xmm1
	621
	622	// Initialize the carry from the value for W we calculated earlier.
	623	movd xmm4, [edi + 0]
	624	movd xmm5, [edi + 4]
	625	movd xmm6, [edi + 8]
	626	movd xmm7, [edi + 12]
	627
	628	// Finish the calculation by adding the Montgomery product.
71ac8e5e	629	mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7
444083ae MW	630	propout [edi + 0], xmm4, xmm5
	631
	632	mulacc [ebx + 4], edx, xmm5, xmm6, xmm7, xmm4, t
	633	propout [edi + 4], xmm5, xmm6
	634
	635	mulacc [ebx + 8], edx, xmm6, xmm7, xmm4, xmm5, t
	636	propout [edi + 8], xmm6, xmm7
	637
	638	mulacc [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6, t
	639	propout [edi + 12], xmm7, xmm4
	640
	641	// Add add on the carry we calculated earlier.
a90d420c MW	642	paddq xmm4, [SP + 0]
	643	paddq xmm5, [SP + 16]
	644	paddq xmm6, [SP + 32]
444083ae MW	645
444083ae MW	646	// And, with that, we're done.
6ecc0b8f	647	stfree 48 + 12
444083ae	648	ret
1a517bb3 MW	649	ENDFUNC
	650
	651	INTFUNC(mont4)
444083ae	652	// On entry, EDI points to the destination buffer holding a packed
8e5386aa	653	// value W; EBX points to a packed operand N; ESI points to an
444083ae MW	654	// expanded operand M; and EDX points to a place to store an expanded
	655	// result Y (32 bytes, at a 16-byte boundary).
	656	//
	657	// On exit, we write Y = W M mod B to [EDX], and the low 128 bits
	658	// of the sum W + N Y to [EDI], leaving the remaining carry in
	659	// XMM4, XMM5, and XMM6. The registers XMM0, XMM1, XMM2, XMM3, and
	660	// XMM7 are clobbered; the general-purpose registers are preserved.
0923a413	661	endprologue
444083ae MW	662
	663	// Calculate Y = W M.
	664	mulcore [edi + 0], esi, xmm4, xmm5, xmm6, xmm7
	665
71ac8e5e MW	666	mulcore [edi + 4], esi, xmm0, xmm1, xmm2
71ac8e5e MW	667	accum xmm5, xmm6, xmm7
444083ae	668
71ac8e5e MW	669	mulcore [edi + 8], esi, xmm0, xmm1
71ac8e5e MW	670	accum xmm6, xmm7
444083ae	671
71ac8e5e MW	672	mulcore [edi + 12], esi, xmm0
71ac8e5e MW	673	accum xmm7
444083ae MW	674
444083ae MW	675	// That's lots of pieces. Now we have to assemble the answer.
71ac8e5e	676	squash xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm4
444083ae MW	677
	678	// Expand it.
	679	pxor xmm2, xmm2
71ac8e5e	680	expand xmm2, xmm4, xmm1
444083ae MW	681	movdqa [edx + 0], xmm4
	682	movdqa [edx + 16], xmm1
	683
	684	// Initialize the carry from W.
	685	movd xmm4, [edi + 0]
	686	movd xmm5, [edi + 4]
	687	movd xmm6, [edi + 8]
	688	movd xmm7, [edi + 12]
	689
	690	// Finish the calculation by adding the Montgomery product.
71ac8e5e	691	mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7
444083ae MW	692	propout [edi + 0], xmm4, xmm5
	693
	694	mulacc [ebx + 4], edx, xmm5, xmm6, xmm7, xmm4, t
	695	propout [edi + 4], xmm5, xmm6
	696
	697	mulacc [ebx + 8], edx, xmm6, xmm7, xmm4, xmm5, t
	698	propout [edi + 8], xmm6, xmm7
	699
	700	mulacc [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6, t
	701	propout [edi + 12], xmm7, xmm4
	702
	703	// And, with that, we're done.
	704	ret
1a517bb3 MW	705	ENDFUNC
1a517bb3 MW	706
444083ae MW	707	///--------------------------------------------------------------------------
	708	/// Bulk multipliers.
	709
b9b279b4 MW	710	FUNC(mpx_umul4_x86_avx)
	711	.arch .avx
	712	vzeroupper
	713	endprologue
	714	// and drop through...
	715	.arch pentium4
	716	ENDFUNC
	717
444083ae MW	718	FUNC(mpx_umul4_x86_sse2)
	719	// void mpx_umul4_x86_sse2(mpw dv, const mpw av, const mpw *avl,
	720	// const mpw bv, const mpw bvl);
	721
a90d420c	722	// Build a stack frame. Arguments will be relative to BP, as
444083ae MW	723	// follows.
444083ae MW	724	//
a90d420c MW	725	// BP + 20 dv
	726	// BP + 24 av
	727	// BP + 28 avl
	728	// BP + 32 bv
	729	// BP + 36 bvl
444083ae	730	//
a90d420c	731	// Locals are relative to SP, as follows.
444083ae	732	//
a90d420c MW	733	// SP + 0 expanded Y (32 bytes)
	734	// SP + 32 (top of locals)
	735	pushreg BP
0923a413 MW	736	pushreg ebx
	737	pushreg esi
	738	pushreg edi
42c44b27	739	setfp
6d2bd7f1	740	stalloc 32
a90d420c	741	and SP, ~15
0923a413	742	endprologue
444083ae MW	743
444083ae MW	744	// Prepare for the first iteration.
a90d420c	745	mov esi, [BP + 32] // -> bv[0]
444083ae MW	746	pxor xmm7, xmm7
444083ae MW	747	movdqu xmm0, [esi] // bv[0]
a90d420c	748	mov edi, [BP + 20] // -> dv[0]
444083ae	749	mov ecx, edi // outer loop dv cursor
71ac8e5e	750	expand xmm7, xmm0, xmm1
a90d420c MW	751	mov ebx, [BP + 24] // -> av[0]
	752	mov eax, [BP + 28] // -> av[m] = av limit
	753	mov edx, SP // -> expanded Y = bv[0]
	754	movdqa [SP + 0], xmm0 // bv[0] expanded low
	755	movdqa [SP + 16], xmm1 // bv[0] expanded high
444083ae MW	756	call mul4zc
	757	add ebx, 16
	758	add edi, 16
	759	add ecx, 16
	760	add esi, 16
	761	cmp ebx, eax // all done?
	762	jae 8f
	763
	764	.p2align 4
	765	// Continue with the first iteration.
	766	0: call mul4
	767	add ebx, 16
	768	add edi, 16
	769	cmp ebx, eax // all done?
	770	jb 0b
	771
	772	// Write out the leftover carry. There can be no tail here.
	773	8: call carryprop
a90d420c	774	cmp esi, [BP + 36] // more passes to do?
444083ae MW	775	jae 9f
	776
	777	.p2align 4
	778	// Set up for the next pass.
	779	1: movdqu xmm0, [esi] // bv[i]
	780	mov edi, ecx // -> dv[i]
	781	pxor xmm7, xmm7
71ac8e5e	782	expand xmm7, xmm0, xmm1
a90d420c MW	783	mov ebx, [BP + 24] // -> av[0]
	784	movdqa [SP + 0], xmm0 // bv[i] expanded low
	785	movdqa [SP + 16], xmm1 // bv[i] expanded high
444083ae MW	786	call mla4zc
	787	add edi, 16
	788	add ebx, 16
	789	add ecx, 16
	790	add esi, 16
	791	cmp ebx, eax // done yet?
	792	jae 8f
	793
	794	.p2align 4
	795	// Continue...
	796	0: call mla4
	797	add ebx, 16
	798	add edi, 16
	799	cmp ebx, eax
	800	jb 0b
	801
	802	// Finish off this pass. There was no tail on the previous pass, and
	803	// there can be none on this pass.
	804	8: call carryprop
a90d420c	805	cmp esi, [BP + 36]
444083ae MW	806	jb 1b
	807
	808	// All over.
0923a413	809	9: dropfp
444083ae MW	810	pop edi
	811	pop esi
	812	pop ebx
a90d420c	813	pop BP
444083ae	814	ret
444083ae MW	815	ENDFUNC
444083ae MW	816
b9b279b4 MW	817	FUNC(mpxmont_mul4_x86_avx)
	818	.arch .avx
	819	vzeroupper
	820	endprologue
	821	// and drop through...
	822	.arch pentium4
	823	ENDFUNC
	824
444083ae MW	825	FUNC(mpxmont_mul4_x86_sse2)
	826	// void mpxmont_mul4_x86_sse2(mpw dv, const mpw av, const mpw *bv,
	827	// const mpw nv, size_t n, const mpw mi);
	828
a90d420c	829	// Build a stack frame. Arguments will be relative to BP, as
444083ae MW	830	// follows.
444083ae MW	831	//
a90d420c MW	832	// BP + 20 dv
	833	// BP + 24 av
	834	// BP + 28 bv
	835	// BP + 32 nv
	836	// BP + 36 n (nonzero multiple of 4)
	837	// BP + 40 mi
444083ae	838	//
a90d420c	839	// Locals are relative to SP, which 16-byte aligned, as follows.
444083ae	840	//
a90d420c MW	841	// SP + 0 expanded V (32 bytes)
	842	// SP + 32 expanded M (32 bytes)
	843	// SP + 64 expanded Y (32 bytes)
	844	// SP + 96 outer loop dv
	845	// SP + 100 outer loop bv
	846	// SP + 104 av limit (mostly in ESI)
	847	// SP + 108 bv limit
	848	// SP + 112 (top of locals)
	849	pushreg BP
0923a413 MW	850	pushreg ebx
	851	pushreg esi
	852	pushreg edi
42c44b27	853	setfp
6d2bd7f1	854	stalloc 112
a90d420c	855	and SP, ~15
0923a413	856	endprologue
444083ae MW	857
	858	// Establish the expanded operands.
	859	pxor xmm7, xmm7
a90d420c MW	860	mov ecx, [BP + 28] // -> bv
a90d420c MW	861	mov edx, [BP + 40] // -> mi
444083ae MW	862	movdqu xmm0, [ecx] // bv[0]
444083ae MW	863	movdqu xmm2, [edx] // mi
71ac8e5e	864	expand xmm7, xmm0, xmm1, xmm2, xmm3
a90d420c MW	865	movdqa [SP + 0], xmm0 // bv[0] expanded low
	866	movdqa [SP + 16], xmm1 // bv[0] expanded high
	867	movdqa [SP + 32], xmm2 // mi expanded low
	868	movdqa [SP + 48], xmm3 // mi expanded high
444083ae MW	869
444083ae MW	870	// Set up the outer loop state and prepare for the first iteration.
a90d420c MW	871	mov edx, [BP + 36] // n
	872	mov eax, [BP + 24] // -> U = av[0]
	873	mov ebx, [BP + 32] // -> X = nv[0]
	874	mov edi, [BP + 20] // -> Z = dv[0]
	875	mov [SP + 100], ecx
444083ae MW	876	lea ecx, [ecx + 4*edx] // -> bv[n/4] = bv limit
444083ae MW	877	lea edx, [eax + 4*edx] // -> av[n/4] = av limit
a90d420c MW	878	mov [SP + 96], edi
	879	mov [SP + 104], edx
	880	mov [SP + 108], ecx
	881	lea ecx, [SP + 0] // -> expanded V = bv[0]
	882	lea esi, [SP + 32] // -> expanded M = mi
	883	lea edx, [SP + 64] // -> space for Y
444083ae	884	call mmul4
a90d420c	885	mov esi, [SP + 104] // recover av limit
444083ae MW	886	add edi, 16
	887	add eax, 16
	888	add ebx, 16
	889	cmp eax, esi // done already?
	890	jae 8f
a90d420c	891	mov [SP + 96], edi
444083ae MW	892
	893	.p2align 4
	894	// Complete the first inner loop.
	895	0: call dmul4
	896	add edi, 16
	897	add eax, 16
	898	add ebx, 16
	899	cmp eax, esi // done yet?
	900	jb 0b
	901
	902	// Still have carries left to propagate.
	903	call carryprop
	904	movd [edi + 16], xmm4
	905
	906	.p2align 4
	907	// Embark on the next iteration. (There must be one. If n = 1, then
	908	// we would have bailed above, to label 8. Similarly, the subsequent
	909	// iterations can fall into the inner loop immediately.)
a90d420c MW	910	1: mov eax, [SP + 100] // -> bv[i - 1]
a90d420c MW	911	mov edi, [SP + 96] // -> Z = dv[i]
444083ae MW	912	add eax, 16 // -> bv[i]
444083ae MW	913	pxor xmm7, xmm7
a90d420c MW	914	mov [SP + 100], eax
a90d420c MW	915	cmp eax, [SP + 108] // done yet?
444083ae	916	jae 9f
6ecc0b8f	917	movdqu xmm0, [eax] // bv[i]
a90d420c MW	918	mov ebx, [BP + 32] // -> X = nv[0]
	919	lea esi, [SP + 32] // -> expanded M = mi
	920	mov eax, [BP + 24] // -> U = av[0]
71ac8e5e	921	expand xmm7, xmm0, xmm1
a90d420c MW	922	movdqa [SP + 0], xmm0 // bv[i] expanded low
a90d420c MW	923	movdqa [SP + 16], xmm1 // bv[i] expanded high
444083ae	924	call mmla4
a90d420c	925	mov esi, [SP + 104] // recover av limit
444083ae MW	926	add edi, 16
	927	add eax, 16
	928	add ebx, 16
a90d420c	929	mov [SP + 96], edi
444083ae MW	930
	931	.p2align 4
	932	// Complete the next inner loop.
	933	0: call dmla4
	934	add edi, 16
	935	add eax, 16
	936	add ebx, 16
	937	cmp eax, esi
	938	jb 0b
	939
	940	// Still have carries left to propagate, and they overlap the
	941	// previous iteration's final tail, so read that in and add it.
	942	movd xmm0, [edi]
	943	paddq xmm4, xmm0
	944	call carryprop
	945	movd [edi + 16], xmm4
	946
	947	// Back again.
	948	jmp 1b
	949
	950	// First iteration was short. Write out the carries and we're done.
	951	// (This could be folded into the main loop structure, but that would
	952	// penalize small numbers more.)
	953	8: call carryprop
	954	movd [edi + 16], xmm4
	955
	956	// All done.
0923a413 MW	957	9: dropfp
	958	popreg edi
	959	popreg esi
	960	popreg ebx
a90d420c	961	popreg BP
444083ae	962	ret
444083ae MW	963	ENDFUNC
444083ae MW	964
b9b279b4 MW	965	FUNC(mpxmont_redc4_x86_avx)
	966	.arch .avx
	967	vzeroupper
	968	endprologue
	969	// and drop through...
	970	.arch pentium4
	971	ENDFUNC
	972
444083ae MW	973	FUNC(mpxmont_redc4_x86_sse2)
	974	// void mpxmont_redc4_x86_sse2(mpw dv, mpw dvl, const mpw *nv,
	975	// size_t n, const mpw *mi);
	976
a90d420c	977	// Build a stack frame. Arguments will be relative to BP, as
444083ae MW	978	// follows.
444083ae MW	979	//
a90d420c MW	980	// BP + 20 dv
	981	// BP + 24 dvl
	982	// BP + 28 nv
	983	// BP + 32 n (nonzero multiple of 4)
	984	// BP + 36 mi
444083ae	985	//
a90d420c	986	// Locals are relative to SP, as follows.
444083ae	987	//
a90d420c MW	988	// SP + 0 outer loop dv
	989	// SP + 4 outer dv limit
	990	// SP + 8 blocks-of-4 dv limit
	991	// SP + 12 expanded M (32 bytes)
	992	// SP + 44 expanded Y (32 bytes)
	993	// SP + 76 (top of locals)
	994	pushreg BP
0923a413 MW	995	pushreg ebx
	996	pushreg esi
	997	pushreg edi
42c44b27	998	setfp
a90d420c	999	and SP, ~15
6d2bd7f1	1000	stalloc 76
0923a413	1001	endprologue
444083ae MW	1002
444083ae MW	1003	// Establish the expanded operands and the blocks-of-4 dv limit.
a90d420c	1004	mov edi, [BP + 20] // -> Z = dv[0]
444083ae	1005	pxor xmm7, xmm7
a90d420c	1006	mov eax, [BP + 24] // -> dv[n] = dv limit
444083ae	1007	sub eax, edi // length of dv in bytes
a90d420c	1008	mov edx, [BP + 36] // -> mi
444083ae MW	1009	movdqu xmm0, [edx] // mi
444083ae MW	1010	and eax, ~15 // mask off the tail end
71ac8e5e	1011	expand xmm7, xmm0, xmm1
444083ae	1012	add eax, edi // find limit
a90d420c MW	1013	movdqa [SP + 12], xmm0 // mi expanded low
	1014	movdqa [SP + 28], xmm1 // mi expanded high
	1015	mov [SP + 8], eax
444083ae MW	1016
444083ae MW	1017	// Set up the outer loop state and prepare for the first iteration.
a90d420c MW	1018	mov ecx, [BP + 32] // n
a90d420c MW	1019	mov ebx, [BP + 28] // -> X = nv[0]
444083ae MW	1020	lea edx, [edi + 4*ecx] // -> dv[n/4] = outer dv limit
444083ae MW	1021	lea ecx, [ebx + 4*ecx] // -> nv[n/4] = nv limit
a90d420c MW	1022	mov [SP + 0], edi
	1023	mov [SP + 4], edx
	1024	lea esi, [SP + 12] // -> expanded M = mi
	1025	lea edx, [SP + 44] // -> space for Y
444083ae	1026	call mont4
444083ae	1027	add ebx, 16
a87d6f26	1028	add edi, 16
444083ae MW	1029	cmp ebx, ecx // done already?
	1030	jae 8f
	1031
	1032	.p2align 4
	1033	// Complete the first inner loop.
	1034	5: call mla4
	1035	add ebx, 16
	1036	add edi, 16
	1037	cmp ebx, ecx // done yet?
	1038	jb 5b
	1039
	1040	// Still have carries left to propagate.
	1041	8: carryadd
a90d420c MW	1042	mov esi, [SP + 8] // -> dv blocks limit
a90d420c MW	1043	mov edx, [BP + 24] // dv limit
444083ae MW	1044	psllq xmm7, 16
	1045	pslldq xmm7, 8
	1046	paddq xmm6, xmm7
	1047	call carryprop
	1048	movd eax, xmm4
	1049	add edi, 16
	1050	cmp edi, esi
	1051	jae 7f
	1052
	1053	.p2align 4
	1054	// Continue carry propagation until the end of the buffer.
	1055	0: add [edi], eax
	1056	mov eax, 0 // preserves flags
bd6d65e3 MW	1057	adc dword ptr [edi + 4], 0
	1058	adc dword ptr [edi + 8], 0
	1059	adc dword ptr [edi + 12], 0
444083ae MW	1060	adc eax, 0
	1061	add edi, 16
	1062	cmp edi, esi
	1063	jb 0b
	1064
6966e7a6 MW	1065	// Deal with the tail end. Note that the actual destination length
	1066	// won't be an exact number of blocks of four, so it's safe to just
	1067	// drop through here.
444083ae	1068	7: add [edi], eax
92edc356	1069	mov eax, 0
444083ae MW	1070	add edi, 4
	1071	adc eax, 0
	1072	cmp edi, edx
	1073	jb 7b
	1074
6966e7a6	1075	// All done for this iteration. Start the next.
a90d420c MW	1076	8: mov edi, [SP + 0] // -> dv[i - 1]
	1077	mov ebx, [BP + 28] // -> X = nv[0]
	1078	lea edx, [SP + 44] // -> space for Y
	1079	lea esi, [SP + 12] // -> expanded M = mi
444083ae	1080	add edi, 16 // -> Z = dv[i]
a90d420c	1081	cmp edi, [SP + 4] // all done yet?
444083ae	1082	jae 9f
a90d420c	1083	mov [SP + 0], edi
444083ae MW	1084	call mont4
	1085	add edi, 16
	1086	add ebx, 16
	1087	jmp 5b
	1088
	1089	// All over.
0923a413 MW	1090	9: dropfp
	1091	popreg edi
	1092	popreg esi
	1093	popreg ebx
a90d420c	1094	popreg BP
444083ae	1095	ret
444083ae MW	1096	ENDFUNC
	1097
	1098	///--------------------------------------------------------------------------
	1099	/// Testing and performance measurement.
	1100
	1101	#ifdef TEST_MUL4
	1102
	1103	.macro cysetup c
	1104	rdtsc
	1105	mov [\c], eax
	1106	mov [\c + 4], edx
	1107	.endm
	1108
	1109	.macro cystore c, v, n
	1110	rdtsc
	1111	sub eax, [\c]
	1112	sbb edx, [\c + 4]
	1113	mov ebx, [\v]
	1114	mov ecx, [\n]
	1115	dec ecx
	1116	mov [\n], ecx
	1117	mov [ebx + ecx*8], eax
	1118	mov [ebx + ecx*8 + 4], edx
	1119	.endm
	1120
6d19758a	1121	.macro testprologue n
a90d420c	1122	pushreg BP
0923a413 MW	1123	pushreg ebx
	1124	pushreg esi
	1125	pushreg edi
42c44b27	1126	setfp
6d2bd7f1	1127	stalloc 332 + 44
a90d420c	1128	and SP, ~15
0923a413	1129	endprologue
6d19758a	1130	mov eax, \n
a90d420c	1131	mov [SP + 104], eax
444083ae	1132	// vars:
a90d420c MW	1133	// SP + 0 = v expanded
	1134	// SP + 32 = y expanded
	1135	// SP + 64 = ? expanded
	1136	// SP + 96 = cycles
	1137	// SP + 104 = count
444083ae MW	1138	.endm
	1139
	1140	.macro testepilogue
0923a413 MW	1141	dropfp
	1142	popreg edi
	1143	popreg esi
	1144	popreg ebx
a90d420c	1145	popreg BP
444083ae MW	1146	ret
	1147	.endm
	1148
	1149	.macro testldcarry c
	1150	mov ecx, \c // -> c
981a9e5d MW	1151	movdqu xmm4, [ecx + 0] // (c''_0; c'_0)
	1152	movdqu xmm5, [ecx + 16] // (c''_1; c'_1)
	1153	movdqu xmm6, [ecx + 32] // (c''_2; c'_2)
444083ae MW	1154	.endm
444083ae MW	1155
71ac8e5e	1156	.macro testexpand v=nil, y=nil
444083ae MW	1157	pxor xmm7, xmm7
	1158	.ifnes "\v", "nil"
	1159	mov ecx, \v
	1160	movdqu xmm0, [ecx]
71ac8e5e	1161	expand xmm7, xmm0, xmm1
a90d420c MW	1162	movdqa [SP + 0], xmm0
a90d420c MW	1163	movdqa [SP + 16], xmm1
444083ae MW	1164	.endif
	1165	.ifnes "\y", "nil"
	1166	mov edx, \y
	1167	movdqu xmm2, [edx]
71ac8e5e	1168	expand xmm7, xmm2, xmm3
a90d420c MW	1169	movdqa [SP + 32], xmm2
a90d420c MW	1170	movdqa [SP + 48], xmm3
444083ae MW	1171	.endif
	1172	.endm
	1173
71ac8e5e	1174	.macro testtop u=nil, x=nil, mode=nil
444083ae MW	1175	.p2align 4
	1176	0:
	1177	.ifnes "\u", "nil"
a90d420c	1178	lea ecx, [SP + 0]
444083ae MW	1179	.endif
	1180	mov ebx, \x
	1181	.ifeqs "\mode", "mont"
a90d420c	1182	lea esi, [SP + 32]
444083ae	1183	.endif
a90d420c	1184	cysetup SP + 96
444083ae MW	1185	.ifnes "\u", "nil"
	1186	mov eax, \u
	1187	.endif
	1188	.ifeqs "\mode", "mont"
a90d420c	1189	lea edx, [SP + 64]
444083ae	1190	.else
a90d420c	1191	lea edx, [SP + 32]
444083ae MW	1192	.endif
	1193	.endm
	1194
6d19758a	1195	.macro testtail cyv
a90d420c	1196	cystore SP + 96, \cyv, SP + 104
444083ae MW	1197	jnz 0b
	1198	.endm
	1199
	1200	.macro testcarryout c
	1201	mov ecx, \c
	1202	movdqu [ecx + 0], xmm4
	1203	movdqu [ecx + 16], xmm5
	1204	movdqu [ecx + 32], xmm6
	1205	.endm
	1206
0923a413	1207	FUNC(test_dmul4)
a90d420c MW	1208	testprologue [BP + 44]
	1209	testldcarry [BP + 24]
	1210	testexpand [BP + 36], [BP + 40]
	1211	mov edi, [BP + 20]
	1212	testtop [BP + 28], [BP + 32]
444083ae	1213	call dmul4
a90d420c MW	1214	testtail [BP + 48]
a90d420c MW	1215	testcarryout [BP + 24]
444083ae	1216	testepilogue
0923a413	1217	ENDFUNC
444083ae	1218
0923a413	1219	FUNC(test_dmla4)
a90d420c MW	1220	testprologue [BP + 44]
	1221	testldcarry [BP + 24]
	1222	testexpand [BP + 36], [BP + 40]
	1223	mov edi, [BP + 20]
	1224	testtop [BP + 28], [BP + 32]
444083ae	1225	call dmla4
a90d420c MW	1226	testtail [BP + 48]
a90d420c MW	1227	testcarryout [BP + 24]
444083ae	1228	testepilogue
0923a413	1229	ENDFUNC
444083ae	1230
0923a413	1231	FUNC(test_mul4)
a90d420c MW	1232	testprologue [BP + 36]
	1233	testldcarry [BP + 24]
	1234	testexpand nil, [BP + 32]
	1235	mov edi, [BP + 20]
	1236	testtop nil, [BP + 28]
444083ae	1237	call mul4
a90d420c MW	1238	testtail [BP + 40]
a90d420c MW	1239	testcarryout [BP + 24]
444083ae	1240	testepilogue
0923a413	1241	ENDFUNC
444083ae	1242
d0d41c6e	1243	FUNC(test_mul4zc)
a90d420c MW	1244	testprologue [BP + 36]
	1245	testldcarry [BP + 24]
	1246	testexpand nil, [BP + 32]
	1247	mov edi, [BP + 20]
	1248	testtop nil, [BP + 28]
d0d41c6e	1249	call mul4zc
a90d420c MW	1250	testtail [BP + 40]
a90d420c MW	1251	testcarryout [BP + 24]
d0d41c6e MW	1252	testepilogue
	1253	ENDFUNC
	1254
0923a413	1255	FUNC(test_mla4)
a90d420c MW	1256	testprologue [BP + 36]
	1257	testldcarry [BP + 24]
	1258	testexpand nil, [BP + 32]
	1259	mov edi, [BP + 20]
	1260	testtop nil, [BP + 28]
444083ae	1261	call mla4
a90d420c MW	1262	testtail [BP + 40]
a90d420c MW	1263	testcarryout [BP + 24]
444083ae	1264	testepilogue
0923a413	1265	ENDFUNC
444083ae	1266
d0d41c6e	1267	FUNC(test_mla4zc)
a90d420c MW	1268	testprologue [BP + 36]
	1269	testldcarry [BP + 24]
	1270	testexpand nil, [BP + 32]
	1271	mov edi, [BP + 20]
	1272	testtop nil, [BP + 28]
d0d41c6e	1273	call mla4zc
a90d420c MW	1274	testtail [BP + 40]
a90d420c MW	1275	testcarryout [BP + 24]
d0d41c6e MW	1276	testepilogue
	1277	ENDFUNC
	1278
0923a413	1279	FUNC(test_mmul4)
a90d420c MW	1280	testprologue [BP + 48]
	1281	testexpand [BP + 40], [BP + 44]
	1282	mov edi, [BP + 20]
	1283	testtop [BP + 32], [BP + 36], mont
444083ae	1284	call mmul4
a90d420c MW	1285	testtail [BP + 52]
	1286	mov edi, [BP + 28]
	1287	movdqa xmm0, [SP + 64]
	1288	movdqa xmm1, [SP + 80]
981a9e5d MW	1289	pshufd xmm0, xmm0, SHUF(3, 1, 2, 0)
981a9e5d MW	1290	pshufd xmm1, xmm1, SHUF(3, 1, 2, 0)
444083ae MW	1291	movdqu [edi], xmm0
444083ae MW	1292	movdqu [edi + 16], xmm1
a90d420c	1293	testcarryout [BP + 24]
444083ae	1294	testepilogue
0923a413	1295	ENDFUNC
444083ae	1296
0923a413	1297	FUNC(test_mmla4)
a90d420c MW	1298	testprologue [BP + 48]
	1299	testexpand [BP + 40], [BP + 44]
	1300	mov edi, [BP + 20]
	1301	testtop [BP + 32], [BP + 36], mont
444083ae	1302	call mmla4
a90d420c MW	1303	testtail [BP + 52]
	1304	mov edi, [BP + 28]
	1305	movdqa xmm0, [SP + 64]
	1306	movdqa xmm1, [SP + 80]
981a9e5d MW	1307	pshufd xmm0, xmm0, SHUF(3, 1, 2, 0)
981a9e5d MW	1308	pshufd xmm1, xmm1, SHUF(3, 1, 2, 0)
444083ae MW	1309	movdqu [edi], xmm0
444083ae MW	1310	movdqu [edi + 16], xmm1
a90d420c	1311	testcarryout [BP + 24]
444083ae	1312	testepilogue
0923a413	1313	ENDFUNC
444083ae	1314
0923a413	1315	FUNC(test_mont4)
a90d420c MW	1316	testprologue [BP + 40]
	1317	testexpand nil, [BP + 36]
	1318	mov edi, [BP + 20]
	1319	testtop nil, [BP + 32], mont
444083ae	1320	call mont4
a90d420c MW	1321	testtail [BP + 44]
	1322	mov edi, [BP + 28]
	1323	movdqa xmm0, [SP + 64]
	1324	movdqa xmm1, [SP + 80]
981a9e5d MW	1325	pshufd xmm0, xmm0, SHUF(3, 1, 2, 0)
981a9e5d MW	1326	pshufd xmm1, xmm1, SHUF(3, 1, 2, 0)
444083ae MW	1327	movdqu [edi], xmm0
444083ae MW	1328	movdqu [edi + 16], xmm1
a90d420c	1329	testcarryout [BP + 24]
444083ae	1330	testepilogue
0923a413	1331	ENDFUNC
444083ae MW	1332
	1333	#endif
	1334
	1335	///----- That's all, folks --------------------------------------------------