chiark - git - mdw - catacomb/blame_incremental

... / ...

Commit	Line	Data
	1	/// -- mode: asm; asm-comment-char: ?/; comment-start: "// " --
	2	///
	3	/// Large SIMD-based multiplications
	4	///
	5	/// (c) 2016 Straylight/Edgeware
	6
	7	///----- Licensing notice ---------------------------------------------------
	8	///
	9	/// This file is part of Catacomb.
	10	///
	11	/// Catacomb is free software; you can redistribute it and/or modify
	12	/// it under the terms of the GNU Library General Public License as
	13	/// published by the Free Software Foundation; either version 2 of the
	14	/// License, or (at your option) any later version.
	15	///
	16	/// Catacomb is distributed in the hope that it will be useful,
	17	/// but WITHOUT ANY WARRANTY; without even the implied warranty of
	18	/// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	19	/// GNU Library General Public License for more details.
	20	///
	21	/// You should have received a copy of the GNU Library General Public
	22	/// License along with Catacomb; if not, write to the Free
	23	/// Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
	24	/// MA 02111-1307, USA.
	25
	26	///--------------------------------------------------------------------------
	27	/// External definitions.
	28
	29	#include "config.h"
	30	#include "asm-common.h"
	31
	32	///--------------------------------------------------------------------------
	33	/// Prologue.
	34
	35	.arch pentium4
	36	.text
	37
	38	///--------------------------------------------------------------------------
	39	/// Theory.
	40	///
	41	/// We define a number of primitive fixed-size multipliers from which we can
	42	/// construct more general variable-length multipliers.
	43	///
	44	/// The basic trick is the same throughout. In an operand-scanning
	45	/// multiplication, the inner multiplication loop multiplies a
	46	/// multiple-precision operand by a single precision factor, and adds the
	47	/// result, appropriately shifted, to the result. A `finely integrated
	48	/// operand scanning' implementation of Montgomery multiplication also adds
	49	/// the product of a single-precision `Montgomery factor' and the modulus,
	50	/// calculated in the same pass. The more common `coarsely integrated
	51	/// operand scanning' alternates main multiplication and Montgomery passes,
	52	/// which requires additional carry propagation.
	53	///
	54	/// Throughout both plain-multiplication and Montgomery stages, then, one of
	55	/// the factors remains constant throughout the operation, so we can afford
	56	/// to take a little time to preprocess it. The transformation we perform is
	57	/// as follows. Let b = 2^16, and B = b^2 = 2^32. Suppose we're given a
	58	/// 128-bit factor v = v_0 + v_1 B + v_2 B^2 + v_3 B^3. Split each v_i into
	59	/// two sixteen-bit pieces, so v_i = v'_i + v''_i b. These eight 16-bit
	60	/// pieces are placed into 32-bit cells, and arranged as two 128-bit SSE
	61	/// operands, as follows.
	62	///
	63	/// Offset 0 4 8 12
	64	/// 0 v'_0 v'_1 v''_0 v''_1
	65	/// 16 v'_2 v'_3 v''_2 v''_3
	66	///
	67	/// A `pmuludqd' instruction ignores the odd positions in its operands; thus,
	68	/// it will act on (say) v'_0 and v''_0 in a single instruction. Shifting
	69	/// this vector right by 4 bytes brings v'_1 and v''_1 into position. We can
	70	/// multiply such a vector by a full 32-bit scalar to produce two 48-bit
	71	/// results in 64-bit fields. The sixteen bits of headroom allows us to add
	72	/// many products together before we must deal with carrying; it also allows
	73	/// for some calculations to be performed on the above expanded form.
	74	///
	75	/// On 32-bit x86, we are register starved: the expanded operands are kept in
	76	/// memory, typically in warm L1 cache.
	77	///
	78	/// We maintain four `carry' registers accumulating intermediate results.
	79	/// The registers' precise roles rotate during the computation; we name them
	80	/// `c0', `c1', `c2', and `c3'. Each carry register holds two 64-bit halves:
	81	/// the register c0, for example, holds c'_0 (low half) and c''_0 (high
	82	/// half), and represents the value c_0 = c'_0 + c''_0 b; the carry registers
	83	/// collectively represent the value c_0 + c_1 B + c_2 B^2 + c_3 B^3. The
	84	/// `pmuluqdq' instruction acting on a scalar operand (broadcast across all
	85	/// lanes of its vector) and an operand in the expanded form above produces a
	86	/// result which can be added directly to the appropriate carry register.
	87	/// Following a pass of four multiplications, we perform some limited carry
	88	/// propagation: let t = c''_0 mod B, and let d = c'_0 + t b; then we output
	89	/// z = d mod B, add (floor(d/B), floor(c''_0/B)) to c1, and cycle the carry
	90	/// registers around, so that c1 becomes c0, and the old c0 is (implicitly)
	91	/// zeroed becomes c3.
	92
	93	///--------------------------------------------------------------------------
	94	/// Macro definitions.
	95
	96	.macro mulcore r, s, d0, d1, d2, d3
	97	// Load a word r_i from R, multiply by the expanded operand [S], and
	98	// leave the pieces of the product in registers D0, D1, D2, D3.
	99	movd \d0, \r // (r_i, 0, 0, 0)
	100	.ifnes "\d1", "nil"
	101	movdqa \d1, [\s] // (s'_0, s'_1, s''_0, s''_1)
	102	.endif
	103	.ifnes "\d3", "nil"
	104	movdqa \d3, [\s + 16] // (s'_2, s'_3, s''_2, s''_3)
	105	.endif
	106	pshufd \d0, \d0, SHUF(3, 0, 3, 0) // (r_i, ?, r_i, ?)
	107	.ifnes "\d1", "nil"
	108	psrldq \d1, 4 // (s'_1, s''_0, s''_1, 0)
	109	.endif
	110	.ifnes "\d2", "nil"
	111	.ifnes "\d3", "nil"
	112	movdqa \d2, \d3 // another copy of (s'_2, s'_3, ...)
	113	.else
	114	movdqa \d2, \d0 // another copy of (r_i, ?, r_i, ?)
	115	.endif
	116	.endif
	117	.ifnes "\d3", "nil"
	118	psrldq \d3, 4 // (s'_3, s''_2, s''_3, 0)
	119	.endif
	120	.ifnes "\d1", "nil"
	121	pmuludqd \d1, \d0 // (r_i s'_1, r_i s''_1)
	122	.endif
	123	.ifnes "\d3", "nil"
	124	pmuludqd \d3, \d0 // (r_i s'_3, r_i s''_3)
	125	.endif
	126	.ifnes "\d2", "nil"
	127	.ifnes "\d3", "nil"
	128	pmuludqd \d2, \d0 // (r_i s'_2, r_i s''_2)
	129	.else
	130	pmuludqd \d2, [\s + 16]
	131	.endif
	132	.endif
	133	pmuludqd \d0, [\s] // (r_i s'_0, r_i s''_0)
	134	.endm
	135
	136	.macro accum c0, c1, c2, c3
	137	paddq \c0, xmm0
	138	.ifnes "\c1", "nil"
	139	paddq \c1, xmm1
	140	.endif
	141	.ifnes "\c2", "nil"
	142	paddq \c2, xmm2
	143	.endif
	144	.ifnes "\c3", "nil"
	145	paddq \c3, xmm3
	146	.endif
	147	.endm
	148
	149	.macro mulacc r, s, c0, c1, c2, c3, z3p
	150	// Load a word r_i from R, multiply by the expanded operand [S],
	151	// and accumulate in carry registers C0, C1, C2, C3. If Z3P is `t'
	152	// then C3 notionally contains zero, but needs clearing; in practice,
	153	// we store the product directly rather than attempting to add. On
	154	// completion, XMM0, XMM1, and XMM2 are clobbered, as is XMM3 if Z3P
	155	// is not `t'.
	156	.ifeqs "\z3p", "t"
	157	mulcore \r, \s, xmm0, xmm1, xmm2, \c3
	158	accum \c0, \c1, \c2, nil
	159	.else
	160	mulcore \r, \s, xmm0, xmm1, xmm2, xmm3
	161	accum \c0, \c1, \c2, \c3
	162	.endif
	163	.endm
	164
	165	.macro propout d, c, cc
	166	// Calculate an output word from C, and store it in D; propagate
	167	// carries out from C to CC in preparation for a rotation of the
	168	// carry registers. On completion, XMM3 is clobbered. If CC is
	169	// `nil', then the contribution which would have been added to it is
	170	// left in C.
	171	pshufd xmm3, \c, SHUF(2, 3, 3, 3) // (?, ?, ?, t = c'' mod B)
	172	psrldq xmm3, 12 // (t, 0, 0, 0) = (t, 0)
	173	pslldq xmm3, 2 // (t b, 0)
	174	paddq \c, xmm3 // (c' + t b, c'')
	175	movd \d, \c
	176	psrlq \c, 32 // floor(c/B)
	177	.ifnes "\cc", "nil"
	178	paddq \cc, \c // propagate up
	179	.endif
	180	.endm
	181
	182	.macro endprop d, c, t
	183	// On entry, C contains a carry register. On exit, the low 32 bits
	184	// of the value represented in C are written to D, and the remaining
	185	// bits are left at the bottom of T.
	186	movdqa \t, \c
	187	psllq \t, 16 // (?, c'' b)
	188	pslldq \c, 8 // (0, c')
	189	paddq \t, \c // (?, c' + c'' b)
	190	psrldq \t, 8 // c' + c'' b
	191	movd \d, \t
	192	psrldq \t, 4 // floor((c' + c'' b)/B)
	193	.endm
	194
	195	.macro expand a, b, c, d, z
	196	// On entry, A and C hold packed 128-bit values, and Z is zero. On
	197	// exit, A:B and C:D together hold the same values in expanded
	198	// form. If C is `nil', then only expand A to A:B.
	199	movdqa \b, \a // (a_0, a_1, a_2, a_3)
	200	.ifnes "\c", "nil"
	201	movdqa \d, \c // (c_0, c_1, c_2, c_3)
	202	.endif
	203	punpcklwd \a, \z // (a'_0, a''_0, a'_1, a''_1)
	204	punpckhwd \b, \z // (a'_2, a''_2, a'_3, a''_3)
	205	.ifnes "\c", "nil"
	206	punpcklwd \c, \z // (c'_0, c''_0, c'_1, c''_1)
	207	punpckhwd \d, \z // (c'_2, c''_2, c'_3, c''_3)
	208	.endif
	209	pshufd \a, \a, SHUF(3, 1, 2, 0) // (a'_0, a'_1, a''_0, a''_1)
	210	pshufd \b, \b, SHUF(3, 1, 2, 0) // (a'_2, a'_3, a''_2, a''_3)
	211	.ifnes "\c", "nil"
	212	pshufd \c, \c, SHUF(3, 1, 2, 0) // (c'_0, c'_1, c''_0, c''_1)
	213	pshufd \d, \d, SHUF(3, 1, 2, 0) // (c'_2, c'_3, c''_2, c''_3)
	214	.endif
	215	.endm
	216
	217	.macro squash c0, c1, c2, c3, h, t, u
	218	// On entry, C0, C1, C2, C3 are carry registers representing a value
	219	// Y. On exit, C0 holds the low 128 bits of the carry value; C1, C2,
	220	// C3, T, and U are clobbered; and the high bits of Y are stored in
	221	// H, if this is not `nil'.
	222
	223	// The first step is to eliminate the `double-prime' pieces -- i.e.,
	224	// the ones offset by 16 bytes from a 32-bit boundary -- by carrying
	225	// them into the 32-bit-aligned pieces above and below. But before
	226	// we can do that, we must gather them together.
	227	movdqa \t, \c0
	228	movdqa \u, \c1
	229	punpcklqdq \t, \c2 // (y'_0, y'_2)
	230	punpckhqdq \c0, \c2 // (y''_0, y''_2)
	231	punpcklqdq \u, \c3 // (y'_1, y'_3)
	232	punpckhqdq \c1, \c3 // (y''_1, y''_3)
	233
	234	// Now split the double-prime pieces. The high (up to) 48 bits will
	235	// go up; the low 16 bits go down.
	236	movdqa \c2, \c0
	237	movdqa \c3, \c1
	238	psllq \c2, 48
	239	psllq \c3, 48
	240	psrlq \c0, 16 // high parts of (y''_0, y''_2)
	241	psrlq \c1, 16 // high parts of (y''_1, y''_3)
	242	psrlq \c2, 32 // low parts of (y''_0, y''_2)
	243	psrlq \c3, 32 // low parts of (y''_1, y''_3)
	244	.ifnes "\h", "nil"
	245	movdqa \h, \c1
	246	.endif
	247	pslldq \c1, 8 // high part of (0, y''_1)
	248
	249	paddq \t, \c2 // propagate down
	250	paddq \u, \c3
	251	paddq \t, \c1 // and up: (y_0, y_2)
	252	paddq \u, \c0 // (y_1, y_3)
	253	.ifnes "\h", "nil"
	254	psrldq \h, 8 // high part of (y''_3, 0)
	255	.endif
	256
	257	// Finally extract the answer. This complicated dance is better than
	258	// storing to memory and loading, because the piecemeal stores
	259	// inhibit store forwarding.
	260	movdqa \c3, \t // (y_0, y_1)
	261	movdqa \c0, \t // (y^*_0, ?, ?, ?)
	262	psrldq \t, 8 // (y_2, 0)
	263	psrlq \c3, 32 // (floor(y_0/B), ?)
	264	paddq \c3, \u // (y_1 + floor(y_0/B), ?)
	265	pslldq \c0, 12 // (0, 0, 0, y^*_0)
	266	movdqa \c1, \c3 // (y^*_1, ?, ?, ?)
	267	psrldq \u, 8 // (y_3, 0)
	268	psrlq \c3, 32 // (floor((y_1 B + y_0)/B^2, ?)
	269	paddq \c3, \t // (y_2 + floor((y_1 B + y_0)/B^2, ?)
	270	pslldq \c1, 12 // (0, 0, 0, y^*_1)
	271	psrldq \c0, 12 // (y^*_0, 0, 0, 0)
	272	movdqa \c2, \c3 // (y^*_2, ?, ?, ?)
	273	psrlq \c3, 32 // (floor((y_2 B^2 + y_1 B + y_0)/B^3, ?)
	274	paddq \c3, \u // (y_3 + floor((y_2 B^2 + y_1 B + y_0)/B^3, ?)
	275	pslldq \c2, 12 // (0, 0, 0, y^*_2)
	276	psrldq \c1, 8 // (0, y^*_1, 0, 0)
	277	psrldq \c2, 4 // (0, 0, y^*_2, 0)
	278	.ifnes "\h", "nil"
	279	movdqu \t, \c3
	280	pxor \u, \u
	281	.endif
	282	pslldq \c3, 12 // (0, 0, 0, y^*_3)
	283	por \c0, \c1 // (y^_0, y^_1, 0, 0)
	284	por \c2, \c3 // (0, 0, y^_2, y^_3)
	285	por \c0, \c2 // y mod B^4
	286	.ifnes "\h", "nil"
	287	psrlq \t, 32 // very high bits of y
	288	paddq \h, \t
	289	punpcklqdq \h, \u // carry up
	290	.endif
	291	.endm
	292
	293	.macro carryadd
	294	// On entry, EDI points to a packed addend A, and XMM4, XMM5, XMM6
	295	// hold the incoming carry registers c0, c1, and c2 representing a
	296	// carry-in C.
	297	//
	298	// On exit, the carry registers, including XMM7, are updated to hold
	299	// C + A; XMM0, XMM1, XMM2, and XMM3 are clobbered. The other
	300	// registers are preserved.
	301	movd xmm0, [edi + 0] // (a_0, 0)
	302	movd xmm1, [edi + 4] // (a_1, 0)
	303	movd xmm2, [edi + 8] // (a_2, 0)
	304	movd xmm7, [edi + 12] // (a_3, 0)
	305
	306	paddq xmm4, xmm0 // (c'_0 + a_0, c''_0)
	307	paddq xmm5, xmm1 // (c'_1 + a_1, c''_1)
	308	paddq xmm6, xmm2 // (c'_2 + a_2, c''_2 + a_3 b)
	309	.endm
	310
	311	///--------------------------------------------------------------------------
	312	/// Primitive multipliers and related utilities.
	313
	314	INTFUNC(carryprop)
	315	// On entry, XMM4, XMM5, and XMM6 hold a 144-bit carry in an expanded
	316	// form. Store the low 128 bits of the represented carry to [EDI] as
	317	// a packed 128-bit value, and leave the remaining 16 bits in the low
	318	// 32 bits of XMM4. On exit, XMM3, XMM5 and XMM6 are clobbered.
	319	endprologue
	320
	321	propout [edi + 0], xmm4, xmm5
	322	propout [edi + 4], xmm5, xmm6
	323	propout [edi + 8], xmm6, nil
	324	endprop [edi + 12], xmm6, xmm4
	325	ret
	326
	327	ENDFUNC
	328
	329	INTFUNC(dmul4)
	330	// On entry, EDI points to the destination buffer; EAX and EBX point
	331	// to the packed operands U and X; ECX and EDX point to the expanded
	332	// operands V and Y; and XMM4, XMM5, XMM6 hold the incoming carry
	333	// registers c0, c1, and c2; c3 is assumed to be zero.
	334	//
	335	// On exit, we write the low 128 bits of the sum C + U V + X Y to
	336	// [EDI], and update the carry registers with the carry out. The
	337	// registers XMM0, XMM1, XMM2, XMM3, and XMM7 are clobbered; the
	338	// general-purpose registers are preserved.
	339	endprologue
	340
	341	mulacc [eax + 0], ecx, xmm4, xmm5, xmm6, xmm7, t
	342	mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7, nil
	343	propout [edi + 0], xmm4, xmm5
	344
	345	mulacc [eax + 4], ecx, xmm5, xmm6, xmm7, xmm4, t
	346	mulacc [ebx + 4], edx, xmm5, xmm6, xmm7, xmm4, nil
	347	propout [edi + 4], xmm5, xmm6
	348
	349	mulacc [eax + 8], ecx, xmm6, xmm7, xmm4, xmm5, t
	350	mulacc [ebx + 8], edx, xmm6, xmm7, xmm4, xmm5, nil
	351	propout [edi + 8], xmm6, xmm7
	352
	353	mulacc [eax + 12], ecx, xmm7, xmm4, xmm5, xmm6, t
	354	mulacc [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6, nil
	355	propout [edi + 12], xmm7, xmm4
	356
	357	ret
	358
	359	ENDFUNC
	360
	361	INTFUNC(dmla4)
	362	// On entry, EDI points to the destination buffer, which also
	363	// contains an addend A to accumulate; EAX and EBX point to the
	364	// packed operands U and X; ECX and EDX point to the expanded
	365	// operands V and Y; and XMM4, XMM5, XMM6 hold the incoming carry
	366	// registers c0, c1, and c2 representing a carry-in C; c3 is assumed
	367	// to be zero.
	368	//
	369	// On exit, we write the low 128 bits of the sum A + C + U V + X Y to
	370	// [EDI], and update the carry registers with the carry out. The
	371	// registers XMM0, XMM1, XMM2, XMM3, and XMM7 are clobbered; the
	372	// general-purpose registers are preserved.
	373	endprologue
	374
	375	carryadd
	376
	377	mulacc [eax + 0], ecx, xmm4, xmm5, xmm6, xmm7, nil
	378	mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7, nil
	379	propout [edi + 0], xmm4, xmm5
	380
	381	mulacc [eax + 4], ecx, xmm5, xmm6, xmm7, xmm4, t
	382	mulacc [ebx + 4], edx, xmm5, xmm6, xmm7, xmm4, nil
	383	propout [edi + 4], xmm5, xmm6
	384
	385	mulacc [eax + 8], ecx, xmm6, xmm7, xmm4, xmm5, t
	386	mulacc [ebx + 8], edx, xmm6, xmm7, xmm4, xmm5, nil
	387	propout [edi + 8], xmm6, xmm7
	388
	389	mulacc [eax + 12], ecx, xmm7, xmm4, xmm5, xmm6, t
	390	mulacc [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6, nil
	391	propout [edi + 12], xmm7, xmm4
	392
	393	ret
	394
	395	ENDFUNC
	396
	397	INTFUNC(mul4zc)
	398	// On entry, EDI points to the destination buffer; EBX points to a
	399	// packed operand X; and EDX points to an expanded operand Y.
	400	//
	401	// On exit, we write the low 128 bits of the product X Y to [EDI],
	402	// and set the carry registers XMM4, XMM5, XMM6 to the carry out.
	403	// The registers XMM0, XMM1, XMM2, XMM3, and XMM7 are clobbered; the
	404	// general-purpose registers are preserved.
	405	endprologue
	406
	407	mulcore [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7
	408	propout [edi + 0], xmm4, xmm5
	409
	410	mulacc [ebx + 4], edx, xmm5, xmm6, xmm7, xmm4, t
	411	propout [edi + 4], xmm5, xmm6
	412
	413	mulacc [ebx + 8], edx, xmm6, xmm7, xmm4, xmm5, t
	414	propout [edi + 8], xmm6, xmm7
	415
	416	mulacc [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6, t
	417	propout [edi + 12], xmm7, xmm4
	418
	419	ret
	420
	421	ENDFUNC
	422
	423	INTFUNC(mul4)
	424	// On entry, EDI points to the destination buffer; EBX points to a
	425	// packed operand X; EDX points to an expanded operand Y; and XMM4,
	426	// XMM5, XMM6 hold the incoming carry registers c0, c1, and c2,
	427	// representing a carry-in C; c3 is assumed to be zero.
	428	//
	429	// On exit, we write the low 128 bits of the sum C + X Y to [EDI],
	430	// and update the carry registers with the carry out. The registers
	431	// XMM0, XMM1, XMM2, XMM3, and XMM7 are clobbered; the
	432	// general-purpose registers are preserved.
	433	endprologue
	434
	435	mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7, t
	436	propout [edi + 0], xmm4, xmm5
	437
	438	mulacc [ebx + 4], edx, xmm5, xmm6, xmm7, xmm4, t
	439	propout [edi + 4], xmm5, xmm6
	440
	441	mulacc [ebx + 8], edx, xmm6, xmm7, xmm4, xmm5, t
	442	propout [edi + 8], xmm6, xmm7
	443
	444	mulacc [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6, t
	445	propout [edi + 12], xmm7, xmm4
	446
	447	ret
	448
	449	ENDFUNC
	450
	451	INTFUNC(mla4zc)
	452	// On entry, EDI points to the destination buffer, which also
	453	// contains an addend A to accumulate; EBX points to a packed operand
	454	// X; and EDX points to an expanded operand Y.
	455	//
	456	// On exit, we write the low 128 bits of the sum A + X Y to [EDI],
	457	// and set the carry registers XMM4, XMM5, XMM6 to the carry out.
	458	// The registers XMM0, XMM1, XMM2, XMM3, and XMM7 are clobbered; the
	459	// general-purpose registers are preserved.
	460	endprologue
	461
	462	movd xmm4, [edi + 0]
	463	movd xmm5, [edi + 4]
	464	movd xmm6, [edi + 8]
	465	movd xmm7, [edi + 12]
	466
	467	mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7, nil
	468	propout [edi + 0], xmm4, xmm5
	469
	470	mulacc [ebx + 4], edx, xmm5, xmm6, xmm7, xmm4, t
	471	propout [edi + 4], xmm5, xmm6
	472
	473	mulacc [ebx + 8], edx, xmm6, xmm7, xmm4, xmm5, t
	474	propout [edi + 8], xmm6, xmm7
	475
	476	mulacc [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6, t
	477	propout [edi + 12], xmm7, xmm4
	478
	479	ret
	480
	481	ENDFUNC
	482
	483	INTFUNC(mla4)
	484	// On entry, EDI points to the destination buffer, which also
	485	// contains an addend A to accumulate; EBX points to a packed operand
	486	// X; EDX points to an expanded operand Y; and XMM4, XMM5, XMM6 hold
	487	// the incoming carry registers c0, c1, and c2, representing a
	488	// carry-in C; c3 is assumed to be zero.
	489	//
	490	// On exit, we write the low 128 bits of the sum A + C + X Y to
	491	// [EDI], and update the carry registers with the carry out. The
	492	// registers XMM0, XMM1, XMM2, XMM3, and XMM7 are clobbered; the
	493	// general-purpose registers are preserved.
	494	endprologue
	495
	496	carryadd
	497
	498	mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7, nil
	499	propout [edi + 0], xmm4, xmm5
	500
	501	mulacc [ebx + 4], edx, xmm5, xmm6, xmm7, xmm4, t
	502	propout [edi + 4], xmm5, xmm6
	503
	504	mulacc [ebx + 8], edx, xmm6, xmm7, xmm4, xmm5, t
	505	propout [edi + 8], xmm6, xmm7
	506
	507	mulacc [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6, t
	508	propout [edi + 12], xmm7, xmm4
	509
	510	ret
	511
	512	ENDFUNC
	513
	514	INTFUNC(mmul4)
	515	// On entry, EDI points to the destination buffer; EAX and EBX point
	516	// to the packed operands U and N; ECX and ESI point to the expanded
	517	// operands V and M; and EDX points to a place to store an expanded
	518	// result Y (32 bytes, at a 16-byte boundary). The stack pointer
	519	// must be 16-byte aligned. (This is not the usual convention, which
	520	// requires alignment before the call.)
	521	//
	522	// On exit, we write Y = U V M mod B to [EDX], and the low 128 bits
	523	// of the sum U V + N Y to [EDI], leaving the remaining carry in
	524	// XMM4, XMM5, and XMM6. The registers XMM0, XMM1, XMM2, XMM3, and
	525	// XMM7 are clobbered; the general-purpose registers are preserved.
	526	stalloc 48 // space for the carries
	527	endprologue
	528
	529	// Calculate W = U V, and leave it in the destination. Stash the
	530	// carry pieces for later.
	531	mulcore [eax + 0], ecx, xmm4, xmm5, xmm6, xmm7
	532	propout [edi + 0], xmm4, xmm5
	533	jmp 5f
	534
	535	ENDFUNC
	536
	537	INTFUNC(mmla4)
	538	// On entry, EDI points to the destination buffer, which also
	539	// contains an addend A to accumulate; EAX and EBX point
	540	// to the packed operands U and N; ECX and ESI point to the expanded
	541	// operands V and M; and EDX points to a place to store an expanded
	542	// result Y (32 bytes, at a 16-byte boundary). The stack pointer
	543	// must be 16-byte aligned. (This is not the usual convention, which
	544	// requires alignment before the call.)
	545	//
	546	// On exit, we write Y = (A + U V) M mod B to [EDX], and the low 128
	547	// bits of the sum A + U V + N Y to [EDI], leaving the remaining
	548	// carry in XMM4, XMM5, and XMM6. The registers XMM0, XMM1, XMM2,
	549	// XMM3, and XMM7 are clobbered; the general-purpose registers are
	550	// preserved.
	551	stalloc 48 // space for the carries
	552	endprologue
	553
	554	movd xmm4, [edi + 0]
	555	movd xmm5, [edi + 4]
	556	movd xmm6, [edi + 8]
	557	movd xmm7, [edi + 12]
	558	mulacc [eax + 0], ecx, xmm4, xmm5, xmm6, xmm7, nil
	559	propout [edi + 0], xmm4, xmm5
	560
	561	5: mulacc [eax + 4], ecx, xmm5, xmm6, xmm7, xmm4, t
	562	propout [edi + 4], xmm5, xmm6
	563
	564	mulacc [eax + 8], ecx, xmm6, xmm7, xmm4, xmm5, t
	565	propout [edi + 8], xmm6, xmm7
	566
	567	mulacc [eax + 12], ecx, xmm7, xmm4, xmm5, xmm6, t
	568	propout [edi + 12], xmm7, xmm4
	569
	570	movdqa [esp + 0], xmm4
	571	movdqa [esp + 16], xmm5
	572	movdqa [esp + 32], xmm6
	573
	574	// Calculate Y = W M.
	575	mulcore [edi + 0], esi, xmm4, xmm5, xmm6, xmm7
	576
	577	mulcore [edi + 4], esi, xmm0, xmm1, xmm2, nil
	578	accum xmm5, xmm6, xmm7, nil
	579
	580	mulcore [edi + 8], esi, xmm0, xmm1, nil, nil
	581	accum xmm6, xmm7, nil, nil
	582
	583	mulcore [edi + 12], esi, xmm0, nil, nil, nil
	584	accum xmm7, nil, nil, nil
	585
	586	// That's lots of pieces. Now we have to assemble the answer.
	587	squash xmm4, xmm5, xmm6, xmm7, nil, xmm0, xmm1
	588
	589	// Expand it.
	590	pxor xmm2, xmm2
	591	expand xmm4, xmm1, nil, nil, xmm2
	592	movdqa [edx + 0], xmm4
	593	movdqa [edx + 16], xmm1
	594
	595	// Initialize the carry from the value for W we calculated earlier.
	596	movd xmm4, [edi + 0]
	597	movd xmm5, [edi + 4]
	598	movd xmm6, [edi + 8]
	599	movd xmm7, [edi + 12]
	600
	601	// Finish the calculation by adding the Montgomery product.
	602	mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7, nil
	603	propout [edi + 0], xmm4, xmm5
	604
	605	mulacc [ebx + 4], edx, xmm5, xmm6, xmm7, xmm4, t
	606	propout [edi + 4], xmm5, xmm6
	607
	608	mulacc [ebx + 8], edx, xmm6, xmm7, xmm4, xmm5, t
	609	propout [edi + 8], xmm6, xmm7
	610
	611	mulacc [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6, t
	612	propout [edi + 12], xmm7, xmm4
	613
	614	// Add add on the carry we calculated earlier.
	615	paddq xmm4, [esp + 0]
	616	paddq xmm5, [esp + 16]
	617	paddq xmm6, [esp + 32]
	618
	619	// And, with that, we're done.
	620	stfree 48
	621	ret
	622
	623	ENDFUNC
	624
	625	INTFUNC(mont4)
	626	// On entry, EDI points to the destination buffer holding a packed
	627	// value W; EBX points to a packed operand N; ESI points to an
	628	// expanded operand M; and EDX points to a place to store an expanded
	629	// result Y (32 bytes, at a 16-byte boundary).
	630	//
	631	// On exit, we write Y = W M mod B to [EDX], and the low 128 bits
	632	// of the sum W + N Y to [EDI], leaving the remaining carry in
	633	// XMM4, XMM5, and XMM6. The registers XMM0, XMM1, XMM2, XMM3, and
	634	// XMM7 are clobbered; the general-purpose registers are preserved.
	635	endprologue
	636
	637	// Calculate Y = W M.
	638	mulcore [edi + 0], esi, xmm4, xmm5, xmm6, xmm7
	639
	640	mulcore [edi + 4], esi, xmm0, xmm1, xmm2, nil
	641	accum xmm5, xmm6, xmm7, nil
	642
	643	mulcore [edi + 8], esi, xmm0, xmm1, nil, nil
	644	accum xmm6, xmm7, nil, nil
	645
	646	mulcore [edi + 12], esi, xmm0, nil, nil, nil
	647	accum xmm7, nil, nil, nil
	648
	649	// That's lots of pieces. Now we have to assemble the answer.
	650	squash xmm4, xmm5, xmm6, xmm7, nil, xmm0, xmm1
	651
	652	// Expand it.
	653	pxor xmm2, xmm2
	654	expand xmm4, xmm1, nil, nil, xmm2
	655	movdqa [edx + 0], xmm4
	656	movdqa [edx + 16], xmm1
	657
	658	// Initialize the carry from W.
	659	movd xmm4, [edi + 0]
	660	movd xmm5, [edi + 4]
	661	movd xmm6, [edi + 8]
	662	movd xmm7, [edi + 12]
	663
	664	// Finish the calculation by adding the Montgomery product.
	665	mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7, nil
	666	propout [edi + 0], xmm4, xmm5
	667
	668	mulacc [ebx + 4], edx, xmm5, xmm6, xmm7, xmm4, t
	669	propout [edi + 4], xmm5, xmm6
	670
	671	mulacc [ebx + 8], edx, xmm6, xmm7, xmm4, xmm5, t
	672	propout [edi + 8], xmm6, xmm7
	673
	674	mulacc [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6, t
	675	propout [edi + 12], xmm7, xmm4
	676
	677	// And, with that, we're done.
	678	ret
	679
	680	ENDFUNC
	681
	682	///--------------------------------------------------------------------------
	683	/// Bulk multipliers.
	684
	685	FUNC(mpx_umul4_x86_sse2)
	686	// void mpx_umul4_x86_sse2(mpw dv, const mpw av, const mpw *avl,
	687	// const mpw bv, const mpw bvl);
	688
	689	// Build a stack frame. Arguments will be relative to EBP, as
	690	// follows.
	691	//
	692	// ebp + 20 dv
	693	// ebp + 24 av
	694	// ebp + 28 avl
	695	// ebp + 32 bv
	696	// ebp + 36 bvl
	697	//
	698	// Locals are relative to ESP, as follows.
	699	//
	700	// esp + 0 expanded Y (32 bytes)
	701	// esp + 32 (top of locals)
	702	pushreg ebp
	703	pushreg ebx
	704	pushreg esi
	705	pushreg edi
	706	setfp ebp
	707	and esp, ~15
	708	sub esp, 32
	709	endprologue
	710
	711	// Prepare for the first iteration.
	712	mov esi, [ebp + 32] // -> bv[0]
	713	pxor xmm7, xmm7
	714	movdqu xmm0, [esi] // bv[0]
	715	mov edi, [ebp + 20] // -> dv[0]
	716	mov ecx, edi // outer loop dv cursor
	717	expand xmm0, xmm1, nil, nil, xmm7
	718	mov ebx, [ebp + 24] // -> av[0]
	719	mov eax, [ebp + 28] // -> av[m] = av limit
	720	mov edx, esp // -> expanded Y = bv[0]
	721	movdqa [esp + 0], xmm0 // bv[0] expanded low
	722	movdqa [esp + 16], xmm1 // bv[0] expanded high
	723	call mul4zc
	724	add ebx, 16
	725	add edi, 16
	726	add ecx, 16
	727	add esi, 16
	728	cmp ebx, eax // all done?
	729	jae 8f
	730
	731	.p2align 4
	732	// Continue with the first iteration.
	733	0: call mul4
	734	add ebx, 16
	735	add edi, 16
	736	cmp ebx, eax // all done?
	737	jb 0b
	738
	739	// Write out the leftover carry. There can be no tail here.
	740	8: call carryprop
	741	cmp esi, [ebp + 36] // more passes to do?
	742	jae 9f
	743
	744	.p2align 4
	745	// Set up for the next pass.
	746	1: movdqu xmm0, [esi] // bv[i]
	747	mov edi, ecx // -> dv[i]
	748	pxor xmm7, xmm7
	749	expand xmm0, xmm1, nil, nil, xmm7
	750	mov ebx, [ebp + 24] // -> av[0]
	751	movdqa [esp + 0], xmm0 // bv[i] expanded low
	752	movdqa [esp + 16], xmm1 // bv[i] expanded high
	753	call mla4zc
	754	add edi, 16
	755	add ebx, 16
	756	add ecx, 16
	757	add esi, 16
	758	cmp ebx, eax // done yet?
	759	jae 8f
	760
	761	.p2align 4
	762	// Continue...
	763	0: call mla4
	764	add ebx, 16
	765	add edi, 16
	766	cmp ebx, eax
	767	jb 0b
	768
	769	// Finish off this pass. There was no tail on the previous pass, and
	770	// there can be none on this pass.
	771	8: call carryprop
	772	cmp esi, [ebp + 36]
	773	jb 1b
	774
	775	// All over.
	776	9: dropfp
	777	pop edi
	778	pop esi
	779	pop ebx
	780	pop ebp
	781	ret
	782
	783	ENDFUNC
	784
	785	FUNC(mpxmont_mul4_x86_sse2)
	786	// void mpxmont_mul4_x86_sse2(mpw dv, const mpw av, const mpw *bv,
	787	// const mpw nv, size_t n, const mpw mi);
	788
	789	// Build a stack frame. Arguments will be relative to EBP, as
	790	// follows.
	791	//
	792	// ebp + 20 dv
	793	// ebp + 24 av
	794	// ebp + 28 bv
	795	// ebp + 32 nv
	796	// ebp + 36 n (nonzero multiple of 4)
	797	// ebp + 40 mi
	798	//
	799	// Locals are relative to ESP, which is 4 mod 16, as follows.
	800	//
	801	// esp + 0 outer loop dv
	802	// esp + 4 outer loop bv
	803	// esp + 8 av limit (mostly in ESI)
	804	// esp + 12 expanded V (32 bytes)
	805	// esp + 44 expanded M (32 bytes)
	806	// esp + 76 expanded Y (32 bytes)
	807	// esp + 108 bv limit
	808	// esp + 112 (gap)
	809	// esp + 124 (top of locals)
	810	pushreg ebp
	811	pushreg ebx
	812	pushreg esi
	813	pushreg edi
	814	setfp ebp
	815	and esp, ~15
	816	sub esp, 124
	817	endprologue
	818
	819	// Establish the expanded operands.
	820	pxor xmm7, xmm7
	821	mov ecx, [ebp + 28] // -> bv
	822	mov edx, [ebp + 40] // -> mi
	823	movdqu xmm0, [ecx] // bv[0]
	824	movdqu xmm2, [edx] // mi
	825	expand xmm0, xmm1, xmm2, xmm3, xmm7
	826	movdqa [esp + 12], xmm0 // bv[0] expanded low
	827	movdqa [esp + 28], xmm1 // bv[0] expanded high
	828	movdqa [esp + 44], xmm2 // mi expanded low
	829	movdqa [esp + 60], xmm3 // mi expanded high
	830
	831	// Set up the outer loop state and prepare for the first iteration.
	832	mov edx, [ebp + 36] // n
	833	mov eax, [ebp + 24] // -> U = av[0]
	834	mov ebx, [ebp + 32] // -> X = nv[0]
	835	mov edi, [ebp + 20] // -> Z = dv[0]
	836	mov [esp + 4], ecx
	837	lea ecx, [ecx + 4*edx] // -> bv[n/4] = bv limit
	838	lea edx, [eax + 4*edx] // -> av[n/4] = av limit
	839	mov [esp + 0], edi
	840	mov [esp + 108], ecx
	841	mov [esp + 8], edx
	842	lea ecx, [esp + 12] // -> expanded V = bv[0]
	843	lea esi, [esp + 44] // -> expanded M = mi
	844	lea edx, [esp + 76] // -> space for Y
	845	call mmul4
	846	mov esi, [esp + 8] // recover av limit
	847	add edi, 16
	848	add eax, 16
	849	add ebx, 16
	850	cmp eax, esi // done already?
	851	jae 8f
	852	mov [esp + 0], edi
	853
	854	.p2align 4
	855	// Complete the first inner loop.
	856	0: call dmul4
	857	add edi, 16
	858	add eax, 16
	859	add ebx, 16
	860	cmp eax, esi // done yet?
	861	jb 0b
	862
	863	// Still have carries left to propagate.
	864	call carryprop
	865	movd [edi + 16], xmm4
	866
	867	.p2align 4
	868	// Embark on the next iteration. (There must be one. If n = 1, then
	869	// we would have bailed above, to label 8. Similarly, the subsequent
	870	// iterations can fall into the inner loop immediately.)
	871	1: mov eax, [esp + 4] // -> bv[i - 1]
	872	mov edi, [esp + 0] // -> Z = dv[i]
	873	add eax, 16 // -> bv[i]
	874	pxor xmm7, xmm7
	875	movdqu xmm0, [eax] // bv[i]
	876	mov [esp + 4], eax
	877	cmp eax, [esp + 108] // done yet?
	878	jae 9f
	879	mov ebx, [ebp + 32] // -> X = nv[0]
	880	lea esi, [esp + 44] // -> expanded M = mi
	881	mov eax, [ebp + 24] // -> U = av[0]
	882	expand xmm0, xmm1, nil, nil, xmm7
	883	movdqa [esp + 12], xmm0 // bv[i] expanded low
	884	movdqa [esp + 28], xmm1 // bv[i] expanded high
	885	call mmla4
	886	mov esi, [esp + 8] // recover av limit
	887	add edi, 16
	888	add eax, 16
	889	add ebx, 16
	890	mov [esp + 0], edi
	891
	892	.p2align 4
	893	// Complete the next inner loop.
	894	0: call dmla4
	895	add edi, 16
	896	add eax, 16
	897	add ebx, 16
	898	cmp eax, esi
	899	jb 0b
	900
	901	// Still have carries left to propagate, and they overlap the
	902	// previous iteration's final tail, so read that in and add it.
	903	movd xmm0, [edi]
	904	paddq xmm4, xmm0
	905	call carryprop
	906	movd [edi + 16], xmm4
	907
	908	// Back again.
	909	jmp 1b
	910
	911	// First iteration was short. Write out the carries and we're done.
	912	// (This could be folded into the main loop structure, but that would
	913	// penalize small numbers more.)
	914	8: call carryprop
	915	movd [edi + 16], xmm4
	916
	917	// All done.
	918	9: dropfp
	919	popreg edi
	920	popreg esi
	921	popreg ebx
	922	popreg ebp
	923	ret
	924
	925	ENDFUNC
	926
	927	FUNC(mpxmont_redc4_x86_sse2)
	928	// void mpxmont_redc4_x86_sse2(mpw dv, mpw dvl, const mpw *nv,
	929	// size_t n, const mpw *mi);
	930
	931	// Build a stack frame. Arguments will be relative to EBP, as
	932	// follows.
	933	//
	934	// ebp + 20 dv
	935	// ebp + 24 dvl
	936	// ebp + 28 nv
	937	// ebp + 32 n (nonzero multiple of 4)
	938	// ebp + 36 mi
	939	//
	940	// Locals are relative to ESP, as follows.
	941	//
	942	// esp + 0 outer loop dv
	943	// esp + 4 outer dv limit
	944	// esp + 8 blocks-of-4 dv limit
	945	// esp + 12 expanded M (32 bytes)
	946	// esp + 44 expanded Y (32 bytes)
	947	// esp + 76 (top of locals)
	948	pushreg ebp
	949	pushreg ebx
	950	pushreg esi
	951	pushreg edi
	952	setfp ebp
	953	and esp, ~15
	954	sub esp, 76
	955	endprologue
	956
	957	// Establish the expanded operands and the blocks-of-4 dv limit.
	958	mov edi, [ebp + 20] // -> Z = dv[0]
	959	pxor xmm7, xmm7
	960	mov eax, [ebp + 24] // -> dv[n] = dv limit
	961	sub eax, edi // length of dv in bytes
	962	mov edx, [ebp + 36] // -> mi
	963	movdqu xmm0, [edx] // mi
	964	and eax, ~15 // mask off the tail end
	965	expand xmm0, xmm1, nil, nil, xmm7
	966	add eax, edi // find limit
	967	movdqa [esp + 12], xmm0 // mi expanded low
	968	movdqa [esp + 28], xmm1 // mi expanded high
	969	mov [esp + 8], eax
	970
	971	// Set up the outer loop state and prepare for the first iteration.
	972	mov ecx, [ebp + 32] // n
	973	mov ebx, [ebp + 28] // -> X = nv[0]
	974	lea edx, [edi + 4*ecx] // -> dv[n/4] = outer dv limit
	975	lea ecx, [ebx + 4*ecx] // -> nv[n/4] = nv limit
	976	mov [esp + 0], edi
	977	mov [esp + 4], edx
	978	lea esi, [esp + 12] // -> expanded M = mi
	979	lea edx, [esp + 44] // -> space for Y
	980	call mont4
	981	add edi, 16
	982	add ebx, 16
	983	cmp ebx, ecx // done already?
	984	jae 8f
	985
	986	.p2align 4
	987	// Complete the first inner loop.
	988	5: call mla4
	989	add ebx, 16
	990	add edi, 16
	991	cmp ebx, ecx // done yet?
	992	jb 5b
	993
	994	// Still have carries left to propagate.
	995	8: carryadd
	996	mov esi, [esp + 8] // -> dv blocks limit
	997	mov edx, [ebp + 24] // dv limit
	998	psllq xmm7, 16
	999	pslldq xmm7, 8
	1000	paddq xmm6, xmm7
	1001	call carryprop
	1002	movd eax, xmm4
	1003	add edi, 16
	1004	cmp edi, esi
	1005	jae 7f
	1006
	1007	.p2align 4
	1008	// Continue carry propagation until the end of the buffer.
	1009	0: add [edi], eax
	1010	mov eax, 0 // preserves flags
	1011	adcd [edi + 4], 0
	1012	adcd [edi + 8], 0
	1013	adcd [edi + 12], 0
	1014	adc eax, 0
	1015	add edi, 16
	1016	cmp edi, esi
	1017	jb 0b
	1018
	1019	// Deal with the tail end.
	1020	7: add [edi], eax
	1021	mov eax, 0 // preserves flags
	1022	add edi, 4
	1023	adc eax, 0
	1024	cmp edi, edx
	1025	jb 7b
	1026
	1027	// All done for this iteration. Start the next. (This must have at
	1028	// least one follow-on iteration, or we'd not have started this outer
	1029	// loop.)
	1030	8: mov edi, [esp + 0] // -> dv[i - 1]
	1031	mov ebx, [ebp + 28] // -> X = nv[0]
	1032	lea edx, [esp + 44] // -> space for Y
	1033	lea esi, [esp + 12] // -> expanded M = mi
	1034	add edi, 16 // -> Z = dv[i]
	1035	cmp edi, [esp + 4] // all done yet?
	1036	jae 9f
	1037	mov [esp + 0], edi
	1038	call mont4
	1039	add edi, 16
	1040	add ebx, 16
	1041	jmp 5b
	1042
	1043	// All over.
	1044	9: dropfp
	1045	popreg edi
	1046	popreg esi
	1047	popreg ebx
	1048	popreg ebp
	1049	ret
	1050
	1051	ENDFUNC
	1052
	1053	///--------------------------------------------------------------------------
	1054	/// Testing and performance measurement.
	1055
	1056	#ifdef TEST_MUL4
	1057
	1058	.macro cysetup c
	1059	rdtsc
	1060	mov [\c], eax
	1061	mov [\c + 4], edx
	1062	.endm
	1063
	1064	.macro cystore c, v, n
	1065	rdtsc
	1066	sub eax, [\c]
	1067	sbb edx, [\c + 4]
	1068	mov ebx, [\v]
	1069	mov ecx, [\n]
	1070	dec ecx
	1071	mov [\n], ecx
	1072	mov [ebx + ecx*8], eax
	1073	mov [ebx + ecx*8 + 4], edx
	1074	.endm
	1075
	1076	.macro testprologue
	1077	pushreg ebp
	1078	pushreg ebx
	1079	pushreg esi
	1080	pushreg edi
	1081	setfp ebp
	1082	and esp, ~15
	1083	sub esp, 3*32 + 12
	1084	endprologue
	1085	// vars:
	1086	// esp + 0 = cycles
	1087	// esp + 12 = v expanded
	1088	// esp + 44 = y expanded
	1089	// esp + 72 = ? expanded
	1090	.endm
	1091
	1092	.macro testepilogue
	1093	dropfp
	1094	popreg edi
	1095	popreg esi
	1096	popreg ebx
	1097	popreg ebp
	1098	ret
	1099	.endm
	1100
	1101	.macro testldcarry c
	1102	mov ecx, \c // -> c
	1103	movdqu xmm4, [ecx + 0] // (c'_0, c''_0)
	1104	movdqu xmm5, [ecx + 16] // (c'_1, c''_1)
	1105	movdqu xmm6, [ecx + 32] // (c'_2, c''_2)
	1106	.endm
	1107
	1108	.macro testexpand v, y
	1109	pxor xmm7, xmm7
	1110	.ifnes "\v", "nil"
	1111	mov ecx, \v
	1112	movdqu xmm0, [ecx]
	1113	expand xmm0, xmm1, nil, nil, xmm7
	1114	movdqa [esp + 12], xmm0
	1115	movdqa [esp + 28], xmm1
	1116	.endif
	1117	.ifnes "\y", "nil"
	1118	mov edx, \y
	1119	movdqu xmm2, [edx]
	1120	expand xmm2, xmm3, nil, nil, xmm7
	1121	movdqa [esp + 44], xmm2
	1122	movdqa [esp + 60], xmm3
	1123	.endif
	1124	.endm
	1125
	1126	.macro testtop u, x, mode
	1127	.p2align 4
	1128	0:
	1129	.ifnes "\u", "nil"
	1130	lea ecx, [esp + 12]
	1131	.endif
	1132	mov ebx, \x
	1133	.ifeqs "\mode", "mont"
	1134	lea esi, [esp + 44]
	1135	.endif
	1136	cysetup esp + 0
	1137	.ifnes "\u", "nil"
	1138	mov eax, \u
	1139	.endif
	1140	.ifeqs "\mode", "mont"
	1141	lea edx, [esp + 76]
	1142	.else
	1143	lea edx, [esp + 44]
	1144	.endif
	1145	.endm
	1146
	1147	.macro testtail cyv, n
	1148	cystore esp + 0, \cyv, \n
	1149	jnz 0b
	1150	.endm
	1151
	1152	.macro testcarryout c
	1153	mov ecx, \c
	1154	movdqu [ecx + 0], xmm4
	1155	movdqu [ecx + 16], xmm5
	1156	movdqu [ecx + 32], xmm6
	1157	.endm
	1158
	1159	FUNC(test_dmul4)
	1160	testprologue
	1161	testldcarry [ebp + 24]
	1162	testexpand [ebp + 36], [ebp + 40]
	1163	mov edi, [ebp + 20]
	1164	testtop [ebp + 28], [ebp + 32]
	1165	call dmul4
	1166	testtail [ebp + 48], [ebp + 44]
	1167	testcarryout [ebp + 24]
	1168	testepilogue
	1169	ENDFUNC
	1170
	1171	FUNC(test_dmla4)
	1172	testprologue
	1173	testldcarry [ebp + 24]
	1174	testexpand [ebp + 36], [ebp + 40]
	1175	mov edi, [ebp + 20]
	1176	testtop [ebp + 28], [ebp + 32]
	1177	call dmla4
	1178	testtail [ebp + 48], [ebp + 44]
	1179	testcarryout [ebp + 24]
	1180	testepilogue
	1181	ENDFUNC
	1182
	1183	FUNC(test_mul4)
	1184	testprologue
	1185	testldcarry [ebp + 24]
	1186	testexpand nil, [ebp + 32]
	1187	mov edi, [ebp + 20]
	1188	testtop nil, [ebp + 28]
	1189	call mul4
	1190	testtail [ebp + 40], [ebp + 36]
	1191	testcarryout [ebp + 24]
	1192	testepilogue
	1193	ENDFUNC
	1194
	1195	FUNC(test_mla4)
	1196	testprologue
	1197	testldcarry [ebp + 24]
	1198	testexpand nil, [ebp + 32]
	1199	mov edi, [ebp + 20]
	1200	testtop nil, [ebp + 28]
	1201	call mla4
	1202	testtail [ebp + 40], [ebp + 36]
	1203	testcarryout [ebp + 24]
	1204	testepilogue
	1205	ENDFUNC
	1206
	1207	FUNC(test_mmul4)
	1208	testprologue
	1209	testexpand [ebp + 40], [ebp + 44]
	1210	mov edi, [ebp + 20]
	1211	testtop [ebp + 32], [ebp + 36], mont
	1212	call mmul4
	1213	testtail [ebp + 52], [ebp + 48]
	1214	mov edi, [ebp + 28]
	1215	movdqa xmm0, [esp + 76]
	1216	movdqa xmm1, [esp + 92]
	1217	movdqu [edi], xmm0
	1218	movdqu [edi + 16], xmm1
	1219	testcarryout [ebp + 24]
	1220	testepilogue
	1221	ENDFUNC
	1222
	1223	FUNC(test_mmla4)
	1224	testprologue
	1225	testexpand [ebp + 40], [ebp + 44]
	1226	mov edi, [ebp + 20]
	1227	testtop [ebp + 32], [ebp + 36], mont
	1228	call mmla4
	1229	testtail [ebp + 52], [ebp + 48]
	1230	mov edi, [ebp + 28]
	1231	movdqa xmm0, [esp + 76]
	1232	movdqa xmm1, [esp + 92]
	1233	movdqu [edi], xmm0
	1234	movdqu [edi + 16], xmm1
	1235	testcarryout [ebp + 24]
	1236	testepilogue
	1237	ENDFUNC
	1238
	1239	FUNC(test_mont4)
	1240	testprologue
	1241	testexpand nil, [ebp + 36]
	1242	mov edi, [ebp + 20]
	1243	testtop nil, [ebp + 32], mont
	1244	call mont4
	1245	testtail [ebp + 44], [ebp + 40]
	1246	mov edi, [ebp + 28]
	1247	movdqa xmm0, [esp + 76]
	1248	movdqa xmm1, [esp + 92]
	1249	movdqu [edi], xmm0
	1250	movdqu [edi + 16], xmm1
	1251	testcarryout [ebp + 24]
	1252	testepilogue
	1253	ENDFUNC
	1254
	1255	#endif
	1256
	1257	///----- That's all, folks --------------------------------------------------