chiark - git - mdw - catacomb/blame - math/mpx-mul4-arm-neon.S

Commit	Line	Data
ea1b3cec MW	1	/// -- mode: asm; asm-comment-char: ?/ --
	2	///
	3	/// Large SIMD-based multiplications
	4	///
	5	/// (c) 2019 Straylight/Edgeware
	6	///
	7
	8	///----- Licensing notice ---------------------------------------------------
	9	///
	10	/// This file is part of Catacomb.
	11	///
	12	/// Catacomb is free software: you can redistribute it and/or modify it
	13	/// under the terms of the GNU Library General Public License as published
	14	/// by the Free Software Foundation; either version 2 of the License, or
	15	/// (at your option) any later version.
	16	///
	17	/// Catacomb is distributed in the hope that it will be useful, but
	18	/// WITHOUT ANY WARRANTY; without even the implied warranty of
	19	/// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
	20	/// Library General Public License for more details.
	21	///
	22	/// You should have received a copy of the GNU Library General Public
	23	/// License along with Catacomb. If not, write to the Free Software
	24	/// Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
	25	/// USA.
	26
	27	///--------------------------------------------------------------------------
	28	/// Preliminaries.
	29
	30	#include "config.h"
	31	#include "asm-common.h"
	32
	33	.arch armv7-a
	34	.fpu neon
	35
	36	.text
	37
	38	///--------------------------------------------------------------------------
	39	/// Theory.
	40	///
	41	/// We define a number of primitive fixed-size multipliers from which we can
	42	/// construct more general variable-length multipliers.
	43	///
	44	/// The basic trick is the same throughout. In an operand-scanning
	45	/// multiplication, the inner multiplication loop multiplies a multiple-
	46	/// precision operand by a single precision factor, and adds the result,
	47	/// appropriately shifted, to the result. A `finely integrated operand
	48	/// scanning' implementation of Montgomery multiplication also adds the
	49	/// product of a single-precision `Montgomery factor' and the modulus,
	50	/// calculated in the same pass. The more common `coarsely integrated
	51	/// operand scanning' alternates main multiplication and Montgomery passes,
	52	/// which requires additional carry propagation.
	53	///
	54	/// Throughout both plain-multiplication and Montgomery stages, then, one of
	55	/// the factors remains constant throughout the operation, so we can afford
	56	/// to take a little time to preprocess it. The transformation we perform is
	57	/// as follows. Let b = 2^16, and B = b^2 = 2^32. Suppose we're given a
	58	/// 128-bit factor v = v_0 + v_1 B + v_2 B^2 + v_3 B^3. Split each v_i into
	59	/// two sixteen-bit pieces, so v_i = v'_i + v''_i b. These eight 16-bit
	60	/// pieces are placed into 32-bit cells, and arranged as two 128-bit NEON
	61	/// operands, as follows.
	62	///
981a9e5d MW	63	/// Offset 12 8 4 0
	64	/// 0 v''_1 v'_1 v''_0 v'_0
	65	/// 16 v''_3 v'_3 v''_2 v'_2
ea1b3cec MW	66	///
	67	/// The `vmull' and `vmlal' instructions can multiply a vector of two 32-bit
	68	/// values by a 32-bit scalar, giving two 64-bit results; thus, it will act
	69	/// on (say) v'_0 and v''_0 in a single instruction, to produce two 48-bit
	70	/// results in 64-bit fields. The sixteen bits of headroom allows us to add
	71	/// many products together before we must deal with carrying; it also allows
	72	/// for some calculations to be performed on the above expanded form.
	73	///
	74	/// We maintain three `carry' registers, q12--q14, accumulating intermediate
	75	/// results; we name them `c0', `c1', and `c2'. Each carry register holds
	76	/// two 64-bit halves: the register c0, for example, holds c'_0 (low half)
	77	/// and c''_0 (high half), and represents the value c_0 = c'_0 + c''_0 b; the
	78	/// carry registers collectively represent the value c_0 + c_1 B + c_2 B^2.
	79	/// The `vmull' or `vmlal' instruction acting on a scalar operand and an
	80	/// operand in the expanded form above produces a result which can be added
	81	/// directly to the appropriate carry register.
	82	///
	83	/// Multiplication is performed in product-scanning order, since ARM
	84	/// processors commonly implement result forwarding for consecutive multiply-
	85	/// and-accumulate instructions specifying the same destination.
	86	/// Experimentally, this runs faster than operand-scanning in an attempt to
	87	/// hide instruction latencies.
	88	///
	89	/// On 32-bit ARM, we have a reasonable number of registers: the expanded
	90	/// operands are kept in registers. The packed operands are read from memory
	91	/// into working registers q0 and q1. The following conventional argument
	92	/// names and locations are used throughout.
	93	///
	94	/// Arg Format Location Notes
	95	///
	96	/// U packed [r1]
	97	/// X packed [r2] In Montgomery multiplication, X = N
	98	/// V expanded q2/q3
	99	/// Y expanded q4/q5 In Montgomery multiplication, Y = (A + U V) M
	100	/// M expanded q4/q5 -N^{-1} (mod B^4)
	101	/// N Modulus, for Montgomery multiplication
	102	/// A packed [r0] Destination/accumulator
	103	/// C carry q13--q15
	104	///
	105	/// The calculation is some variant of
	106	///
	107	/// A' + C' B^4 <- U V + X Y + A + C
	108	///
	109	/// The low-level functions fit into a fairly traditional (finely-integrated)
	110	/// operand scanning loop over operand pairs (U, X) (indexed by j) and (V, Y)
	111	/// (indexed by i).
	112	///
	113	/// The variants are as follows.
	114	///
	115	/// Function Variant Use i j
	116	///
	117	/// mmul4 A = C = 0 Montgomery 0 0
	118	/// dmul4 A = 0 Montgomery 0 +
	119	/// mmla4 C = 0 Montgomery + 0
	120	/// dmla4 exactly as shown Montgomery + +
	121	///
	122	/// mul4zc U = V = A = C = 0 Plain 0 0
	123	/// mul4 U = V = A = 0 Plain 0 +
	124	/// mla4zc U = V = C = 0 Plain + 0
	125	/// mla4 U = V = 0 Plain + +
	126	///
	127	/// The `mmul4' and `mmla4' functions are also responsible for calculating
	128	/// the Montgomery reduction factor Y = (A + U V) M used by the rest of the
	129	/// inner loop.
130
131	///--------------------------------------------------------------------------
132	/// Macro definitions.
133
134	.macro mulacc z, u, v, x=nil, y=nil
135	// Set Z = Z + U V + X Y. X may be `nil' to omit the second
136	// operand. Z should be a 128-bit `qN' register; V and Y should be
137	// 64-bit `dN' registers; and U and X should be 32-bit `dN[I]'
138	// scalars; the multiplications produce two 64-bit elementwise
139	// products, which are added elementwise to Z.
140
141	vmlal.u32 \z, \v, \u
142	.ifnes "\x", "nil"
143	vmlal.u32 \z, \y, \x
144	.endif
145	.endm
146
147	.macro mulinit z, zinitp, u, v, x, y
148	// If ZINITP then set Z = Z + U V + X Y, as for `mulacc'; otherwise,
149	// set Z = U V + X Y. Operand requirements and detailed operation
150	// are as for `mulacc'.
151
152	.ifeqs "\zinitp", "t"
153	mulacc \z, \u, \v, \x, \y
154	.else
155	vmull.u32 \z, \v, \u
156	.ifnes "\x", "nil"
157	vmlal.u32 \z, \y, \x
158	.endif
159	.endif
160	.endm
161
162	// `MULI': accumulate the B^I and b B^i terms of the polynomial product sum U
163	// V + X Y, given that U = u_0 + B u_1 + B^2 u_2 + B^3 u_3 (and similarly for
164	// x), and V = v'_0 + b v''_0 + B (v'_1 + b v''_1) + B^2 (v'_2 + b v''_2) +
165	// B^3 (v'_3 + b v''_3) (and similarly for Y). The 64-bit coefficients are
166	// added into the low and high halves of the 128-bit register Z (if ZINIT is
167	// `nil' then simply set Z, as if it were initially zero).
168	#define MUL0(z, zinitp, u, v0, v1, x, y0, y1) \
169	mulinit z, zinitp, QW(u, 0), D0(v0), QW(x, 0), D0(y0)
170	#define MUL1(z, zinitp, u, v0, v1, x, y0, y1) \
171	mulinit z, zinitp, QW(u, 0), D1(v0), QW(x, 0), D1(y0); \
172	mulacc z, QW(u, 1), D0(v0), QW(x, 1), D0(y0)
173	#define MUL2(z, zinitp, u, v0, v1, x, y0, y1) \
174	mulinit z, zinitp, QW(u, 0), D0(v1), QW(x, 0), D0(y1); \
175	mulacc z, QW(u, 1), D1(v0), QW(x, 1), D1(y0); \
176	mulacc z, QW(u, 2), D0(v0), QW(x, 2), D0(y0)
177	#define MUL3(z, zinitp, u, v0, v1, x, y0, y1) \
178	mulinit z, zinitp, QW(u, 0), D1(v1), QW(x, 0), D1(y1); \
179	mulacc z, QW(u, 1), D0(v1), QW(x, 1), D0(y1); \
180	mulacc z, QW(u, 2), D1(v0), QW(x, 2), D1(y0); \
181	mulacc z, QW(u, 3), D0(v0), QW(x, 3), D0(y0)
182	#define MUL4(z, zinitp, u, v0, v1, x, y0, y1) \
183	mulinit z, zinitp, QW(u, 1), D1(v1), QW(x, 1), D1(y1); \
184	mulacc z, QW(u, 2), D0(v1), QW(x, 2), D0(y1); \
185	mulacc z, QW(u, 3), D1(v0), QW(x, 3), D1(y0)
186	#define MUL5(z, zinitp, u, v0, v1, x, y0, y1) \
187	mulinit z, zinitp, QW(u, 2), D1(v1), QW(x, 2), D1(y1); \
188	mulacc z, QW(u, 3), D0(v1), QW(x, 3), D0(y1)
189	#define MUL6(z, zinitp, u, v0, v1, x, y0, y1) \
190	mulinit z, zinitp, QW(u, 3), D1(v1), QW(x, 3), D1(y1)
191
192	// Steps in the process of propagating carry bits from ZLO to ZHI (both
193	// 128-bit `qN' registers). Here, T is a 128-bit `qN' temporary register.
194	// Set the low 32 bits of the 64-bit `dN' register ZOUT to the completed
195	// coefficient z_i.
196	//
197	// In detail, what happens is as follows. Suppose initially that ZLO =
198	// (z'_i; z''_i) and ZHI = (z'_{i+1}; z''_{i+1}). Let t = z'_i + b z''_i;
199	// observe that floor(t/b) = floor(z'_i/b) + z''_i. Let z_i = t mod B, and
200	// add floor(t/B) = floor((floor(z'_i/b) + z''_i)/b) onto z'_{i+1}. This has
201	// a circuit depth of 4; I don't know how to do better.
202	.macro _carry0 zout, zlo0, zlo1, t0, t1
203	// ZLO0 and ZLO1 are the low and high halves of a carry register.
204	// Extract a 32-bit output, in the bottom 32 bits of ZOUT, and set T1
205	// so as to continue processing using `_carry1'. All operands are
206	// 64-bit `dN' registers. If ZOUT is `nil' then no output is
207	// produced; if T1 is `nil' then no further processing will be
208	// possible.
209	.ifnes "\zout", "nil"
210	vshl.i64 \t0, \zlo1, #16
211	.endif
212	.ifnes "\t1", "nil"
213	vshr.u64 \t1, \zlo0, #16
214	.endif
215	.ifnes "\zout", "nil"
216	vadd.i64 \zout, \zlo0, \t0
217	.endif
218	.ifnes "\t1", "nil"
219	vadd.i64 \t1, \t1, \zlo1
220	.endif
221	.endm
222	.macro _carry1 u, zhi0, t1
223	// ZHI0 is the low half of a carry register, and T1 is the result of
224	// applying `_carry0' to the previous carry register. Set U to the
225	// result of propagating the carry into ZHI0.
226	vshr.u64 \t1, \t1, #16
227	vadd.i64 \u, \zhi0, \t1
228	.endm
229
230	// More convenient wrappers for `_carry0' and `_carry1'.
231	//
232	// CARRY0(ZOUT, ZLO, T)
233	// Store a 32-bit output in ZOUT from carry ZLO, using T as a
234	// temporary. ZOUT is a 64-bit `dN' register; ZLO and T are 128-bit
235	// `qN' registers.
236	//
237	// CARRY1(ZHI, T)
238	// Propagate carry from T into ZHI. Both are 128-bit `qN' registers;
239	// ZHI is updated.
240	#define CARRY0(zout, zlo, t) \
241	CASIDE0(zout, D0(zlo), zlo, t)
242	#define CARRY1(zhi, t) \
243	CASIDE1(D0(zhi), zhi, t)
244
245	// Versions of `CARRY0' and `CARRY1' which don't mutate their operands.
246	//
247	// CASIDE0(ZOUT, U, ZLO, T)
248	// As for `CARRY0', but the low half of ZLO is actually in U (a 64-bit
249	// `dN' register).
250	//
251	// CASIDE0E(ZOUT, U, ZLO, T)
252	// As for `CASIDE0', but only calculate the output word, and no
253	// carry-out.
254	//
255	// CASIDE1(U, ZHI, T)
256	// As for `CARRY1', but write the updated low half of ZHI to U.
257	#define CASIDE0(zout, u, zlo, t) \
258	_carry0 zout, u, D1(zlo), D0(t), D1(t)
259	#define CASIDE0E(zout, u, zlo, t) \
260	_carry0 zout, u, D1(zlo), D0(t), nil
261	#define CASIDE1(u, zhi, t) \
262	_carry1 u, D0(zhi), D1(t)
263
264	// Steps in spreading a packed 128-bit operand in A0 across A0, A1, A2, A3 in
265	// carry format.
266	#define SPREADACC0(a0, a1, a2, a3) \
267	vmov.i32 a1, #0; \
268	vmov.i32 a2, #0; \
269	vmov.i32 a3, #0
270	#define SPREADACC1(a0, a1, a2, a3) \
271	vswp D1(a0), D0(a2); \
272	vtrn.32 D0(a0), D0(a1); \
273	vtrn.32 D0(a2), D0(a3)
274
275	// Add the carry-format values A0, A1, A2 into the existing carries C0, C1,
276	// C2 (leaving A3 where it is).
277	#define CARRYACC(a0, a1, a2, a3, c0, c1, c2) \
278	vadd.i64 c0, c0, a0; \
279	vadd.i64 c1, c1, a1; \
280	vadd.i64 c2, c2, a2
281
282	///--------------------------------------------------------------------------
283	/// Primitive multipliers and related utilities.
284
285	INTFUNC(carryprop)
286	// On entry, r0 points to a destination, and q13--q15 hold incoming
287	// carries c0--c2. On exit, the low 128 bits of the carry value are
288	// stored at [r0]; the remaining 16 bits of carry are left in d30; r0
289	// is advanced by 16; and q10--q14 are clobbered.
290	endprologue
291
292	CARRY0(D0(q10), q13, q12)
293	CARRY1(q14, q12)
294	CARRY0(D0(q11), q14, q12)
295	CARRY1(q15, q12)
296	CARRY0(D1(q10), q15, q12)
297	vshr.u64 D1(q11), D1(q12), #16
298	vshr.u64 D0(q15), D1(q12), #48
299	vtrn.32 q10, q11
300	vst1.32 {q10}, [r0]!
301	bx r14
302	ENDFUNC
303
304	INTFUNC(dmul4)
305	// On entry, r0 points to the destination; r1 and r2 point to packed
306	// operands U and X; q2/q3 and q4/q5 hold expanded operands V and Y;
307	// and q13--q15 hold incoming carries c0--c2. On exit, the
308	// destination and carries are updated; r0, r1, r2 are each advanced
309	// by 16; q2--q5 are preserved; and the other NEON registers are
310	// clobbered.
311	endprologue
312
313	// Start by loading the operand words from memory.
314	vld1.32 {q0}, [r1]!
315	vld1.32 {q1}, [r2]!
316
317	// Do the multiplication.
318	MUL0(q13, t, q0, q2, q3, q1, q4, q5)
319	MUL1(q14, t, q0, q2, q3, q1, q4, q5)
320	CARRY0(D0(q8), q13, q6)
321	MUL2(q15, t, q0, q2, q3, q1, q4, q5)
322	CARRY1(q14, q6)
323	CARRY0(D0(q9), q14, q6)
324	MUL3(q12, nil, q0, q2, q3, q1, q4, q5)
325	CARRY1(q15, q6)
326	CARRY0(D1(q8), q15, q6)
327	MUL4(q13, nil, q0, q2, q3, q1, q4, q5)
328	CARRY1(q12, q6)
329	CARRY0(D1(q9), q12, q6)
330	MUL5(q14, nil, q0, q2, q3, q1, q4, q5)
331	CARRY1(q13, q6)
332	MUL6(q15, nil, q0, q2, q3, q1, q4, q5)
333
334	// Finish up and store the result.
335	vtrn.32 q8, q9
336	vst1.32 {q8}, [r0]!
337
338	// All done.
339	bx r14
340	ENDFUNC
341
342	INTFUNC(dmla4)
343	// On entry, r0 points to the destination/accumulator; r1 and r2
344	// point to packed operands U and X; q2/q3 and q4/q5 hold expanded
345	// operands V and Y; and q13--q15 hold incoming carries c0--c2. On
346	// exit, the accumulator and carries are updated; r0, r1, r2 are each
347	// advanced by 16; q2--q5 are preserved; and the other NEON registers
348	// are clobbered.
349	endprologue
350
351	// Start by loading the operand words from memory.
352	vld1.32 {q9}, [r0]
353	SPREADACC0(q9, q10, q11, q12)
354	vld1.32 {q0}, [r1]!
355	vld1.32 {q1}, [r2]!
356
357	// Add the accumulator input to the incoming carries. Split the
358	// accumulator into four pieces and add the carries onto them.
359	SPREADACC1(q9, q10, q11, q12)
360	CARRYACC(q9, q10, q11, q12, q13, q14, q15)
361
362	// Do the multiplication.
363	MUL0(q13, t, q0, q2, q3, q1, q4, q5)
364	MUL1(q14, t, q0, q2, q3, q1, q4, q5)
365	CARRY0(D0(q8), q13, q6)
366	MUL2(q15, t, q0, q2, q3, q1, q4, q5)
367	CARRY1(q14, q6)
368	CARRY0(D0(q9), q14, q6)
369	MUL3(q12, t, q0, q2, q3, q1, q4, q5)
370	CARRY1(q15, q6)
371	CARRY0(D1(q8), q15, q6)
372	MUL4(q13, nil, q0, q2, q3, q1, q4, q5)
373	CARRY1(q12, q6)
374	CARRY0(D1(q9), q12, q6)
375	MUL5(q14, nil, q0, q2, q3, q1, q4, q5)
376	CARRY1(q13, q6)
377	MUL6(q15, nil, q0, q2, q3, q1, q4, q5)
378
379	// Finish up and store the result.
380	vtrn.32 q8, q9
381	vst1.32 {q8}, [r0]!
382
383	// All done.
384	bx r14
385	ENDFUNC
386
387	INTFUNC(mul4)
388	// On entry, r0 points to the destination; r2 points to a packed
389	// operand X; q4/q5 holds an expanded operand Y; and q13--q15 hold
390	// incoming carries c0--c2. On exit, the destination and carries are
391	// updated; r0 and r2 are each advanced by 16; q4 and q5 are
392	// preserved; and the other NEON registers are clobbered.
393	endprologue
394
395	// Start by loading the operand words from memory.
396	vld1.32 {q1}, [r2]!
397
398	// Do the multiplication.
399	MUL0(q13, t, q1, q4, q5, nil, nil, nil)
400	MUL1(q14, t, q1, q4, q5, nil, nil, nil)
401	CARRY0(D0(q8), q13, q6)
402	MUL2(q15, t, q1, q4, q5, nil, nil, nil)
403	CARRY1(q14, q6)
404	CARRY0(D0(q9), q14, q6)
405	MUL3(q12, nil, q1, q4, q5, nil, nil, nil)
406	CARRY1(q15, q6)
407	CARRY0(D1(q8), q15, q6)
408	MUL4(q13, nil, q1, q4, q5, nil, nil, nil)
409	CARRY1(q12, q6)
410	CARRY0(D1(q9), q12, q6)
411	MUL5(q14, nil, q1, q4, q5, nil, nil, nil)
412	CARRY1(q13, q6)
413	MUL6(q15, nil, q1, q4, q5, nil, nil, nil)
414
415	// Finish up and store the result.
416	vtrn.32 q8, q9
417	vst1.32 {q8}, [r0]!
418
419	// All done.
420	bx r14
421	ENDFUNC
422
423	INTFUNC(mul4zc)
424	// On entry, r0 points to the destination; r2 points to a packed
425	// operand X; and q4/q5 holds an expanded operand Y. On exit, the
426	// destination is updated; q13--q15 hold outgoing carries c0--c2; r0
427	// and r2 are each advanced by 16; q4 and q5 are preserved; and the
428	// other NEON registers are clobbered.
429	endprologue
430
431	// Start by loading the operand words from memory.
432	vld1.32 {q1}, [r2]!
433
434	// Do the multiplication.
435	MUL0(q13, nil, q1, q4, q5, nil, nil, nil)
436	MUL1(q14, nil, q1, q4, q5, nil, nil, nil)
437	CARRY0(D0(q8), q13, q6)
438	MUL2(q15, nil, q1, q4, q5, nil, nil, nil)
439	CARRY1(q14, q6)
440	CARRY0(D0(q9), q14, q6)
441	MUL3(q12, nil, q1, q4, q5, nil, nil, nil)
442	CARRY1(q15, q6)
443	CARRY0(D1(q8), q15, q6)
444	MUL4(q13, nil, q1, q4, q5, nil, nil, nil)
445	CARRY1(q12, q6)
446	CARRY0(D1(q9), q12, q6)
447	MUL5(q14, nil, q1, q4, q5, nil, nil, nil)
448	CARRY1(q13, q6)
449	MUL6(q15, nil, q1, q4, q5, nil, nil, nil)
450
451	// Finish up and store the result.
452	vtrn.32 q8, q9
453	vst1.32 {q8}, [r0]!
454
455	// All done.
456	bx r14
457	ENDFUNC
458
459	INTFUNC(mla4)
460	// On entry, r0 points to the destination/accumulator; r2 points to a
461	// packed operand X; q4/q5 holds an expanded operand Y; and q13--q15
462	// hold incoming carries c0--c2. On exit, the accumulator and
463	// carries are updated; r0 and r2 are each advanced by 16; q4 and q5
464	// are preserved; and the other NEON registers are clobbered.
465	endprologue
466
467	// Start by loading the operand words from memory.
468	vld1.32 {q9}, [r0]
469	SPREADACC0(q9, q10, q11, q12)
470	vld1.32 {q1}, [r2]!
471
472	// Add the accumulator input to the incoming carries. Split the
473	// accumulator into four pieces and add the carries onto them.
474	SPREADACC1(q9, q10, q11, q12)
475	CARRYACC(q9, q10, q11, q12, q13, q14, q15)
476
477	// Do the multiplication.
478	mla4_common:
479	MUL0(q13, t, q1, q4, q5, nil, nil, nil)
480	MUL1(q14, t, q1, q4, q5, nil, nil, nil)
481	CARRY0(D0(q8), q13, q6)
482	MUL2(q15, t, q1, q4, q5, nil, nil, nil)
483	CARRY1(q14, q6)
484	CARRY0(D0(q9), q14, q6)
485	MUL3(q12, t, q1, q4, q5, nil, nil, nil)
486	CARRY1(q15, q6)
487	CARRY0(D1(q8), q15, q6)
488	MUL4(q13, nil, q1, q4, q5, nil, nil, nil)
489	CARRY1(q12, q6)
490	CARRY0(D1(q9), q12, q6)
491	MUL5(q14, nil, q1, q4, q5, nil, nil, nil)
492	CARRY1(q13, q6)
493	MUL6(q15, nil, q1, q4, q5, nil, nil, nil)
494
495	// Finish up and store the result.
496	vtrn.32 q8, q9
497	vst1.32 {q8}, [r0]!
498
499	// All done.
500	bx r14
501	ENDFUNC
502
503	INTFUNC(mla4zc)
504	// On entry, r0 points to the destination/accumulator; r2 points to a
505	// packed operand X; and q4/q5 holds an expanded operand Y. On exit,
506	// the accumulator is updated; q13--q15 hold outgoing carries c0--c2;
507	// r0 and r2 are each advanced by 16; q4 and q5 are preserved; and
508	// the other NEON registers are clobbered.
509	endprologue
510
511	// Start by loading the operand words from memory.
512	vld1.32 {q13}, [r0]
513	SPREADACC0(q13, q14, q15, q12)
514	vld1.32 {q1}, [r2]!
515
516	// Move the accumulator input to the incoming carry slots. Split the
517	// accumulator into four pieces.
518	SPREADACC1(q13, q14, q15, q12)
519
520	b mla4_common
521	ENDFUNC
522
523	INTFUNC(mmul4)
524	// On entry, r0 points to the destination; r1 points to a packed
525	// operand U; r2 points to a packed operand X (the modulus); q2/q3
526	// holds an expanded operand V; and q4/q5 holds an expanded operand M
527	// (the Montgomery factor -N^{-1} (mod B)). On exit, the destination
528	// is updated (to zero); q4/q5 hold an expanded factor Y = U V M (mod
529	// B); q13--q15 hold outgoing carries c0--c2; r0, r1, and r2 are each
530	// advanced by 16; q2 and q3 are preserved; and the other NEON
531	// registers are clobbered.
532
533	// Start by loading the operand words from memory.
534	vld1.32 {q0}, [r1]!
535
536	// Calculate the low half of W = A + U V, being careful to leave the
537	// carries in place.
538	MUL0(q13, nil, q0, q2, q3, nil, nil, nil)
539	MUL1(q14, nil, q0, q2, q3, nil, nil, nil)
540	CARRY0(D0(q6), q13, q8)
541	MUL2(q15, nil, q0, q2, q3, nil, nil, nil)
542	CASIDE1(D0(q9), q14, q8)
543	CASIDE0(D0(q7), D0(q9), q14, q8)
544	MUL3(q12, nil, q0, q2, q3, nil, nil, nil)
545	b mmla4_common
546	ENDFUNC
547
548	INTFUNC(mmla4)
549	// On entry, r0 points to the destination/accumulator A; r1 points to
550	// a packed operand U; r2 points to a packed operand X (the modulus);
551	// q2/q3 holds an expanded operand V; and q4/q5 holds an expanded
552	// operand M (the Montgomery factor -N^{-1} (mod B)). On exit, the
553	// accumulator is updated (to zero); q4/q5 hold an expanded factor Y
554	// = (A + U V) M (mod B); q13--q15 hold outgoing carries c0--c2; r0,
555	// r1, and r2 are each advanced by 16; q2 and q3 are preserved; and
556	// the other NEON registers are clobbered.
557	endprologue
558
559	// Start by loading the operand words from memory.
560	vld1.32 {q13}, [r0]
561	SPREADACC0(q13, q14, q15, q12)
562	vld1.32 {q0}, [r1]!
563
564	// Move the accumulator input to the incoming carry slots. Split the
565	// accumulator into four pieces.
566	SPREADACC1(q13, q14, q15, q12)
567
568	// Calculate the low half of W = A + U V, being careful to leave the
569	// carries in place.
570	MUL0(q13, t, q0, q2, q3, nil, nil, nil)
571	MUL1(q14, t, q0, q2, q3, nil, nil, nil)
572	CARRY0(D0(q6), q13, q8)
573	MUL2(q15, t, q0, q2, q3, nil, nil, nil)
574	CASIDE1(D0(q9), q14, q8)
575	CASIDE0(D0(q7), D0(q9), q14, q8)
576	MUL3(q12, t, q0, q2, q3, nil, nil, nil)
577	mmla4_common:
578	CASIDE1(D0(q9), q15, q8)
579	CASIDE0(D1(q6), D0(q9), q15, q8)
580	CASIDE1(D0(q9), q12, q8)
581	CASIDE0E(D1(q7), D0(q9), q12, q8)
582	vtrn.32 q6, q7
583
584	// Calculate the low half of the Montgomery factor Y = W M. At this
585	// point, registers are a little tight.
586	MUL0( q8, nil, q6, q4, q5, nil, nil, nil)
587	MUL1( q9, nil, q6, q4, q5, nil, nil, nil)
588	CARRY0(D0(q8), q8, q1)
589	MUL2(q10, nil, q6, q4, q5, nil, nil, nil)
590	CARRY1(q9, q1)
591	CARRY0(D0(q9), q9, q1)
592	MUL3(q11, nil, q6, q4, q5, nil, nil, nil)
593	CARRY1(q10, q1)
594	CARRY0(D1(q8), q10, q1)
595	vmov.i32 q5, #0
596	CARRY1(q11, q1)
597	CARRY0(D1(q9), q11, q1)
598	vld1.32 {q1}, [r2]!
599	vtrn.32 q8, q9
600
601	// Expand Y. We'll put it in its proper place a bit later.
602	vzip.16 q8, q5
603
604	// Build up the product X Y in the carry slots.
605	MUL0(q13, t, q1, q8, q5, nil, nil, nil)
606	MUL1(q14, t, q1, q8, q5, nil, nil, nil)
607	CARRY0(nil, q13, q9)
608	MUL2(q15, t, q1, q8, q5, nil, nil, nil)
609	CARRY1(q14, q9)
610	vmov q4, q8
611	CARRY0(nil, q14, q9)
612	MUL3(q12, t, q1, q8, q5, nil, nil, nil)
613	CARRY1(q15, q9)
614	CARRY0(nil, q15, q9)
615	vmov.u32 q6, #0
616
617	// And complete the calculation.
618	MUL4(q13, nil, q0, q2, q3, q1, q4, q5)
619	CARRY1(q12, q9)
620	CARRY0(nil, q12, q9)
621	MUL5(q14, nil, q0, q2, q3, q1, q4, q5)
622	CARRY1(q13, q9)
623	MUL6(q15, nil, q0, q2, q3, q1, q4, q5)
624
625	// Finish up and store the result.
626	vst1.32 {q6}, [r0]!
627
628	// All done.
629	bx r14
630	ENDFUNC
631
632	INTFUNC(mont4)
633	// On entry, r0 points to the destination/accumulator A; r2 points to
634	// a packed operand X (the modulus); and q2/q3 holds an expanded
635	// operand M (the Montgomery factor -N^{-1} (mod B)). On exit, the
636	// accumulator is updated (to zero); q4/q5 hold an expanded factor Y
637	// = A M (mod B); q13--q15 hold outgoing carries c0--c2; r0 and r2
638	// are each advanced by 16; q2 and q3 are preserved; and the other
639	// NEON registers are clobbered.
640	endprologue
641
642	// Start by loading the operand words from memory.
643	vld1.32 {q0}, [r0]
644	vld1.32 {q1}, [r2]!
645
646	// Calculate Y = A M (mod B).
647	MUL0(q8, nil, q0, q2, q3, nil, nil, nil)
648	MUL1(q9, nil, q0, q2, q3, nil, nil, nil)
649	CARRY0(D0(q4), q8, q6)
650	MUL2(q10, nil, q0, q2, q3, nil, nil, nil)
651	CARRY1(q9, q6)
652	vmov q13, q0
653	CARRY0(D0(q7), q9, q6)
654	MUL3(q11, nil, q0, q2, q3, nil, nil, nil)
655	CARRY1(q10, q6)
656	CARRY0(D1(q4), q10, q6)
657	SPREADACC0(q13, q14, q15, q12)
658	CARRY1(q11, q6)
659	CARRY0(D1(q7), q11, q6)
660	SPREADACC1(q13, q14, q15, q12)
661	vmov.i32 q5, #0
662	vtrn.32 q4, q7
663	vzip.16 q4, q5
664
665	// Calculate the actual result. Well, the carries, at least.
666	vmov.i32 q8, #0
667	MUL0(q13, t, q1, q4, q5, nil, nil, nil)
668	MUL1(q14, t, q1, q4, q5, nil, nil, nil)
669	CARRY0(nil, q13, q6)
670	MUL2(q15, t, q1, q4, q5, nil, nil, nil)
671	CARRY1(q14, q6)
672	CARRY0(nil, q14, q6)
673	MUL3(q12, t, q1, q4, q5, nil, nil, nil)
674	CARRY1(q15, q6)
675	CARRY0(nil, q15, q6)
676	MUL4(q13, nil, q1, q4, q5, nil, nil, nil)
677	CARRY1(q12, q6)
678	CARRY0(nil, q12, q6)
679	MUL5(q14, nil, q1, q4, q5, nil, nil, nil)
680	CARRY1(q13, q6)
681	MUL6(q15, nil, q1, q4, q5, nil, nil, nil)
682
683	// Finish up and store the result.
684	vst1.32 {q8}, [r0]!
685
686	// All done.
687	bx r14
688	ENDFUNC
689
690	///--------------------------------------------------------------------------
691	/// Bulk multipliers.
692
693	FUNC(mpx_umul4_arm_neon)
694	// void mpx_umul4_arm_neon(mpw dv, const mpw av, const mpw *avl,
695	// const mpw bv, const mpw bvl);
696
697	// Establish the arguments and do initial setup.
698	//
699	// inner loop dv r0
700	// inner loop av r2
701	// outer loop dv r5
702	// outer loop bv r3
703	// av base r1
704	// av limit r12
705	// bv limit r4
706	pushreg r4, r5, r14
707	pushvfp QQ(q4, q7)
708	endprologue
709
710	// Prepare for the first iteration.
711	vld1.32 {q4}, [r3]! // = Y = bv[0]
712	vmov.i32 q5, #0
713	// r0 // = dv for inner loop
714	// r1 // = av base
715	// r3 // = bv for outer loop
716	ldr r4, [sp, #76] // = bv limit
717	mov r12, r2 // = av limit
718	mov r2, r1 // = av for inner loop
719	add r5, r0, #16 // = dv for outer loop
720	vzip.16 q4, q5 // expand Y
721	bl mul4zc
722	cmp r2, r12 // all done?
723	bhs 8f
724
725	// Continue with the first iteration.
726	0: bl mul4
727	cmp r2, r12 // all done?
728	blo 0b
729
730	// Write out the leftover carry. There can be no tail here.
731	8: bl carryprop
732	cmp r3, r4 // more passes to do?
733	bhs 9f
734
735	// Set up for the next pass.
736	1: vld1.32 {q4}, [r3]! // = Y = bv[i]
737	vmov.i32 q5, #0
738	mov r0, r5 // -> dv[i]
739	mov r2, r1 // -> av[0]
740	add r5, r5, #16
741	vzip.16 q4, q5 // expand Y
742	bl mla4zc
743	cmp r2, r12 // done yet?
744	bhs 8f
745
746	// Continue...
747	0: bl mla4
748	cmp r2, r12
749	blo 0b
750
751	// Finish off this pass. There was no tail on the previous pass, and
752	// there can be done on this pass.
753	8: bl carryprop
754	cmp r3, r4
755	blo 1b
756
757	// All over.
758	9: popvfp QQ(q4, q7)
759	popreg r4, r5, r14
760	bx r14
761	ENDFUNC
762
763	FUNC(mpxmont_mul4_arm_neon)
764	// void mpxmont_mul4_arm_neon(mpw dv, const mpw av, const mpw *bv,
765	// const mpw nv, size_t n, const mpw mi);
766
767	// Establish the arguments and do initial setup.
768	//
769	// inner loop dv r0
770	// inner loop av r1
771	// inner loop nv r2
772	// mi r5
773	// outer loop dv r6
774	// outer loop bv r7
775	// av base r8
776	// av limit r9
777	// bv limit r4
778	// nv base r3
779	// n r4
780	// c r10
781	// 0 r12
782
783	pushreg r4-r10, r14
784	pushvfp QQ(q4, q7)
785	endprologue
786
787	// Establish the expanded operands.
788	ldrd r4, r5, [sp, #96] // r4 = n; r5 -> mi
789	vld1.32 {q2}, [r2] // = V = bv[0]
790	vmov.i32 q3, #0
791	vmov.i32 q5, #0
792	vld1.32 {q4}, [r5] // = M
793
794	// Set up the outer loop state and prepare for the first iteration.
795	// r0 // -> dv for inner loop
796	// r1 // -> av for inner loop
797	add r7, r2, #16 // -> bv
798	// r3 // -> nv
799	add r6, r0, #16 // -> dv
800	mov r8, r1 // -> av
801	add r9, r1, r4, lsl #2 // -> av limit
802	add r4, r2, r4, lsl #2 // -> bv limit
803	mov r2, r3 // -> nv for inner loop
804	mov r12, #0 // = 0
805
806	vzip.16 q2, q3 // expand V
807	vzip.16 q4, q5 // expand M
808	bl mmul4
809	cmp r1, r9 // done already?
810	bhs 8f
811
812	// Complete the first inner loop.
813	0: bl dmul4
814	cmp r1, r9 // done yet?
815	blo 0b
816
817	// Still have carries left to propagate. Rather than store the tail
818	// end in memory, keep it in a general-purpose register for later.
819	bl carryprop
820	vmov.32 r10, QW(q15, 0)
821
822	// Embark on the next iteration. (There must be one. If n = 1 then
823	// we would have bailed above, to label 8. Similarly, the subsequent
824	// iterations can fall into the inner loop immediately.)
825	1: vld1.32 {q2}, [r7]! // = V = bv[i]
826	vld1.32 {q4}, [r5] // = M
827	vmov.i32 q3, #0
828	vmov.i32 q5, #0
829	mov r0, r6 // -> dv[i]
830	add r6, r6, #16
831	mov r1, r8 // -> av[0]
832	mov r2, r3 // -> nv[0]
833	vzip.16 q2, q3 // expand V
834	vzip.16 q4, q5 // expand M
835	bl mmla4
836
837	// Complete the next inner loop.
838	0: bl dmla4
839	cmp r1, r9 // done yet?
840	blo 0b
841
842	// Still have carries left to propagate, and they overlap the
843	// previous iteration's final tail, so read that and add it.
844	cmp r7, r4
845	vmov.32 D0(q12), r10, r12
846	vadd.i64 D0(q13), D0(q13), D0(q12)
847	bl carryprop
848	vmov.32 r10, QW(q15, 0)
849
850	// Back again, maybe.
851	blo 1b
852
853	// All done, almost.
854	str r10, [r0], #4
855	popvfp QQ(q4, q7)
856	popreg r4-r10, r14
857	bx r14
858
859	// First iteration was short. Write out the carries and we're done.
860	// (This could be folded into the main loop structure, but that would
861	// penalize small numbers more.)
862	8: bl carryprop
863	vst1.32 {QW(q15, 0)}, [r0]!
864	popvfp QQ(q4, q7)
865	popreg r4-r10, r14
866	bx r14
867	ENDFUNC
868
869	FUNC(mpxmont_redc4_arm_neon)
870	// void mpxmont_redc4_arm_neon(mpw dv, mpw dvl, const mpw *nv,
871	// size_t n, const mpw *mi);
872
873	// Establish the arguments and do initial setup.
874	//
875	// inner loop dv r0
876	// dv limit r1
877	// inner loop nv r2
878	// blocks-of-4 dv limit r3
879	// mi (r14)
880	// outer loop dv r4
881	// outer loop dv limit r5
882	// nv base r6
883	// nv limit r7
884	// n r3
885	// c (r14)
886	// t0, t1, t2, t3 r2, r8, r9, r10
887	// 0 r12
888
889	pushreg r4-r10, r14
890	pushvfp QQ(q4, q7)
891	endprologue
892
893	// Set up the outer loop state and prepare for the first iteration.
894	ldr r14, [sp, #96] // -> mi
895	vmov.i32 q3, #0
896	sub r12, r1, r0 // total dv bytes
897	// r0 // -> dv for inner loop
898	// r1 // -> overall dv limit
899	// r2 // -> nv for inner loop
900	// r3 // = n (for now)
901	add r4, r0, #16 // -> dv for outer loop
902	add r5, r0, r3, lsl #2 // -> dv limit
903	bic r12, r12, #15 // dv blocks of 4
904	vld1.32 {q2}, [r14] // = M
905	mov r6, r2 // -> nv
906	add r7, r2, r3, lsl #2 // -> nv limit
907	add r3, r0, r12 // -> dv blocks-of-4 limit
908	vzip.16 q2, q3 // expand M
909	mov r12, #0 // = 0
910	bl mont4
911	cmp r2, r7 // done already?
912	bhs 8f
913
914	5: bl mla4
915	cmp r2, r7 // done yet?
916	blo 5b
917
918	// Still have carries left to propagate. Adding the accumulator
919	// block into the carries is a little different this time, because
920	// all four accumulator limbs have to be squished into the three
921	// carry registers for `carryprop' to do its thing.
922	8: vld1.32 {q9}, [r0]
923	SPREADACC0(q9, q10, q11, q12)
924	SPREADACC1(q9, q10, q11, q12)
925	vshl.u64 D0(q12), D0(q12), #16
926	CARRYACC(q9, q10, q11, q12, q13, q14, q15)
927	vadd.u64 D1(q15), D1(q15), D0(q12)
928
929	bl carryprop
930	vmov.32 r14, QW(q15, 0)
931	cmp r0, r3
932	bhs 7f
933
934	// Propagate the first group of carries.
935	ldmia r0, {r2, r8-r10}
936	adds r2, r2, r14
937	adcs r8, r8, #0
938	adcs r9, r9, #0
939	adcs r10, r10, #0
940	stmia r0!, {r2, r8-r10}
941	teq r0, r3
942	beq 6f
943
944	// Continue carry propagation until the end of the buffer.
945	0: ldmia r0, {r2, r8-r10}
946	adcs r2, r2, #0
947	adcs r8, r8, #0
948	adcs r9, r9, #0
949	adcs r10, r10, #0
950	stmia r0!, {r2, r8-r10}
951	teq r0, r3
952	bne 0b
953
954	// Deal with the tail end. Note that the actual destination length
955	// won't be an exacty number of blocks of four, so it's safe to just
956	// drop through here.
957	6: adc r14, r12, #0
958	7: ldr r2, [r0]
959	adds r2, r2, r14
960	str r2, [r0], #4
961	teq r0, r1
962	beq 8f
963	0: ldr r2, [r0]
964	adcs r2, r2, #0
965	str r2, [r0], #4
966	teq r0, r1
967	bne 0b
968
969	// All done for this iteration. Start the next.
970	8: cmp r4, r5
971	bhs 9f
972	mov r0, r4
973	add r4, r4, #16
974	mov r2, r6
975	bl mont4
976	b 5b
977
978	// All over.
979	9: popvfp QQ(q4, q7)
980	popreg r4-r10, r14
981	bx r14
982	ENDFUNC
983
984	///--------------------------------------------------------------------------
985	/// Testing and performance measurement.
986
987	#ifdef TEST_MUL4
988
989	// dmul smul mmul mont
990	// z r0 r0 r0 r0 r0
991	// c r4 r1 r1 r1 r1
992	// y r3 -- -- r2 r2
993	// u r1 r2 -- r3 --
994	// x r2 r3 r2 stk0 r3
995	// vv q2/q3 stk0 -- stk1 stk0
996	// yy q4/q5 stk1 r3 stk2 --
997	// n r5 stk2 stk0 stk3 stk1
998	// cyv r6 stk3 stk1 stk4 stk2
999
1000	#define STKARG(i) sp, #80 + 4*(i)
1001
1002	.macro testprologue mode
1003	pushreg r4-r6, r14
1004	pushvfp QQ(q4, q7)
1005	endprologue
1006
1007	.ifeqs "\mode", "dmul"
1008	mov r4, r1 // -> c
1009	mov r1, r2 // -> u
1010	mov r2, r3 // -> x
1011
1012	ldr r14, [STKARG(0)] // -> vv
1013	vld1.32 {q2}, [r14]
1014	vmov.i32 q3, #0
981a9e5d	1015	vzip.16 q2, q3 // (v''_1, v'_1; v''_0, v'_0)
ea1b3cec MW	1016
	1017	ldr r14, [STKARG(1)] // -> yy
	1018	vld1.32 {q4}, [r14]
	1019	vmov.i32 q5, #0
981a9e5d	1020	vzip.16 q4, q5 // (y''_1, y'_1; y''_0, y'_0)
ea1b3cec MW	1021
	1022	ldr r5, [STKARG(2)] // = n
	1023	ldr r6, [STKARG(3)] // -> cyv
	1024	.endif
	1025
	1026	.ifeqs "\mode", "smul"
	1027	mov r4, r1 // -> c
	1028	// r2 // -> x
	1029
	1030	vld1.32 {q4}, [r3]
	1031	vmov.i32 q5, #0
981a9e5d	1032	vzip.16 q4, q5 // (y''_1, y'_1; y''_0, y'_0)
ea1b3cec MW	1033
	1034	ldr r5, [STKARG(0)] // = n
	1035	ldr r6, [STKARG(1)] // -> cyv
	1036	.endif
	1037
	1038	.ifeqs "\mode", "mmul"
	1039	mov r4, r1 // -> c
	1040	mov r1, r3 // -> u
	1041	mov r3, r2 // -> y
	1042	ldr r2, [STKARG(0)] // -> x
	1043
	1044	ldr r14, [STKARG(1)] // -> vv
	1045	vld1.32 {q2}, [r14]
	1046	vmov.i32 q3, #0
981a9e5d	1047	vzip.16 q2, q3 // (v''_1, v'_1; v''_0, v'_0)
ea1b3cec MW	1048
	1049	ldr r14, [STKARG(2)] // -> yy
	1050	vld1.32 {q4}, [r14]
	1051	vmov.i32 q5, #0
981a9e5d	1052	vzip.16 q4, q5 // (y''_1, y'_1; y''_0, y'_0)
ea1b3cec MW	1053
	1054	ldr r5, [STKARG(3)] // = n
	1055	ldr r6, [STKARG(4)] // -> cyv
	1056	.endif
	1057
	1058	.ifeqs "\mode", "mont"
	1059	mov r4, r1 // -> c
	1060	mov r1, r3 // -> u
	1061	mov r14, r2
	1062	mov r2, r3 // -> x
	1063	mov r3, r14 // -> y
	1064
	1065	ldr r14, [STKARG(0)] // -> vv
	1066	vld1.32 {q2}, [r14]
	1067	vmov.i32 q3, #0
981a9e5d	1068	vzip.16 q2, q3 // (v''_1, v'_1; v''_0, v'_0)
ea1b3cec MW	1069
	1070	ldr r5, [STKARG(1)] // = n
	1071	ldr r6, [STKARG(2)] // -> cyv
	1072	.endif
	1073	.endm
	1074
	1075	.macro testldcarry
	1076	vldmia r4, {QQ(q13, q15)} // c0, c1, c2
	1077	.endm
	1078
	1079	.macro testtop
	1080	0: subs r5, r5, #1
	1081	.endm
	1082
	1083	.macro testtail
	1084	bne 0b
	1085	.endm
	1086
	1087	.macro testcarryout
	1088	vstmia r4, {QQ(q13, q15)}
	1089	.endm
	1090
	1091	.macro testepilogue
	1092	popvfp QQ(q4, q7)
	1093	popreg r4-r6, r14
	1094	bx r14
	1095	.endm
	1096
	1097	FUNC(test_dmul4)
	1098	testprologue dmul
	1099	testldcarry
	1100	testtop
	1101	bl dmul4
	1102	testtail
	1103	testcarryout
	1104	testepilogue
	1105	ENDFUNC
	1106
	1107	FUNC(test_dmla4)
	1108	testprologue dmul
	1109	testldcarry
	1110	testtop
	1111	bl dmla4
	1112	testtail
	1113	testcarryout
	1114	testepilogue
	1115	ENDFUNC
	1116
	1117	FUNC(test_mul4)
	1118	testprologue smul
	1119	testldcarry
	1120	testtop
	1121	bl mul4
	1122	testtail
	1123	testcarryout
	1124	testepilogue
	1125	ENDFUNC
	1126
	1127	FUNC(test_mul4zc)
	1128	testprologue smul
	1129	testldcarry
	1130	testtop
	1131	bl mul4zc
	1132	testtail
1133	testcarryout
1134	testepilogue
1135	ENDFUNC
1136
1137	FUNC(test_mla4)
1138	testprologue smul
1139	testldcarry
1140	testtop
1141	bl mla4
1142	testtail
1143	testcarryout
1144	testepilogue
1145	ENDFUNC
1146
1147	FUNC(test_mla4zc)
1148	testprologue smul
1149	testldcarry
1150	testtop
1151	bl mla4zc
1152	testtail
1153	testcarryout
1154	testepilogue
1155	ENDFUNC
1156
1157	FUNC(test_mmul4)
1158	testprologue mmul
1159	testtop
1160	bl mmul4
1161	testtail
1162	vst1.32 {q4, q5}, [r3]
1163	testcarryout
1164	testepilogue
1165	ENDFUNC
1166
1167	FUNC(test_mmla4)
1168	testprologue mmul
1169	testtop
1170	bl mmla4
1171	testtail
1172	vst1.32 {q4, q5}, [r3]
1173	testcarryout
1174	testepilogue
1175	ENDFUNC
1176
1177	FUNC(test_mont4)
1178	testprologue mont
1179	testtop
1180	bl mont4
1181	testtail
1182	vst1.32 {q4, q5}, [r3]
1183	testcarryout
1184	testepilogue
1185	ENDFUNC
1186
1187	#endif
1188
1189	///----- That's all, folks --------------------------------------------------