From 71ac8e5eb7dcaf08a9db60c7b460120f3f43d8a1 Mon Sep 17 00:00:00 2001
Message-Id: <71ac8e5eb7dcaf08a9db60c7b460120f3f43d8a1.1716365649.git.mdw@distorted.org.uk>
From: Mark Wooding <mdw@chiark.greenend.org.uk>
Date: Thu, 29 Dec 2016 15:24:26 +0000
Subject: [PATCH] math/mpx-mul4-x86-sse2.S: Use default arguments for macros.
Organization: Straylight/Edgeware

From: Mark Wooding <mdw@distorted.org.uk>

I'd muddled up my macro languages and misremembered that GNU as handles
omitted macro arguments sensibly.  So use default argument values
throughout.  Some of the macro arguments have been reordered to make
defaulting work better.  No functional change.
---
 math/mpx-mul4-x86-sse2.S | 95 +++++++++++++++++++++-------------------
 1 file changed, 49 insertions(+), 46 deletions(-)

diff --git a/math/mpx-mul4-x86-sse2.S b/math/mpx-mul4-x86-sse2.S
index 0b57dbfd..5d5714da 100644
--- a/math/mpx-mul4-x86-sse2.S
+++ b/math/mpx-mul4-x86-sse2.S
@@ -93,7 +93,7 @@
 ///--------------------------------------------------------------------------
 /// Macro definitions.
 
-.macro	mulcore	r, s, d0, d1, d2, d3
+.macro	mulcore	r, s, d0, d1=nil, d2=nil, d3=nil
 	// Load a word r_i from R, multiply by the expanded operand [S], and
 	// leave the pieces of the product in registers D0, D1, D2, D3.
 	movd	\d0, \r			// (r_i, 0, 0, 0)
@@ -133,7 +133,10 @@
 	pmuludq	\d0, [\s]		// (r_i s'_0, r_i s''_0)
 .endm
 
-.macro	accum	c0, c1, c2, c3
+.macro	accum	c0, c1=nil, c2=nil, c3=nil
+	// Accumulate 64-bit pieces in XMM0--XMM3 into the corresponding
+	// carry registers C0--C3.  Any or all of C1--C3 may be `nil' to skip
+	// updating that register.
 	paddq	\c0, xmm0
   .ifnes "\c1", "nil"
 	paddq	\c1, xmm1
@@ -146,7 +149,7 @@
   .endif
 .endm
 
-.macro	mulacc	r, s, c0, c1, c2, c3, z3p
+.macro	mulacc	r, s, c0, c1, c2, c3, z3p=nil
 	// Load a word r_i from R, multiply by the expanded operand [S],
 	// and accumulate in carry registers C0, C1, C2, C3.  If Z3P is `t'
 	// then C3 notionally contains zero, but needs clearing; in practice,
@@ -155,14 +158,14 @@
 	// is not `t'.
   .ifeqs "\z3p", "t"
 	mulcore	\r, \s, xmm0, xmm1, xmm2, \c3
-	accum		\c0,  \c1,  \c2,  nil
+	accum		\c0,  \c1,  \c2
   .else
 	mulcore	\r, \s, xmm0, xmm1, xmm2, xmm3
 	accum		\c0,  \c1,  \c2,  \c3
   .endif
 .endm
 
-.macro	propout	d, c, cc
+.macro	propout	d, c, cc=nil
 	// Calculate an output word from C, and store it in D; propagate
 	// carries out from C to CC in preparation for a rotation of the
 	// carry registers.  On completion, XMM3 is clobbered.  If CC is
@@ -192,7 +195,7 @@
 	psrldq	\t, 4			// floor((c' + c'' b)/B)
 .endm
 
-.macro	expand	a, b, c, d, z
+.macro	expand	z, a, b, c=nil, d=nil
 	// On entry, A and C hold packed 128-bit values, and Z is zero.  On
 	// exit, A:B and C:D together hold the same values in expanded
 	// form.  If C is `nil', then only expand A to A:B.
@@ -214,7 +217,7 @@
   .endif
 .endm
 
-.macro	squash	lo, hi, c0, c1, c2, c3, t, u
+.macro	squash	c0, c1, c2, c3, t, u, lo, hi=nil
 	// On entry, C0, C1, C2, C3 are carry registers representing a value
 	// Y.  On exit, LO holds the low 128 bits of the carry value; C1, C2,
 	// C3, T, and U are clobbered; and the high bits of Y are stored in
@@ -331,19 +334,19 @@ INTFUNC(dmul4)
   endprologue
 
 	mulacc	[eax +  0], ecx, xmm4, xmm5, xmm6, xmm7, t
-	mulacc	[ebx +  0], edx, xmm4, xmm5, xmm6, xmm7, nil
+	mulacc	[ebx +  0], edx, xmm4, xmm5, xmm6, xmm7
 	propout	[edi +  0],	 xmm4, xmm5
 
 	mulacc	[eax +  4], ecx, xmm5, xmm6, xmm7, xmm4, t
-	mulacc	[ebx +  4], edx, xmm5, xmm6, xmm7, xmm4, nil
+	mulacc	[ebx +  4], edx, xmm5, xmm6, xmm7, xmm4
 	propout	[edi +  4],	 xmm5, xmm6
 
 	mulacc	[eax +  8], ecx, xmm6, xmm7, xmm4, xmm5, t
-	mulacc	[ebx +  8], edx, xmm6, xmm7, xmm4, xmm5, nil
+	mulacc	[ebx +  8], edx, xmm6, xmm7, xmm4, xmm5
 	propout	[edi +  8],	 xmm6, xmm7
 
 	mulacc	[eax + 12], ecx, xmm7, xmm4, xmm5, xmm6, t
-	mulacc	[ebx + 12], edx, xmm7, xmm4, xmm5, xmm6, nil
+	mulacc	[ebx + 12], edx, xmm7, xmm4, xmm5, xmm6
 	propout	[edi + 12],	 xmm7, xmm4
 
 	ret
@@ -366,20 +369,20 @@ INTFUNC(dmla4)
 
 	carryadd
 
-	mulacc	[eax +  0], ecx, xmm4, xmm5, xmm6, xmm7, nil
-	mulacc	[ebx +  0], edx, xmm4, xmm5, xmm6, xmm7, nil
+	mulacc	[eax +  0], ecx, xmm4, xmm5, xmm6, xmm7
+	mulacc	[ebx +  0], edx, xmm4, xmm5, xmm6, xmm7
 	propout	[edi +  0],	 xmm4, xmm5
 
 	mulacc	[eax +  4], ecx, xmm5, xmm6, xmm7, xmm4, t
-	mulacc	[ebx +  4], edx, xmm5, xmm6, xmm7, xmm4, nil
+	mulacc	[ebx +  4], edx, xmm5, xmm6, xmm7, xmm4
 	propout	[edi +  4],	 xmm5, xmm6
 
 	mulacc	[eax +  8], ecx, xmm6, xmm7, xmm4, xmm5, t
-	mulacc	[ebx +  8], edx, xmm6, xmm7, xmm4, xmm5, nil
+	mulacc	[ebx +  8], edx, xmm6, xmm7, xmm4, xmm5
 	propout	[edi +  8],	 xmm6, xmm7
 
 	mulacc	[eax + 12], ecx, xmm7, xmm4, xmm5, xmm6, t
-	mulacc	[ebx + 12], edx, xmm7, xmm4, xmm5, xmm6, nil
+	mulacc	[ebx + 12], edx, xmm7, xmm4, xmm5, xmm6
 	propout	[edi + 12],	 xmm7, xmm4
 
 	ret
@@ -456,7 +459,7 @@ INTFUNC(mla4zc)
 	movd	xmm6, [edi +  8]
 	movd	xmm7, [edi + 12]
 
-	mulacc	[ebx +  0], edx, xmm4, xmm5, xmm6, xmm7, nil
+	mulacc	[ebx +  0], edx, xmm4, xmm5, xmm6, xmm7
 	propout	[edi +  0],	 xmm4, xmm5
 
 	mulacc	[ebx +  4], edx, xmm5, xmm6, xmm7, xmm4, t
@@ -487,7 +490,7 @@ INTFUNC(mla4)
 
 	carryadd
 
-	mulacc	[ebx +  0], edx, xmm4, xmm5, xmm6, xmm7, nil
+	mulacc	[ebx +  0], edx, xmm4, xmm5, xmm6, xmm7
 	propout	[edi +  0],	 xmm4, xmm5
 
 	mulacc	[ebx +  4], edx, xmm5, xmm6, xmm7, xmm4, t
@@ -547,7 +550,7 @@ INTFUNC(mmla4)
 	movd	xmm5, [edi +  4]
 	movd	xmm6, [edi +  8]
 	movd	xmm7, [edi + 12]
-	mulacc	[eax +  0], ecx, xmm4, xmm5, xmm6, xmm7, nil
+	mulacc	[eax +  0], ecx, xmm4, xmm5, xmm6, xmm7
 	propout	[edi +  0],	 xmm4, xmm5
 
 5:	mulacc	[eax +  4], ecx, xmm5, xmm6, xmm7, xmm4, t
@@ -566,21 +569,21 @@ INTFUNC(mmla4)
 	// Calculate Y = W M.
 	mulcore	[edi +  0], esi, xmm4, xmm5, xmm6, xmm7
 
-	mulcore	[edi +  4], esi, xmm0, xmm1, xmm2, nil
-	accum			 xmm5, xmm6, xmm7, nil
+	mulcore	[edi +  4], esi, xmm0, xmm1, xmm2
+	accum			 xmm5, xmm6, xmm7
 
-	mulcore	[edi +  8], esi, xmm0, xmm1, nil,  nil
-	accum			 xmm6, xmm7, nil,  nil
+	mulcore	[edi +  8], esi, xmm0, xmm1
+	accum			 xmm6, xmm7
 
-	mulcore	[edi + 12], esi, xmm0, nil,  nil,  nil
-	accum			 xmm7, nil,  nil,  nil
+	mulcore	[edi + 12], esi, xmm0
+	accum			 xmm7
 
 	// That's lots of pieces.  Now we have to assemble the answer.
-	squash	xmm4, nil,  xmm4, xmm5, xmm6, xmm7,  xmm0, xmm1
+	squash	xmm4, xmm5, xmm6, xmm7,  xmm0, xmm1,  xmm4
 
 	// Expand it.
 	pxor	xmm2, xmm2
-	expand	xmm4, xmm1, nil, nil, xmm2
+	expand	xmm2, xmm4, xmm1
 	movdqa	[edx +  0], xmm4
 	movdqa	[edx + 16], xmm1
 
@@ -591,7 +594,7 @@ INTFUNC(mmla4)
 	movd	xmm7, [edi + 12]
 
 	// Finish the calculation by adding the Montgomery product.
-	mulacc	[ebx +  0], edx, xmm4, xmm5, xmm6, xmm7, nil
+	mulacc	[ebx +  0], edx, xmm4, xmm5, xmm6, xmm7
 	propout	[edi +  0],	 xmm4, xmm5
 
 	mulacc	[ebx +  4], edx, xmm5, xmm6, xmm7, xmm4, t
@@ -629,21 +632,21 @@ INTFUNC(mont4)
 	// Calculate Y = W M.
 	mulcore	[edi +  0], esi, xmm4, xmm5, xmm6, xmm7
 
-	mulcore	[edi +  4], esi, xmm0, xmm1, xmm2, nil
-	accum			 xmm5, xmm6, xmm7, nil
+	mulcore	[edi +  4], esi, xmm0, xmm1, xmm2
+	accum			 xmm5, xmm6, xmm7
 
-	mulcore	[edi +  8], esi, xmm0, xmm1, nil,  nil
-	accum			 xmm6, xmm7, nil,  nil
+	mulcore	[edi +  8], esi, xmm0, xmm1
+	accum			 xmm6, xmm7
 
-	mulcore	[edi + 12], esi, xmm0, nil,  nil,  nil
-	accum			 xmm7, nil,  nil,  nil
+	mulcore	[edi + 12], esi, xmm0
+	accum			 xmm7
 
 	// That's lots of pieces.  Now we have to assemble the answer.
-	squash	xmm4, nil,  xmm4, xmm5, xmm6, xmm7,  xmm0, xmm1
+	squash	xmm4, xmm5, xmm6, xmm7,  xmm0, xmm1,  xmm4
 
 	// Expand it.
 	pxor	xmm2, xmm2
-	expand	xmm4, xmm1, nil, nil, xmm2
+	expand	xmm2, xmm4, xmm1
 	movdqa	[edx +  0], xmm4
 	movdqa	[edx + 16], xmm1
 
@@ -654,7 +657,7 @@ INTFUNC(mont4)
 	movd	xmm7, [edi + 12]
 
 	// Finish the calculation by adding the Montgomery product.
-	mulacc	[ebx +  0], edx, xmm4, xmm5, xmm6, xmm7, nil
+	mulacc	[ebx +  0], edx, xmm4, xmm5, xmm6, xmm7
 	propout	[edi +  0],	 xmm4, xmm5
 
 	mulacc	[ebx +  4], edx, xmm5, xmm6, xmm7, xmm4, t
@@ -706,7 +709,7 @@ FUNC(mpx_umul4_x86_sse2)
 	movdqu	xmm0, [esi]		// bv[0]
 	mov	edi, [ebp + 20]		// -> dv[0]
 	mov	ecx, edi		// outer loop dv cursor
-	expand	xmm0, xmm1, nil, nil, xmm7
+	expand	xmm7, xmm0, xmm1
 	mov	ebx, [ebp + 24]		// -> av[0]
 	mov	eax, [ebp + 28]		// -> av[m] = av limit
 	mov	edx, esp		// -> expanded Y = bv[0]
@@ -738,7 +741,7 @@ FUNC(mpx_umul4_x86_sse2)
 1:	movdqu	xmm0, [esi]		// bv[i]
 	mov	edi, ecx		// -> dv[i]
 	pxor	xmm7, xmm7
-	expand	xmm0, xmm1, nil, nil, xmm7
+	expand	xmm7, xmm0, xmm1
 	mov	ebx, [ebp + 24]		// -> av[0]
 	movdqa	[esp + 0], xmm0		// bv[i] expanded low
 	movdqa	[esp + 16], xmm1	// bv[i] expanded high
@@ -814,7 +817,7 @@ FUNC(mpxmont_mul4_x86_sse2)
 	mov	edx, [ebp + 40]		// -> mi
 	movdqu	xmm0, [ecx]		// bv[0]
 	movdqu	xmm2, [edx]		// mi
-	expand	xmm0, xmm1, xmm2, xmm3, xmm7
+	expand	xmm7, xmm0, xmm1, xmm2, xmm3
 	movdqa	[esp + 12], xmm0	// bv[0] expanded low
 	movdqa	[esp + 28], xmm1	// bv[0] expanded high
 	movdqa	[esp + 44], xmm2	// mi expanded low
@@ -871,7 +874,7 @@ FUNC(mpxmont_mul4_x86_sse2)
 	mov	ebx, [ebp + 32]		// -> X = nv[0]
 	lea	esi, [esp + 44]		// -> expanded M = mi
 	mov	eax, [ebp + 24]		// -> U = av[0]
-	expand	xmm0, xmm1, nil, nil, xmm7
+	expand	xmm7, xmm0, xmm1
 	movdqa	[esp + 12], xmm0	// bv[i] expanded low
 	movdqa	[esp + 28], xmm1	// bv[i] expanded high
 	call	mmla4
@@ -954,7 +957,7 @@ FUNC(mpxmont_redc4_x86_sse2)
 	mov	edx, [ebp + 36]		// -> mi
 	movdqu	xmm0, [edx]		// mi
 	and	eax, ~15		// mask off the tail end
-	expand	xmm0, xmm1, nil, nil, xmm7
+	expand	xmm7, xmm0, xmm1
 	add	eax, edi		// find limit
 	movdqa	[esp + 12], xmm0	// mi expanded low
 	movdqa	[esp + 28], xmm1	// mi expanded high
@@ -1097,25 +1100,25 @@ ENDFUNC
 	movdqu	xmm6, [ecx + 32]	// (c'_2, c''_2)
 .endm
 
-.macro	testexpand v, y
+.macro	testexpand v=nil, y=nil
 	pxor	xmm7, xmm7
   .ifnes "\v", "nil"
 	mov	ecx, \v
 	movdqu	xmm0, [ecx]
-	expand	xmm0, xmm1, nil, nil, xmm7
+	expand	xmm7, xmm0, xmm1
 	movdqa	[esp + 12], xmm0
 	movdqa	[esp + 28], xmm1
   .endif
   .ifnes "\y", "nil"
 	mov	edx, \y
 	movdqu	xmm2, [edx]
-	expand	xmm2, xmm3, nil, nil, xmm7
+	expand	xmm7, xmm2, xmm3
 	movdqa	[esp + 44], xmm2
 	movdqa	[esp + 60], xmm3
   .endif
 .endm
 
-.macro	testtop	u, x, mode
+.macro	testtop	u=nil, x=nil, mode=nil
 	.p2align 4
 0:
   .ifnes "\u", "nil"
-- 
[mdw]