--- /dev/null
+/// -*- mode: asm; asm-comment-char: ?/; comment-start: "// " -*-
+///
+/// Large SIMD-based multiplications
+///
+/// (c) 2016 Straylight/Edgeware
+
+///----- Licensing notice ---------------------------------------------------
+///
+/// This file is part of Catacomb.
+///
+/// Catacomb is free software; you can redistribute it and/or modify
+/// it under the terms of the GNU Library General Public License as
+/// published by the Free Software Foundation; either version 2 of the
+/// License, or (at your option) any later version.
+///
+/// Catacomb is distributed in the hope that it will be useful,
+/// but WITHOUT ANY WARRANTY; without even the implied warranty of
+/// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+/// GNU Library General Public License for more details.
+///
+/// You should have received a copy of the GNU Library General Public
+/// License along with Catacomb; if not, write to the Free
+/// Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+/// MA 02111-1307, USA.
+
+///--------------------------------------------------------------------------
+/// External definitions.
+
+#include "config.h"
+#include "asm-common.h"
+
+///--------------------------------------------------------------------------
+/// Prologue.
+
+ .arch pentium4
+ .text
+
+///--------------------------------------------------------------------------
+/// Theory.
+///
+/// We define a number of primitive fixed-size multipliers from which we can
+/// construct more general variable-length multipliers.
+///
+/// The basic trick is the same throughout. In an operand-scanning
+/// multiplication, the inner multiplication loop multiplies a
+/// multiple-precision operand by a single precision factor, and adds the
+/// result, appropriately shifted, to the result. A `finely integrated
+/// operand scanning' implementation of Montgomery multiplication also adds
+/// the product of a single-precision `Montgomery factor' and the modulus,
+/// calculated in the same pass. The more common `coarsely integrated
+/// operand scanning' alternates main multiplication and Montgomery passes,
+/// which requires additional carry propagation.
+///
+/// Throughout both plain-multiplication and Montgomery stages, then, one of
+/// the factors remains constant throughout the operation, so we can afford
+/// to take a little time to preprocess it. The transformation we perform is
+/// as follows. Let b = 2^16, and B = b^2 = 2^32. Suppose we're given a
+/// 128-bit factor v = v_0 + v_1 B + v_2 B^2 + v_3 B^3. Split each v_i into
+/// two sixteen-bit pieces, so v_i = v'_i + v''_i b. These eight 16-bit
+/// pieces are placed into 32-bit cells, and arranged as two 128-bit SSE
+/// operands, as follows.
+///
+/// Offset 0 4 8 12
+/// 0 v'_0 v'_1 v''_0 v''_1
+/// 16 v'_2 v'_3 v''_2 v''_3
+///
+/// A `pmuludqd' instruction ignores the odd positions in its operands; thus,
+/// it will act on (say) v'_0 and v''_0 in a single instruction. Shifting
+/// this vector right by 4 bytes brings v'_1 and v''_1 into position. We can
+/// multiply such a vector by a full 32-bit scalar to produce two 48-bit
+/// results in 64-bit fields. The sixteen bits of headroom allows us to add
+/// many products together before we must deal with carrying; it also allows
+/// for some calculations to be performed on the above expanded form.
+///
+/// On 32-bit x86, we are register starved: the expanded operands are kept in
+/// memory, typically in warm L1 cache.
+///
+/// We maintain four `carry' registers accumulating intermediate results.
+/// The registers' precise roles rotate during the computation; we name them
+/// `c0', `c1', `c2', and `c3'. Each carry register holds two 64-bit halves:
+/// the register c0, for example, holds c'_0 (low half) and c''_0 (high
+/// half), and represents the value c_0 = c'_0 + c''_0 b; the carry registers
+/// collectively represent the value c_0 + c_1 B + c_2 B^2 + c_3 B^3. The
+/// `pmuluqdq' instruction acting on a scalar operand (broadcast across all
+/// lanes of its vector) and an operand in the expanded form above produces a
+/// result which can be added directly to the appropriate carry register.
+/// Following a pass of four multiplications, we perform some limited carry
+/// propagation: let t = c''_0 mod B, and let d = c'_0 + t b; then we output
+/// z = d mod B, add (floor(d/B), floor(c''_0/B)) to c1, and cycle the carry
+/// registers around, so that c1 becomes c0, and the old c0 is (implicitly)
+/// zeroed becomes c3.
+
+///--------------------------------------------------------------------------
+/// Macro definitions.
+
+.macro mulcore r, s, d0, d1, d2, d3
+ // Load a word r_i from R, multiply by the expanded operand [S], and
+ // leave the pieces of the product in registers D0, D1, D2, D3.
+ movd \d0, \r // (r_i, 0, 0, 0)
+ .ifnes "\d1", "nil"
+ movdqa \d1, [\s] // (s'_0, s'_1, s''_0, s''_1)
+ .endif
+ .ifnes "\d3", "nil"
+ movdqa \d3, [\s + 16] // (s'_2, s'_3, s''_2, s''_3)
+ .endif
+ pshufd \d0, \d0, 0b11001100 // (r_i, ?, r_i, ?)
+ .ifnes "\d1", "nil"
+ psrldq \d1, 4 // (s'_1, s''_0, s''_1, 0)
+ .endif
+ .ifnes "\d2", "nil"
+ .ifnes "\d3", "nil"
+ movdqa \d2, \d3 // another copy of (s'_2, s'_3, ...)
+ .else
+ movdqa \d2, \d0 // another copy of (r_i, ?, r_i, ?)
+ .endif
+ .endif
+ .ifnes "\d3", "nil"
+ psrldq \d3, 4 // (s'_3, s''_2, s''_3, 0)
+ .endif
+ .ifnes "\d1", "nil"
+ pmuludqd \d1, \d0 // (r_i s'_1, r_i s''_1)
+ .endif
+ .ifnes "\d3", "nil"
+ pmuludqd \d3, \d0 // (r_i s'_3, r_i s''_3)
+ .endif
+ .ifnes "\d2", "nil"
+ .ifnes "\d3", "nil"
+ pmuludqd \d2, \d0 // (r_i s'_2, r_i s''_2)
+ .else
+ pmuludqd \d2, [\s + 16]
+ .endif
+ .endif
+ pmuludqd \d0, [\s] // (r_i s'_0, r_i s''_0)
+.endm
+
+.macro accum c0, c1, c2, c3
+ paddq \c0, xmm0
+ .ifnes "\c1", "nil"
+ paddq \c1, xmm1
+ .endif
+ .ifnes "\c2", "nil"
+ paddq \c2, xmm2
+ .endif
+ .ifnes "\c3", "nil"
+ paddq \c3, xmm3
+ .endif
+.endm
+
+.macro mulacc r, s, c0, c1, c2, c3, z3p
+ // Load a word r_i from R, multiply by the expanded operand [S],
+ // and accumulate in carry registers C0, C1, C2, C3. If Z3P is `t'
+ // then C3 notionally contains zero, but needs clearing; in practice,
+ // we store the product directly rather than attempting to add. On
+ // completion, XMM0, XMM1, and XMM2 are clobbered, as is XMM3 if Z3P
+ // is not `t'.
+ .ifeqs "\z3p", "t"
+ mulcore \r, \s, xmm0, xmm1, xmm2, \c3
+ accum \c0, \c1, \c2, nil
+ .else
+ mulcore \r, \s, xmm0, xmm1, xmm2, xmm3
+ accum \c0, \c1, \c2, \c3
+ .endif
+.endm
+
+.macro propout d, c, cc
+ // Calculate an output word from C, and store it in D; propagate
+ // carries out from C to CC in preparation for a rotation of the
+ // carry registers. On completion, XMM3 is clobbered. If CC is
+ // `nil', then the contribution which would have been added to it is
+ // left in C.
+ pshufd xmm3, \c, 0b10111111 // (?, ?, ?, t = c'' mod B)
+ psrldq xmm3, 12 // (t, 0, 0, 0) = (t, 0)
+ pslldq xmm3, 2 // (t b, 0)
+ paddq \c, xmm3 // (c' + t b, c'')
+ movd \d, \c
+ psrlq \c, 32 // floor(c/B)
+ .ifnes "\cc", "nil"
+ paddq \cc, \c // propagate up
+ .endif
+.endm
+
+.macro endprop d, c, t
+ // On entry, C contains a carry register. On exit, the low 32 bits
+ // of the value represented in C are written to D, and the remaining
+ // bits are left at the bottom of T.
+ movdqa \t, \c
+ psllq \t, 16 // (?, c'' b)
+ pslldq \c, 8 // (0, c')
+ paddq \t, \c // (?, c' + c'' b)
+ psrldq \t, 8 // c' + c'' b
+ movd \d, \t
+ psrldq \t, 4 // floor((c' + c'' b)/B)
+.endm
+
+.macro expand a, b, c, d, z
+ // On entry, A and C hold packed 128-bit values, and Z is zero. On
+ // exit, A:B and C:D together hold the same values in expanded
+ // form. If C is `nil', then only expand A to A:B.
+ movdqa \b, \a // (a_0, a_1, a_2, a_3)
+ .ifnes "\c", "nil"
+ movdqa \d, \c // (c_0, c_1, c_2, c_3)
+ .endif
+ punpcklwd \a, \z // (a'_0, a''_0, a'_1, a''_1)
+ punpckhwd \b, \z // (a'_2, a''_2, a'_3, a''_3)
+ .ifnes "\c", "nil"
+ punpcklwd \c, \z // (c'_0, c''_0, c'_1, c''_1)
+ punpckhwd \d, \z // (c'_2, c''_2, c'_3, c''_3)
+ .endif
+ pshufd \a, \a, 0b11011000 // (a'_0, a'_1, a''_0, a''_1)
+ pshufd \b, \b, 0b11011000 // (a'_2, a'_3, a''_2, a''_3)
+ .ifnes "\c", "nil"
+ pshufd \c, \c, 0b11011000 // (c'_0, c'_1, c''_0, c''_1)
+ pshufd \d, \d, 0b11011000 // (c'_2, c'_3, c''_2, c''_3)
+ .endif
+.endm
+
+.macro squash c0, c1, c2, c3, h, t, u
+ // On entry, C0, C1, C2, C3 are carry registers representing a value
+ // Y. On exit, C0 holds the low 128 bits of the carry value; C1, C2,
+ // C3, T, and U are clobbered; and the high bits of Y are stored in
+ // H, if this is not `nil'.
+
+ // The first step is to eliminate the `double-prime' pieces -- i.e.,
+ // the ones offset by 16 bytes from a 32-bit boundary -- by carrying
+ // them into the 32-bit-aligned pieces above and below. But before
+ // we can do that, we must gather them together.
+ movdqa \t, \c0
+ movdqa \u, \c1
+ punpcklqdq \t, \c2 // (y'_0, y'_2)
+ punpckhqdq \c0, \c2 // (y''_0, y''_2)
+ punpcklqdq \u, \c3 // (y'_1, y'_3)
+ punpckhqdq \c1, \c3 // (y''_1, y''_3)
+
+ // Now split the double-prime pieces. The high (up to) 48 bits will
+ // go up; the low 16 bits go down.
+ movdqa \c2, \c0
+ movdqa \c3, \c1
+ psllq \c2, 48
+ psllq \c3, 48
+ psrlq \c0, 16 // high parts of (y''_0, y''_2)
+ psrlq \c1, 16 // high parts of (y''_1, y''_3)
+ psrlq \c2, 32 // low parts of (y''_0, y''_2)
+ psrlq \c3, 32 // low parts of (y''_1, y''_3)
+ .ifnes "\h", "nil"
+ movdqa \h, \c1
+ .endif
+ pslldq \c1, 8 // high part of (0, y''_1)
+
+ paddq \t, \c2 // propagate down
+ paddq \u, \c3
+ paddq \t, \c1 // and up: (y_0, y_2)
+ paddq \u, \c0 // (y_1, y_3)
+ .ifnes "\h", "nil"
+ psrldq \h, 8 // high part of (y''_3, 0)
+ .endif
+
+ // Finally extract the answer. This complicated dance is better than
+ // storing to memory and loading, because the piecemeal stores
+ // inhibit store forwarding.
+ movdqa \c3, \t // (y_0, y_1)
+ movdqa \c0, \t // (y^*_0, ?, ?, ?)
+ psrldq \t, 8 // (y_2, 0)
+ psrlq \c3, 32 // (floor(y_0/B), ?)
+ paddq \c3, \u // (y_1 + floor(y_0/B), ?)
+ pslldq \c0, 12 // (0, 0, 0, y^*_0)
+ movdqa \c1, \c3 // (y^*_1, ?, ?, ?)
+ psrldq \u, 8 // (y_3, 0)
+ psrlq \c3, 32 // (floor((y_1 B + y_0)/B^2, ?)
+ paddq \c3, \t // (y_2 + floor((y_1 B + y_0)/B^2, ?)
+ pslldq \c1, 12 // (0, 0, 0, y^*_1)
+ psrldq \c0, 12 // (y^*_0, 0, 0, 0)
+ movdqa \c2, \c3 // (y^*_2, ?, ?, ?)
+ psrlq \c3, 32 // (floor((y_2 B^2 + y_1 B + y_0)/B^3, ?)
+ paddq \c3, \u // (y_3 + floor((y_2 B^2 + y_1 B + y_0)/B^3, ?)
+ pslldq \c2, 12 // (0, 0, 0, y^*_2)
+ psrldq \c1, 8 // (0, y^*_1, 0, 0)
+ psrldq \c2, 4 // (0, 0, y^*_2, 0)
+ .ifnes "\h", "nil"
+ movdqu \t, \c3
+ pxor \u, \u
+ .endif
+ pslldq \c3, 12 // (0, 0, 0, y^*_3)
+ por \c0, \c1 // (y^*_0, y^*_1, 0, 0)
+ por \c2, \c3 // (0, 0, y^*_2, y^*_3)
+ por \c0, \c2 // y mod B^4
+ .ifnes "\h", "nil"
+ psrlq \t, 32 // very high bits of y
+ paddq \h, \t
+ punpcklqdq \h, \u // carry up
+ .endif
+.endm
+
+.macro carryadd
+ // On entry, EDI points to a packed addend A, and XMM4, XMM5, XMM6
+ // hold the incoming carry registers c0, c1, and c2 representing a
+ // carry-in C.
+ //
+ // On exit, the carry registers, including XMM7, are updated to hold
+ // C + A; XMM0, XMM1, XMM2, and XMM3 are clobbered. The other
+ // registers are preserved.
+ movd xmm0, [edi + 0] // (a_0, 0)
+ movd xmm1, [edi + 4] // (a_1, 0)
+ movd xmm2, [edi + 8] // (a_2, 0)
+ movd xmm7, [edi + 12] // (a_3, 0)
+ paddq xmm4, xmm0 // (c'_0 + a_0, c''_0)
+ paddq xmm5, xmm1 // (c'_1 + a_1, c''_1)
+ paddq xmm6, xmm2 // (c'_2 + a_2, c''_2 + a_3 b)
+.endm
+
+///--------------------------------------------------------------------------
+/// Primitive multipliers and related utilities.
+
+ .p2align 4
+carryprop:
+ // On entry, XMM4, XMM5, and XMM6 hold a 144-bit carry in an expanded
+ // form. Store the low 128 bits of the represented carry to [EDI] as
+ // a packed 128-bit value, and leave the remaining 16 bits in the low
+ // 32 bits of XMM4. On exit, XMM3, XMM5 and XMM6 are clobbered.
+ propout [edi + 0], xmm4, xmm5
+ propout [edi + 4], xmm5, xmm6
+ propout [edi + 8], xmm6, nil
+ endprop [edi + 12], xmm6, xmm4
+ ret
+
+ .p2align 4
+dmul4:
+ // On entry, EDI points to the destination buffer; EAX and EBX point
+ // to the packed operands U and X; ECX and EDX point to the expanded
+ // operands V and Y; and XMM4, XMM5, XMM6 hold the incoming carry
+ // registers c0, c1, and c2; c3 is assumed to be zero.
+ //
+ // On exit, we write the low 128 bits of the sum C + U V + X Y to
+ // [EDI], and update the carry registers with the carry out. The
+ // registers XMM0, XMM1, XMM2, XMM3, and XMM7 are clobbered; the
+ // general-purpose registers are preserved.
+ mulacc [eax + 0], ecx, xmm4, xmm5, xmm6, xmm7, t
+ mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7, nil
+ propout [edi + 0], xmm4, xmm5
+
+ mulacc [eax + 4], ecx, xmm5, xmm6, xmm7, xmm4, t
+ mulacc [ebx + 4], edx, xmm5, xmm6, xmm7, xmm4, nil
+ propout [edi + 4], xmm5, xmm6
+
+ mulacc [eax + 8], ecx, xmm6, xmm7, xmm4, xmm5, t
+ mulacc [ebx + 8], edx, xmm6, xmm7, xmm4, xmm5, nil
+ propout [edi + 8], xmm6, xmm7
+
+ mulacc [eax + 12], ecx, xmm7, xmm4, xmm5, xmm6, t
+ mulacc [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6, nil
+ propout [edi + 12], xmm7, xmm4
+
+ ret
+
+ .p2align 4
+dmla4:
+ // On entry, EDI points to the destination buffer, which also
+ // contains an addend A to accumulate; EAX and EBX point to the
+ // packed operands U and X; ECX and EDX point to the expanded
+ // operands V and Y; and XMM4, XMM5, XMM6 hold the incoming carry
+ // registers c0, c1, and c2 representing a carry-in C; c3 is assumed
+ // to be zero.
+ //
+ // On exit, we write the low 128 bits of the sum A + C + U V + X Y to
+ // [EDI], and update the carry registers with the carry out. The
+ // registers XMM0, XMM1, XMM2, XMM3, and XMM7 are clobbered; the
+ // general-purpose registers are preserved.
+ carryadd
+
+ mulacc [eax + 0], ecx, xmm4, xmm5, xmm6, xmm7, nil
+ mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7, nil
+ propout [edi + 0], xmm4, xmm5
+
+ mulacc [eax + 4], ecx, xmm5, xmm6, xmm7, xmm4, t
+ mulacc [ebx + 4], edx, xmm5, xmm6, xmm7, xmm4, nil
+ propout [edi + 4], xmm5, xmm6
+
+ mulacc [eax + 8], ecx, xmm6, xmm7, xmm4, xmm5, t
+ mulacc [ebx + 8], edx, xmm6, xmm7, xmm4, xmm5, nil
+ propout [edi + 8], xmm6, xmm7
+
+ mulacc [eax + 12], ecx, xmm7, xmm4, xmm5, xmm6, t
+ mulacc [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6, nil
+ propout [edi + 12], xmm7, xmm4
+
+ ret
+
+ .p2align 4
+mul4zc:
+ // On entry, EDI points to the destination buffer; EBX points to a
+ // packed operand X; and EDX points to an expanded operand Y.
+ //
+ // On exit, we write the low 128 bits of the product X Y to [EDI],
+ // and set the carry registers XMM4, XMM5, XMM6 to the carry out.
+ // The registers XMM0, XMM1, XMM2, XMM3, and XMM7 are clobbered; the
+ // general-purpose registers are preserved.
+ mulcore [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7
+ propout [edi + 0], xmm4, xmm5
+
+ mulacc [ebx + 4], edx, xmm5, xmm6, xmm7, xmm4, t
+ propout [edi + 4], xmm5, xmm6
+
+ mulacc [ebx + 8], edx, xmm6, xmm7, xmm4, xmm5, t
+ propout [edi + 8], xmm6, xmm7
+
+ mulacc [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6, t
+ propout [edi + 12], xmm7, xmm4
+
+ ret
+
+ .p2align 4
+mul4:
+ // On entry, EDI points to the destination buffer; EBX points to a
+ // packed operand X; EDX points to an expanded operand Y; and XMM4,
+ // XMM5, XMM6 hold the incoming carry registers c0, c1, and c2,
+ // representing a carry-in C; c3 is assumed to be zero.
+ //
+ // On exit, we write the low 128 bits of the sum C + X Y to [EDI],
+ // and update the carry registers with the carry out. The registers
+ // XMM0, XMM1, XMM2, XMM3, and XMM7 are clobbered; the
+ // general-purpose registers are preserved.
+ mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7, t
+ propout [edi + 0], xmm4, xmm5
+
+ mulacc [ebx + 4], edx, xmm5, xmm6, xmm7, xmm4, t
+ propout [edi + 4], xmm5, xmm6
+
+ mulacc [ebx + 8], edx, xmm6, xmm7, xmm4, xmm5, t
+ propout [edi + 8], xmm6, xmm7
+
+ mulacc [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6, t
+ propout [edi + 12], xmm7, xmm4
+
+ ret
+
+ .p2align 4
+mla4zc:
+ // On entry, EDI points to the destination buffer, which also
+ // contains an addend A to accumulate; EBX points to a packed operand
+ // X; and EDX points to an expanded operand Y.
+ //
+ // On exit, we write the low 128 bits of the sum A + X Y to [EDI],
+ // and set the carry registers XMM4, XMM5, XMM6 to the carry out.
+ // The registers XMM0, XMM1, XMM2, XMM3, and XMM7 are clobbered; the
+ // general-purpose registers are preserved.
+ movd xmm4, [edi + 0]
+ movd xmm5, [edi + 4]
+ movd xmm6, [edi + 8]
+ movd xmm7, [edi + 12]
+
+ mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7, nil
+ propout [edi + 0], xmm4, xmm5
+
+ mulacc [ebx + 4], edx, xmm5, xmm6, xmm7, xmm4, t
+ propout [edi + 4], xmm5, xmm6
+
+ mulacc [ebx + 8], edx, xmm6, xmm7, xmm4, xmm5, t
+ propout [edi + 8], xmm6, xmm7
+
+ mulacc [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6, t
+ propout [edi + 12], xmm7, xmm4
+
+ ret
+
+ .p2align 4
+mla4:
+ // On entry, EDI points to the destination buffer, which also
+ // contains an addend A to accumulate; EBX points to a packed operand
+ // X; EDX points to an expanded operand Y; and XMM4, XMM5, XMM6 hold
+ // the incoming carry registers c0, c1, and c2, representing a
+ // carry-in C; c3 is assumed to be zero.
+ //
+ // On exit, we write the low 128 bits of the sum A + C + X Y to
+ // [EDI], and update the carry registers with the carry out. The
+ // registers XMM0, XMM1, XMM2, XMM3, and XMM7 are clobbered; the
+ // general-purpose registers are preserved.
+ carryadd
+
+ mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7, nil
+ propout [edi + 0], xmm4, xmm5
+
+ mulacc [ebx + 4], edx, xmm5, xmm6, xmm7, xmm4, t
+ propout [edi + 4], xmm5, xmm6
+
+ mulacc [ebx + 8], edx, xmm6, xmm7, xmm4, xmm5, t
+ propout [edi + 8], xmm6, xmm7
+
+ mulacc [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6, t
+ propout [edi + 12], xmm7, xmm4
+
+ ret
+
+ .p2align 4
+mmul4:
+ // On entry, EDI points to the destination buffer; EAX and EBX point
+ // to the packed operands U and N; ECX and ESI point to the expanded
+ // operands V and M; and EDX points to a place to store an expanded
+ // result Y (32 bytes, at a 16-byte boundary). The stack pointer
+ // must be 16-byte aligned. (This is not the usual convention, which
+ // requires alignment before the call.)
+ //
+ // On exit, we write Y = U V M mod B to [EDX], and the low 128 bits
+ // of the sum U V + N Y to [EDI], leaving the remaining carry in
+ // XMM4, XMM5, and XMM6. The registers XMM0, XMM1, XMM2, XMM3, and
+ // XMM7 are clobbered; the general-purpose registers are preserved.
+ sub esp, 64 // space for the carries
+
+ // Calculate W = U V, and leave it in the destination. Stash the
+ // carry pieces for later.
+ mulcore [eax + 0], ecx, xmm4, xmm5, xmm6, xmm7
+ propout [edi + 0], xmm4, xmm5
+ jmp 5f
+
+ .p2align 4
+mmla4:
+ // On entry, EDI points to the destination buffer, which also
+ // contains an addend A to accumulate; EAX and EBX point
+ // to the packed operands U and N; ECX and ESI point to the expanded
+ // operands V and M; and EDX points to a place to store an expanded
+ // result Y (32 bytes, at a 16-byte boundary). The stack pointer
+ // must be 16-byte aligned. (This is not the usual convention, which
+ // requires alignment before the call.)
+ //
+ // On exit, we write Y = (A + U V) M mod B to [EDX], and the low 128
+ // bits of the sum A + U V + N Y to [EDI], leaving the remaining
+ // carry in XMM4, XMM5, and XMM6. The registers XMM0, XMM1, XMM2,
+ // XMM3, and XMM7 are clobbered; the general-purpose registers are
+ // preserved.
+ sub esp, 64 // space for the carries
+ movd xmm4, [edi + 0]
+ movd xmm5, [edi + 4]
+ movd xmm6, [edi + 8]
+ movd xmm7, [edi + 12]
+ mulacc [eax + 0], ecx, xmm4, xmm5, xmm6, xmm7, nil
+ propout [edi + 0], xmm4, xmm5
+
+5: mulacc [eax + 4], ecx, xmm5, xmm6, xmm7, xmm4, t
+ propout [edi + 4], xmm5, xmm6
+
+ mulacc [eax + 8], ecx, xmm6, xmm7, xmm4, xmm5, t
+ propout [edi + 8], xmm6, xmm7
+
+ mulacc [eax + 12], ecx, xmm7, xmm4, xmm5, xmm6, t
+ propout [edi + 12], xmm7, xmm4
+
+ movdqa [esp + 0], xmm4
+ movdqa [esp + 16], xmm5
+ movdqa [esp + 32], xmm6
+
+ // Calculate Y = W M.
+ mulcore [edi + 0], esi, xmm4, xmm5, xmm6, xmm7
+
+ mulcore [edi + 4], esi, xmm0, xmm1, xmm2, nil
+ accum xmm5, xmm6, xmm7, nil
+
+ mulcore [edi + 8], esi, xmm0, xmm1, nil, nil
+ accum xmm6, xmm7, nil, nil
+
+ mulcore [edi + 12], esi, xmm0, nil, nil, nil
+ accum xmm7, nil, nil, nil
+
+ // That's lots of pieces. Now we have to assemble the answer.
+ squash xmm4, xmm5, xmm6, xmm7, nil, xmm0, xmm1
+
+ // Expand it.
+ pxor xmm2, xmm2
+ expand xmm4, xmm1, nil, nil, xmm2
+ movdqa [edx + 0], xmm4
+ movdqa [edx + 16], xmm1
+
+ // Initialize the carry from the value for W we calculated earlier.
+ movd xmm4, [edi + 0]
+ movd xmm5, [edi + 4]
+ movd xmm6, [edi + 8]
+ movd xmm7, [edi + 12]
+
+ // Finish the calculation by adding the Montgomery product.
+ mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7, nil
+ propout [edi + 0], xmm4, xmm5
+
+ mulacc [ebx + 4], edx, xmm5, xmm6, xmm7, xmm4, t
+ propout [edi + 4], xmm5, xmm6
+
+ mulacc [ebx + 8], edx, xmm6, xmm7, xmm4, xmm5, t
+ propout [edi + 8], xmm6, xmm7
+
+ mulacc [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6, t
+ propout [edi + 12], xmm7, xmm4
+
+ // Add add on the carry we calculated earlier.
+ paddq xmm4, [esp + 0]
+ paddq xmm5, [esp + 16]
+ paddq xmm6, [esp + 32]
+
+ // And, with that, we're done.
+ add esp, 64
+ ret
+
+ .p2align 4
+mont4:
+ // On entry, EDI points to the destination buffer holding a packed
+ // value A; EBX points to a packed operand N; ESI points to an
+ // expanded operand M; and EDX points to a place to store an expanded
+ // result Y (32 bytes, at a 16-byte boundary).
+ //
+ // On exit, we write Y = W M mod B to [EDX], and the low 128 bits
+ // of the sum W + N Y to [EDI], leaving the remaining carry in
+ // XMM4, XMM5, and XMM6. The registers XMM0, XMM1, XMM2, XMM3, and
+ // XMM7 are clobbered; the general-purpose registers are preserved.
+
+ // Calculate Y = W M.
+ mulcore [edi + 0], esi, xmm4, xmm5, xmm6, xmm7
+
+ mulcore [edi + 4], esi, xmm0, xmm1, xmm2, nil
+ accum xmm5, xmm6, xmm7, nil
+
+ mulcore [edi + 8], esi, xmm0, xmm1, nil, nil
+ accum xmm6, xmm7, nil, nil
+
+ mulcore [edi + 12], esi, xmm0, nil, nil, nil
+ accum xmm7, nil, nil, nil
+
+ // That's lots of pieces. Now we have to assemble the answer.
+ squash xmm4, xmm5, xmm6, xmm7, nil, xmm0, xmm1
+
+ // Expand it.
+ pxor xmm2, xmm2
+ expand xmm4, xmm1, nil, nil, xmm2
+ movdqa [edx + 0], xmm4
+ movdqa [edx + 16], xmm1
+
+ // Initialize the carry from W.
+ movd xmm4, [edi + 0]
+ movd xmm5, [edi + 4]
+ movd xmm6, [edi + 8]
+ movd xmm7, [edi + 12]
+
+ // Finish the calculation by adding the Montgomery product.
+ mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7, nil
+ propout [edi + 0], xmm4, xmm5
+
+ mulacc [ebx + 4], edx, xmm5, xmm6, xmm7, xmm4, t
+ propout [edi + 4], xmm5, xmm6
+
+ mulacc [ebx + 8], edx, xmm6, xmm7, xmm4, xmm5, t
+ propout [edi + 8], xmm6, xmm7
+
+ mulacc [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6, t
+ propout [edi + 12], xmm7, xmm4
+
+ // And, with that, we're done.
+ ret
+
+///--------------------------------------------------------------------------
+/// Bulk multipliers.
+
+FUNC(mpx_umul4_x86_sse2)
+ // void mpx_umul4_x86_sse2(mpw *dv, const mpw *av, const mpw *avl,
+ // const mpw *bv, const mpw *bvl);
+
+ // Build a stack frame. Arguments will be relative to EBP, as
+ // follows.
+ //
+ // ebp + 20 dv
+ // ebp + 24 av
+ // ebp + 28 avl
+ // ebp + 32 bv
+ // ebp + 36 bvl
+ //
+ // Locals are relative to ESP, as follows.
+ //
+ // esp + 0 expanded Y (32 bytes)
+ // esp + 32 (top of locals)
+ push ebp
+ push ebx
+ push esi
+ push edi
+ mov ebp, esp
+ and esp, ~15
+ sub esp, 32
+
+ // Prepare for the first iteration.
+ mov esi, [ebp + 32] // -> bv[0]
+ pxor xmm7, xmm7
+ movdqu xmm0, [esi] // bv[0]
+ mov edi, [ebp + 20] // -> dv[0]
+ mov ecx, edi // outer loop dv cursor
+ expand xmm0, xmm1, nil, nil, xmm7
+ mov ebx, [ebp + 24] // -> av[0]
+ mov eax, [ebp + 28] // -> av[m] = av limit
+ mov edx, esp // -> expanded Y = bv[0]
+ movdqa [esp + 0], xmm0 // bv[0] expanded low
+ movdqa [esp + 16], xmm1 // bv[0] expanded high
+ call mul4zc
+ add ebx, 16
+ add edi, 16
+ add ecx, 16
+ add esi, 16
+ cmp ebx, eax // all done?
+ jae 8f
+
+ .p2align 4
+ // Continue with the first iteration.
+0: call mul4
+ add ebx, 16
+ add edi, 16
+ cmp ebx, eax // all done?
+ jb 0b
+
+ // Write out the leftover carry. There can be no tail here.
+8: call carryprop
+ cmp esi, [ebp + 36] // more passes to do?
+ jae 9f
+
+ .p2align 4
+ // Set up for the next pass.
+1: movdqu xmm0, [esi] // bv[i]
+ mov edi, ecx // -> dv[i]
+ pxor xmm7, xmm7
+ expand xmm0, xmm1, nil, nil, xmm7
+ mov ebx, [ebp + 24] // -> av[0]
+ movdqa [esp + 0], xmm0 // bv[i] expanded low
+ movdqa [esp + 16], xmm1 // bv[i] expanded high
+ call mla4zc
+ add edi, 16
+ add ebx, 16
+ add ecx, 16
+ add esi, 16
+ cmp ebx, eax // done yet?
+ jae 8f
+
+ .p2align 4
+ // Continue...
+0: call mla4
+ add ebx, 16
+ add edi, 16
+ cmp ebx, eax
+ jb 0b
+
+ // Finish off this pass. There was no tail on the previous pass, and
+ // there can be none on this pass.
+8: call carryprop
+ cmp esi, [ebp + 36]
+ jb 1b
+
+ // All over.
+9: mov esp, ebp
+ pop edi
+ pop esi
+ pop ebx
+ pop ebp
+ ret
+
+ENDFUNC
+
+FUNC(mpxmont_mul4_x86_sse2)
+ // void mpxmont_mul4_x86_sse2(mpw *dv, const mpw *av, const mpw *bv,
+ // const mpw *nv, size_t n, const mpw *mi);
+
+ // Build a stack frame. Arguments will be relative to EBP, as
+ // follows.
+ //
+ // ebp + 20 dv
+ // ebp + 24 av
+ // ebp + 28 bv
+ // ebp + 32 nv
+ // ebp + 36 n (nonzero multiple of 4)
+ // ebp + 40 mi
+ //
+ // Locals are relative to ESP, which is 4 mod 16, as follows.
+ //
+ // esp + 0 outer loop dv
+ // esp + 4 outer loop bv
+ // esp + 8 av limit (mostly in ESI)
+ // esp + 12 expanded V (32 bytes)
+ // esp + 44 expanded M (32 bytes)
+ // esp + 76 expanded Y (32 bytes)
+ // esp + 108 bv limit
+ // esp + 112 (gap)
+ // esp + 124 (top of locals)
+ push ebp
+ push ebx
+ push esi
+ push edi
+ mov ebp, esp
+ and esp, ~15
+ sub esp, 124
+
+ // Establish the expanded operands.
+ pxor xmm7, xmm7
+ mov ecx, [ebp + 28] // -> bv
+ mov edx, [ebp + 40] // -> mi
+ movdqu xmm0, [ecx] // bv[0]
+ movdqu xmm2, [edx] // mi
+ expand xmm0, xmm1, xmm2, xmm3, xmm7
+ movdqa [esp + 12], xmm0 // bv[0] expanded low
+ movdqa [esp + 28], xmm1 // bv[0] expanded high
+ movdqa [esp + 44], xmm2 // mi expanded low
+ movdqa [esp + 60], xmm3 // mi expanded high
+
+ // Set up the outer loop state and prepare for the first iteration.
+ mov edx, [ebp + 36] // n
+ mov eax, [ebp + 24] // -> U = av[0]
+ mov ebx, [ebp + 32] // -> X = nv[0]
+ mov edi, [ebp + 20] // -> Z = dv[0]
+ mov [esp + 4], ecx
+ lea ecx, [ecx + 4*edx] // -> bv[n/4] = bv limit
+ lea edx, [eax + 4*edx] // -> av[n/4] = av limit
+ mov [esp + 0], edi
+ mov [esp + 108], ecx
+ mov [esp + 8], edx
+ lea ecx, [esp + 12] // -> expanded V = bv[0]
+ lea esi, [esp + 44] // -> expanded M = mi
+ lea edx, [esp + 76] // -> space for Y
+ call mmul4
+ mov esi, [esp + 8] // recover av limit
+ add edi, 16
+ add eax, 16
+ add ebx, 16
+ cmp eax, esi // done already?
+ jae 8f
+ mov [esp + 0], edi
+
+ .p2align 4
+ // Complete the first inner loop.
+0: call dmul4
+ add edi, 16
+ add eax, 16
+ add ebx, 16
+ cmp eax, esi // done yet?
+ jb 0b
+
+ // Still have carries left to propagate.
+ call carryprop
+ movd [edi + 16], xmm4
+
+ .p2align 4
+ // Embark on the next iteration. (There must be one. If n = 1, then
+ // we would have bailed above, to label 8. Similarly, the subsequent
+ // iterations can fall into the inner loop immediately.)
+1: mov eax, [esp + 4] // -> bv[i - 1]
+ mov edi, [esp + 0] // -> Z = dv[i]
+ add eax, 16 // -> bv[i]
+ pxor xmm7, xmm7
+ movdqu xmm0, [eax] // bv[i]
+ mov [esp + 4], eax
+ cmp eax, [esp + 108] // done yet?
+ jae 9f
+ mov ebx, [ebp + 32] // -> X = nv[0]
+ lea esi, [esp + 44] // -> expanded M = mi
+ mov eax, [ebp + 24] // -> U = av[0]
+ expand xmm0, xmm1, nil, nil, xmm7
+ movdqa [esp + 12], xmm0 // bv[i] expanded low
+ movdqa [esp + 28], xmm1 // bv[i] expanded high
+ call mmla4
+ mov esi, [esp + 8] // recover av limit
+ add edi, 16
+ add eax, 16
+ add ebx, 16
+ mov [esp + 0], edi
+
+ .p2align 4
+ // Complete the next inner loop.
+0: call dmla4
+ add edi, 16
+ add eax, 16
+ add ebx, 16
+ cmp eax, esi
+ jb 0b
+
+ // Still have carries left to propagate, and they overlap the
+ // previous iteration's final tail, so read that in and add it.
+ movd xmm0, [edi]
+ paddq xmm4, xmm0
+ call carryprop
+ movd [edi + 16], xmm4
+
+ // Back again.
+ jmp 1b
+
+ // First iteration was short. Write out the carries and we're done.
+ // (This could be folded into the main loop structure, but that would
+ // penalize small numbers more.)
+8: call carryprop
+ movd [edi + 16], xmm4
+
+ // All done.
+9: mov esp, ebp
+ pop edi
+ pop esi
+ pop ebx
+ pop ebp
+ ret
+
+ENDFUNC
+
+FUNC(mpxmont_redc4_x86_sse2)
+ // void mpxmont_redc4_x86_sse2(mpw *dv, mpw *dvl, const mpw *nv,
+ // size_t n, const mpw *mi);
+
+ // Build a stack frame. Arguments will be relative to EBP, as
+ // follows.
+ //
+ // ebp + 20 dv
+ // ebp + 24 dvl
+ // ebp + 28 nv
+ // ebp + 32 n (nonzero multiple of 4)
+ // ebp + 36 mi
+ //
+ // Locals are relative to ESP, as follows.
+ //
+ // esp + 0 outer loop dv
+ // esp + 4 outer dv limit
+ // esp + 8 blocks-of-4 dv limit
+ // esp + 12 expanded M (32 bytes)
+ // esp + 44 expanded Y (32 bytes)
+ // esp + 76 (top of locals)
+ push ebp
+ push ebx
+ push esi
+ push edi
+ mov ebp, esp
+ and esp, ~15
+ sub esp, 76
+
+ // Establish the expanded operands and the blocks-of-4 dv limit.
+ mov edi, [ebp + 20] // -> Z = dv[0]
+ pxor xmm7, xmm7
+ mov eax, [ebp + 24] // -> dv[n] = dv limit
+ sub eax, edi // length of dv in bytes
+ mov edx, [ebp + 36] // -> mi
+ movdqu xmm0, [edx] // mi
+ and eax, ~15 // mask off the tail end
+ expand xmm0, xmm1, nil, nil, xmm7
+ add eax, edi // find limit
+ movdqa [esp + 12], xmm0 // mi expanded low
+ movdqa [esp + 28], xmm1 // mi expanded high
+ mov [esp + 8], eax
+
+ // Set up the outer loop state and prepare for the first iteration.
+ mov ecx, [ebp + 32] // n
+ mov ebx, [ebp + 28] // -> X = nv[0]
+ lea edx, [edi + 4*ecx] // -> dv[n/4] = outer dv limit
+ lea ecx, [ebx + 4*ecx] // -> nv[n/4] = nv limit
+ mov [esp + 0], edi
+ mov [esp + 4], edx
+ lea esi, [esp + 12] // -> expanded M = mi
+ lea edx, [esp + 44] // -> space for Y
+ call mont4
+ add edi, 16
+ add ebx, 16
+ cmp ebx, ecx // done already?
+ jae 8f
+
+ .p2align 4
+ // Complete the first inner loop.
+5: call mla4
+ add ebx, 16
+ add edi, 16
+ cmp ebx, ecx // done yet?
+ jb 5b
+
+ // Still have carries left to propagate.
+8: carryadd
+ mov esi, [esp + 8] // -> dv blocks limit
+ mov edx, [ebp + 24] // dv limit
+ psllq xmm7, 16
+ pslldq xmm7, 8
+ paddq xmm6, xmm7
+ call carryprop
+ movd eax, xmm4
+ add edi, 16
+ cmp edi, esi
+ jae 7f
+
+ .p2align 4
+ // Continue carry propagation until the end of the buffer.
+0: add [edi], eax
+ mov eax, 0 // preserves flags
+ adcd [edi + 4], 0
+ adcd [edi + 8], 0
+ adcd [edi + 12], 0
+ adc eax, 0
+ add edi, 16
+ cmp edi, esi
+ jb 0b
+
+ // Deal with the tail end.
+7: add [edi], eax
+ mov eax, 0 // preserves flags
+ add edi, 4
+ adc eax, 0
+ cmp edi, edx
+ jb 7b
+
+ // All done for this iteration. Start the next. (This must have at
+ // least one follow-on iteration, or we'd not have started this outer
+ // loop.)
+8: mov edi, [esp + 0] // -> dv[i - 1]
+ mov ebx, [ebp + 28] // -> X = nv[0]
+ lea edx, [esp + 44] // -> space for Y
+ lea esi, [esp + 12] // -> expanded M = mi
+ add edi, 16 // -> Z = dv[i]
+ cmp edi, [esp + 4] // all done yet?
+ jae 9f
+ mov [esp + 0], edi
+ call mont4
+ add edi, 16
+ add ebx, 16
+ jmp 5b
+
+ // All over.
+9: mov esp, ebp
+ pop edi
+ pop esi
+ pop ebx
+ pop ebp
+ ret
+
+ENDFUNC
+
+///--------------------------------------------------------------------------
+/// Testing and performance measurement.
+
+#ifdef TEST_MUL4
+
+.macro cysetup c
+ rdtsc
+ mov [\c], eax
+ mov [\c + 4], edx
+.endm
+
+.macro cystore c, v, n
+ rdtsc
+ sub eax, [\c]
+ sbb edx, [\c + 4]
+ mov ebx, [\v]
+ mov ecx, [\n]
+ dec ecx
+ mov [\n], ecx
+ mov [ebx + ecx*8], eax
+ mov [ebx + ecx*8 + 4], edx
+.endm
+
+.macro testprologue
+ push ebp
+ push ebx
+ push esi
+ push edi
+ mov ebp, esp
+ and esp, ~15
+ sub esp, 3*32 + 12
+ // vars:
+ // esp + 0 = cycles
+ // esp + 12 = v expanded
+ // esp + 44 = y expanded
+ // esp + 72 = ? expanded
+.endm
+
+.macro testepilogue
+ mov esp, ebp
+ pop edi
+ pop esi
+ pop ebx
+ pop ebp
+ ret
+.endm
+
+.macro testldcarry c
+ mov ecx, \c // -> c
+ movdqu xmm4, [ecx + 0] // (c'_0, c''_0)
+ movdqu xmm5, [ecx + 16] // (c'_1, c''_1)
+ movdqu xmm6, [ecx + 32] // (c'_2, c''_2)
+.endm
+
+.macro testexpand v, y
+ pxor xmm7, xmm7
+ .ifnes "\v", "nil"
+ mov ecx, \v
+ movdqu xmm0, [ecx]
+ expand xmm0, xmm1, nil, nil, xmm7
+ movdqa [esp + 12], xmm0
+ movdqa [esp + 28], xmm1
+ .endif
+ .ifnes "\y", "nil"
+ mov edx, \y
+ movdqu xmm2, [edx]
+ expand xmm2, xmm3, nil, nil, xmm7
+ movdqa [esp + 44], xmm2
+ movdqa [esp + 60], xmm3
+ .endif
+.endm
+
+.macro testtop u, x, mode
+ .p2align 4
+0:
+ .ifnes "\u", "nil"
+ lea ecx, [esp + 12]
+ .endif
+ mov ebx, \x
+ .ifeqs "\mode", "mont"
+ lea esi, [esp + 44]
+ .endif
+ cysetup esp + 0
+ .ifnes "\u", "nil"
+ mov eax, \u
+ .endif
+ .ifeqs "\mode", "mont"
+ lea edx, [esp + 76]
+ .else
+ lea edx, [esp + 44]
+ .endif
+.endm
+
+.macro testtail cyv, n
+ cystore esp + 0, \cyv, \n
+ jnz 0b
+.endm
+
+.macro testcarryout c
+ mov ecx, \c
+ movdqu [ecx + 0], xmm4
+ movdqu [ecx + 16], xmm5
+ movdqu [ecx + 32], xmm6
+.endm
+
+ .globl test_dmul4
+test_dmul4:
+ testprologue
+ testldcarry [ebp + 24]
+ testexpand [ebp + 36], [ebp + 40]
+ mov edi, [ebp + 20]
+ testtop [ebp + 28], [ebp + 32]
+ call dmul4
+ testtail [ebp + 48], [ebp + 44]
+ testcarryout [ebp + 24]
+ testepilogue
+
+ .globl test_dmla4
+test_dmla4:
+ testprologue
+ testldcarry [ebp + 24]
+ testexpand [ebp + 36], [ebp + 40]
+ mov edi, [ebp + 20]
+ testtop [ebp + 28], [ebp + 32]
+ call dmla4
+ testtail [ebp + 48], [ebp + 44]
+ testcarryout [ebp + 24]
+ testepilogue
+
+ .globl test_mul4
+test_mul4:
+ testprologue
+ testldcarry [ebp + 24]
+ testexpand nil, [ebp + 32]
+ mov edi, [ebp + 20]
+ testtop nil, [ebp + 28]
+ call mul4
+ testtail [ebp + 40], [ebp + 36]
+ testcarryout [ebp + 24]
+ testepilogue
+
+ .globl test_mla4
+test_mla4:
+ testprologue
+ testldcarry [ebp + 24]
+ testexpand nil, [ebp + 32]
+ mov edi, [ebp + 20]
+ testtop nil, [ebp + 28]
+ call mla4
+ testtail [ebp + 40], [ebp + 36]
+ testcarryout [ebp + 24]
+ testepilogue
+
+ .globl test_mmul4
+test_mmul4:
+ testprologue
+ testexpand [ebp + 40], [ebp + 44]
+ mov edi, [ebp + 20]
+ testtop [ebp + 32], [ebp + 36], mont
+ call mmul4
+ testtail [ebp + 52], [ebp + 48]
+ mov edi, [ebp + 28]
+ movdqa xmm0, [esp + 76]
+ movdqa xmm1, [esp + 92]
+ movdqu [edi], xmm0
+ movdqu [edi + 16], xmm1
+ testcarryout [ebp + 24]
+ testepilogue
+
+ .globl test_mmla4
+test_mmla4:
+ testprologue
+ testexpand [ebp + 40], [ebp + 44]
+ mov edi, [ebp + 20]
+ testtop [ebp + 32], [ebp + 36], mont
+ call mmla4
+ testtail [ebp + 52], [ebp + 48]
+ mov edi, [ebp + 28]
+ movdqa xmm0, [esp + 76]
+ movdqa xmm1, [esp + 92]
+ movdqu [edi], xmm0
+ movdqu [edi + 16], xmm1
+ testcarryout [ebp + 24]
+ testepilogue
+
+ .globl test_mont4
+test_mont4:
+ testprologue
+ testexpand nil, [ebp + 36]
+ mov edi, [ebp + 20]
+ testtop nil, [ebp + 32], mont
+ call mont4
+ testtail [ebp + 44], [ebp + 40]
+ mov edi, [ebp + 28]
+ movdqa xmm0, [esp + 76]
+ movdqa xmm1, [esp + 92]
+ movdqu [edi], xmm0
+ movdqu [edi + 16], xmm1
+ testcarryout [ebp + 24]
+ testepilogue
+
+#endif
+
+///----- That's all, folks --------------------------------------------------