chiark - git - mdw - catacomb/blob - math/mpx-mul4-x86-sse2.S

   1 /// -*- mode: asm; asm-comment-char: ?/; comment-start: "// " -*-
   2 ///
   3 /// Large SIMD-based multiplications
   4 ///
   5 /// (c) 2016 Straylight/Edgeware
   6
   7 ///----- Licensing notice ---------------------------------------------------
   8 ///
   9 /// This file is part of Catacomb.
  10 ///
  11 /// Catacomb is free software; you can redistribute it and/or modify
  12 /// it under the terms of the GNU Library General Public License as
  13 /// published by the Free Software Foundation; either version 2 of the
  14 /// License, or (at your option) any later version.
  15 ///
  16 /// Catacomb is distributed in the hope that it will be useful,
  17 /// but WITHOUT ANY WARRANTY; without even the implied warranty of
  18 /// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  19 /// GNU Library General Public License for more details.
  20 ///
  21 /// You should have received a copy of the GNU Library General Public
  22 /// License along with Catacomb; if not, write to the Free
  23 /// Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
  24 /// MA 02111-1307, USA.
  25
  26 ///--------------------------------------------------------------------------
  27 /// External definitions.
  28
  29 #include "config.h"
  30 #include "asm-common.h"
  31
  32 ///--------------------------------------------------------------------------
  33 /// Prologue.
  34
  35         .arch   pentium4
  36         .text
  37
  38 ///--------------------------------------------------------------------------
  39 /// Theory.
  40 ///
  41 /// We define a number of primitive fixed-size multipliers from which we can
  42 /// construct more general variable-length multipliers.
  43 ///
  44 /// The basic trick is the same throughout.  In an operand-scanning
  45 /// multiplication, the inner multiplication loop multiplies a
  46 /// multiple-precision operand by a single precision factor, and adds the
  47 /// result, appropriately shifted, to the result.  A `finely integrated
  48 /// operand scanning' implementation of Montgomery multiplication also adds
  49 /// the product of a single-precision `Montgomery factor' and the modulus,
  50 /// calculated in the same pass.  The more common `coarsely integrated
  51 /// operand scanning' alternates main multiplication and Montgomery passes,
  52 /// which requires additional carry propagation.
  53 ///
  54 /// Throughout both plain-multiplication and Montgomery stages, then, one of
  55 /// the factors remains constant throughout the operation, so we can afford
  56 /// to take a little time to preprocess it.  The transformation we perform is
  57 /// as follows.  Let b = 2^16, and B = b^2 = 2^32.  Suppose we're given a
  58 /// 128-bit factor v = v_0 + v_1 B + v_2 B^2 + v_3 B^3.  Split each v_i into
  59 /// two sixteen-bit pieces, so v_i = v'_i + v''_i b.  These eight 16-bit
  60 /// pieces are placed into 32-bit cells, and arranged as two 128-bit SSE
  61 /// operands, as follows.
  62 ///
  63 ///     Offset     0       4        8      12
  64 ///        0    v'_0    v'_1    v''_0   v''_1
  65 ///       16    v'_2    v'_3    v''_2   v''_3
  66 ///
  67 /// A `pmuludq' instruction ignores the odd positions in its operands; thus,
  68 /// it will act on (say) v'_0 and v''_0 in a single instruction.  Shifting
  69 /// this vector right by 4 bytes brings v'_1 and v''_1 into position.  We can
  70 /// multiply such a vector by a full 32-bit scalar to produce two 48-bit
  71 /// results in 64-bit fields.  The sixteen bits of headroom allows us to add
  72 /// many products together before we must deal with carrying; it also allows
  73 /// for some calculations to be performed on the above expanded form.
  74 ///
  75 /// On 32-bit x86, we are register starved: the expanded operands are kept in
  76 /// memory, typically in warm L1 cache.
  77 ///
  78 /// We maintain four `carry' registers accumulating intermediate results.
  79 /// The registers' precise roles rotate during the computation; we name them
  80 /// `c0', `c1', `c2', and `c3'.  Each carry register holds two 64-bit halves:
  81 /// the register c0, for example, holds c'_0 (low half) and c''_0 (high
  82 /// half), and represents the value c_0 = c'_0 + c''_0 b; the carry registers
  83 /// collectively represent the value c_0 + c_1 B + c_2 B^2 + c_3 B^3.  The
  84 /// `pmuluqd' instruction acting on a scalar operand (broadcast across all
  85 /// lanes of its vector) and an operand in the expanded form above produces a
  86 /// result which can be added directly to the appropriate carry register.
  87 /// Following a pass of four multiplications, we perform some limited carry
  88 /// propagation: let t = c''_0 mod B, and let d = c'_0 + t b; then we output
  89 /// z = d mod B, add (floor(d/B), floor(c''_0/B)) to c1, and cycle the carry
  90 /// registers around, so that c1 becomes c0, and the old c0 is (implicitly)
  91 /// zeroed becomes c3.
  92
  93 ///--------------------------------------------------------------------------
  94 /// Macro definitions.
  95
  96 .macro  mulcore r, s, d0, d1=nil, d2=nil, d3=nil
  97         // Load a word r_i from R, multiply by the expanded operand [S], and
  98         // leave the pieces of the product in registers D0, D1, D2, D3.
  99         movd    \d0, \r                 // (r_i, 0; 0, 0)
 100   .ifnes "\d1", "nil"
 101         movdqa  \d1, [\s]               // (s'_0, s'_1; s''_0, s''_1)
 102   .endif
 103   .ifnes "\d3", "nil"
 104         movdqa  \d3, [\s + 16]          // (s'_2, s'_3; s''_2, s''_3)
 105   .endif
 106         pshufd  \d0, \d0, SHUF(3, 0, 3, 0) // (r_i, ?; r_i, ?)
 107   .ifnes "\d1", "nil"
 108         psrldq  \d1, 4                  // (s'_1, s''_0; s''_1, 0)
 109   .endif
 110   .ifnes "\d2", "nil"
 111     .ifnes "\d3", "nil"
 112         movdqa  \d2, \d3                // another copy of (s'_2, s'_3; ...)
 113     .else
 114         movdqa  \d2, \d0                // another copy of (r_i, ?; r_i, ?)
 115     .endif
 116   .endif
 117   .ifnes "\d3", "nil"
 118         psrldq  \d3, 4                  // (s'_3, s''_2; s''_3, 0)
 119   .endif
 120   .ifnes "\d1", "nil"
 121         pmuludq \d1, \d0                // (r_i s'_1; r_i s''_1)
 122   .endif
 123   .ifnes "\d3", "nil"
 124         pmuludq \d3, \d0                // (r_i s'_3; r_i s''_3)
 125   .endif
 126   .ifnes "\d2", "nil"
 127     .ifnes "\d3", "nil"
 128         pmuludq \d2, \d0                // (r_i s'_2; r_i s''_2)
 129     .else
 130         pmuludq \d2, [\s + 16]
 131     .endif
 132   .endif
 133         pmuludq \d0, [\s]               // (r_i s'_0; r_i s''_0)
 134 .endm
 135
 136 .macro  accum   c0, c1=nil, c2=nil, c3=nil
 137         // Accumulate 64-bit pieces in XMM0--XMM3 into the corresponding
 138         // carry registers C0--C3.  Any or all of C1--C3 may be `nil' to skip
 139         // updating that register.
 140         paddq   \c0, xmm0
 141   .ifnes "\c1", "nil"
 142         paddq   \c1, xmm1
 143   .endif
 144   .ifnes "\c2", "nil"
 145         paddq   \c2, xmm2
 146   .endif
 147   .ifnes "\c3", "nil"
 148         paddq   \c3, xmm3
 149   .endif
 150 .endm
 151
 152 .macro  mulacc  r, s, c0, c1, c2, c3, z3p=nil
 153         // Load a word r_i from R, multiply by the expanded operand [S],
 154         // and accumulate in carry registers C0, C1, C2, C3.  If Z3P is `t'
 155         // then C3 notionally contains zero, but needs clearing; in practice,
 156         // we store the product directly rather than attempting to add.  On
 157         // completion, XMM0, XMM1, and XMM2 are clobbered, as is XMM3 if Z3P
 158         // is not `t'.
 159   .ifeqs "\z3p", "t"
 160         mulcore \r, \s, xmm0, xmm1, xmm2, \c3
 161         accum           \c0,  \c1,  \c2
 162   .else
 163         mulcore \r, \s, xmm0, xmm1, xmm2, xmm3
 164         accum           \c0,  \c1,  \c2,  \c3
 165   .endif
 166 .endm
 167
 168 .macro  propout d, c, cc=nil
 169         // Calculate an output word from C, and store it in D; propagate
 170         // carries out from C to CC in preparation for a rotation of the
 171         // carry registers.  On completion, XMM3 is clobbered.  If CC is
 172         // `nil', then the contribution which would have been added to it is
 173         // left in C.
 174         pshufd  xmm3, \c, SHUF(2, 3, 3, 3) // (?, ?; ?, t = c'' mod B)
 175         psrldq  xmm3, 12                // (t, 0; 0, 0) = (t, 0)
 176         pslldq  xmm3, 2                 // (t b; 0)
 177         paddq   \c, xmm3                // (c' + t b; c'')
 178         movd    \d, \c
 179         psrlq   \c, 32                  // floor(c/B)
 180   .ifnes "\cc", "nil"
 181         paddq   \cc, \c                 // propagate up
 182   .endif
 183 .endm
 184
 185 .macro  endprop d, c, t
 186         // On entry, C contains a carry register.  On exit, the low 32 bits
 187         // of the value represented in C are written to D, and the remaining
 188         // bits are left at the bottom of T.
 189         movdqa  \t, \c
 190         psllq   \t, 16                  // (?; c'' b)
 191         pslldq  \c, 8                   // (0; c')
 192         paddq   \t, \c                  // (?; c' + c'' b)
 193         psrldq  \t, 8                   // (c' + c'' b; 0) = (c; 0)
 194         movd    \d, \t
 195         psrldq  \t, 4                   // (floor(c/B); 0)
 196 .endm
 197
 198 .macro  expand  z, a, b, c=nil, d=nil
 199         // On entry, A and C hold packed 128-bit values, and Z is zero.  On
 200         // exit, A:B and C:D together hold the same values in expanded
 201         // form.  If C is `nil', then only expand A to A:B.
 202         movdqa  \b, \a                  // (a_0, a_1; a_2, a_3)
 203   .ifnes "\c", "nil"
 204         movdqa  \d, \c                  // (c_0, c_1; c_2, c_3)
 205   .endif
 206         punpcklwd \a, \z                // (a'_0, a''_0; a'_1, a''_1)
 207         punpckhwd \b, \z                // (a'_2, a''_2; a'_3, a''_3)
 208   .ifnes "\c", "nil"
 209         punpcklwd \c, \z                // (c'_0, c''_0; c'_1, c''_1)
 210         punpckhwd \d, \z                // (c'_2, c''_2; c'_3, c''_3)
 211   .endif
 212         pshufd  \a, \a, SHUF(3, 1, 2, 0) // (a'_0, a'_1; a''_0, a''_1)
 213         pshufd  \b, \b, SHUF(3, 1, 2, 0) // (a'_2, a'_3; a''_2, a''_3)
 214   .ifnes "\c", "nil"
 215         pshufd  \c, \c, SHUF(3, 1, 2, 0) // (c'_0, c'_1; c''_0, c''_1)
 216         pshufd  \d, \d, SHUF(3, 1, 2, 0) // (c'_2, c'_3; c''_2, c''_3)
 217   .endif
 218 .endm
 219
 220 .macro  squash  c0, c1, c2, c3, t, u, lo, hi=nil
 221         // On entry, C0, C1, C2, C3 are carry registers representing a value
 222         // Y.  On exit, LO holds the low 128 bits of the carry value; C1, C2,
 223         // C3, T, and U are clobbered; and the high bits of Y are stored in
 224         // HI, if this is not `nil'.
 225
 226         // The first step is to eliminate the `double-prime' pieces -- i.e.,
 227         // the ones offset by 16 bytes from a 32-bit boundary -- by carrying
 228         // them into the 32-bit-aligned pieces above and below.  But before
 229         // we can do that, we must gather them together.
 230         movdqa  \t, \c0
 231         movdqa  \u, \c1
 232         punpcklqdq \t, \c2              // (y'_0; y'_2)
 233         punpckhqdq \c0, \c2             // (y''_0; y''_2)
 234         punpcklqdq \u, \c3              // (y'_1; y'_3)
 235         punpckhqdq \c1, \c3             // (y''_1; y''_3)
 236
 237         // Now split the double-prime pieces.  The high (up to) 48 bits will
 238         // go up; the low 16 bits go down.
 239         movdqa  \c2, \c0
 240         movdqa  \c3, \c1
 241         psllq   \c2, 48
 242         psllq   \c3, 48
 243         psrlq   \c0, 16                 // high parts of (y''_0; y''_2)
 244         psrlq   \c1, 16                 // high parts of (y''_1; y''_3)
 245         psrlq   \c2, 32                 // low parts of (y''_0; y''_2)
 246         psrlq   \c3, 32                 // low parts of (y''_1; y''_3)
 247   .ifnes "\hi", "nil"
 248         movdqa  \hi, \c1
 249   .endif
 250         pslldq  \c1, 8                  // high part of (0; y''_1)
 251
 252         paddq   \t, \c2                 // propagate down
 253         paddq   \u, \c3
 254         paddq   \t, \c1                 // and up: (y_0; y_2)
 255         paddq   \u, \c0                 // (y_1; y_3)
 256   .ifnes "\hi", "nil"
 257         psrldq  \hi, 8                  // high part of (y''_3; 0)
 258   .endif
 259
 260         // Finally extract the answer.  This complicated dance is better than
 261         // storing to memory and loading, because the piecemeal stores
 262         // inhibit store forwarding.
 263         movdqa  \c3, \t                 // (y_0; ?)
 264         movdqa  \lo, \t                 // (y^*_0, ?; ?, ?)
 265         psrldq  \t, 8                   // (y_2; 0)
 266         psrlq   \c3, 32                 // (floor(y_0/B); ?)
 267         paddq   \c3, \u                 // (y_1 + floor(y_0/B); ?)
 268         movdqa  \c1, \c3                // (y^*_1, ?; ?, ?)
 269         psrldq  \u, 8                   // (y_3; 0)
 270         psrlq   \c3, 32                 // (floor((y_1 B + y_0)/B^2; ?)
 271         paddq   \c3, \t                 // (y_2 + floor((y_1 B + y_0)/B^2; ?)
 272         punpckldq \lo, \c3              // (y^*_0, y^*_2; ?, ?)
 273         psrlq   \c3, 32             // (floor((y_2 B^2 + y_1 B + y_0)/B^3; ?)
 274         paddq   \c3, \u       // (y_3 + floor((y_2 B^2 + y_1 B + y_0)/B^3; ?)
 275   .ifnes "\hi", "nil"
 276         movdqa  \t, \c3
 277         pxor    \u, \u
 278   .endif
 279         punpckldq \c1, \c3              // (y^*_1, y^*_3; ?, ?)
 280   .ifnes "\hi", "nil"
 281         psrlq   \t, 32                  // very high bits of y
 282         paddq   \hi, \t
 283         punpcklqdq \hi, \u              // carry up
 284   .endif
 285         punpckldq \lo, \c1              // y mod B^4
 286 .endm
 287
 288 .macro  carryadd
 289         // On entry, EDI points to a packed addend A, and XMM4, XMM5, XMM6
 290         // hold the incoming carry registers c0, c1, and c2 representing a
 291         // carry-in C.
 292         //
 293         // On exit, the carry registers, including XMM7, are updated to hold
 294         // C + A; XMM0, XMM1, XMM2, and XMM3 are clobbered.  The other
 295         // registers are preserved.
 296         movd    xmm0, [edi +  0]        // (a_0; 0)
 297         movd    xmm1, [edi +  4]        // (a_1; 0)
 298         movd    xmm2, [edi +  8]        // (a_2; 0)
 299         movd    xmm7, [edi + 12]        // (a_3; 0)
 300
 301         paddq   xmm4, xmm0              // (c'_0 + a_0; c''_0)
 302         paddq   xmm5, xmm1              // (c'_1 + a_1; c''_1)
 303         paddq   xmm6, xmm2              // (c'_2 + a_2; c''_2 + a_3 b)
 304 .endm
 305
 306 ///--------------------------------------------------------------------------
 307 /// Primitive multipliers and related utilities.
 308
 309 INTFUNC(carryprop)
 310         // On entry, XMM4, XMM5, and XMM6 hold a 144-bit carry in an expanded
 311         // form.  Store the low 128 bits of the represented carry to [EDI] as
 312         // a packed 128-bit value, and leave the remaining 16 bits in the low
 313         // 32 bits of XMM4.  On exit, XMM3, XMM5 and XMM6 are clobbered.
 314   endprologue
 315
 316         propout [edi +  0], xmm4, xmm5
 317         propout [edi +  4], xmm5, xmm6
 318         propout [edi +  8], xmm6, nil
 319         endprop [edi + 12], xmm6, xmm4
 320         ret
 321
 322 ENDFUNC
 323
 324 INTFUNC(dmul4)
 325         // On entry, EDI points to the destination buffer; EAX and EBX point
 326         // to the packed operands U and X; ECX and EDX point to the expanded
 327         // operands V and Y; and XMM4, XMM5, XMM6 hold the incoming carry
 328         // registers c0, c1, and c2; c3 is assumed to be zero.
 329         //
 330         // On exit, we write the low 128 bits of the sum C + U V + X Y to
 331         // [EDI], and update the carry registers with the carry out.  The
 332         // registers XMM0, XMM1, XMM2, XMM3, and XMM7 are clobbered; the
 333         // general-purpose registers are preserved.
 334   endprologue
 335
 336         mulacc  [eax +  0], ecx, xmm4, xmm5, xmm6, xmm7, t
 337         mulacc  [ebx +  0], edx, xmm4, xmm5, xmm6, xmm7
 338         propout [edi +  0],      xmm4, xmm5
 339
 340         mulacc  [eax +  4], ecx, xmm5, xmm6, xmm7, xmm4, t
 341         mulacc  [ebx +  4], edx, xmm5, xmm6, xmm7, xmm4
 342         propout [edi +  4],      xmm5, xmm6
 343
 344         mulacc  [eax +  8], ecx, xmm6, xmm7, xmm4, xmm5, t
 345         mulacc  [ebx +  8], edx, xmm6, xmm7, xmm4, xmm5
 346         propout [edi +  8],      xmm6, xmm7
 347
 348         mulacc  [eax + 12], ecx, xmm7, xmm4, xmm5, xmm6, t
 349         mulacc  [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6
 350         propout [edi + 12],      xmm7, xmm4
 351
 352         ret
 353
 354 ENDFUNC
 355
 356 INTFUNC(dmla4)
 357         // On entry, EDI points to the destination buffer, which also
 358         // contains an addend A to accumulate; EAX and EBX point to the
 359         // packed operands U and X; ECX and EDX point to the expanded
 360         // operands V and Y; and XMM4, XMM5, XMM6 hold the incoming carry
 361         // registers c0, c1, and c2 representing a carry-in C; c3 is assumed
 362         // to be zero.
 363         //
 364         // On exit, we write the low 128 bits of the sum A + C + U V + X Y to
 365         // [EDI], and update the carry registers with the carry out.  The
 366         // registers XMM0, XMM1, XMM2, XMM3, and XMM7 are clobbered; the
 367         // general-purpose registers are preserved.
 368   endprologue
 369
 370         carryadd
 371
 372         mulacc  [eax +  0], ecx, xmm4, xmm5, xmm6, xmm7
 373         mulacc  [ebx +  0], edx, xmm4, xmm5, xmm6, xmm7
 374         propout [edi +  0],      xmm4, xmm5
 375
 376         mulacc  [eax +  4], ecx, xmm5, xmm6, xmm7, xmm4, t
 377         mulacc  [ebx +  4], edx, xmm5, xmm6, xmm7, xmm4
 378         propout [edi +  4],      xmm5, xmm6
 379
 380         mulacc  [eax +  8], ecx, xmm6, xmm7, xmm4, xmm5, t
 381         mulacc  [ebx +  8], edx, xmm6, xmm7, xmm4, xmm5
 382         propout [edi +  8],      xmm6, xmm7
 383
 384         mulacc  [eax + 12], ecx, xmm7, xmm4, xmm5, xmm6, t
 385         mulacc  [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6
 386         propout [edi + 12],      xmm7, xmm4
 387
 388         ret
 389
 390 ENDFUNC
 391
 392 INTFUNC(mul4zc)
 393         // On entry, EDI points to the destination buffer; EBX points to a
 394         // packed operand X; and EDX points to an expanded operand Y.
 395         //
 396         // On exit, we write the low 128 bits of the product X Y to [EDI],
 397         // and set the carry registers XMM4, XMM5, XMM6 to the carry out.
 398         // The registers XMM0, XMM1, XMM2, XMM3, and XMM7 are clobbered; the
 399         // general-purpose registers are preserved.
 400   endprologue
 401
 402         mulcore [ebx +  0], edx, xmm4, xmm5, xmm6, xmm7
 403         propout [edi +  0],      xmm4, xmm5
 404
 405         mulacc  [ebx +  4], edx, xmm5, xmm6, xmm7, xmm4, t
 406         propout [edi +  4],      xmm5, xmm6
 407
 408         mulacc  [ebx +  8], edx, xmm6, xmm7, xmm4, xmm5, t
 409         propout [edi +  8],      xmm6, xmm7
 410
 411         mulacc  [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6, t
 412         propout [edi + 12],      xmm7, xmm4
 413
 414         ret
 415
 416 ENDFUNC
 417
 418 INTFUNC(mul4)
 419         // On entry, EDI points to the destination buffer; EBX points to a
 420         // packed operand X; EDX points to an expanded operand Y; and XMM4,
 421         // XMM5, XMM6 hold the incoming carry registers c0, c1, and c2,
 422         // representing a carry-in C; c3 is assumed to be zero.
 423         //
 424         // On exit, we write the low 128 bits of the sum C + X Y to [EDI],
 425         // and update the carry registers with the carry out.  The registers
 426         // XMM0, XMM1, XMM2, XMM3, and XMM7 are clobbered; the
 427         // general-purpose registers are preserved.
 428   endprologue
 429
 430         mulacc  [ebx +  0], edx, xmm4, xmm5, xmm6, xmm7, t
 431         propout [edi +  0],      xmm4, xmm5
 432
 433         mulacc  [ebx +  4], edx, xmm5, xmm6, xmm7, xmm4, t
 434         propout [edi +  4],      xmm5, xmm6
 435
 436         mulacc  [ebx +  8], edx, xmm6, xmm7, xmm4, xmm5, t
 437         propout [edi +  8],      xmm6, xmm7
 438
 439         mulacc  [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6, t
 440         propout [edi + 12],      xmm7, xmm4
 441
 442         ret
 443
 444 ENDFUNC
 445
 446 INTFUNC(mla4zc)
 447         // On entry, EDI points to the destination buffer, which also
 448         // contains an addend A to accumulate; EBX points to a packed operand
 449         // X; and EDX points to an expanded operand Y.
 450         //
 451         // On exit, we write the low 128 bits of the sum A + X Y to [EDI],
 452         // and set the carry registers XMM4, XMM5, XMM6 to the carry out.
 453         // The registers XMM0, XMM1, XMM2, XMM3, and XMM7 are clobbered; the
 454         // general-purpose registers are preserved.
 455   endprologue
 456
 457         movd    xmm4, [edi +  0]
 458         movd    xmm5, [edi +  4]
 459         movd    xmm6, [edi +  8]
 460         movd    xmm7, [edi + 12]
 461
 462         mulacc  [ebx +  0], edx, xmm4, xmm5, xmm6, xmm7
 463         propout [edi +  0],      xmm4, xmm5
 464
 465         mulacc  [ebx +  4], edx, xmm5, xmm6, xmm7, xmm4, t
 466         propout [edi +  4],      xmm5, xmm6
 467
 468         mulacc  [ebx +  8], edx, xmm6, xmm7, xmm4, xmm5, t
 469         propout [edi +  8],      xmm6, xmm7
 470
 471         mulacc  [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6, t
 472         propout [edi + 12],      xmm7, xmm4
 473
 474         ret
 475
 476 ENDFUNC
 477
 478 INTFUNC(mla4)
 479         // On entry, EDI points to the destination buffer, which also
 480         // contains an addend A to accumulate; EBX points to a packed operand
 481         // X; EDX points to an expanded operand Y; and XMM4, XMM5, XMM6 hold
 482         // the incoming carry registers c0, c1, and c2, representing a
 483         // carry-in C; c3 is assumed to be zero.
 484         //
 485         // On exit, we write the low 128 bits of the sum A + C + X Y to
 486         // [EDI], and update the carry registers with the carry out.  The
 487         // registers XMM0, XMM1, XMM2, XMM3, and XMM7 are clobbered; the
 488         // general-purpose registers are preserved.
 489   endprologue
 490
 491         carryadd
 492
 493         mulacc  [ebx +  0], edx, xmm4, xmm5, xmm6, xmm7
 494         propout [edi +  0],      xmm4, xmm5
 495
 496         mulacc  [ebx +  4], edx, xmm5, xmm6, xmm7, xmm4, t
 497         propout [edi +  4],      xmm5, xmm6
 498
 499         mulacc  [ebx +  8], edx, xmm6, xmm7, xmm4, xmm5, t
 500         propout [edi +  8],      xmm6, xmm7
 501
 502         mulacc  [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6, t
 503         propout [edi + 12],      xmm7, xmm4
 504
 505         ret
 506
 507 ENDFUNC
 508
 509 INTFUNC(mmul4)
 510         // On entry, EDI points to the destination buffer; EAX and EBX point
 511         // to the packed operands U and N; ECX and ESI point to the expanded
 512         // operands V and M; and EDX points to a place to store an expanded
 513         // result Y (32 bytes, at a 16-byte boundary).  The stack pointer
 514         // must be 12 modulo 16, as is usual for modern x86 ABIs.
 515         //
 516         // On exit, we write Y = U V M mod B to [EDX], and the low 128 bits
 517         // of the sum U V + N Y to [EDI], leaving the remaining carry in
 518         // XMM4, XMM5, and XMM6.  The registers XMM0, XMM1, XMM2, XMM3, and
 519         // XMM7 are clobbered; the general-purpose registers are preserved.
 520         stalloc 48 + 12                 // space for the carries
 521   endprologue
 522
 523         // Calculate W = U V, and leave it in the destination.  Stash the
 524         // carry pieces for later.
 525         mulcore [eax +  0], ecx, xmm4, xmm5, xmm6, xmm7
 526         propout [edi +  0],      xmm4, xmm5
 527         jmp     5f
 528
 529 ENDFUNC
 530
 531 INTFUNC(mmla4)
 532         // On entry, EDI points to the destination buffer, which also
 533         // contains an addend A to accumulate; EAX and EBX point to the
 534         // packed operands U and N; ECX and ESI point to the expanded
 535         // operands V and M; and EDX points to a place to store an expanded
 536         // result Y (32 bytes, at a 16-byte boundary).  The stack pointer
 537         // must be 12 modulo 16, as is usual for modern x86 ABIs.
 538         //
 539         // On exit, we write Y = (A + U V) M mod B to [EDX], and the low 128
 540         // bits of the sum A + U V + N Y to [EDI], leaving the remaining
 541         // carry in XMM4, XMM5, and XMM6.  The registers XMM0, XMM1, XMM2,
 542         // XMM3, and XMM7 are clobbered; the general-purpose registers are
 543         // preserved.
 544         stalloc 48 + 12                 // space for the carries
 545   endprologue
 546
 547         movd    xmm4, [edi +  0]
 548         movd    xmm5, [edi +  4]
 549         movd    xmm6, [edi +  8]
 550         movd    xmm7, [edi + 12]
 551
 552         // Calculate W = U V, and leave it in the destination.  Stash the
 553         // carry pieces for later.
 554         mulacc  [eax +  0], ecx, xmm4, xmm5, xmm6, xmm7
 555         propout [edi +  0],      xmm4, xmm5
 556
 557 5:      mulacc  [eax +  4], ecx, xmm5, xmm6, xmm7, xmm4, t
 558         propout [edi +  4],      xmm5, xmm6
 559
 560         mulacc  [eax +  8], ecx, xmm6, xmm7, xmm4, xmm5, t
 561         propout [edi +  8],      xmm6, xmm7
 562
 563         mulacc  [eax + 12], ecx, xmm7, xmm4, xmm5, xmm6, t
 564         propout [edi + 12],      xmm7, xmm4
 565
 566         movdqa  [esp +  0], xmm4
 567         movdqa  [esp + 16], xmm5
 568         movdqa  [esp + 32], xmm6
 569
 570         // Calculate Y = W M.
 571         mulcore [edi +  0], esi, xmm4, xmm5, xmm6, xmm7
 572
 573         mulcore [edi +  4], esi, xmm0, xmm1, xmm2
 574         accum                    xmm5, xmm6, xmm7
 575
 576         mulcore [edi +  8], esi, xmm0, xmm1
 577         accum                    xmm6, xmm7
 578
 579         mulcore [edi + 12], esi, xmm0
 580         accum                    xmm7
 581
 582         // That's lots of pieces.  Now we have to assemble the answer.
 583         squash  xmm4, xmm5, xmm6, xmm7,  xmm0, xmm1,  xmm4
 584
 585         // Expand it.
 586         pxor    xmm2, xmm2
 587         expand  xmm2, xmm4, xmm1
 588         movdqa  [edx +  0], xmm4
 589         movdqa  [edx + 16], xmm1
 590
 591         // Initialize the carry from the value for W we calculated earlier.
 592         movd    xmm4, [edi +  0]
 593         movd    xmm5, [edi +  4]
 594         movd    xmm6, [edi +  8]
 595         movd    xmm7, [edi + 12]
 596
 597         // Finish the calculation by adding the Montgomery product.
 598         mulacc  [ebx +  0], edx, xmm4, xmm5, xmm6, xmm7
 599         propout [edi +  0],      xmm4, xmm5
 600
 601         mulacc  [ebx +  4], edx, xmm5, xmm6, xmm7, xmm4, t
 602         propout [edi +  4],      xmm5, xmm6
 603
 604         mulacc  [ebx +  8], edx, xmm6, xmm7, xmm4, xmm5, t
 605         propout [edi +  8],      xmm6, xmm7
 606
 607         mulacc  [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6, t
 608         propout [edi + 12],      xmm7, xmm4
 609
 610         // Add add on the carry we calculated earlier.
 611         paddq   xmm4, [esp +  0]
 612         paddq   xmm5, [esp + 16]
 613         paddq   xmm6, [esp + 32]
 614
 615         // And, with that, we're done.
 616         stfree  48 + 12
 617         ret
 618
 619 ENDFUNC
 620
 621 INTFUNC(mont4)
 622         // On entry, EDI points to the destination buffer holding a packed
 623         // value W; EBX points to a packed operand N; ESI points to an
 624         // expanded operand M; and EDX points to a place to store an expanded
 625         // result Y (32 bytes, at a 16-byte boundary).
 626         //
 627         // On exit, we write Y = W M mod B to [EDX], and the low 128 bits
 628         // of the sum W + N Y to [EDI], leaving the remaining carry in
 629         // XMM4, XMM5, and XMM6.  The registers XMM0, XMM1, XMM2, XMM3, and
 630         // XMM7 are clobbered; the general-purpose registers are preserved.
 631   endprologue
 632
 633         // Calculate Y = W M.
 634         mulcore [edi +  0], esi, xmm4, xmm5, xmm6, xmm7
 635
 636         mulcore [edi +  4], esi, xmm0, xmm1, xmm2
 637         accum                    xmm5, xmm6, xmm7
 638
 639         mulcore [edi +  8], esi, xmm0, xmm1
 640         accum                    xmm6, xmm7
 641
 642         mulcore [edi + 12], esi, xmm0
 643         accum                    xmm7
 644
 645         // That's lots of pieces.  Now we have to assemble the answer.
 646         squash  xmm4, xmm5, xmm6, xmm7,  xmm0, xmm1,  xmm4
 647
 648         // Expand it.
 649         pxor    xmm2, xmm2
 650         expand  xmm2, xmm4, xmm1
 651         movdqa  [edx +  0], xmm4
 652         movdqa  [edx + 16], xmm1
 653
 654         // Initialize the carry from W.
 655         movd    xmm4, [edi +  0]
 656         movd    xmm5, [edi +  4]
 657         movd    xmm6, [edi +  8]
 658         movd    xmm7, [edi + 12]
 659
 660         // Finish the calculation by adding the Montgomery product.
 661         mulacc  [ebx +  0], edx, xmm4, xmm5, xmm6, xmm7
 662         propout [edi +  0],      xmm4, xmm5
 663
 664         mulacc  [ebx +  4], edx, xmm5, xmm6, xmm7, xmm4, t
 665         propout [edi +  4],      xmm5, xmm6
 666
 667         mulacc  [ebx +  8], edx, xmm6, xmm7, xmm4, xmm5, t
 668         propout [edi +  8],      xmm6, xmm7
 669
 670         mulacc  [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6, t
 671         propout [edi + 12],      xmm7, xmm4
 672
 673         // And, with that, we're done.
 674         ret
 675
 676 ENDFUNC
 677
 678 ///--------------------------------------------------------------------------
 679 /// Bulk multipliers.
 680
 681 FUNC(mpx_umul4_x86_avx)
 682         .arch   .avx
 683         vzeroupper
 684   endprologue
 685         // and drop through...
 686         .arch   pentium4
 687 ENDFUNC
 688
 689 FUNC(mpx_umul4_x86_sse2)
 690         // void mpx_umul4_x86_sse2(mpw *dv, const mpw *av, const mpw *avl,
 691         //                         const mpw *bv, const mpw *bvl);
 692
 693         // Build a stack frame.  Arguments will be relative to EBP, as
 694         // follows.
 695         //
 696         //      ebp + 20        dv
 697         //      ebp + 24        av
 698         //      ebp + 28        avl
 699         //      ebp + 32        bv
 700         //      ebp + 36        bvl
 701         //
 702         // Locals are relative to ESP, as follows.
 703         //
 704         //      esp +  0        expanded Y (32 bytes)
 705         //      esp + 32        (top of locals)
 706         pushreg ebp
 707         pushreg ebx
 708         pushreg esi
 709         pushreg edi
 710         setfp   ebp
 711         and     esp, ~15
 712         sub     esp, 32
 713   endprologue
 714
 715         // Prepare for the first iteration.
 716         mov     esi, [ebp + 32]         // -> bv[0]
 717         pxor    xmm7, xmm7
 718         movdqu  xmm0, [esi]             // bv[0]
 719         mov     edi, [ebp + 20]         // -> dv[0]
 720         mov     ecx, edi                // outer loop dv cursor
 721         expand  xmm7, xmm0, xmm1
 722         mov     ebx, [ebp + 24]         // -> av[0]
 723         mov     eax, [ebp + 28]         // -> av[m] = av limit
 724         mov     edx, esp                // -> expanded Y = bv[0]
 725         movdqa  [esp + 0], xmm0         // bv[0] expanded low
 726         movdqa  [esp + 16], xmm1        // bv[0] expanded high
 727         call    mul4zc
 728         add     ebx, 16
 729         add     edi, 16
 730         add     ecx, 16
 731         add     esi, 16
 732         cmp     ebx, eax                // all done?
 733         jae     8f
 734
 735         .p2align 4
 736         // Continue with the first iteration.
 737 0:      call    mul4
 738         add     ebx, 16
 739         add     edi, 16
 740         cmp     ebx, eax                // all done?
 741         jb      0b
 742
 743         // Write out the leftover carry.  There can be no tail here.
 744 8:      call    carryprop
 745         cmp     esi, [ebp + 36]         // more passes to do?
 746         jae     9f
 747
 748         .p2align 4
 749         // Set up for the next pass.
 750 1:      movdqu  xmm0, [esi]             // bv[i]
 751         mov     edi, ecx                // -> dv[i]
 752         pxor    xmm7, xmm7
 753         expand  xmm7, xmm0, xmm1
 754         mov     ebx, [ebp + 24]         // -> av[0]
 755         movdqa  [esp + 0], xmm0         // bv[i] expanded low
 756         movdqa  [esp + 16], xmm1        // bv[i] expanded high
 757         call    mla4zc
 758         add     edi, 16
 759         add     ebx, 16
 760         add     ecx, 16
 761         add     esi, 16
 762         cmp     ebx, eax                // done yet?
 763         jae     8f
 764
 765         .p2align 4
 766         // Continue...
 767 0:      call    mla4
 768         add     ebx, 16
 769         add     edi, 16
 770         cmp     ebx, eax
 771         jb      0b
 772
 773         // Finish off this pass.  There was no tail on the previous pass, and
 774         // there can be none on this pass.
 775 8:      call    carryprop
 776         cmp     esi, [ebp + 36]
 777         jb      1b
 778
 779         // All over.
 780 9:      dropfp
 781         pop     edi
 782         pop     esi
 783         pop     ebx
 784         pop     ebp
 785         ret
 786
 787 ENDFUNC
 788
 789 FUNC(mpxmont_mul4_x86_avx)
 790         .arch   .avx
 791         vzeroupper
 792   endprologue
 793         // and drop through...
 794         .arch   pentium4
 795 ENDFUNC
 796
 797 FUNC(mpxmont_mul4_x86_sse2)
 798         // void mpxmont_mul4_x86_sse2(mpw *dv, const mpw *av, const mpw *bv,
 799         //                           const mpw *nv, size_t n, const mpw *mi);
 800
 801         // Build a stack frame.  Arguments will be relative to EBP, as
 802         // follows.
 803         //
 804         //      ebp + 20        dv
 805         //      ebp + 24        av
 806         //      ebp + 28        bv
 807         //      ebp + 32        nv
 808         //      ebp + 36        n (nonzero multiple of 4)
 809         //      ebp + 40        mi
 810         //
 811         // Locals are relative to ESP, which 16-byte aligned, as follows.
 812         //
 813         //      esp +   0       expanded V (32 bytes)
 814         //      esp +  32       expanded M (32 bytes)
 815         //      esp +  64       expanded Y (32 bytes)
 816         //      esp +  96       outer loop dv
 817         //      esp + 100       outer loop bv
 818         //      esp + 104       av limit (mostly in ESI)
 819         //      esp + 108       bv limit
 820         //      esp + 112       (top of locals)
 821         pushreg ebp
 822         pushreg ebx
 823         pushreg esi
 824         pushreg edi
 825         setfp   ebp
 826         and     esp, ~15
 827         sub     esp, 112
 828   endprologue
 829
 830         // Establish the expanded operands.
 831         pxor    xmm7, xmm7
 832         mov     ecx, [ebp + 28]         // -> bv
 833         mov     edx, [ebp + 40]         // -> mi
 834         movdqu  xmm0, [ecx]             // bv[0]
 835         movdqu  xmm2, [edx]             // mi
 836         expand  xmm7, xmm0, xmm1, xmm2, xmm3
 837         movdqa  [esp +  0], xmm0        // bv[0] expanded low
 838         movdqa  [esp + 16], xmm1        // bv[0] expanded high
 839         movdqa  [esp + 32], xmm2        // mi expanded low
 840         movdqa  [esp + 48], xmm3        // mi expanded high
 841
 842         // Set up the outer loop state and prepare for the first iteration.
 843         mov     edx, [ebp + 36]         // n
 844         mov     eax, [ebp + 24]         // -> U = av[0]
 845         mov     ebx, [ebp + 32]         // -> X = nv[0]
 846         mov     edi, [ebp + 20]         // -> Z = dv[0]
 847         mov     [esp + 100], ecx
 848         lea     ecx, [ecx + 4*edx]      // -> bv[n/4] = bv limit
 849         lea     edx, [eax + 4*edx]      // -> av[n/4] = av limit
 850         mov     [esp + 96], edi
 851         mov     [esp + 104], edx
 852         mov     [esp + 108], ecx
 853         lea     ecx, [esp + 0]          // -> expanded V = bv[0]
 854         lea     esi, [esp + 32]         // -> expanded M = mi
 855         lea     edx, [esp + 64]         // -> space for Y
 856         call    mmul4
 857         mov     esi, [esp + 104]        // recover av limit
 858         add     edi, 16
 859         add     eax, 16
 860         add     ebx, 16
 861         cmp     eax, esi                // done already?
 862         jae     8f
 863         mov     [esp + 96], edi
 864
 865         .p2align 4
 866         // Complete the first inner loop.
 867 0:      call    dmul4
 868         add     edi, 16
 869         add     eax, 16
 870         add     ebx, 16
 871         cmp     eax, esi                // done yet?
 872         jb      0b
 873
 874         // Still have carries left to propagate.
 875         call    carryprop
 876         movd    [edi + 16], xmm4
 877
 878         .p2align 4
 879         // Embark on the next iteration.  (There must be one.  If n = 1, then
 880         // we would have bailed above, to label 8.  Similarly, the subsequent
 881         // iterations can fall into the inner loop immediately.)
 882 1:      mov     eax, [esp + 100]        // -> bv[i - 1]
 883         mov     edi, [esp + 96]         // -> Z = dv[i]
 884         add     eax, 16                 // -> bv[i]
 885         pxor    xmm7, xmm7
 886         mov     [esp + 100], eax
 887         cmp     eax, [esp + 108]        // done yet?
 888         jae     9f
 889         movdqu  xmm0, [eax]             // bv[i]
 890         mov     ebx, [ebp + 32]         // -> X = nv[0]
 891         lea     esi, [esp + 32]         // -> expanded M = mi
 892         mov     eax, [ebp + 24]         // -> U = av[0]
 893         expand  xmm7, xmm0, xmm1
 894         movdqa  [esp + 0], xmm0         // bv[i] expanded low
 895         movdqa  [esp + 16], xmm1        // bv[i] expanded high
 896         call    mmla4
 897         mov     esi, [esp + 104]        // recover av limit
 898         add     edi, 16
 899         add     eax, 16
 900         add     ebx, 16
 901         mov     [esp + 96], edi
 902
 903         .p2align 4
 904         // Complete the next inner loop.
 905 0:      call    dmla4
 906         add     edi, 16
 907         add     eax, 16
 908         add     ebx, 16
 909         cmp     eax, esi
 910         jb      0b
 911
 912         // Still have carries left to propagate, and they overlap the
 913         // previous iteration's final tail, so read that in and add it.
 914         movd    xmm0, [edi]
 915         paddq   xmm4, xmm0
 916         call    carryprop
 917         movd    [edi + 16], xmm4
 918
 919         // Back again.
 920         jmp     1b
 921
 922         // First iteration was short.  Write out the carries and we're done.
 923         // (This could be folded into the main loop structure, but that would
 924         // penalize small numbers more.)
 925 8:      call    carryprop
 926         movd    [edi + 16], xmm4
 927
 928         // All done.
 929 9:      dropfp
 930         popreg  edi
 931         popreg  esi
 932         popreg  ebx
 933         popreg  ebp
 934         ret
 935
 936 ENDFUNC
 937
 938 FUNC(mpxmont_redc4_x86_avx)
 939         .arch   .avx
 940         vzeroupper
 941   endprologue
 942         // and drop through...
 943         .arch   pentium4
 944 ENDFUNC
 945
 946 FUNC(mpxmont_redc4_x86_sse2)
 947         // void mpxmont_redc4_x86_sse2(mpw *dv, mpw *dvl, const mpw *nv,
 948         //                             size_t n, const mpw *mi);
 949
 950         // Build a stack frame.  Arguments will be relative to EBP, as
 951         // follows.
 952         //
 953         //      ebp + 20        dv
 954         //      ebp + 24        dvl
 955         //      ebp + 28        nv
 956         //      ebp + 32        n (nonzero multiple of 4)
 957         //      ebp + 36        mi
 958         //
 959         // Locals are relative to ESP, as follows.
 960         //
 961         //      esp +  0        outer loop dv
 962         //      esp +  4        outer dv limit
 963         //      esp +  8        blocks-of-4 dv limit
 964         //      esp + 12        expanded M (32 bytes)
 965         //      esp + 44        expanded Y (32 bytes)
 966         //      esp + 76        (top of locals)
 967         pushreg ebp
 968         pushreg ebx
 969         pushreg esi
 970         pushreg edi
 971         setfp   ebp
 972         and     esp, ~15
 973         sub     esp, 76
 974   endprologue
 975
 976         // Establish the expanded operands and the blocks-of-4 dv limit.
 977         mov     edi, [ebp + 20]         // -> Z = dv[0]
 978         pxor    xmm7, xmm7
 979         mov     eax, [ebp + 24]         // -> dv[n] = dv limit
 980         sub     eax, edi                // length of dv in bytes
 981         mov     edx, [ebp + 36]         // -> mi
 982         movdqu  xmm0, [edx]             // mi
 983         and     eax, ~15                // mask off the tail end
 984         expand  xmm7, xmm0, xmm1
 985         add     eax, edi                // find limit
 986         movdqa  [esp + 12], xmm0        // mi expanded low
 987         movdqa  [esp + 28], xmm1        // mi expanded high
 988         mov     [esp + 8], eax
 989
 990         // Set up the outer loop state and prepare for the first iteration.
 991         mov     ecx, [ebp + 32]         // n
 992         mov     ebx, [ebp + 28]         // -> X = nv[0]
 993         lea     edx, [edi + 4*ecx]      // -> dv[n/4] = outer dv limit
 994         lea     ecx, [ebx + 4*ecx]      // -> nv[n/4] = nv limit
 995         mov     [esp + 0], edi
 996         mov     [esp + 4], edx
 997         lea     esi, [esp + 12]         // -> expanded M = mi
 998         lea     edx, [esp + 44]         // -> space for Y
 999         call    mont4
1000         add     ebx, 16
1001         add     edi, 16
1002         cmp     ebx, ecx                // done already?
1003         jae     8f
1004
1005         .p2align 4
1006         // Complete the first inner loop.
1007 5:      call    mla4
1008         add     ebx, 16
1009         add     edi, 16
1010         cmp     ebx, ecx                // done yet?
1011         jb      5b
1012
1013         // Still have carries left to propagate.
1014 8:      carryadd
1015         mov     esi, [esp + 8]          // -> dv blocks limit
1016         mov     edx, [ebp + 24]         // dv limit
1017         psllq   xmm7, 16
1018         pslldq  xmm7, 8
1019         paddq   xmm6, xmm7
1020         call    carryprop
1021         movd    eax, xmm4
1022         add     edi, 16
1023         cmp     edi, esi
1024         jae     7f
1025
1026         .p2align 4
1027         // Continue carry propagation until the end of the buffer.
1028 0:      add     [edi], eax
1029         mov     eax, 0                  // preserves flags
1030         adcd    [edi + 4], 0
1031         adcd    [edi + 8], 0
1032         adcd    [edi + 12], 0
1033         adc     eax, 0
1034         add     edi, 16
1035         cmp     edi, esi
1036         jb      0b
1037
1038         // Deal with the tail end.
1039 7:      add     [edi], eax
1040         mov     eax, 0                  // preserves flags
1041         add     edi, 4
1042         adc     eax, 0
1043         cmp     edi, edx
1044         jb      7b
1045
1046         // All done for this iteration.  Start the next.  (This must have at
1047         // least one follow-on iteration, or we'd not have started this outer
1048         // loop.)
1049 8:      mov     edi, [esp + 0]          // -> dv[i - 1]
1050         mov     ebx, [ebp + 28]         // -> X = nv[0]
1051         lea     edx, [esp + 44]         // -> space for Y
1052         lea     esi, [esp + 12]         // -> expanded M = mi
1053         add     edi, 16                 // -> Z = dv[i]
1054         cmp     edi, [esp + 4]          // all done yet?
1055         jae     9f
1056         mov     [esp + 0], edi
1057         call    mont4
1058         add     edi, 16
1059         add     ebx, 16
1060         jmp     5b
1061
1062         // All over.
1063 9:      dropfp
1064         popreg  edi
1065         popreg  esi
1066         popreg  ebx
1067         popreg  ebp
1068         ret
1069
1070 ENDFUNC
1071
1072 ///--------------------------------------------------------------------------
1073 /// Testing and performance measurement.
1074
1075 #ifdef TEST_MUL4
1076
1077 .macro  cysetup c
1078         rdtsc
1079         mov     [\c], eax
1080         mov     [\c + 4], edx
1081 .endm
1082
1083 .macro  cystore c, v, n
1084         rdtsc
1085         sub     eax, [\c]
1086         sbb     edx, [\c + 4]
1087         mov     ebx, [\v]
1088         mov     ecx, [\n]
1089         dec     ecx
1090         mov     [\n], ecx
1091         mov     [ebx + ecx*8], eax
1092         mov     [ebx + ecx*8 + 4], edx
1093 .endm
1094
1095 .macro  testprologue n
1096         pushreg ebp
1097         pushreg ebx
1098         pushreg esi
1099         pushreg edi
1100         setfp   ebp
1101         and     esp, ~15
1102         sub     esp, 3*32 + 4*4
1103   endprologue
1104         mov     eax, \n
1105         mov     [esp + 104], eax
1106         // vars:
1107         //      esp +   0 = v expanded
1108         //      esp +  32 = y expanded
1109         //      esp +  64 = ? expanded
1110         //      esp +  96 = cycles
1111         //      esp + 104 = count
1112 .endm
1113
1114 .macro  testepilogue
1115         dropfp
1116         popreg  edi
1117         popreg  esi
1118         popreg  ebx
1119         popreg  ebp
1120         ret
1121 .endm
1122
1123 .macro  testldcarry c
1124         mov     ecx, \c                 // -> c
1125         movdqu  xmm4, [ecx +  0]        // (c'_0; c''_0)
1126         movdqu  xmm5, [ecx + 16]        // (c'_1; c''_1)
1127         movdqu  xmm6, [ecx + 32]        // (c'_2; c''_2)
1128 .endm
1129
1130 .macro  testexpand v=nil, y=nil
1131         pxor    xmm7, xmm7
1132   .ifnes "\v", "nil"
1133         mov     ecx, \v
1134         movdqu  xmm0, [ecx]
1135         expand  xmm7, xmm0, xmm1
1136         movdqa  [esp +  0], xmm0
1137         movdqa  [esp + 16], xmm1
1138   .endif
1139   .ifnes "\y", "nil"
1140         mov     edx, \y
1141         movdqu  xmm2, [edx]
1142         expand  xmm7, xmm2, xmm3
1143         movdqa  [esp + 32], xmm2
1144         movdqa  [esp + 48], xmm3
1145   .endif
1146 .endm
1147
1148 .macro  testtop u=nil, x=nil, mode=nil
1149         .p2align 4
1150 0:
1151   .ifnes "\u", "nil"
1152         lea     ecx, [esp + 0]
1153   .endif
1154         mov     ebx, \x
1155   .ifeqs "\mode", "mont"
1156         lea     esi, [esp + 32]
1157   .endif
1158         cysetup esp + 96
1159   .ifnes "\u", "nil"
1160         mov     eax, \u
1161   .endif
1162   .ifeqs "\mode", "mont"
1163         lea     edx, [esp + 64]
1164   .else
1165         lea     edx, [esp + 32]
1166   .endif
1167 .endm
1168
1169 .macro  testtail cyv
1170         cystore esp + 96, \cyv, esp + 104
1171         jnz     0b
1172 .endm
1173
1174 .macro  testcarryout c
1175         mov     ecx, \c
1176         movdqu  [ecx +  0], xmm4
1177         movdqu  [ecx + 16], xmm5
1178         movdqu  [ecx + 32], xmm6
1179 .endm
1180
1181 FUNC(test_dmul4)
1182         testprologue [ebp + 44]
1183         testldcarry [ebp + 24]
1184         testexpand [ebp + 36], [ebp + 40]
1185         mov     edi, [ebp + 20]
1186         testtop [ebp + 28], [ebp + 32]
1187         call    dmul4
1188         testtail [ebp + 48]
1189         testcarryout [ebp + 24]
1190         testepilogue
1191 ENDFUNC
1192
1193 FUNC(test_dmla4)
1194         testprologue [ebp + 44]
1195         testldcarry [ebp + 24]
1196         testexpand [ebp + 36], [ebp + 40]
1197         mov     edi, [ebp + 20]
1198         testtop [ebp + 28], [ebp + 32]
1199         call    dmla4
1200         testtail [ebp + 48]
1201         testcarryout [ebp + 24]
1202         testepilogue
1203 ENDFUNC
1204
1205 FUNC(test_mul4)
1206         testprologue [ebp + 36]
1207         testldcarry [ebp + 24]
1208         testexpand nil, [ebp + 32]
1209         mov     edi, [ebp + 20]
1210         testtop nil, [ebp + 28]
1211         call    mul4
1212         testtail [ebp + 40]
1213         testcarryout [ebp + 24]
1214         testepilogue
1215 ENDFUNC
1216
1217 FUNC(test_mla4)
1218         testprologue [ebp + 36]
1219         testldcarry [ebp + 24]
1220         testexpand nil, [ebp + 32]
1221         mov     edi, [ebp + 20]
1222         testtop nil, [ebp + 28]
1223         call    mla4
1224         testtail [ebp + 40]
1225         testcarryout [ebp + 24]
1226         testepilogue
1227 ENDFUNC
1228
1229 FUNC(test_mmul4)
1230         testprologue [ebp + 48]
1231         testexpand [ebp + 40], [ebp + 44]
1232         mov     edi, [ebp + 20]
1233         testtop [ebp + 32], [ebp + 36], mont
1234         call    mmul4
1235         testtail [ebp + 52]
1236         mov     edi, [ebp + 28]
1237         movdqa  xmm0, [esp + 64]
1238         movdqa  xmm1, [esp + 80]
1239         movdqu  [edi], xmm0
1240         movdqu  [edi + 16], xmm1
1241         testcarryout [ebp + 24]
1242         testepilogue
1243 ENDFUNC
1244
1245 FUNC(test_mmla4)
1246         testprologue [ebp + 48]
1247         testexpand [ebp + 40], [ebp + 44]
1248         mov     edi, [ebp + 20]
1249         testtop [ebp + 32], [ebp + 36], mont
1250         call    mmla4
1251         testtail [ebp + 52]
1252         mov     edi, [ebp + 28]
1253         movdqa  xmm0, [esp + 64]
1254         movdqa  xmm1, [esp + 80]
1255         movdqu  [edi], xmm0
1256         movdqu  [edi + 16], xmm1
1257         testcarryout [ebp + 24]
1258         testepilogue
1259 ENDFUNC
1260
1261 FUNC(test_mont4)
1262         testprologue [ebp + 40]
1263         testexpand nil, [ebp + 36]
1264         mov     edi, [ebp + 20]
1265         testtop nil, [ebp + 32], mont
1266         call    mont4
1267         testtail [ebp + 44]
1268         mov     edi, [ebp + 28]
1269         movdqa  xmm0, [esp + 64]
1270         movdqa  xmm1, [esp + 80]
1271         movdqu  [edi], xmm0
1272         movdqu  [edi + 16], xmm1
1273         testcarryout [ebp + 24]
1274         testepilogue
1275 ENDFUNC
1276
1277 #endif
1278
1279 ///----- That's all, folks --------------------------------------------------