chiark - git - mdw - catacomb/blob - math/mpx-mul4-x86-sse2.S

   1 /// -*- mode: asm; asm-comment-char: ?/; comment-start: "// " -*-
   2 ///
   3 /// Large SIMD-based multiplications
   4 ///
   5 /// (c) 2016 Straylight/Edgeware
   6
   7 ///----- Licensing notice ---------------------------------------------------
   8 ///
   9 /// This file is part of Catacomb.
  10 ///
  11 /// Catacomb is free software; you can redistribute it and/or modify
  12 /// it under the terms of the GNU Library General Public License as
  13 /// published by the Free Software Foundation; either version 2 of the
  14 /// License, or (at your option) any later version.
  15 ///
  16 /// Catacomb is distributed in the hope that it will be useful,
  17 /// but WITHOUT ANY WARRANTY; without even the implied warranty of
  18 /// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  19 /// GNU Library General Public License for more details.
  20 ///
  21 /// You should have received a copy of the GNU Library General Public
  22 /// License along with Catacomb; if not, write to the Free
  23 /// Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
  24 /// MA 02111-1307, USA.
  25
  26 ///--------------------------------------------------------------------------
  27 /// External definitions.
  28
  29 #include "config.h"
  30 #include "asm-common.h"
  31
  32 ///--------------------------------------------------------------------------
  33 /// Prologue.
  34
  35         .arch   pentium4
  36         .text
  37
  38 ///--------------------------------------------------------------------------
  39 /// Theory.
  40 ///
  41 /// We define a number of primitive fixed-size multipliers from which we can
  42 /// construct more general variable-length multipliers.
  43 ///
  44 /// The basic trick is the same throughout.  In an operand-scanning
  45 /// multiplication, the inner multiplication loop multiplies a
  46 /// multiple-precision operand by a single precision factor, and adds the
  47 /// result, appropriately shifted, to the result.  A `finely integrated
  48 /// operand scanning' implementation of Montgomery multiplication also adds
  49 /// the product of a single-precision `Montgomery factor' and the modulus,
  50 /// calculated in the same pass.  The more common `coarsely integrated
  51 /// operand scanning' alternates main multiplication and Montgomery passes,
  52 /// which requires additional carry propagation.
  53 ///
  54 /// Throughout both plain-multiplication and Montgomery stages, then, one of
  55 /// the factors remains constant throughout the operation, so we can afford
  56 /// to take a little time to preprocess it.  The transformation we perform is
  57 /// as follows.  Let b = 2^16, and B = b^2 = 2^32.  Suppose we're given a
  58 /// 128-bit factor v = v_0 + v_1 B + v_2 B^2 + v_3 B^3.  Split each v_i into
  59 /// two sixteen-bit pieces, so v_i = v'_i + v''_i b.  These eight 16-bit
  60 /// pieces are placed into 32-bit cells, and arranged as two 128-bit SSE
  61 /// operands, as follows.
  62 ///
  63 ///     Offset     0       4        8      12
  64 ///        0    v'_0    v'_1    v''_0   v''_1
  65 ///       16    v'_2    v'_3    v''_2   v''_3
  66 ///
  67 /// A `pmuludqd' instruction ignores the odd positions in its operands; thus,
  68 /// it will act on (say) v'_0 and v''_0 in a single instruction.  Shifting
  69 /// this vector right by 4 bytes brings v'_1 and v''_1 into position.  We can
  70 /// multiply such a vector by a full 32-bit scalar to produce two 48-bit
  71 /// results in 64-bit fields.  The sixteen bits of headroom allows us to add
  72 /// many products together before we must deal with carrying; it also allows
  73 /// for some calculations to be performed on the above expanded form.
  74 ///
  75 /// On 32-bit x86, we are register starved: the expanded operands are kept in
  76 /// memory, typically in warm L1 cache.
  77 ///
  78 /// We maintain four `carry' registers accumulating intermediate results.
  79 /// The registers' precise roles rotate during the computation; we name them
  80 /// `c0', `c1', `c2', and `c3'.  Each carry register holds two 64-bit halves:
  81 /// the register c0, for example, holds c'_0 (low half) and c''_0 (high
  82 /// half), and represents the value c_0 = c'_0 + c''_0 b; the carry registers
  83 /// collectively represent the value c_0 + c_1 B + c_2 B^2 + c_3 B^3.  The
  84 /// `pmuluqdq' instruction acting on a scalar operand (broadcast across all
  85 /// lanes of its vector) and an operand in the expanded form above produces a
  86 /// result which can be added directly to the appropriate carry register.
  87 /// Following a pass of four multiplications, we perform some limited carry
  88 /// propagation: let t = c''_0 mod B, and let d = c'_0 + t b; then we output
  89 /// z = d mod B, add (floor(d/B), floor(c''_0/B)) to c1, and cycle the carry
  90 /// registers around, so that c1 becomes c0, and the old c0 is (implicitly)
  91 /// zeroed becomes c3.
  92
  93 ///--------------------------------------------------------------------------
  94 /// Macro definitions.
  95
  96 .macro  mulcore r, s, d0, d1, d2, d3
  97         // Load a word r_i from R, multiply by the expanded operand [S], and
  98         // leave the pieces of the product in registers D0, D1, D2, D3.
  99         movd    \d0, \r                 // (r_i, 0, 0, 0)
 100   .ifnes "\d1", "nil"
 101         movdqa  \d1, [\s]               // (s'_0, s'_1, s''_0, s''_1)
 102   .endif
 103   .ifnes "\d3", "nil"
 104         movdqa  \d3, [\s + 16]          // (s'_2, s'_3, s''_2, s''_3)
 105   .endif
 106         pshufd  \d0, \d0, SHUF(3, 0, 3, 0) // (r_i, ?, r_i, ?)
 107   .ifnes "\d1", "nil"
 108         psrldq  \d1, 4                  // (s'_1, s''_0, s''_1, 0)
 109   .endif
 110   .ifnes "\d2", "nil"
 111     .ifnes "\d3", "nil"
 112         movdqa  \d2, \d3                // another copy of (s'_2, s'_3, ...)
 113     .else
 114         movdqa  \d2, \d0                // another copy of (r_i, ?, r_i, ?)
 115     .endif
 116   .endif
 117   .ifnes "\d3", "nil"
 118         psrldq  \d3, 4                  // (s'_3, s''_2, s''_3, 0)
 119   .endif
 120   .ifnes "\d1", "nil"
 121         pmuludqd \d1, \d0               // (r_i s'_1, r_i s''_1)
 122   .endif
 123   .ifnes "\d3", "nil"
 124         pmuludqd \d3, \d0               // (r_i s'_3, r_i s''_3)
 125   .endif
 126   .ifnes "\d2", "nil"
 127     .ifnes "\d3", "nil"
 128         pmuludqd \d2, \d0               // (r_i s'_2, r_i s''_2)
 129     .else
 130         pmuludqd \d2, [\s + 16]
 131     .endif
 132   .endif
 133         pmuludqd \d0, [\s]              // (r_i s'_0, r_i s''_0)
 134 .endm
 135
 136 .macro  accum   c0, c1, c2, c3
 137         paddq   \c0, xmm0
 138   .ifnes "\c1", "nil"
 139         paddq   \c1, xmm1
 140   .endif
 141   .ifnes "\c2", "nil"
 142         paddq   \c2, xmm2
 143   .endif
 144   .ifnes "\c3", "nil"
 145         paddq   \c3, xmm3
 146   .endif
 147 .endm
 148
 149 .macro  mulacc  r, s, c0, c1, c2, c3, z3p
 150         // Load a word r_i from R, multiply by the expanded operand [S],
 151         // and accumulate in carry registers C0, C1, C2, C3.  If Z3P is `t'
 152         // then C3 notionally contains zero, but needs clearing; in practice,
 153         // we store the product directly rather than attempting to add.  On
 154         // completion, XMM0, XMM1, and XMM2 are clobbered, as is XMM3 if Z3P
 155         // is not `t'.
 156   .ifeqs "\z3p", "t"
 157         mulcore \r, \s, xmm0, xmm1, xmm2, \c3
 158         accum           \c0,  \c1,  \c2,  nil
 159   .else
 160         mulcore \r, \s, xmm0, xmm1, xmm2, xmm3
 161         accum           \c0,  \c1,  \c2,  \c3
 162   .endif
 163 .endm
 164
 165 .macro  propout d, c, cc
 166         // Calculate an output word from C, and store it in D; propagate
 167         // carries out from C to CC in preparation for a rotation of the
 168         // carry registers.  On completion, XMM3 is clobbered.  If CC is
 169         // `nil', then the contribution which would have been added to it is
 170         // left in C.
 171         pshufd  xmm3, \c, SHUF(2, 3, 3, 3) // (?, ?, ?, t = c'' mod B)
 172         psrldq  xmm3, 12                // (t, 0, 0, 0) = (t, 0)
 173         pslldq  xmm3, 2                 // (t b, 0)
 174         paddq   \c, xmm3                // (c' + t b, c'')
 175         movd    \d, \c
 176         psrlq   \c, 32                  // floor(c/B)
 177   .ifnes "\cc", "nil"
 178         paddq   \cc, \c                 // propagate up
 179   .endif
 180 .endm
 181
 182 .macro  endprop d, c, t
 183         // On entry, C contains a carry register.  On exit, the low 32 bits
 184         // of the value represented in C are written to D, and the remaining
 185         // bits are left at the bottom of T.
 186         movdqa  \t, \c
 187         psllq   \t, 16                  // (?, c'' b)
 188         pslldq  \c, 8                   // (0, c')
 189         paddq   \t, \c                  // (?, c' + c'' b)
 190         psrldq  \t, 8                   // c' + c'' b
 191         movd    \d, \t
 192         psrldq  \t, 4                   // floor((c' + c'' b)/B)
 193 .endm
 194
 195 .macro  expand  a, b, c, d, z
 196         // On entry, A and C hold packed 128-bit values, and Z is zero.  On
 197         // exit, A:B and C:D together hold the same values in expanded
 198         // form.  If C is `nil', then only expand A to A:B.
 199         movdqa  \b, \a                  // (a_0, a_1, a_2, a_3)
 200   .ifnes "\c", "nil"
 201         movdqa  \d, \c                  // (c_0, c_1, c_2, c_3)
 202   .endif
 203         punpcklwd \a, \z                // (a'_0, a''_0, a'_1, a''_1)
 204         punpckhwd \b, \z                // (a'_2, a''_2, a'_3, a''_3)
 205   .ifnes "\c", "nil"
 206         punpcklwd \c, \z                // (c'_0, c''_0, c'_1, c''_1)
 207         punpckhwd \d, \z                // (c'_2, c''_2, c'_3, c''_3)
 208   .endif
 209         pshufd  \a, \a, SHUF(3, 1, 2, 0) // (a'_0, a'_1, a''_0, a''_1)
 210         pshufd  \b, \b, SHUF(3, 1, 2, 0) // (a'_2, a'_3, a''_2, a''_3)
 211   .ifnes "\c", "nil"
 212         pshufd  \c, \c, SHUF(3, 1, 2, 0) // (c'_0, c'_1, c''_0, c''_1)
 213         pshufd  \d, \d, SHUF(3, 1, 2, 0) // (c'_2, c'_3, c''_2, c''_3)
 214   .endif
 215 .endm
 216
 217 .macro  squash  c0, c1, c2, c3, h, t, u
 218         // On entry, C0, C1, C2, C3 are carry registers representing a value
 219         // Y.  On exit, C0 holds the low 128 bits of the carry value; C1, C2,
 220         // C3, T, and U are clobbered; and the high bits of Y are stored in
 221         // H, if this is not `nil'.
 222
 223         // The first step is to eliminate the `double-prime' pieces -- i.e.,
 224         // the ones offset by 16 bytes from a 32-bit boundary -- by carrying
 225         // them into the 32-bit-aligned pieces above and below.  But before
 226         // we can do that, we must gather them together.
 227         movdqa  \t, \c0
 228         movdqa  \u, \c1
 229         punpcklqdq \t, \c2              // (y'_0, y'_2)
 230         punpckhqdq \c0, \c2             // (y''_0, y''_2)
 231         punpcklqdq \u, \c3              // (y'_1, y'_3)
 232         punpckhqdq \c1, \c3             // (y''_1, y''_3)
 233
 234         // Now split the double-prime pieces.  The high (up to) 48 bits will
 235         // go up; the low 16 bits go down.
 236         movdqa  \c2, \c0
 237         movdqa  \c3, \c1
 238         psllq   \c2, 48
 239         psllq   \c3, 48
 240         psrlq   \c0, 16                 // high parts of (y''_0, y''_2)
 241         psrlq   \c1, 16                 // high parts of (y''_1, y''_3)
 242         psrlq   \c2, 32                 // low parts of (y''_0, y''_2)
 243         psrlq   \c3, 32                 // low parts of (y''_1, y''_3)
 244   .ifnes "\h", "nil"
 245         movdqa  \h, \c1
 246   .endif
 247         pslldq  \c1, 8                  // high part of (0, y''_1)
 248
 249         paddq   \t, \c2                 // propagate down
 250         paddq   \u, \c3
 251         paddq   \t, \c1                 // and up: (y_0, y_2)
 252         paddq   \u, \c0                 // (y_1, y_3)
 253   .ifnes "\h", "nil"
 254         psrldq  \h, 8                   // high part of (y''_3, 0)
 255   .endif
 256
 257         // Finally extract the answer.  This complicated dance is better than
 258         // storing to memory and loading, because the piecemeal stores
 259         // inhibit store forwarding.
 260         movdqa  \c3, \t                 // (y_0, y_1)
 261         movdqa  \c0, \t                 // (y^*_0, ?, ?, ?)
 262         psrldq  \t, 8                   // (y_2, 0)
 263         psrlq   \c3, 32                 // (floor(y_0/B), ?)
 264         paddq   \c3, \u                 // (y_1 + floor(y_0/B), ?)
 265         movdqa  \c1, \c3                // (y^*_1, ?, ?, ?)
 266         psrldq  \u, 8                   // (y_3, 0)
 267         psrlq   \c3, 32                 // (floor((y_1 B + y_0)/B^2, ?)
 268         paddq   \c3, \t                 // (y_2 + floor((y_1 B + y_0)/B^2, ?)
 269         punpckldq \c0, \c3              // (y^*_0, y^*_2, ?, ?)
 270         psrlq   \c3, 32             // (floor((y_2 B^2 + y_1 B + y_0)/B^3, ?)
 271         paddq   \c3, \u       // (y_3 + floor((y_2 B^2 + y_1 B + y_0)/B^3, ?)
 272   .ifnes "\h", "nil"
 273         movdqa  \t, \c3
 274         pxor    \u, \u
 275   .endif
 276         punpckldq \c1, \c3              // (y^*_1, y^*_3, ?, ?)
 277   .ifnes "\h", "nil"
 278         psrlq   \t, 32                  // very high bits of y
 279         paddq   \h, \t
 280         punpcklqdq \h, \u               // carry up
 281   .endif
 282         punpckldq \c0, \c1              // y mod B^4
 283 .endm
 284
 285 .macro  carryadd
 286         // On entry, EDI points to a packed addend A, and XMM4, XMM5, XMM6
 287         // hold the incoming carry registers c0, c1, and c2 representing a
 288         // carry-in C.
 289         //
 290         // On exit, the carry registers, including XMM7, are updated to hold
 291         // C + A; XMM0, XMM1, XMM2, and XMM3 are clobbered.  The other
 292         // registers are preserved.
 293         movd    xmm0, [edi +  0]        // (a_0, 0)
 294         movd    xmm1, [edi +  4]        // (a_1, 0)
 295         movd    xmm2, [edi +  8]        // (a_2, 0)
 296         movd    xmm7, [edi + 12]        // (a_3, 0)
 297
 298         paddq   xmm4, xmm0              // (c'_0 + a_0, c''_0)
 299         paddq   xmm5, xmm1              // (c'_1 + a_1, c''_1)
 300         paddq   xmm6, xmm2              // (c'_2 + a_2, c''_2 + a_3 b)
 301 .endm
 302
 303 ///--------------------------------------------------------------------------
 304 /// Primitive multipliers and related utilities.
 305
 306 INTFUNC(carryprop)
 307         // On entry, XMM4, XMM5, and XMM6 hold a 144-bit carry in an expanded
 308         // form.  Store the low 128 bits of the represented carry to [EDI] as
 309         // a packed 128-bit value, and leave the remaining 16 bits in the low
 310         // 32 bits of XMM4.  On exit, XMM3, XMM5 and XMM6 are clobbered.
 311   endprologue
 312
 313         propout [edi +  0], xmm4, xmm5
 314         propout [edi +  4], xmm5, xmm6
 315         propout [edi +  8], xmm6, nil
 316         endprop [edi + 12], xmm6, xmm4
 317         ret
 318
 319 ENDFUNC
 320
 321 INTFUNC(dmul4)
 322         // On entry, EDI points to the destination buffer; EAX and EBX point
 323         // to the packed operands U and X; ECX and EDX point to the expanded
 324         // operands V and Y; and XMM4, XMM5, XMM6 hold the incoming carry
 325         // registers c0, c1, and c2; c3 is assumed to be zero.
 326         //
 327         // On exit, we write the low 128 bits of the sum C + U V + X Y to
 328         // [EDI], and update the carry registers with the carry out.  The
 329         // registers XMM0, XMM1, XMM2, XMM3, and XMM7 are clobbered; the
 330         // general-purpose registers are preserved.
 331   endprologue
 332
 333         mulacc  [eax +  0], ecx, xmm4, xmm5, xmm6, xmm7, t
 334         mulacc  [ebx +  0], edx, xmm4, xmm5, xmm6, xmm7, nil
 335         propout [edi +  0],      xmm4, xmm5
 336
 337         mulacc  [eax +  4], ecx, xmm5, xmm6, xmm7, xmm4, t
 338         mulacc  [ebx +  4], edx, xmm5, xmm6, xmm7, xmm4, nil
 339         propout [edi +  4],      xmm5, xmm6
 340
 341         mulacc  [eax +  8], ecx, xmm6, xmm7, xmm4, xmm5, t
 342         mulacc  [ebx +  8], edx, xmm6, xmm7, xmm4, xmm5, nil
 343         propout [edi +  8],      xmm6, xmm7
 344
 345         mulacc  [eax + 12], ecx, xmm7, xmm4, xmm5, xmm6, t
 346         mulacc  [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6, nil
 347         propout [edi + 12],      xmm7, xmm4
 348
 349         ret
 350
 351 ENDFUNC
 352
 353 INTFUNC(dmla4)
 354         // On entry, EDI points to the destination buffer, which also
 355         // contains an addend A to accumulate; EAX and EBX point to the
 356         // packed operands U and X; ECX and EDX point to the expanded
 357         // operands V and Y; and XMM4, XMM5, XMM6 hold the incoming carry
 358         // registers c0, c1, and c2 representing a carry-in C; c3 is assumed
 359         // to be zero.
 360         //
 361         // On exit, we write the low 128 bits of the sum A + C + U V + X Y to
 362         // [EDI], and update the carry registers with the carry out.  The
 363         // registers XMM0, XMM1, XMM2, XMM3, and XMM7 are clobbered; the
 364         // general-purpose registers are preserved.
 365   endprologue
 366
 367         carryadd
 368
 369         mulacc  [eax +  0], ecx, xmm4, xmm5, xmm6, xmm7, nil
 370         mulacc  [ebx +  0], edx, xmm4, xmm5, xmm6, xmm7, nil
 371         propout [edi +  0],      xmm4, xmm5
 372
 373         mulacc  [eax +  4], ecx, xmm5, xmm6, xmm7, xmm4, t
 374         mulacc  [ebx +  4], edx, xmm5, xmm6, xmm7, xmm4, nil
 375         propout [edi +  4],      xmm5, xmm6
 376
 377         mulacc  [eax +  8], ecx, xmm6, xmm7, xmm4, xmm5, t
 378         mulacc  [ebx +  8], edx, xmm6, xmm7, xmm4, xmm5, nil
 379         propout [edi +  8],      xmm6, xmm7
 380
 381         mulacc  [eax + 12], ecx, xmm7, xmm4, xmm5, xmm6, t
 382         mulacc  [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6, nil
 383         propout [edi + 12],      xmm7, xmm4
 384
 385         ret
 386
 387 ENDFUNC
 388
 389 INTFUNC(mul4zc)
 390         // On entry, EDI points to the destination buffer; EBX points to a
 391         // packed operand X; and EDX points to an expanded operand Y.
 392         //
 393         // On exit, we write the low 128 bits of the product X Y to [EDI],
 394         // and set the carry registers XMM4, XMM5, XMM6 to the carry out.
 395         // The registers XMM0, XMM1, XMM2, XMM3, and XMM7 are clobbered; the
 396         // general-purpose registers are preserved.
 397   endprologue
 398
 399         mulcore [ebx +  0], edx, xmm4, xmm5, xmm6, xmm7
 400         propout [edi +  0],      xmm4, xmm5
 401
 402         mulacc  [ebx +  4], edx, xmm5, xmm6, xmm7, xmm4, t
 403         propout [edi +  4],      xmm5, xmm6
 404
 405         mulacc  [ebx +  8], edx, xmm6, xmm7, xmm4, xmm5, t
 406         propout [edi +  8],      xmm6, xmm7
 407
 408         mulacc  [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6, t
 409         propout [edi + 12],      xmm7, xmm4
 410
 411         ret
 412
 413 ENDFUNC
 414
 415 INTFUNC(mul4)
 416         // On entry, EDI points to the destination buffer; EBX points to a
 417         // packed operand X; EDX points to an expanded operand Y; and XMM4,
 418         // XMM5, XMM6 hold the incoming carry registers c0, c1, and c2,
 419         // representing a carry-in C; c3 is assumed to be zero.
 420         //
 421         // On exit, we write the low 128 bits of the sum C + X Y to [EDI],
 422         // and update the carry registers with the carry out.  The registers
 423         // XMM0, XMM1, XMM2, XMM3, and XMM7 are clobbered; the
 424         // general-purpose registers are preserved.
 425   endprologue
 426
 427         mulacc  [ebx +  0], edx, xmm4, xmm5, xmm6, xmm7, t
 428         propout [edi +  0],      xmm4, xmm5
 429
 430         mulacc  [ebx +  4], edx, xmm5, xmm6, xmm7, xmm4, t
 431         propout [edi +  4],      xmm5, xmm6
 432
 433         mulacc  [ebx +  8], edx, xmm6, xmm7, xmm4, xmm5, t
 434         propout [edi +  8],      xmm6, xmm7
 435
 436         mulacc  [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6, t
 437         propout [edi + 12],      xmm7, xmm4
 438
 439         ret
 440
 441 ENDFUNC
 442
 443 INTFUNC(mla4zc)
 444         // On entry, EDI points to the destination buffer, which also
 445         // contains an addend A to accumulate; EBX points to a packed operand
 446         // X; and EDX points to an expanded operand Y.
 447         //
 448         // On exit, we write the low 128 bits of the sum A + X Y to [EDI],
 449         // and set the carry registers XMM4, XMM5, XMM6 to the carry out.
 450         // The registers XMM0, XMM1, XMM2, XMM3, and XMM7 are clobbered; the
 451         // general-purpose registers are preserved.
 452   endprologue
 453
 454         movd    xmm4, [edi +  0]
 455         movd    xmm5, [edi +  4]
 456         movd    xmm6, [edi +  8]
 457         movd    xmm7, [edi + 12]
 458
 459         mulacc  [ebx +  0], edx, xmm4, xmm5, xmm6, xmm7, nil
 460         propout [edi +  0],      xmm4, xmm5
 461
 462         mulacc  [ebx +  4], edx, xmm5, xmm6, xmm7, xmm4, t
 463         propout [edi +  4],      xmm5, xmm6
 464
 465         mulacc  [ebx +  8], edx, xmm6, xmm7, xmm4, xmm5, t
 466         propout [edi +  8],      xmm6, xmm7
 467
 468         mulacc  [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6, t
 469         propout [edi + 12],      xmm7, xmm4
 470
 471         ret
 472
 473 ENDFUNC
 474
 475 INTFUNC(mla4)
 476         // On entry, EDI points to the destination buffer, which also
 477         // contains an addend A to accumulate; EBX points to a packed operand
 478         // X; EDX points to an expanded operand Y; and XMM4, XMM5, XMM6 hold
 479         // the incoming carry registers c0, c1, and c2, representing a
 480         // carry-in C; c3 is assumed to be zero.
 481         //
 482         // On exit, we write the low 128 bits of the sum A + C + X Y to
 483         // [EDI], and update the carry registers with the carry out.  The
 484         // registers XMM0, XMM1, XMM2, XMM3, and XMM7 are clobbered; the
 485         // general-purpose registers are preserved.
 486   endprologue
 487
 488         carryadd
 489
 490         mulacc  [ebx +  0], edx, xmm4, xmm5, xmm6, xmm7, nil
 491         propout [edi +  0],      xmm4, xmm5
 492
 493         mulacc  [ebx +  4], edx, xmm5, xmm6, xmm7, xmm4, t
 494         propout [edi +  4],      xmm5, xmm6
 495
 496         mulacc  [ebx +  8], edx, xmm6, xmm7, xmm4, xmm5, t
 497         propout [edi +  8],      xmm6, xmm7
 498
 499         mulacc  [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6, t
 500         propout [edi + 12],      xmm7, xmm4
 501
 502         ret
 503
 504 ENDFUNC
 505
 506 INTFUNC(mmul4)
 507         // On entry, EDI points to the destination buffer; EAX and EBX point
 508         // to the packed operands U and N; ECX and ESI point to the expanded
 509         // operands V and M; and EDX points to a place to store an expanded
 510         // result Y (32 bytes, at a 16-byte boundary).  The stack pointer
 511         // must be 16-byte aligned.  (This is not the usual convention, which
 512         // requires alignment before the call.)
 513         //
 514         // On exit, we write Y = U V M mod B to [EDX], and the low 128 bits
 515         // of the sum U V + N Y to [EDI], leaving the remaining carry in
 516         // XMM4, XMM5, and XMM6.  The registers XMM0, XMM1, XMM2, XMM3, and
 517         // XMM7 are clobbered; the general-purpose registers are preserved.
 518         stalloc 48                      // space for the carries
 519   endprologue
 520
 521         // Calculate W = U V, and leave it in the destination.  Stash the
 522         // carry pieces for later.
 523         mulcore [eax +  0], ecx, xmm4, xmm5, xmm6, xmm7
 524         propout [edi +  0],      xmm4, xmm5
 525         jmp     5f
 526
 527 ENDFUNC
 528
 529 INTFUNC(mmla4)
 530         // On entry, EDI points to the destination buffer, which also
 531         // contains an addend A to accumulate; EAX and EBX point
 532         // to the packed operands U and N; ECX and ESI point to the expanded
 533         // operands V and M; and EDX points to a place to store an expanded
 534         // result Y (32 bytes, at a 16-byte boundary).  The stack pointer
 535         // must be 16-byte aligned.  (This is not the usual convention, which
 536         // requires alignment before the call.)
 537         //
 538         // On exit, we write Y = (A + U V) M mod B to [EDX], and the low 128
 539         // bits of the sum A + U V + N Y to [EDI], leaving the remaining
 540         // carry in XMM4, XMM5, and XMM6.  The registers XMM0, XMM1, XMM2,
 541         // XMM3, and XMM7 are clobbered; the general-purpose registers are
 542         // preserved.
 543         stalloc 48                      // space for the carries
 544   endprologue
 545
 546         movd    xmm4, [edi +  0]
 547         movd    xmm5, [edi +  4]
 548         movd    xmm6, [edi +  8]
 549         movd    xmm7, [edi + 12]
 550         mulacc  [eax +  0], ecx, xmm4, xmm5, xmm6, xmm7, nil
 551         propout [edi +  0],      xmm4, xmm5
 552
 553 5:      mulacc  [eax +  4], ecx, xmm5, xmm6, xmm7, xmm4, t
 554         propout [edi +  4],      xmm5, xmm6
 555
 556         mulacc  [eax +  8], ecx, xmm6, xmm7, xmm4, xmm5, t
 557         propout [edi +  8],      xmm6, xmm7
 558
 559         mulacc  [eax + 12], ecx, xmm7, xmm4, xmm5, xmm6, t
 560         propout [edi + 12],      xmm7, xmm4
 561
 562         movdqa  [esp +  0], xmm4
 563         movdqa  [esp + 16], xmm5
 564         movdqa  [esp + 32], xmm6
 565
 566         // Calculate Y = W M.
 567         mulcore [edi +  0], esi, xmm4, xmm5, xmm6, xmm7
 568
 569         mulcore [edi +  4], esi, xmm0, xmm1, xmm2, nil
 570         accum                    xmm5, xmm6, xmm7, nil
 571
 572         mulcore [edi +  8], esi, xmm0, xmm1, nil,  nil
 573         accum                    xmm6, xmm7, nil,  nil
 574
 575         mulcore [edi + 12], esi, xmm0, nil,  nil,  nil
 576         accum                    xmm7, nil,  nil,  nil
 577
 578         // That's lots of pieces.  Now we have to assemble the answer.
 579         squash  xmm4, xmm5, xmm6, xmm7, nil, xmm0, xmm1
 580
 581         // Expand it.
 582         pxor    xmm2, xmm2
 583         expand  xmm4, xmm1, nil, nil, xmm2
 584         movdqa  [edx +  0], xmm4
 585         movdqa  [edx + 16], xmm1
 586
 587         // Initialize the carry from the value for W we calculated earlier.
 588         movd    xmm4, [edi +  0]
 589         movd    xmm5, [edi +  4]
 590         movd    xmm6, [edi +  8]
 591         movd    xmm7, [edi + 12]
 592
 593         // Finish the calculation by adding the Montgomery product.
 594         mulacc  [ebx +  0], edx, xmm4, xmm5, xmm6, xmm7, nil
 595         propout [edi +  0],      xmm4, xmm5
 596
 597         mulacc  [ebx +  4], edx, xmm5, xmm6, xmm7, xmm4, t
 598         propout [edi +  4],      xmm5, xmm6
 599
 600         mulacc  [ebx +  8], edx, xmm6, xmm7, xmm4, xmm5, t
 601         propout [edi +  8],      xmm6, xmm7
 602
 603         mulacc  [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6, t
 604         propout [edi + 12],      xmm7, xmm4
 605
 606         // Add add on the carry we calculated earlier.
 607         paddq   xmm4, [esp +  0]
 608         paddq   xmm5, [esp + 16]
 609         paddq   xmm6, [esp + 32]
 610
 611         // And, with that, we're done.
 612         stfree  48
 613         ret
 614
 615 ENDFUNC
 616
 617 INTFUNC(mont4)
 618         // On entry, EDI points to the destination buffer holding a packed
 619         // value W; EBX points to a packed operand N; ESI points to an
 620         // expanded operand M; and EDX points to a place to store an expanded
 621         // result Y (32 bytes, at a 16-byte boundary).
 622         //
 623         // On exit, we write Y = W M mod B to [EDX], and the low 128 bits
 624         // of the sum W + N Y to [EDI], leaving the remaining carry in
 625         // XMM4, XMM5, and XMM6.  The registers XMM0, XMM1, XMM2, XMM3, and
 626         // XMM7 are clobbered; the general-purpose registers are preserved.
 627   endprologue
 628
 629         // Calculate Y = W M.
 630         mulcore [edi +  0], esi, xmm4, xmm5, xmm6, xmm7
 631
 632         mulcore [edi +  4], esi, xmm0, xmm1, xmm2, nil
 633         accum                    xmm5, xmm6, xmm7, nil
 634
 635         mulcore [edi +  8], esi, xmm0, xmm1, nil,  nil
 636         accum                    xmm6, xmm7, nil,  nil
 637
 638         mulcore [edi + 12], esi, xmm0, nil,  nil,  nil
 639         accum                    xmm7, nil,  nil,  nil
 640
 641         // That's lots of pieces.  Now we have to assemble the answer.
 642         squash  xmm4, xmm5, xmm6, xmm7, nil, xmm0, xmm1
 643
 644         // Expand it.
 645         pxor    xmm2, xmm2
 646         expand  xmm4, xmm1, nil, nil, xmm2
 647         movdqa  [edx +  0], xmm4
 648         movdqa  [edx + 16], xmm1
 649
 650         // Initialize the carry from W.
 651         movd    xmm4, [edi +  0]
 652         movd    xmm5, [edi +  4]
 653         movd    xmm6, [edi +  8]
 654         movd    xmm7, [edi + 12]
 655
 656         // Finish the calculation by adding the Montgomery product.
 657         mulacc  [ebx +  0], edx, xmm4, xmm5, xmm6, xmm7, nil
 658         propout [edi +  0],      xmm4, xmm5
 659
 660         mulacc  [ebx +  4], edx, xmm5, xmm6, xmm7, xmm4, t
 661         propout [edi +  4],      xmm5, xmm6
 662
 663         mulacc  [ebx +  8], edx, xmm6, xmm7, xmm4, xmm5, t
 664         propout [edi +  8],      xmm6, xmm7
 665
 666         mulacc  [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6, t
 667         propout [edi + 12],      xmm7, xmm4
 668
 669         // And, with that, we're done.
 670         ret
 671
 672 ENDFUNC
 673
 674 ///--------------------------------------------------------------------------
 675 /// Bulk multipliers.
 676
 677 FUNC(mpx_umul4_x86_sse2)
 678         // void mpx_umul4_x86_sse2(mpw *dv, const mpw *av, const mpw *avl,
 679         //                         const mpw *bv, const mpw *bvl);
 680
 681         // Build a stack frame.  Arguments will be relative to EBP, as
 682         // follows.
 683         //
 684         //      ebp + 20        dv
 685         //      ebp + 24        av
 686         //      ebp + 28        avl
 687         //      ebp + 32        bv
 688         //      ebp + 36        bvl
 689         //
 690         // Locals are relative to ESP, as follows.
 691         //
 692         //      esp +  0        expanded Y (32 bytes)
 693         //      esp + 32        (top of locals)
 694         pushreg ebp
 695         pushreg ebx
 696         pushreg esi
 697         pushreg edi
 698         setfp   ebp
 699         and     esp, ~15
 700         sub     esp, 32
 701   endprologue
 702
 703         // Prepare for the first iteration.
 704         mov     esi, [ebp + 32]         // -> bv[0]
 705         pxor    xmm7, xmm7
 706         movdqu  xmm0, [esi]             // bv[0]
 707         mov     edi, [ebp + 20]         // -> dv[0]
 708         mov     ecx, edi                // outer loop dv cursor
 709         expand  xmm0, xmm1, nil, nil, xmm7
 710         mov     ebx, [ebp + 24]         // -> av[0]
 711         mov     eax, [ebp + 28]         // -> av[m] = av limit
 712         mov     edx, esp                // -> expanded Y = bv[0]
 713         movdqa  [esp + 0], xmm0         // bv[0] expanded low
 714         movdqa  [esp + 16], xmm1        // bv[0] expanded high
 715         call    mul4zc
 716         add     ebx, 16
 717         add     edi, 16
 718         add     ecx, 16
 719         add     esi, 16
 720         cmp     ebx, eax                // all done?
 721         jae     8f
 722
 723         .p2align 4
 724         // Continue with the first iteration.
 725 0:      call    mul4
 726         add     ebx, 16
 727         add     edi, 16
 728         cmp     ebx, eax                // all done?
 729         jb      0b
 730
 731         // Write out the leftover carry.  There can be no tail here.
 732 8:      call    carryprop
 733         cmp     esi, [ebp + 36]         // more passes to do?
 734         jae     9f
 735
 736         .p2align 4
 737         // Set up for the next pass.
 738 1:      movdqu  xmm0, [esi]             // bv[i]
 739         mov     edi, ecx                // -> dv[i]
 740         pxor    xmm7, xmm7
 741         expand  xmm0, xmm1, nil, nil, xmm7
 742         mov     ebx, [ebp + 24]         // -> av[0]
 743         movdqa  [esp + 0], xmm0         // bv[i] expanded low
 744         movdqa  [esp + 16], xmm1        // bv[i] expanded high
 745         call    mla4zc
 746         add     edi, 16
 747         add     ebx, 16
 748         add     ecx, 16
 749         add     esi, 16
 750         cmp     ebx, eax                // done yet?
 751         jae     8f
 752
 753         .p2align 4
 754         // Continue...
 755 0:      call    mla4
 756         add     ebx, 16
 757         add     edi, 16
 758         cmp     ebx, eax
 759         jb      0b
 760
 761         // Finish off this pass.  There was no tail on the previous pass, and
 762         // there can be none on this pass.
 763 8:      call    carryprop
 764         cmp     esi, [ebp + 36]
 765         jb      1b
 766
 767         // All over.
 768 9:      dropfp
 769         pop     edi
 770         pop     esi
 771         pop     ebx
 772         pop     ebp
 773         ret
 774
 775 ENDFUNC
 776
 777 FUNC(mpxmont_mul4_x86_sse2)
 778         // void mpxmont_mul4_x86_sse2(mpw *dv, const mpw *av, const mpw *bv,
 779         //                           const mpw *nv, size_t n, const mpw *mi);
 780
 781         // Build a stack frame.  Arguments will be relative to EBP, as
 782         // follows.
 783         //
 784         //      ebp + 20        dv
 785         //      ebp + 24        av
 786         //      ebp + 28        bv
 787         //      ebp + 32        nv
 788         //      ebp + 36        n (nonzero multiple of 4)
 789         //      ebp + 40        mi
 790         //
 791         // Locals are relative to ESP, which is 4 mod 16, as follows.
 792         //
 793         //      esp +   0       outer loop dv
 794         //      esp +   4       outer loop bv
 795         //      esp +   8       av limit (mostly in ESI)
 796         //      esp +  12       expanded V (32 bytes)
 797         //      esp +  44       expanded M (32 bytes)
 798         //      esp +  76       expanded Y (32 bytes)
 799         //      esp + 108       bv limit
 800         //      esp + 112       (gap)
 801         //      esp + 124       (top of locals)
 802         pushreg ebp
 803         pushreg ebx
 804         pushreg esi
 805         pushreg edi
 806         setfp   ebp
 807         and     esp, ~15
 808         sub     esp, 124
 809   endprologue
 810
 811         // Establish the expanded operands.
 812         pxor    xmm7, xmm7
 813         mov     ecx, [ebp + 28]         // -> bv
 814         mov     edx, [ebp + 40]         // -> mi
 815         movdqu  xmm0, [ecx]             // bv[0]
 816         movdqu  xmm2, [edx]             // mi
 817         expand  xmm0, xmm1, xmm2, xmm3, xmm7
 818         movdqa  [esp + 12], xmm0        // bv[0] expanded low
 819         movdqa  [esp + 28], xmm1        // bv[0] expanded high
 820         movdqa  [esp + 44], xmm2        // mi expanded low
 821         movdqa  [esp + 60], xmm3        // mi expanded high
 822
 823         // Set up the outer loop state and prepare for the first iteration.
 824         mov     edx, [ebp + 36]         // n
 825         mov     eax, [ebp + 24]         // -> U = av[0]
 826         mov     ebx, [ebp + 32]         // -> X = nv[0]
 827         mov     edi, [ebp + 20]         // -> Z = dv[0]
 828         mov     [esp + 4], ecx
 829         lea     ecx, [ecx + 4*edx]      // -> bv[n/4] = bv limit
 830         lea     edx, [eax + 4*edx]      // -> av[n/4] = av limit
 831         mov     [esp + 0], edi
 832         mov     [esp + 108], ecx
 833         mov     [esp + 8], edx
 834         lea     ecx, [esp + 12]         // -> expanded V = bv[0]
 835         lea     esi, [esp + 44]         // -> expanded M = mi
 836         lea     edx, [esp + 76]         // -> space for Y
 837         call    mmul4
 838         mov     esi, [esp + 8]          // recover av limit
 839         add     edi, 16
 840         add     eax, 16
 841         add     ebx, 16
 842         cmp     eax, esi                // done already?
 843         jae     8f
 844         mov     [esp + 0], edi
 845
 846         .p2align 4
 847         // Complete the first inner loop.
 848 0:      call    dmul4
 849         add     edi, 16
 850         add     eax, 16
 851         add     ebx, 16
 852         cmp     eax, esi                // done yet?
 853         jb      0b
 854
 855         // Still have carries left to propagate.
 856         call    carryprop
 857         movd    [edi + 16], xmm4
 858
 859         .p2align 4
 860         // Embark on the next iteration.  (There must be one.  If n = 1, then
 861         // we would have bailed above, to label 8.  Similarly, the subsequent
 862         // iterations can fall into the inner loop immediately.)
 863 1:      mov     eax, [esp + 4]          // -> bv[i - 1]
 864         mov     edi, [esp + 0]          // -> Z = dv[i]
 865         add     eax, 16                 // -> bv[i]
 866         pxor    xmm7, xmm7
 867         movdqu  xmm0, [eax]             // bv[i]
 868         mov     [esp + 4], eax
 869         cmp     eax, [esp + 108]        // done yet?
 870         jae     9f
 871         mov     ebx, [ebp + 32]         // -> X = nv[0]
 872         lea     esi, [esp + 44]         // -> expanded M = mi
 873         mov     eax, [ebp + 24]         // -> U = av[0]
 874         expand  xmm0, xmm1, nil, nil, xmm7
 875         movdqa  [esp + 12], xmm0        // bv[i] expanded low
 876         movdqa  [esp + 28], xmm1        // bv[i] expanded high
 877         call    mmla4
 878         mov     esi, [esp + 8]          // recover av limit
 879         add     edi, 16
 880         add     eax, 16
 881         add     ebx, 16
 882         mov     [esp + 0], edi
 883
 884         .p2align 4
 885         // Complete the next inner loop.
 886 0:      call    dmla4
 887         add     edi, 16
 888         add     eax, 16
 889         add     ebx, 16
 890         cmp     eax, esi
 891         jb      0b
 892
 893         // Still have carries left to propagate, and they overlap the
 894         // previous iteration's final tail, so read that in and add it.
 895         movd    xmm0, [edi]
 896         paddq   xmm4, xmm0
 897         call    carryprop
 898         movd    [edi + 16], xmm4
 899
 900         // Back again.
 901         jmp     1b
 902
 903         // First iteration was short.  Write out the carries and we're done.
 904         // (This could be folded into the main loop structure, but that would
 905         // penalize small numbers more.)
 906 8:      call    carryprop
 907         movd    [edi + 16], xmm4
 908
 909         // All done.
 910 9:      dropfp
 911         popreg  edi
 912         popreg  esi
 913         popreg  ebx
 914         popreg  ebp
 915         ret
 916
 917 ENDFUNC
 918
 919 FUNC(mpxmont_redc4_x86_sse2)
 920         // void mpxmont_redc4_x86_sse2(mpw *dv, mpw *dvl, const mpw *nv,
 921         //                             size_t n, const mpw *mi);
 922
 923         // Build a stack frame.  Arguments will be relative to EBP, as
 924         // follows.
 925         //
 926         //      ebp + 20        dv
 927         //      ebp + 24        dvl
 928         //      ebp + 28        nv
 929         //      ebp + 32        n (nonzero multiple of 4)
 930         //      ebp + 36        mi
 931         //
 932         // Locals are relative to ESP, as follows.
 933         //
 934         //      esp +  0        outer loop dv
 935         //      esp +  4        outer dv limit
 936         //      esp +  8        blocks-of-4 dv limit
 937         //      esp + 12        expanded M (32 bytes)
 938         //      esp + 44        expanded Y (32 bytes)
 939         //      esp + 76        (top of locals)
 940         pushreg ebp
 941         pushreg ebx
 942         pushreg esi
 943         pushreg edi
 944         setfp   ebp
 945         and     esp, ~15
 946         sub     esp, 76
 947   endprologue
 948
 949         // Establish the expanded operands and the blocks-of-4 dv limit.
 950         mov     edi, [ebp + 20]         // -> Z = dv[0]
 951         pxor    xmm7, xmm7
 952         mov     eax, [ebp + 24]         // -> dv[n] = dv limit
 953         sub     eax, edi                // length of dv in bytes
 954         mov     edx, [ebp + 36]         // -> mi
 955         movdqu  xmm0, [edx]             // mi
 956         and     eax, ~15                // mask off the tail end
 957         expand  xmm0, xmm1, nil, nil, xmm7
 958         add     eax, edi                // find limit
 959         movdqa  [esp + 12], xmm0        // mi expanded low
 960         movdqa  [esp + 28], xmm1        // mi expanded high
 961         mov     [esp + 8], eax
 962
 963         // Set up the outer loop state and prepare for the first iteration.
 964         mov     ecx, [ebp + 32]         // n
 965         mov     ebx, [ebp + 28]         // -> X = nv[0]
 966         lea     edx, [edi + 4*ecx]      // -> dv[n/4] = outer dv limit
 967         lea     ecx, [ebx + 4*ecx]      // -> nv[n/4] = nv limit
 968         mov     [esp + 0], edi
 969         mov     [esp + 4], edx
 970         lea     esi, [esp + 12]         // -> expanded M = mi
 971         lea     edx, [esp + 44]         // -> space for Y
 972         call    mont4
 973         add     edi, 16
 974         add     ebx, 16
 975         cmp     ebx, ecx                // done already?
 976         jae     8f
 977
 978         .p2align 4
 979         // Complete the first inner loop.
 980 5:      call    mla4
 981         add     ebx, 16
 982         add     edi, 16
 983         cmp     ebx, ecx                // done yet?
 984         jb      5b
 985
 986         // Still have carries left to propagate.
 987 8:      carryadd
 988         mov     esi, [esp + 8]          // -> dv blocks limit
 989         mov     edx, [ebp + 24]         // dv limit
 990         psllq   xmm7, 16
 991         pslldq  xmm7, 8
 992         paddq   xmm6, xmm7
 993         call    carryprop
 994         movd    eax, xmm4
 995         add     edi, 16
 996         cmp     edi, esi
 997         jae     7f
 998
 999         .p2align 4
1000         // Continue carry propagation until the end of the buffer.
1001 0:      add     [edi], eax
1002         mov     eax, 0                  // preserves flags
1003         adcd    [edi + 4], 0
1004         adcd    [edi + 8], 0
1005         adcd    [edi + 12], 0
1006         adc     eax, 0
1007         add     edi, 16
1008         cmp     edi, esi
1009         jb      0b
1010
1011         // Deal with the tail end.
1012 7:      add     [edi], eax
1013         mov     eax, 0                  // preserves flags
1014         add     edi, 4
1015         adc     eax, 0
1016         cmp     edi, edx
1017         jb      7b
1018
1019         // All done for this iteration.  Start the next.  (This must have at
1020         // least one follow-on iteration, or we'd not have started this outer
1021         // loop.)
1022 8:      mov     edi, [esp + 0]          // -> dv[i - 1]
1023         mov     ebx, [ebp + 28]         // -> X = nv[0]
1024         lea     edx, [esp + 44]         // -> space for Y
1025         lea     esi, [esp + 12]         // -> expanded M = mi
1026         add     edi, 16                 // -> Z = dv[i]
1027         cmp     edi, [esp + 4]          // all done yet?
1028         jae     9f
1029         mov     [esp + 0], edi
1030         call    mont4
1031         add     edi, 16
1032         add     ebx, 16
1033         jmp     5b
1034
1035         // All over.
1036 9:      dropfp
1037         popreg  edi
1038         popreg  esi
1039         popreg  ebx
1040         popreg  ebp
1041         ret
1042
1043 ENDFUNC
1044
1045 ///--------------------------------------------------------------------------
1046 /// Testing and performance measurement.
1047
1048 #ifdef TEST_MUL4
1049
1050 .macro  cysetup c
1051         rdtsc
1052         mov     [\c], eax
1053         mov     [\c + 4], edx
1054 .endm
1055
1056 .macro  cystore c, v, n
1057         rdtsc
1058         sub     eax, [\c]
1059         sbb     edx, [\c + 4]
1060         mov     ebx, [\v]
1061         mov     ecx, [\n]
1062         dec     ecx
1063         mov     [\n], ecx
1064         mov     [ebx + ecx*8], eax
1065         mov     [ebx + ecx*8 + 4], edx
1066 .endm
1067
1068 .macro  testprologue
1069         pushreg ebp
1070         pushreg ebx
1071         pushreg esi
1072         pushreg edi
1073         setfp   ebp
1074         and     esp, ~15
1075         sub     esp, 3*32 + 12
1076   endprologue
1077         // vars:
1078         //      esp +  0 = cycles
1079         //      esp + 12 = v expanded
1080         //      esp + 44 = y expanded
1081         //      esp + 72 = ? expanded
1082 .endm
1083
1084 .macro  testepilogue
1085         dropfp
1086         popreg  edi
1087         popreg  esi
1088         popreg  ebx
1089         popreg  ebp
1090         ret
1091 .endm
1092
1093 .macro  testldcarry c
1094         mov     ecx, \c                 // -> c
1095         movdqu  xmm4, [ecx +  0]        // (c'_0, c''_0)
1096         movdqu  xmm5, [ecx + 16]        // (c'_1, c''_1)
1097         movdqu  xmm6, [ecx + 32]        // (c'_2, c''_2)
1098 .endm
1099
1100 .macro  testexpand v, y
1101         pxor    xmm7, xmm7
1102   .ifnes "\v", "nil"
1103         mov     ecx, \v
1104         movdqu  xmm0, [ecx]
1105         expand  xmm0, xmm1, nil, nil, xmm7
1106         movdqa  [esp + 12], xmm0
1107         movdqa  [esp + 28], xmm1
1108   .endif
1109   .ifnes "\y", "nil"
1110         mov     edx, \y
1111         movdqu  xmm2, [edx]
1112         expand  xmm2, xmm3, nil, nil, xmm7
1113         movdqa  [esp + 44], xmm2
1114         movdqa  [esp + 60], xmm3
1115   .endif
1116 .endm
1117
1118 .macro  testtop u, x, mode
1119         .p2align 4
1120 0:
1121   .ifnes "\u", "nil"
1122         lea     ecx, [esp + 12]
1123   .endif
1124         mov     ebx, \x
1125   .ifeqs "\mode", "mont"
1126         lea     esi, [esp + 44]
1127   .endif
1128         cysetup esp + 0
1129   .ifnes "\u", "nil"
1130         mov     eax, \u
1131   .endif
1132   .ifeqs "\mode", "mont"
1133         lea     edx, [esp + 76]
1134   .else
1135         lea     edx, [esp + 44]
1136   .endif
1137 .endm
1138
1139 .macro  testtail cyv, n
1140         cystore esp + 0, \cyv, \n
1141         jnz     0b
1142 .endm
1143
1144 .macro  testcarryout c
1145         mov     ecx, \c
1146         movdqu  [ecx +  0], xmm4
1147         movdqu  [ecx + 16], xmm5
1148         movdqu  [ecx + 32], xmm6
1149 .endm
1150
1151 FUNC(test_dmul4)
1152         testprologue
1153         testldcarry [ebp + 24]
1154         testexpand [ebp + 36], [ebp + 40]
1155         mov     edi, [ebp + 20]
1156         testtop [ebp + 28], [ebp + 32]
1157         call    dmul4
1158         testtail [ebp + 48], [ebp + 44]
1159         testcarryout [ebp + 24]
1160         testepilogue
1161 ENDFUNC
1162
1163 FUNC(test_dmla4)
1164         testprologue
1165         testldcarry [ebp + 24]
1166         testexpand [ebp + 36], [ebp + 40]
1167         mov     edi, [ebp + 20]
1168         testtop [ebp + 28], [ebp + 32]
1169         call    dmla4
1170         testtail [ebp + 48], [ebp + 44]
1171         testcarryout [ebp + 24]
1172         testepilogue
1173 ENDFUNC
1174
1175 FUNC(test_mul4)
1176         testprologue
1177         testldcarry [ebp + 24]
1178         testexpand nil, [ebp + 32]
1179         mov     edi, [ebp + 20]
1180         testtop nil, [ebp + 28]
1181         call    mul4
1182         testtail [ebp + 40], [ebp + 36]
1183         testcarryout [ebp + 24]
1184         testepilogue
1185 ENDFUNC
1186
1187 FUNC(test_mla4)
1188         testprologue
1189         testldcarry [ebp + 24]
1190         testexpand nil, [ebp + 32]
1191         mov     edi, [ebp + 20]
1192         testtop nil, [ebp + 28]
1193         call    mla4
1194         testtail [ebp + 40], [ebp + 36]
1195         testcarryout [ebp + 24]
1196         testepilogue
1197 ENDFUNC
1198
1199 FUNC(test_mmul4)
1200         testprologue
1201         testexpand [ebp + 40], [ebp + 44]
1202         mov     edi, [ebp + 20]
1203         testtop [ebp + 32], [ebp + 36], mont
1204         call    mmul4
1205         testtail [ebp + 52], [ebp + 48]
1206         mov     edi, [ebp + 28]
1207         movdqa  xmm0, [esp + 76]
1208         movdqa  xmm1, [esp + 92]
1209         movdqu  [edi], xmm0
1210         movdqu  [edi + 16], xmm1
1211         testcarryout [ebp + 24]
1212         testepilogue
1213 ENDFUNC
1214
1215 FUNC(test_mmla4)
1216         testprologue
1217         testexpand [ebp + 40], [ebp + 44]
1218         mov     edi, [ebp + 20]
1219         testtop [ebp + 32], [ebp + 36], mont
1220         call    mmla4
1221         testtail [ebp + 52], [ebp + 48]
1222         mov     edi, [ebp + 28]
1223         movdqa  xmm0, [esp + 76]
1224         movdqa  xmm1, [esp + 92]
1225         movdqu  [edi], xmm0
1226         movdqu  [edi + 16], xmm1
1227         testcarryout [ebp + 24]
1228         testepilogue
1229 ENDFUNC
1230
1231 FUNC(test_mont4)
1232         testprologue
1233         testexpand nil, [ebp + 36]
1234         mov     edi, [ebp + 20]
1235         testtop nil, [ebp + 32], mont
1236         call    mont4
1237         testtail [ebp + 44], [ebp + 40]
1238         mov     edi, [ebp + 28]
1239         movdqa  xmm0, [esp + 76]
1240         movdqa  xmm1, [esp + 92]
1241         movdqu  [edi], xmm0
1242         movdqu  [edi + 16], xmm1
1243         testcarryout [ebp + 24]
1244         testepilogue
1245 ENDFUNC
1246
1247 #endif
1248
1249 ///----- That's all, folks --------------------------------------------------