chiark - git - mdw - catacomb/blob - math/mpx-mul4-x86-sse2.S

   1 /// -*- mode: asm; asm-comment-char: ?/; comment-start: "// " -*-
   2 ///
   3 /// Large SIMD-based multiplications
   4 ///
   5 /// (c) 2016 Straylight/Edgeware
   6
   7 ///----- Licensing notice ---------------------------------------------------
   8 ///
   9 /// This file is part of Catacomb.
  10 ///
  11 /// Catacomb is free software; you can redistribute it and/or modify
  12 /// it under the terms of the GNU Library General Public License as
  13 /// published by the Free Software Foundation; either version 2 of the
  14 /// License, or (at your option) any later version.
  15 ///
  16 /// Catacomb is distributed in the hope that it will be useful,
  17 /// but WITHOUT ANY WARRANTY; without even the implied warranty of
  18 /// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  19 /// GNU Library General Public License for more details.
  20 ///
  21 /// You should have received a copy of the GNU Library General Public
  22 /// License along with Catacomb; if not, write to the Free
  23 /// Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
  24 /// MA 02111-1307, USA.
  25
  26 ///--------------------------------------------------------------------------
  27 /// External definitions.
  28
  29 #include "config.h"
  30 #include "asm-common.h"
  31
  32 ///--------------------------------------------------------------------------
  33 /// Prologue.
  34
  35         .arch   pentium4
  36         .text
  37
  38 ///--------------------------------------------------------------------------
  39 /// Theory.
  40 ///
  41 /// We define a number of primitive fixed-size multipliers from which we can
  42 /// construct more general variable-length multipliers.
  43 ///
  44 /// The basic trick is the same throughout.  In an operand-scanning
  45 /// multiplication, the inner multiplication loop multiplies a
  46 /// multiple-precision operand by a single precision factor, and adds the
  47 /// result, appropriately shifted, to the result.  A `finely integrated
  48 /// operand scanning' implementation of Montgomery multiplication also adds
  49 /// the product of a single-precision `Montgomery factor' and the modulus,
  50 /// calculated in the same pass.  The more common `coarsely integrated
  51 /// operand scanning' alternates main multiplication and Montgomery passes,
  52 /// which requires additional carry propagation.
  53 ///
  54 /// Throughout both plain-multiplication and Montgomery stages, then, one of
  55 /// the factors remains constant throughout the operation, so we can afford
  56 /// to take a little time to preprocess it.  The transformation we perform is
  57 /// as follows.  Let b = 2^16, and B = b^2 = 2^32.  Suppose we're given a
  58 /// 128-bit factor v = v_0 + v_1 B + v_2 B^2 + v_3 B^3.  Split each v_i into
  59 /// two sixteen-bit pieces, so v_i = v'_i + v''_i b.  These eight 16-bit
  60 /// pieces are placed into 32-bit cells, and arranged as two 128-bit SSE
  61 /// operands, as follows.
  62 ///
  63 ///     Offset     0       4        8      12
  64 ///        0    v'_0    v'_1    v''_0   v''_1
  65 ///       16    v'_2    v'_3    v''_2   v''_3
  66 ///
  67 /// A `pmuludqd' instruction ignores the odd positions in its operands; thus,
  68 /// it will act on (say) v'_0 and v''_0 in a single instruction.  Shifting
  69 /// this vector right by 4 bytes brings v'_1 and v''_1 into position.  We can
  70 /// multiply such a vector by a full 32-bit scalar to produce two 48-bit
  71 /// results in 64-bit fields.  The sixteen bits of headroom allows us to add
  72 /// many products together before we must deal with carrying; it also allows
  73 /// for some calculations to be performed on the above expanded form.
  74 ///
  75 /// On 32-bit x86, we are register starved: the expanded operands are kept in
  76 /// memory, typically in warm L1 cache.
  77 ///
  78 /// We maintain four `carry' registers accumulating intermediate results.
  79 /// The registers' precise roles rotate during the computation; we name them
  80 /// `c0', `c1', `c2', and `c3'.  Each carry register holds two 64-bit halves:
  81 /// the register c0, for example, holds c'_0 (low half) and c''_0 (high
  82 /// half), and represents the value c_0 = c'_0 + c''_0 b; the carry registers
  83 /// collectively represent the value c_0 + c_1 B + c_2 B^2 + c_3 B^3.  The
  84 /// `pmuluqdq' instruction acting on a scalar operand (broadcast across all
  85 /// lanes of its vector) and an operand in the expanded form above produces a
  86 /// result which can be added directly to the appropriate carry register.
  87 /// Following a pass of four multiplications, we perform some limited carry
  88 /// propagation: let t = c''_0 mod B, and let d = c'_0 + t b; then we output
  89 /// z = d mod B, add (floor(d/B), floor(c''_0/B)) to c1, and cycle the carry
  90 /// registers around, so that c1 becomes c0, and the old c0 is (implicitly)
  91 /// zeroed becomes c3.
  92
  93 ///--------------------------------------------------------------------------
  94 /// Macro definitions.
  95
  96 .macro  mulcore r, s, d0, d1, d2, d3
  97         // Load a word r_i from R, multiply by the expanded operand [S], and
  98         // leave the pieces of the product in registers D0, D1, D2, D3.
  99         movd    \d0, \r                 // (r_i, 0, 0, 0)
 100   .ifnes "\d1", "nil"
 101         movdqa  \d1, [\s]               // (s'_0, s'_1, s''_0, s''_1)
 102   .endif
 103   .ifnes "\d3", "nil"
 104         movdqa  \d3, [\s + 16]          // (s'_2, s'_3, s''_2, s''_3)
 105   .endif
 106         pshufd  \d0, \d0, SHUF(3, 0, 3, 0) // (r_i, ?, r_i, ?)
 107   .ifnes "\d1", "nil"
 108         psrldq  \d1, 4                  // (s'_1, s''_0, s''_1, 0)
 109   .endif
 110   .ifnes "\d2", "nil"
 111     .ifnes "\d3", "nil"
 112         movdqa  \d2, \d3                // another copy of (s'_2, s'_3, ...)
 113     .else
 114         movdqa  \d2, \d0                // another copy of (r_i, ?, r_i, ?)
 115     .endif
 116   .endif
 117   .ifnes "\d3", "nil"
 118         psrldq  \d3, 4                  // (s'_3, s''_2, s''_3, 0)
 119   .endif
 120   .ifnes "\d1", "nil"
 121         pmuludqd \d1, \d0               // (r_i s'_1, r_i s''_1)
 122   .endif
 123   .ifnes "\d3", "nil"
 124         pmuludqd \d3, \d0               // (r_i s'_3, r_i s''_3)
 125   .endif
 126   .ifnes "\d2", "nil"
 127     .ifnes "\d3", "nil"
 128         pmuludqd \d2, \d0               // (r_i s'_2, r_i s''_2)
 129     .else
 130         pmuludqd \d2, [\s + 16]
 131     .endif
 132   .endif
 133         pmuludqd \d0, [\s]              // (r_i s'_0, r_i s''_0)
 134 .endm
 135
 136 .macro  accum   c0, c1, c2, c3
 137         paddq   \c0, xmm0
 138   .ifnes "\c1", "nil"
 139         paddq   \c1, xmm1
 140   .endif
 141   .ifnes "\c2", "nil"
 142         paddq   \c2, xmm2
 143   .endif
 144   .ifnes "\c3", "nil"
 145         paddq   \c3, xmm3
 146   .endif
 147 .endm
 148
 149 .macro  mulacc  r, s, c0, c1, c2, c3, z3p
 150         // Load a word r_i from R, multiply by the expanded operand [S],
 151         // and accumulate in carry registers C0, C1, C2, C3.  If Z3P is `t'
 152         // then C3 notionally contains zero, but needs clearing; in practice,
 153         // we store the product directly rather than attempting to add.  On
 154         // completion, XMM0, XMM1, and XMM2 are clobbered, as is XMM3 if Z3P
 155         // is not `t'.
 156   .ifeqs "\z3p", "t"
 157         mulcore \r, \s, xmm0, xmm1, xmm2, \c3
 158         accum           \c0,  \c1,  \c2,  nil
 159   .else
 160         mulcore \r, \s, xmm0, xmm1, xmm2, xmm3
 161         accum           \c0,  \c1,  \c2,  \c3
 162   .endif
 163 .endm
 164
 165 .macro  propout d, c, cc
 166         // Calculate an output word from C, and store it in D; propagate
 167         // carries out from C to CC in preparation for a rotation of the
 168         // carry registers.  On completion, XMM3 is clobbered.  If CC is
 169         // `nil', then the contribution which would have been added to it is
 170         // left in C.
 171         pshufd  xmm3, \c, SHUF(2, 3, 3, 3) // (?, ?, ?, t = c'' mod B)
 172         psrldq  xmm3, 12                // (t, 0, 0, 0) = (t, 0)
 173         pslldq  xmm3, 2                 // (t b, 0)
 174         paddq   \c, xmm3                // (c' + t b, c'')
 175         movd    \d, \c
 176         psrlq   \c, 32                  // floor(c/B)
 177   .ifnes "\cc", "nil"
 178         paddq   \cc, \c                 // propagate up
 179   .endif
 180 .endm
 181
 182 .macro  endprop d, c, t
 183         // On entry, C contains a carry register.  On exit, the low 32 bits
 184         // of the value represented in C are written to D, and the remaining
 185         // bits are left at the bottom of T.
 186         movdqa  \t, \c
 187         psllq   \t, 16                  // (?, c'' b)
 188         pslldq  \c, 8                   // (0, c')
 189         paddq   \t, \c                  // (?, c' + c'' b)
 190         psrldq  \t, 8                   // c' + c'' b
 191         movd    \d, \t
 192         psrldq  \t, 4                   // floor((c' + c'' b)/B)
 193 .endm
 194
 195 .macro  expand  a, b, c, d, z
 196         // On entry, A and C hold packed 128-bit values, and Z is zero.  On
 197         // exit, A:B and C:D together hold the same values in expanded
 198         // form.  If C is `nil', then only expand A to A:B.
 199         movdqa  \b, \a                  // (a_0, a_1, a_2, a_3)
 200   .ifnes "\c", "nil"
 201         movdqa  \d, \c                  // (c_0, c_1, c_2, c_3)
 202   .endif
 203         punpcklwd \a, \z                // (a'_0, a''_0, a'_1, a''_1)
 204         punpckhwd \b, \z                // (a'_2, a''_2, a'_3, a''_3)
 205   .ifnes "\c", "nil"
 206         punpcklwd \c, \z                // (c'_0, c''_0, c'_1, c''_1)
 207         punpckhwd \d, \z                // (c'_2, c''_2, c'_3, c''_3)
 208   .endif
 209         pshufd  \a, \a, SHUF(3, 1, 2, 0) // (a'_0, a'_1, a''_0, a''_1)
 210         pshufd  \b, \b, SHUF(3, 1, 2, 0) // (a'_2, a'_3, a''_2, a''_3)
 211   .ifnes "\c", "nil"
 212         pshufd  \c, \c, SHUF(3, 1, 2, 0) // (c'_0, c'_1, c''_0, c''_1)
 213         pshufd  \d, \d, SHUF(3, 1, 2, 0) // (c'_2, c'_3, c''_2, c''_3)
 214   .endif
 215 .endm
 216
 217 .macro  squash  c0, c1, c2, c3, h, t, u
 218         // On entry, C0, C1, C2, C3 are carry registers representing a value
 219         // Y.  On exit, C0 holds the low 128 bits of the carry value; C1, C2,
 220         // C3, T, and U are clobbered; and the high bits of Y are stored in
 221         // H, if this is not `nil'.
 222
 223         // The first step is to eliminate the `double-prime' pieces -- i.e.,
 224         // the ones offset by 16 bytes from a 32-bit boundary -- by carrying
 225         // them into the 32-bit-aligned pieces above and below.  But before
 226         // we can do that, we must gather them together.
 227         movdqa  \t, \c0
 228         movdqa  \u, \c1
 229         punpcklqdq \t, \c2              // (y'_0, y'_2)
 230         punpckhqdq \c0, \c2             // (y''_0, y''_2)
 231         punpcklqdq \u, \c3              // (y'_1, y'_3)
 232         punpckhqdq \c1, \c3             // (y''_1, y''_3)
 233
 234         // Now split the double-prime pieces.  The high (up to) 48 bits will
 235         // go up; the low 16 bits go down.
 236         movdqa  \c2, \c0
 237         movdqa  \c3, \c1
 238         psllq   \c2, 48
 239         psllq   \c3, 48
 240         psrlq   \c0, 16                 // high parts of (y''_0, y''_2)
 241         psrlq   \c1, 16                 // high parts of (y''_1, y''_3)
 242         psrlq   \c2, 32                 // low parts of (y''_0, y''_2)
 243         psrlq   \c3, 32                 // low parts of (y''_1, y''_3)
 244   .ifnes "\h", "nil"
 245         movdqa  \h, \c1
 246   .endif
 247         pslldq  \c1, 8                  // high part of (0, y''_1)
 248
 249         paddq   \t, \c2                 // propagate down
 250         paddq   \u, \c3
 251         paddq   \t, \c1                 // and up: (y_0, y_2)
 252         paddq   \u, \c0                 // (y_1, y_3)
 253   .ifnes "\h", "nil"
 254         psrldq  \h, 8                   // high part of (y''_3, 0)
 255   .endif
 256
 257         // Finally extract the answer.  This complicated dance is better than
 258         // storing to memory and loading, because the piecemeal stores
 259         // inhibit store forwarding.
 260         movdqa  \c3, \t                 // (y_0, y_1)
 261         movdqa  \c0, \t                 // (y^*_0, ?, ?, ?)
 262         psrldq  \t, 8                   // (y_2, 0)
 263         psrlq   \c3, 32                 // (floor(y_0/B), ?)
 264         paddq   \c3, \u                 // (y_1 + floor(y_0/B), ?)
 265         pslldq  \c0, 12                 // (0, 0, 0, y^*_0)
 266         movdqa  \c1, \c3                // (y^*_1, ?, ?, ?)
 267         psrldq  \u, 8                   // (y_3, 0)
 268         psrlq   \c3, 32                 // (floor((y_1 B + y_0)/B^2, ?)
 269         paddq   \c3, \t                 // (y_2 + floor((y_1 B + y_0)/B^2, ?)
 270         pslldq  \c1, 12                 // (0, 0, 0, y^*_1)
 271         psrldq  \c0, 12                 // (y^*_0, 0, 0, 0)
 272         movdqa  \c2, \c3                // (y^*_2, ?, ?, ?)
 273         psrlq   \c3, 32             // (floor((y_2 B^2 + y_1 B + y_0)/B^3, ?)
 274         paddq   \c3, \u       // (y_3 + floor((y_2 B^2 + y_1 B + y_0)/B^3, ?)
 275         pslldq  \c2, 12                 // (0, 0, 0, y^*_2)
 276         psrldq  \c1, 8                  // (0, y^*_1, 0, 0)
 277         psrldq  \c2, 4                  // (0, 0, y^*_2, 0)
 278   .ifnes "\h", "nil"
 279         movdqu  \t, \c3
 280         pxor    \u, \u
 281   .endif
 282         pslldq  \c3, 12                 // (0, 0, 0, y^*_3)
 283         por     \c0, \c1                // (y^*_0, y^*_1, 0, 0)
 284         por     \c2, \c3                // (0, 0, y^*_2, y^*_3)
 285         por     \c0, \c2                // y mod B^4
 286   .ifnes "\h", "nil"
 287         psrlq   \t, 32                  // very high bits of y
 288         paddq   \h, \t
 289         punpcklqdq \h, \u               // carry up
 290   .endif
 291 .endm
 292
 293 .macro  carryadd
 294         // On entry, EDI points to a packed addend A, and XMM4, XMM5, XMM6
 295         // hold the incoming carry registers c0, c1, and c2 representing a
 296         // carry-in C.
 297         //
 298         // On exit, the carry registers, including XMM7, are updated to hold
 299         // C + A; XMM0, XMM1, XMM2, and XMM3 are clobbered.  The other
 300         // registers are preserved.
 301         movd    xmm0, [edi +  0]        // (a_0, 0)
 302         movd    xmm1, [edi +  4]        // (a_1, 0)
 303         movd    xmm2, [edi +  8]        // (a_2, 0)
 304         movd    xmm7, [edi + 12]        // (a_3, 0)
 305         paddq   xmm4, xmm0              // (c'_0 + a_0, c''_0)
 306         paddq   xmm5, xmm1              // (c'_1 + a_1, c''_1)
 307         paddq   xmm6, xmm2              // (c'_2 + a_2, c''_2 + a_3 b)
 308 .endm
 309
 310 ///--------------------------------------------------------------------------
 311 /// Primitive multipliers and related utilities.
 312
 313         .p2align 4
 314 carryprop:
 315         // On entry, XMM4, XMM5, and XMM6 hold a 144-bit carry in an expanded
 316         // form.  Store the low 128 bits of the represented carry to [EDI] as
 317         // a packed 128-bit value, and leave the remaining 16 bits in the low
 318         // 32 bits of XMM4.  On exit, XMM3, XMM5 and XMM6 are clobbered.
 319         propout [edi +  0], xmm4, xmm5
 320         propout [edi +  4], xmm5, xmm6
 321         propout [edi +  8], xmm6, nil
 322         endprop [edi + 12], xmm6, xmm4
 323         ret
 324
 325         .p2align 4
 326 dmul4:
 327         // On entry, EDI points to the destination buffer; EAX and EBX point
 328         // to the packed operands U and X; ECX and EDX point to the expanded
 329         // operands V and Y; and XMM4, XMM5, XMM6 hold the incoming carry
 330         // registers c0, c1, and c2; c3 is assumed to be zero.
 331         //
 332         // On exit, we write the low 128 bits of the sum C + U V + X Y to
 333         // [EDI], and update the carry registers with the carry out.  The
 334         // registers XMM0, XMM1, XMM2, XMM3, and XMM7 are clobbered; the
 335         // general-purpose registers are preserved.
 336         mulacc  [eax +  0], ecx, xmm4, xmm5, xmm6, xmm7, t
 337         mulacc  [ebx +  0], edx, xmm4, xmm5, xmm6, xmm7, nil
 338         propout [edi +  0],      xmm4, xmm5
 339
 340         mulacc  [eax +  4], ecx, xmm5, xmm6, xmm7, xmm4, t
 341         mulacc  [ebx +  4], edx, xmm5, xmm6, xmm7, xmm4, nil
 342         propout [edi +  4],      xmm5, xmm6
 343
 344         mulacc  [eax +  8], ecx, xmm6, xmm7, xmm4, xmm5, t
 345         mulacc  [ebx +  8], edx, xmm6, xmm7, xmm4, xmm5, nil
 346         propout [edi +  8],      xmm6, xmm7
 347
 348         mulacc  [eax + 12], ecx, xmm7, xmm4, xmm5, xmm6, t
 349         mulacc  [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6, nil
 350         propout [edi + 12],      xmm7, xmm4
 351
 352         ret
 353
 354         .p2align 4
 355 dmla4:
 356         // On entry, EDI points to the destination buffer, which also
 357         // contains an addend A to accumulate; EAX and EBX point to the
 358         // packed operands U and X; ECX and EDX point to the expanded
 359         // operands V and Y; and XMM4, XMM5, XMM6 hold the incoming carry
 360         // registers c0, c1, and c2 representing a carry-in C; c3 is assumed
 361         // to be zero.
 362         //
 363         // On exit, we write the low 128 bits of the sum A + C + U V + X Y to
 364         // [EDI], and update the carry registers with the carry out.  The
 365         // registers XMM0, XMM1, XMM2, XMM3, and XMM7 are clobbered; the
 366         // general-purpose registers are preserved.
 367         carryadd
 368
 369         mulacc  [eax +  0], ecx, xmm4, xmm5, xmm6, xmm7, nil
 370         mulacc  [ebx +  0], edx, xmm4, xmm5, xmm6, xmm7, nil
 371         propout [edi +  0],      xmm4, xmm5
 372
 373         mulacc  [eax +  4], ecx, xmm5, xmm6, xmm7, xmm4, t
 374         mulacc  [ebx +  4], edx, xmm5, xmm6, xmm7, xmm4, nil
 375         propout [edi +  4],      xmm5, xmm6
 376
 377         mulacc  [eax +  8], ecx, xmm6, xmm7, xmm4, xmm5, t
 378         mulacc  [ebx +  8], edx, xmm6, xmm7, xmm4, xmm5, nil
 379         propout [edi +  8],      xmm6, xmm7
 380
 381         mulacc  [eax + 12], ecx, xmm7, xmm4, xmm5, xmm6, t
 382         mulacc  [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6, nil
 383         propout [edi + 12],      xmm7, xmm4
 384
 385         ret
 386
 387         .p2align 4
 388 mul4zc:
 389         // On entry, EDI points to the destination buffer; EBX points to a
 390         // packed operand X; and EDX points to an expanded operand Y.
 391         //
 392         // On exit, we write the low 128 bits of the product X Y to [EDI],
 393         // and set the carry registers XMM4, XMM5, XMM6 to the carry out.
 394         // The registers XMM0, XMM1, XMM2, XMM3, and XMM7 are clobbered; the
 395         // general-purpose registers are preserved.
 396         mulcore [ebx +  0], edx, xmm4, xmm5, xmm6, xmm7
 397         propout [edi +  0],      xmm4, xmm5
 398
 399         mulacc  [ebx +  4], edx, xmm5, xmm6, xmm7, xmm4, t
 400         propout [edi +  4],      xmm5, xmm6
 401
 402         mulacc  [ebx +  8], edx, xmm6, xmm7, xmm4, xmm5, t
 403         propout [edi +  8],      xmm6, xmm7
 404
 405         mulacc  [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6, t
 406         propout [edi + 12],      xmm7, xmm4
 407
 408         ret
 409
 410         .p2align 4
 411 mul4:
 412         // On entry, EDI points to the destination buffer; EBX points to a
 413         // packed operand X; EDX points to an expanded operand Y; and XMM4,
 414         // XMM5, XMM6 hold the incoming carry registers c0, c1, and c2,
 415         // representing a carry-in C; c3 is assumed to be zero.
 416         //
 417         // On exit, we write the low 128 bits of the sum C + X Y to [EDI],
 418         // and update the carry registers with the carry out.  The registers
 419         // XMM0, XMM1, XMM2, XMM3, and XMM7 are clobbered; the
 420         // general-purpose registers are preserved.
 421         mulacc  [ebx +  0], edx, xmm4, xmm5, xmm6, xmm7, t
 422         propout [edi +  0],      xmm4, xmm5
 423
 424         mulacc  [ebx +  4], edx, xmm5, xmm6, xmm7, xmm4, t
 425         propout [edi +  4],      xmm5, xmm6
 426
 427         mulacc  [ebx +  8], edx, xmm6, xmm7, xmm4, xmm5, t
 428         propout [edi +  8],      xmm6, xmm7
 429
 430         mulacc  [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6, t
 431         propout [edi + 12],      xmm7, xmm4
 432
 433         ret
 434
 435         .p2align 4
 436 mla4zc:
 437         // On entry, EDI points to the destination buffer, which also
 438         // contains an addend A to accumulate; EBX points to a packed operand
 439         // X; and EDX points to an expanded operand Y.
 440         //
 441         // On exit, we write the low 128 bits of the sum A + X Y to [EDI],
 442         // and set the carry registers XMM4, XMM5, XMM6 to the carry out.
 443         // The registers XMM0, XMM1, XMM2, XMM3, and XMM7 are clobbered; the
 444         // general-purpose registers are preserved.
 445         movd    xmm4, [edi +  0]
 446         movd    xmm5, [edi +  4]
 447         movd    xmm6, [edi +  8]
 448         movd    xmm7, [edi + 12]
 449
 450         mulacc  [ebx +  0], edx, xmm4, xmm5, xmm6, xmm7, nil
 451         propout [edi +  0],      xmm4, xmm5
 452
 453         mulacc  [ebx +  4], edx, xmm5, xmm6, xmm7, xmm4, t
 454         propout [edi +  4],      xmm5, xmm6
 455
 456         mulacc  [ebx +  8], edx, xmm6, xmm7, xmm4, xmm5, t
 457         propout [edi +  8],      xmm6, xmm7
 458
 459         mulacc  [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6, t
 460         propout [edi + 12],      xmm7, xmm4
 461
 462         ret
 463
 464         .p2align 4
 465 mla4:
 466         // On entry, EDI points to the destination buffer, which also
 467         // contains an addend A to accumulate; EBX points to a packed operand
 468         // X; EDX points to an expanded operand Y; and XMM4, XMM5, XMM6 hold
 469         // the incoming carry registers c0, c1, and c2, representing a
 470         // carry-in C; c3 is assumed to be zero.
 471         //
 472         // On exit, we write the low 128 bits of the sum A + C + X Y to
 473         // [EDI], and update the carry registers with the carry out.  The
 474         // registers XMM0, XMM1, XMM2, XMM3, and XMM7 are clobbered; the
 475         // general-purpose registers are preserved.
 476         carryadd
 477
 478         mulacc  [ebx +  0], edx, xmm4, xmm5, xmm6, xmm7, nil
 479         propout [edi +  0],      xmm4, xmm5
 480
 481         mulacc  [ebx +  4], edx, xmm5, xmm6, xmm7, xmm4, t
 482         propout [edi +  4],      xmm5, xmm6
 483
 484         mulacc  [ebx +  8], edx, xmm6, xmm7, xmm4, xmm5, t
 485         propout [edi +  8],      xmm6, xmm7
 486
 487         mulacc  [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6, t
 488         propout [edi + 12],      xmm7, xmm4
 489
 490         ret
 491
 492         .p2align 4
 493 mmul4:
 494         // On entry, EDI points to the destination buffer; EAX and EBX point
 495         // to the packed operands U and N; ECX and ESI point to the expanded
 496         // operands V and M; and EDX points to a place to store an expanded
 497         // result Y (32 bytes, at a 16-byte boundary).  The stack pointer
 498         // must be 16-byte aligned.  (This is not the usual convention, which
 499         // requires alignment before the call.)
 500         //
 501         // On exit, we write Y = U V M mod B to [EDX], and the low 128 bits
 502         // of the sum U V + N Y to [EDI], leaving the remaining carry in
 503         // XMM4, XMM5, and XMM6.  The registers XMM0, XMM1, XMM2, XMM3, and
 504         // XMM7 are clobbered; the general-purpose registers are preserved.
 505         sub     esp, 64                 // space for the carries
 506
 507         // Calculate W = U V, and leave it in the destination.  Stash the
 508         // carry pieces for later.
 509         mulcore [eax +  0], ecx, xmm4, xmm5, xmm6, xmm7
 510         propout [edi +  0],      xmm4, xmm5
 511         jmp     5f
 512
 513         .p2align 4
 514 mmla4:
 515         // On entry, EDI points to the destination buffer, which also
 516         // contains an addend A to accumulate; EAX and EBX point
 517         // to the packed operands U and N; ECX and ESI point to the expanded
 518         // operands V and M; and EDX points to a place to store an expanded
 519         // result Y (32 bytes, at a 16-byte boundary).  The stack pointer
 520         // must be 16-byte aligned.  (This is not the usual convention, which
 521         // requires alignment before the call.)
 522         //
 523         // On exit, we write Y = (A + U V) M mod B to [EDX], and the low 128
 524         // bits of the sum A + U V + N Y to [EDI], leaving the remaining
 525         // carry in XMM4, XMM5, and XMM6.  The registers XMM0, XMM1, XMM2,
 526         // XMM3, and XMM7 are clobbered; the general-purpose registers are
 527         // preserved.
 528         sub     esp, 64                 // space for the carries
 529         movd    xmm4, [edi +  0]
 530         movd    xmm5, [edi +  4]
 531         movd    xmm6, [edi +  8]
 532         movd    xmm7, [edi + 12]
 533         mulacc  [eax +  0], ecx, xmm4, xmm5, xmm6, xmm7, nil
 534         propout [edi +  0],      xmm4, xmm5
 535
 536 5:      mulacc  [eax +  4], ecx, xmm5, xmm6, xmm7, xmm4, t
 537         propout [edi +  4],      xmm5, xmm6
 538
 539         mulacc  [eax +  8], ecx, xmm6, xmm7, xmm4, xmm5, t
 540         propout [edi +  8],      xmm6, xmm7
 541
 542         mulacc  [eax + 12], ecx, xmm7, xmm4, xmm5, xmm6, t
 543         propout [edi + 12],      xmm7, xmm4
 544
 545         movdqa  [esp +  0], xmm4
 546         movdqa  [esp + 16], xmm5
 547         movdqa  [esp + 32], xmm6
 548
 549         // Calculate Y = W M.
 550         mulcore [edi +  0], esi, xmm4, xmm5, xmm6, xmm7
 551
 552         mulcore [edi +  4], esi, xmm0, xmm1, xmm2, nil
 553         accum                    xmm5, xmm6, xmm7, nil
 554
 555         mulcore [edi +  8], esi, xmm0, xmm1, nil,  nil
 556         accum                    xmm6, xmm7, nil,  nil
 557
 558         mulcore [edi + 12], esi, xmm0, nil,  nil,  nil
 559         accum                    xmm7, nil,  nil,  nil
 560
 561         // That's lots of pieces.  Now we have to assemble the answer.
 562         squash  xmm4, xmm5, xmm6, xmm7, nil, xmm0, xmm1
 563
 564         // Expand it.
 565         pxor    xmm2, xmm2
 566         expand  xmm4, xmm1, nil, nil, xmm2
 567         movdqa  [edx +  0], xmm4
 568         movdqa  [edx + 16], xmm1
 569
 570         // Initialize the carry from the value for W we calculated earlier.
 571         movd    xmm4, [edi +  0]
 572         movd    xmm5, [edi +  4]
 573         movd    xmm6, [edi +  8]
 574         movd    xmm7, [edi + 12]
 575
 576         // Finish the calculation by adding the Montgomery product.
 577         mulacc  [ebx +  0], edx, xmm4, xmm5, xmm6, xmm7, nil
 578         propout [edi +  0],      xmm4, xmm5
 579
 580         mulacc  [ebx +  4], edx, xmm5, xmm6, xmm7, xmm4, t
 581         propout [edi +  4],      xmm5, xmm6
 582
 583         mulacc  [ebx +  8], edx, xmm6, xmm7, xmm4, xmm5, t
 584         propout [edi +  8],      xmm6, xmm7
 585
 586         mulacc  [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6, t
 587         propout [edi + 12],      xmm7, xmm4
 588
 589         // Add add on the carry we calculated earlier.
 590         paddq   xmm4, [esp +  0]
 591         paddq   xmm5, [esp + 16]
 592         paddq   xmm6, [esp + 32]
 593
 594         // And, with that, we're done.
 595         add     esp, 64
 596         ret
 597
 598         .p2align 4
 599 mont4:
 600         // On entry, EDI points to the destination buffer holding a packed
 601         // value A; EBX points to a packed operand N; ESI points to an
 602         // expanded operand M; and EDX points to a place to store an expanded
 603         // result Y (32 bytes, at a 16-byte boundary).
 604         //
 605         // On exit, we write Y = W M mod B to [EDX], and the low 128 bits
 606         // of the sum W + N Y to [EDI], leaving the remaining carry in
 607         // XMM4, XMM5, and XMM6.  The registers XMM0, XMM1, XMM2, XMM3, and
 608         // XMM7 are clobbered; the general-purpose registers are preserved.
 609
 610         // Calculate Y = W M.
 611         mulcore [edi +  0], esi, xmm4, xmm5, xmm6, xmm7
 612
 613         mulcore [edi +  4], esi, xmm0, xmm1, xmm2, nil
 614         accum                    xmm5, xmm6, xmm7, nil
 615
 616         mulcore [edi +  8], esi, xmm0, xmm1, nil,  nil
 617         accum                    xmm6, xmm7, nil,  nil
 618
 619         mulcore [edi + 12], esi, xmm0, nil,  nil,  nil
 620         accum                    xmm7, nil,  nil,  nil
 621
 622         // That's lots of pieces.  Now we have to assemble the answer.
 623         squash  xmm4, xmm5, xmm6, xmm7, nil, xmm0, xmm1
 624
 625         // Expand it.
 626         pxor    xmm2, xmm2
 627         expand  xmm4, xmm1, nil, nil, xmm2
 628         movdqa  [edx +  0], xmm4
 629         movdqa  [edx + 16], xmm1
 630
 631         // Initialize the carry from W.
 632         movd    xmm4, [edi +  0]
 633         movd    xmm5, [edi +  4]
 634         movd    xmm6, [edi +  8]
 635         movd    xmm7, [edi + 12]
 636
 637         // Finish the calculation by adding the Montgomery product.
 638         mulacc  [ebx +  0], edx, xmm4, xmm5, xmm6, xmm7, nil
 639         propout [edi +  0],      xmm4, xmm5
 640
 641         mulacc  [ebx +  4], edx, xmm5, xmm6, xmm7, xmm4, t
 642         propout [edi +  4],      xmm5, xmm6
 643
 644         mulacc  [ebx +  8], edx, xmm6, xmm7, xmm4, xmm5, t
 645         propout [edi +  8],      xmm6, xmm7
 646
 647         mulacc  [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6, t
 648         propout [edi + 12],      xmm7, xmm4
 649
 650         // And, with that, we're done.
 651         ret
 652
 653 ///--------------------------------------------------------------------------
 654 /// Bulk multipliers.
 655
 656 FUNC(mpx_umul4_x86_sse2)
 657         // void mpx_umul4_x86_sse2(mpw *dv, const mpw *av, const mpw *avl,
 658         //                         const mpw *bv, const mpw *bvl);
 659
 660         // Build a stack frame.  Arguments will be relative to EBP, as
 661         // follows.
 662         //
 663         //      ebp + 20        dv
 664         //      ebp + 24        av
 665         //      ebp + 28        avl
 666         //      ebp + 32        bv
 667         //      ebp + 36        bvl
 668         //
 669         // Locals are relative to ESP, as follows.
 670         //
 671         //      esp +  0        expanded Y (32 bytes)
 672         //      esp + 32        (top of locals)
 673         push    ebp
 674         push    ebx
 675         push    esi
 676         push    edi
 677         mov     ebp, esp
 678         and     esp, ~15
 679         sub     esp, 32
 680
 681         // Prepare for the first iteration.
 682         mov     esi, [ebp + 32]         // -> bv[0]
 683         pxor    xmm7, xmm7
 684         movdqu  xmm0, [esi]             // bv[0]
 685         mov     edi, [ebp + 20]         // -> dv[0]
 686         mov     ecx, edi                // outer loop dv cursor
 687         expand  xmm0, xmm1, nil, nil, xmm7
 688         mov     ebx, [ebp + 24]         // -> av[0]
 689         mov     eax, [ebp + 28]         // -> av[m] = av limit
 690         mov     edx, esp                // -> expanded Y = bv[0]
 691         movdqa  [esp + 0], xmm0         // bv[0] expanded low
 692         movdqa  [esp + 16], xmm1        // bv[0] expanded high
 693         call    mul4zc
 694         add     ebx, 16
 695         add     edi, 16
 696         add     ecx, 16
 697         add     esi, 16
 698         cmp     ebx, eax                // all done?
 699         jae     8f
 700
 701         .p2align 4
 702         // Continue with the first iteration.
 703 0:      call    mul4
 704         add     ebx, 16
 705         add     edi, 16
 706         cmp     ebx, eax                // all done?
 707         jb      0b
 708
 709         // Write out the leftover carry.  There can be no tail here.
 710 8:      call    carryprop
 711         cmp     esi, [ebp + 36]         // more passes to do?
 712         jae     9f
 713
 714         .p2align 4
 715         // Set up for the next pass.
 716 1:      movdqu  xmm0, [esi]             // bv[i]
 717         mov     edi, ecx                // -> dv[i]
 718         pxor    xmm7, xmm7
 719         expand  xmm0, xmm1, nil, nil, xmm7
 720         mov     ebx, [ebp + 24]         // -> av[0]
 721         movdqa  [esp + 0], xmm0         // bv[i] expanded low
 722         movdqa  [esp + 16], xmm1        // bv[i] expanded high
 723         call    mla4zc
 724         add     edi, 16
 725         add     ebx, 16
 726         add     ecx, 16
 727         add     esi, 16
 728         cmp     ebx, eax                // done yet?
 729         jae     8f
 730
 731         .p2align 4
 732         // Continue...
 733 0:      call    mla4
 734         add     ebx, 16
 735         add     edi, 16
 736         cmp     ebx, eax
 737         jb      0b
 738
 739         // Finish off this pass.  There was no tail on the previous pass, and
 740         // there can be none on this pass.
 741 8:      call    carryprop
 742         cmp     esi, [ebp + 36]
 743         jb      1b
 744
 745         // All over.
 746 9:      mov     esp, ebp
 747         pop     edi
 748         pop     esi
 749         pop     ebx
 750         pop     ebp
 751         ret
 752
 753 ENDFUNC
 754
 755 FUNC(mpxmont_mul4_x86_sse2)
 756         // void mpxmont_mul4_x86_sse2(mpw *dv, const mpw *av, const mpw *bv,
 757         //                           const mpw *nv, size_t n, const mpw *mi);
 758
 759         // Build a stack frame.  Arguments will be relative to EBP, as
 760         // follows.
 761         //
 762         //      ebp + 20        dv
 763         //      ebp + 24        av
 764         //      ebp + 28        bv
 765         //      ebp + 32        nv
 766         //      ebp + 36        n (nonzero multiple of 4)
 767         //      ebp + 40        mi
 768         //
 769         // Locals are relative to ESP, which is 4 mod 16, as follows.
 770         //
 771         //      esp +   0       outer loop dv
 772         //      esp +   4       outer loop bv
 773         //      esp +   8       av limit (mostly in ESI)
 774         //      esp +  12       expanded V (32 bytes)
 775         //      esp +  44       expanded M (32 bytes)
 776         //      esp +  76       expanded Y (32 bytes)
 777         //      esp + 108       bv limit
 778         //      esp + 112       (gap)
 779         //      esp + 124       (top of locals)
 780         push    ebp
 781         push    ebx
 782         push    esi
 783         push    edi
 784         mov     ebp, esp
 785         and     esp, ~15
 786         sub     esp, 124
 787
 788         // Establish the expanded operands.
 789         pxor    xmm7, xmm7
 790         mov     ecx, [ebp + 28]         // -> bv
 791         mov     edx, [ebp + 40]         // -> mi
 792         movdqu  xmm0, [ecx]             // bv[0]
 793         movdqu  xmm2, [edx]             // mi
 794         expand  xmm0, xmm1, xmm2, xmm3, xmm7
 795         movdqa  [esp + 12], xmm0        // bv[0] expanded low
 796         movdqa  [esp + 28], xmm1        // bv[0] expanded high
 797         movdqa  [esp + 44], xmm2        // mi expanded low
 798         movdqa  [esp + 60], xmm3        // mi expanded high
 799
 800         // Set up the outer loop state and prepare for the first iteration.
 801         mov     edx, [ebp + 36]         // n
 802         mov     eax, [ebp + 24]         // -> U = av[0]
 803         mov     ebx, [ebp + 32]         // -> X = nv[0]
 804         mov     edi, [ebp + 20]         // -> Z = dv[0]
 805         mov     [esp + 4], ecx
 806         lea     ecx, [ecx + 4*edx]      // -> bv[n/4] = bv limit
 807         lea     edx, [eax + 4*edx]      // -> av[n/4] = av limit
 808         mov     [esp + 0], edi
 809         mov     [esp + 108], ecx
 810         mov     [esp + 8], edx
 811         lea     ecx, [esp + 12]         // -> expanded V = bv[0]
 812         lea     esi, [esp + 44]         // -> expanded M = mi
 813         lea     edx, [esp + 76]         // -> space for Y
 814         call    mmul4
 815         mov     esi, [esp + 8]          // recover av limit
 816         add     edi, 16
 817         add     eax, 16
 818         add     ebx, 16
 819         cmp     eax, esi                // done already?
 820         jae     8f
 821         mov     [esp + 0], edi
 822
 823         .p2align 4
 824         // Complete the first inner loop.
 825 0:      call    dmul4
 826         add     edi, 16
 827         add     eax, 16
 828         add     ebx, 16
 829         cmp     eax, esi                // done yet?
 830         jb      0b
 831
 832         // Still have carries left to propagate.
 833         call    carryprop
 834         movd    [edi + 16], xmm4
 835
 836         .p2align 4
 837         // Embark on the next iteration.  (There must be one.  If n = 1, then
 838         // we would have bailed above, to label 8.  Similarly, the subsequent
 839         // iterations can fall into the inner loop immediately.)
 840 1:      mov     eax, [esp + 4]          // -> bv[i - 1]
 841         mov     edi, [esp + 0]          // -> Z = dv[i]
 842         add     eax, 16                 // -> bv[i]
 843         pxor    xmm7, xmm7
 844         movdqu  xmm0, [eax]             // bv[i]
 845         mov     [esp + 4], eax
 846         cmp     eax, [esp + 108]        // done yet?
 847         jae     9f
 848         mov     ebx, [ebp + 32]         // -> X = nv[0]
 849         lea     esi, [esp + 44]         // -> expanded M = mi
 850         mov     eax, [ebp + 24]         // -> U = av[0]
 851         expand  xmm0, xmm1, nil, nil, xmm7
 852         movdqa  [esp + 12], xmm0        // bv[i] expanded low
 853         movdqa  [esp + 28], xmm1        // bv[i] expanded high
 854         call    mmla4
 855         mov     esi, [esp + 8]          // recover av limit
 856         add     edi, 16
 857         add     eax, 16
 858         add     ebx, 16
 859         mov     [esp + 0], edi
 860
 861         .p2align 4
 862         // Complete the next inner loop.
 863 0:      call    dmla4
 864         add     edi, 16
 865         add     eax, 16
 866         add     ebx, 16
 867         cmp     eax, esi
 868         jb      0b
 869
 870         // Still have carries left to propagate, and they overlap the
 871         // previous iteration's final tail, so read that in and add it.
 872         movd    xmm0, [edi]
 873         paddq   xmm4, xmm0
 874         call    carryprop
 875         movd    [edi + 16], xmm4
 876
 877         // Back again.
 878         jmp     1b
 879
 880         // First iteration was short.  Write out the carries and we're done.
 881         // (This could be folded into the main loop structure, but that would
 882         // penalize small numbers more.)
 883 8:      call    carryprop
 884         movd    [edi + 16], xmm4
 885
 886         // All done.
 887 9:      mov     esp, ebp
 888         pop     edi
 889         pop     esi
 890         pop     ebx
 891         pop     ebp
 892         ret
 893
 894 ENDFUNC
 895
 896 FUNC(mpxmont_redc4_x86_sse2)
 897         // void mpxmont_redc4_x86_sse2(mpw *dv, mpw *dvl, const mpw *nv,
 898         //                             size_t n, const mpw *mi);
 899
 900         // Build a stack frame.  Arguments will be relative to EBP, as
 901         // follows.
 902         //
 903         //      ebp + 20        dv
 904         //      ebp + 24        dvl
 905         //      ebp + 28        nv
 906         //      ebp + 32        n (nonzero multiple of 4)
 907         //      ebp + 36        mi
 908         //
 909         // Locals are relative to ESP, as follows.
 910         //
 911         //      esp +  0        outer loop dv
 912         //      esp +  4        outer dv limit
 913         //      esp +  8        blocks-of-4 dv limit
 914         //      esp + 12        expanded M (32 bytes)
 915         //      esp + 44        expanded Y (32 bytes)
 916         //      esp + 76        (top of locals)
 917         push    ebp
 918         push    ebx
 919         push    esi
 920         push    edi
 921         mov     ebp, esp
 922         and     esp, ~15
 923         sub     esp, 76
 924
 925         // Establish the expanded operands and the blocks-of-4 dv limit.
 926         mov     edi, [ebp + 20]         // -> Z = dv[0]
 927         pxor    xmm7, xmm7
 928         mov     eax, [ebp + 24]         // -> dv[n] = dv limit
 929         sub     eax, edi                // length of dv in bytes
 930         mov     edx, [ebp + 36]         // -> mi
 931         movdqu  xmm0, [edx]             // mi
 932         and     eax, ~15                // mask off the tail end
 933         expand  xmm0, xmm1, nil, nil, xmm7
 934         add     eax, edi                // find limit
 935         movdqa  [esp + 12], xmm0        // mi expanded low
 936         movdqa  [esp + 28], xmm1        // mi expanded high
 937         mov     [esp + 8], eax
 938
 939         // Set up the outer loop state and prepare for the first iteration.
 940         mov     ecx, [ebp + 32]         // n
 941         mov     ebx, [ebp + 28]         // -> X = nv[0]
 942         lea     edx, [edi + 4*ecx]      // -> dv[n/4] = outer dv limit
 943         lea     ecx, [ebx + 4*ecx]      // -> nv[n/4] = nv limit
 944         mov     [esp + 0], edi
 945         mov     [esp + 4], edx
 946         lea     esi, [esp + 12]         // -> expanded M = mi
 947         lea     edx, [esp + 44]         // -> space for Y
 948         call    mont4
 949         add     edi, 16
 950         add     ebx, 16
 951         cmp     ebx, ecx                // done already?
 952         jae     8f
 953
 954         .p2align 4
 955         // Complete the first inner loop.
 956 5:      call    mla4
 957         add     ebx, 16
 958         add     edi, 16
 959         cmp     ebx, ecx                // done yet?
 960         jb      5b
 961
 962         // Still have carries left to propagate.
 963 8:      carryadd
 964         mov     esi, [esp + 8]          // -> dv blocks limit
 965         mov     edx, [ebp + 24]         // dv limit
 966         psllq   xmm7, 16
 967         pslldq  xmm7, 8
 968         paddq   xmm6, xmm7
 969         call    carryprop
 970         movd    eax, xmm4
 971         add     edi, 16
 972         cmp     edi, esi
 973         jae     7f
 974
 975         .p2align 4
 976         // Continue carry propagation until the end of the buffer.
 977 0:      add     [edi], eax
 978         mov     eax, 0                  // preserves flags
 979         adcd    [edi + 4], 0
 980         adcd    [edi + 8], 0
 981         adcd    [edi + 12], 0
 982         adc     eax, 0
 983         add     edi, 16
 984         cmp     edi, esi
 985         jb      0b
 986
 987         // Deal with the tail end.
 988 7:      add     [edi], eax
 989         mov     eax, 0                  // preserves flags
 990         add     edi, 4
 991         adc     eax, 0
 992         cmp     edi, edx
 993         jb      7b
 994
 995         // All done for this iteration.  Start the next.  (This must have at
 996         // least one follow-on iteration, or we'd not have started this outer
 997         // loop.)
 998 8:      mov     edi, [esp + 0]          // -> dv[i - 1]
 999         mov     ebx, [ebp + 28]         // -> X = nv[0]
1000         lea     edx, [esp + 44]         // -> space for Y
1001         lea     esi, [esp + 12]         // -> expanded M = mi
1002         add     edi, 16                 // -> Z = dv[i]
1003         cmp     edi, [esp + 4]          // all done yet?
1004         jae     9f
1005         mov     [esp + 0], edi
1006         call    mont4
1007         add     edi, 16
1008         add     ebx, 16
1009         jmp     5b
1010
1011         // All over.
1012 9:      mov     esp, ebp
1013         pop     edi
1014         pop     esi
1015         pop     ebx
1016         pop     ebp
1017         ret
1018
1019 ENDFUNC
1020
1021 ///--------------------------------------------------------------------------
1022 /// Testing and performance measurement.
1023
1024 #ifdef TEST_MUL4
1025
1026 .macro  cysetup c
1027         rdtsc
1028         mov     [\c], eax
1029         mov     [\c + 4], edx
1030 .endm
1031
1032 .macro  cystore c, v, n
1033         rdtsc
1034         sub     eax, [\c]
1035         sbb     edx, [\c + 4]
1036         mov     ebx, [\v]
1037         mov     ecx, [\n]
1038         dec     ecx
1039         mov     [\n], ecx
1040         mov     [ebx + ecx*8], eax
1041         mov     [ebx + ecx*8 + 4], edx
1042 .endm
1043
1044 .macro  testprologue
1045         push    ebp
1046         push    ebx
1047         push    esi
1048         push    edi
1049         mov     ebp, esp
1050         and     esp, ~15
1051         sub     esp, 3*32 + 12
1052         // vars:
1053         //      esp +  0 = cycles
1054         //      esp + 12 = v expanded
1055         //      esp + 44 = y expanded
1056         //      esp + 72 = ? expanded
1057 .endm
1058
1059 .macro  testepilogue
1060         mov     esp, ebp
1061         pop     edi
1062         pop     esi
1063         pop     ebx
1064         pop     ebp
1065         ret
1066 .endm
1067
1068 .macro  testldcarry c
1069         mov     ecx, \c                 // -> c
1070         movdqu  xmm4, [ecx +  0]        // (c'_0, c''_0)
1071         movdqu  xmm5, [ecx + 16]        // (c'_1, c''_1)
1072         movdqu  xmm6, [ecx + 32]        // (c'_2, c''_2)
1073 .endm
1074
1075 .macro  testexpand v, y
1076         pxor    xmm7, xmm7
1077   .ifnes "\v", "nil"
1078         mov     ecx, \v
1079         movdqu  xmm0, [ecx]
1080         expand  xmm0, xmm1, nil, nil, xmm7
1081         movdqa  [esp + 12], xmm0
1082         movdqa  [esp + 28], xmm1
1083   .endif
1084   .ifnes "\y", "nil"
1085         mov     edx, \y
1086         movdqu  xmm2, [edx]
1087         expand  xmm2, xmm3, nil, nil, xmm7
1088         movdqa  [esp + 44], xmm2
1089         movdqa  [esp + 60], xmm3
1090   .endif
1091 .endm
1092
1093 .macro  testtop u, x, mode
1094         .p2align 4
1095 0:
1096   .ifnes "\u", "nil"
1097         lea     ecx, [esp + 12]
1098   .endif
1099         mov     ebx, \x
1100   .ifeqs "\mode", "mont"
1101         lea     esi, [esp + 44]
1102   .endif
1103         cysetup esp + 0
1104   .ifnes "\u", "nil"
1105         mov     eax, \u
1106   .endif
1107   .ifeqs "\mode", "mont"
1108         lea     edx, [esp + 76]
1109   .else
1110         lea     edx, [esp + 44]
1111   .endif
1112 .endm
1113
1114 .macro  testtail cyv, n
1115         cystore esp + 0, \cyv, \n
1116         jnz     0b
1117 .endm
1118
1119 .macro  testcarryout c
1120         mov     ecx, \c
1121         movdqu  [ecx +  0], xmm4
1122         movdqu  [ecx + 16], xmm5
1123         movdqu  [ecx + 32], xmm6
1124 .endm
1125
1126         .globl  test_dmul4
1127 test_dmul4:
1128         testprologue
1129         testldcarry [ebp + 24]
1130         testexpand [ebp + 36], [ebp + 40]
1131         mov     edi, [ebp + 20]
1132         testtop [ebp + 28], [ebp + 32]
1133         call    dmul4
1134         testtail [ebp + 48], [ebp + 44]
1135         testcarryout [ebp + 24]
1136         testepilogue
1137
1138         .globl  test_dmla4
1139 test_dmla4:
1140         testprologue
1141         testldcarry [ebp + 24]
1142         testexpand [ebp + 36], [ebp + 40]
1143         mov     edi, [ebp + 20]
1144         testtop [ebp + 28], [ebp + 32]
1145         call    dmla4
1146         testtail [ebp + 48], [ebp + 44]
1147         testcarryout [ebp + 24]
1148         testepilogue
1149
1150         .globl  test_mul4
1151 test_mul4:
1152         testprologue
1153         testldcarry [ebp + 24]
1154         testexpand nil, [ebp + 32]
1155         mov     edi, [ebp + 20]
1156         testtop nil, [ebp + 28]
1157         call    mul4
1158         testtail [ebp + 40], [ebp + 36]
1159         testcarryout [ebp + 24]
1160         testepilogue
1161
1162         .globl  test_mla4
1163 test_mla4:
1164         testprologue
1165         testldcarry [ebp + 24]
1166         testexpand nil, [ebp + 32]
1167         mov     edi, [ebp + 20]
1168         testtop nil, [ebp + 28]
1169         call    mla4
1170         testtail [ebp + 40], [ebp + 36]
1171         testcarryout [ebp + 24]
1172         testepilogue
1173
1174         .globl  test_mmul4
1175 test_mmul4:
1176         testprologue
1177         testexpand [ebp + 40], [ebp + 44]
1178         mov     edi, [ebp + 20]
1179         testtop [ebp + 32], [ebp + 36], mont
1180         call    mmul4
1181         testtail [ebp + 52], [ebp + 48]
1182         mov     edi, [ebp + 28]
1183         movdqa  xmm0, [esp + 76]
1184         movdqa  xmm1, [esp + 92]
1185         movdqu  [edi], xmm0
1186         movdqu  [edi + 16], xmm1
1187         testcarryout [ebp + 24]
1188         testepilogue
1189
1190         .globl  test_mmla4
1191 test_mmla4:
1192         testprologue
1193         testexpand [ebp + 40], [ebp + 44]
1194         mov     edi, [ebp + 20]
1195         testtop [ebp + 32], [ebp + 36], mont
1196         call    mmla4
1197         testtail [ebp + 52], [ebp + 48]
1198         mov     edi, [ebp + 28]
1199         movdqa  xmm0, [esp + 76]
1200         movdqa  xmm1, [esp + 92]
1201         movdqu  [edi], xmm0
1202         movdqu  [edi + 16], xmm1
1203         testcarryout [ebp + 24]
1204         testepilogue
1205
1206         .globl  test_mont4
1207 test_mont4:
1208         testprologue
1209         testexpand nil, [ebp + 36]
1210         mov     edi, [ebp + 20]
1211         testtop nil, [ebp + 32], mont
1212         call    mont4
1213         testtail [ebp + 44], [ebp + 40]
1214         mov     edi, [ebp + 28]
1215         movdqa  xmm0, [esp + 76]
1216         movdqa  xmm1, [esp + 92]
1217         movdqu  [edi], xmm0
1218         movdqu  [edi + 16], xmm1
1219         testcarryout [ebp + 24]
1220         testepilogue
1221
1222 #endif
1223
1224 ///----- That's all, folks --------------------------------------------------