chiark - git - mdw - catacomb/blob - math/mpx-mul4-x86-sse2.S

   1 /// -*- mode: asm; asm-comment-char: ?/; comment-start: "// " -*-
   2 ///
   3 /// Large SIMD-based multiplications
   4 ///
   5 /// (c) 2016 Straylight/Edgeware
   6
   7 ///----- Licensing notice ---------------------------------------------------
   8 ///
   9 /// This file is part of Catacomb.
  10 ///
  11 /// Catacomb is free software; you can redistribute it and/or modify
  12 /// it under the terms of the GNU Library General Public License as
  13 /// published by the Free Software Foundation; either version 2 of the
  14 /// License, or (at your option) any later version.
  15 ///
  16 /// Catacomb is distributed in the hope that it will be useful,
  17 /// but WITHOUT ANY WARRANTY; without even the implied warranty of
  18 /// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  19 /// GNU Library General Public License for more details.
  20 ///
  21 /// You should have received a copy of the GNU Library General Public
  22 /// License along with Catacomb; if not, write to the Free
  23 /// Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
  24 /// MA 02111-1307, USA.
  25
  26 ///--------------------------------------------------------------------------
  27 /// External definitions.
  28
  29 #include "config.h"
  30 #include "asm-common.h"
  31
  32 ///--------------------------------------------------------------------------
  33 /// Prologue.
  34
  35         .arch   pentium4
  36         .text
  37
  38 ///--------------------------------------------------------------------------
  39 /// Theory.
  40 ///
  41 /// We define a number of primitive fixed-size multipliers from which we can
  42 /// construct more general variable-length multipliers.
  43 ///
  44 /// The basic trick is the same throughout.  In an operand-scanning
  45 /// multiplication, the inner multiplication loop multiplies a
  46 /// multiple-precision operand by a single precision factor, and adds the
  47 /// result, appropriately shifted, to the result.  A `finely integrated
  48 /// operand scanning' implementation of Montgomery multiplication also adds
  49 /// the product of a single-precision `Montgomery factor' and the modulus,
  50 /// calculated in the same pass.  The more common `coarsely integrated
  51 /// operand scanning' alternates main multiplication and Montgomery passes,
  52 /// which requires additional carry propagation.
  53 ///
  54 /// Throughout both plain-multiplication and Montgomery stages, then, one of
  55 /// the factors remains constant throughout the operation, so we can afford
  56 /// to take a little time to preprocess it.  The transformation we perform is
  57 /// as follows.  Let b = 2^16, and B = b^2 = 2^32.  Suppose we're given a
  58 /// 128-bit factor v = v_0 + v_1 B + v_2 B^2 + v_3 B^3.  Split each v_i into
  59 /// two sixteen-bit pieces, so v_i = v'_i + v''_i b.  These eight 16-bit
  60 /// pieces are placed into 32-bit cells, and arranged as two 128-bit SSE
  61 /// operands, as follows.
  62 ///
  63 ///     Offset     0       4        8      12
  64 ///        0    v'_0    v'_1    v''_0   v''_1
  65 ///       16    v'_2    v'_3    v''_2   v''_3
  66 ///
  67 /// A `pmuludqd' instruction ignores the odd positions in its operands; thus,
  68 /// it will act on (say) v'_0 and v''_0 in a single instruction.  Shifting
  69 /// this vector right by 4 bytes brings v'_1 and v''_1 into position.  We can
  70 /// multiply such a vector by a full 32-bit scalar to produce two 48-bit
  71 /// results in 64-bit fields.  The sixteen bits of headroom allows us to add
  72 /// many products together before we must deal with carrying; it also allows
  73 /// for some calculations to be performed on the above expanded form.
  74 ///
  75 /// On 32-bit x86, we are register starved: the expanded operands are kept in
  76 /// memory, typically in warm L1 cache.
  77 ///
  78 /// We maintain four `carry' registers accumulating intermediate results.
  79 /// The registers' precise roles rotate during the computation; we name them
  80 /// `c0', `c1', `c2', and `c3'.  Each carry register holds two 64-bit halves:
  81 /// the register c0, for example, holds c'_0 (low half) and c''_0 (high
  82 /// half), and represents the value c_0 = c'_0 + c''_0 b; the carry registers
  83 /// collectively represent the value c_0 + c_1 B + c_2 B^2 + c_3 B^3.  The
  84 /// `pmuluqdq' instruction acting on a scalar operand (broadcast across all
  85 /// lanes of its vector) and an operand in the expanded form above produces a
  86 /// result which can be added directly to the appropriate carry register.
  87 /// Following a pass of four multiplications, we perform some limited carry
  88 /// propagation: let t = c''_0 mod B, and let d = c'_0 + t b; then we output
  89 /// z = d mod B, add (floor(d/B), floor(c''_0/B)) to c1, and cycle the carry
  90 /// registers around, so that c1 becomes c0, and the old c0 is (implicitly)
  91 /// zeroed becomes c3.
  92
  93 ///--------------------------------------------------------------------------
  94 /// Macro definitions.
  95
  96 .macro  mulcore r, s, d0, d1, d2, d3
  97         // Load a word r_i from R, multiply by the expanded operand [S], and
  98         // leave the pieces of the product in registers D0, D1, D2, D3.
  99         movd    \d0, \r                 // (r_i, 0, 0, 0)
 100   .ifnes "\d1", "nil"
 101         movdqa  \d1, [\s]               // (s'_0, s'_1, s''_0, s''_1)
 102   .endif
 103   .ifnes "\d3", "nil"
 104         movdqa  \d3, [\s + 16]          // (s'_2, s'_3, s''_2, s''_3)
 105   .endif
 106         pshufd  \d0, \d0, SHUF(3, 0, 3, 0) // (r_i, ?, r_i, ?)
 107   .ifnes "\d1", "nil"
 108         psrldq  \d1, 4                  // (s'_1, s''_0, s''_1, 0)
 109   .endif
 110   .ifnes "\d2", "nil"
 111     .ifnes "\d3", "nil"
 112         movdqa  \d2, \d3                // another copy of (s'_2, s'_3, ...)
 113     .else
 114         movdqa  \d2, \d0                // another copy of (r_i, ?, r_i, ?)
 115     .endif
 116   .endif
 117   .ifnes "\d3", "nil"
 118         psrldq  \d3, 4                  // (s'_3, s''_2, s''_3, 0)
 119   .endif
 120   .ifnes "\d1", "nil"
 121         pmuludqd \d1, \d0               // (r_i s'_1, r_i s''_1)
 122   .endif
 123   .ifnes "\d3", "nil"
 124         pmuludqd \d3, \d0               // (r_i s'_3, r_i s''_3)
 125   .endif
 126   .ifnes "\d2", "nil"
 127     .ifnes "\d3", "nil"
 128         pmuludqd \d2, \d0               // (r_i s'_2, r_i s''_2)
 129     .else
 130         pmuludqd \d2, [\s + 16]
 131     .endif
 132   .endif
 133         pmuludqd \d0, [\s]              // (r_i s'_0, r_i s''_0)
 134 .endm
 135
 136 .macro  accum   c0, c1, c2, c3
 137         paddq   \c0, xmm0
 138   .ifnes "\c1", "nil"
 139         paddq   \c1, xmm1
 140   .endif
 141   .ifnes "\c2", "nil"
 142         paddq   \c2, xmm2
 143   .endif
 144   .ifnes "\c3", "nil"
 145         paddq   \c3, xmm3
 146   .endif
 147 .endm
 148
 149 .macro  mulacc  r, s, c0, c1, c2, c3, z3p
 150         // Load a word r_i from R, multiply by the expanded operand [S],
 151         // and accumulate in carry registers C0, C1, C2, C3.  If Z3P is `t'
 152         // then C3 notionally contains zero, but needs clearing; in practice,
 153         // we store the product directly rather than attempting to add.  On
 154         // completion, XMM0, XMM1, and XMM2 are clobbered, as is XMM3 if Z3P
 155         // is not `t'.
 156   .ifeqs "\z3p", "t"
 157         mulcore \r, \s, xmm0, xmm1, xmm2, \c3
 158         accum           \c0,  \c1,  \c2,  nil
 159   .else
 160         mulcore \r, \s, xmm0, xmm1, xmm2, xmm3
 161         accum           \c0,  \c1,  \c2,  \c3
 162   .endif
 163 .endm
 164
 165 .macro  propout d, c, cc
 166         // Calculate an output word from C, and store it in D; propagate
 167         // carries out from C to CC in preparation for a rotation of the
 168         // carry registers.  On completion, XMM3 is clobbered.  If CC is
 169         // `nil', then the contribution which would have been added to it is
 170         // left in C.
 171         pshufd  xmm3, \c, SHUF(2, 3, 3, 3) // (?, ?, ?, t = c'' mod B)
 172         psrldq  xmm3, 12                // (t, 0, 0, 0) = (t, 0)
 173         pslldq  xmm3, 2                 // (t b, 0)
 174         paddq   \c, xmm3                // (c' + t b, c'')
 175         movd    \d, \c
 176         psrlq   \c, 32                  // floor(c/B)
 177   .ifnes "\cc", "nil"
 178         paddq   \cc, \c                 // propagate up
 179   .endif
 180 .endm
 181
 182 .macro  endprop d, c, t
 183         // On entry, C contains a carry register.  On exit, the low 32 bits
 184         // of the value represented in C are written to D, and the remaining
 185         // bits are left at the bottom of T.
 186         movdqa  \t, \c
 187         psllq   \t, 16                  // (?, c'' b)
 188         pslldq  \c, 8                   // (0, c')
 189         paddq   \t, \c                  // (?, c' + c'' b)
 190         psrldq  \t, 8                   // c' + c'' b
 191         movd    \d, \t
 192         psrldq  \t, 4                   // floor((c' + c'' b)/B)
 193 .endm
 194
 195 .macro  expand  a, b, c, d, z
 196         // On entry, A and C hold packed 128-bit values, and Z is zero.  On
 197         // exit, A:B and C:D together hold the same values in expanded
 198         // form.  If C is `nil', then only expand A to A:B.
 199         movdqa  \b, \a                  // (a_0, a_1, a_2, a_3)
 200   .ifnes "\c", "nil"
 201         movdqa  \d, \c                  // (c_0, c_1, c_2, c_3)
 202   .endif
 203         punpcklwd \a, \z                // (a'_0, a''_0, a'_1, a''_1)
 204         punpckhwd \b, \z                // (a'_2, a''_2, a'_3, a''_3)
 205   .ifnes "\c", "nil"
 206         punpcklwd \c, \z                // (c'_0, c''_0, c'_1, c''_1)
 207         punpckhwd \d, \z                // (c'_2, c''_2, c'_3, c''_3)
 208   .endif
 209         pshufd  \a, \a, SHUF(3, 1, 2, 0) // (a'_0, a'_1, a''_0, a''_1)
 210         pshufd  \b, \b, SHUF(3, 1, 2, 0) // (a'_2, a'_3, a''_2, a''_3)
 211   .ifnes "\c", "nil"
 212         pshufd  \c, \c, SHUF(3, 1, 2, 0) // (c'_0, c'_1, c''_0, c''_1)
 213         pshufd  \d, \d, SHUF(3, 1, 2, 0) // (c'_2, c'_3, c''_2, c''_3)
 214   .endif
 215 .endm
 216
 217 .macro  squash  c0, c1, c2, c3, h, t, u
 218         // On entry, C0, C1, C2, C3 are carry registers representing a value
 219         // Y.  On exit, C0 holds the low 128 bits of the carry value; C1, C2,
 220         // C3, T, and U are clobbered; and the high bits of Y are stored in
 221         // H, if this is not `nil'.
 222
 223         // The first step is to eliminate the `double-prime' pieces -- i.e.,
 224         // the ones offset by 16 bytes from a 32-bit boundary -- by carrying
 225         // them into the 32-bit-aligned pieces above and below.  But before
 226         // we can do that, we must gather them together.
 227         movdqa  \t, \c0
 228         movdqa  \u, \c1
 229         punpcklqdq \t, \c2              // (y'_0, y'_2)
 230         punpckhqdq \c0, \c2             // (y''_0, y''_2)
 231         punpcklqdq \u, \c3              // (y'_1, y'_3)
 232         punpckhqdq \c1, \c3             // (y''_1, y''_3)
 233
 234         // Now split the double-prime pieces.  The high (up to) 48 bits will
 235         // go up; the low 16 bits go down.
 236         movdqa  \c2, \c0
 237         movdqa  \c3, \c1
 238         psllq   \c2, 48
 239         psllq   \c3, 48
 240         psrlq   \c0, 16                 // high parts of (y''_0, y''_2)
 241         psrlq   \c1, 16                 // high parts of (y''_1, y''_3)
 242         psrlq   \c2, 32                 // low parts of (y''_0, y''_2)
 243         psrlq   \c3, 32                 // low parts of (y''_1, y''_3)
 244   .ifnes "\h", "nil"
 245         movdqa  \h, \c1
 246   .endif
 247         pslldq  \c1, 8                  // high part of (0, y''_1)
 248
 249         paddq   \t, \c2                 // propagate down
 250         paddq   \u, \c3
 251         paddq   \t, \c1                 // and up: (y_0, y_2)
 252         paddq   \u, \c0                 // (y_1, y_3)
 253   .ifnes "\h", "nil"
 254         psrldq  \h, 8                   // high part of (y''_3, 0)
 255   .endif
 256
 257         // Finally extract the answer.  This complicated dance is better than
 258         // storing to memory and loading, because the piecemeal stores
 259         // inhibit store forwarding.
 260         movdqa  \c3, \t                 // (y_0, y_1)
 261         movdqa  \c0, \t                 // (y^*_0, ?, ?, ?)
 262         psrldq  \t, 8                   // (y_2, 0)
 263         psrlq   \c3, 32                 // (floor(y_0/B), ?)
 264         paddq   \c3, \u                 // (y_1 + floor(y_0/B), ?)
 265         pslldq  \c0, 12                 // (0, 0, 0, y^*_0)
 266         movdqa  \c1, \c3                // (y^*_1, ?, ?, ?)
 267         psrldq  \u, 8                   // (y_3, 0)
 268         psrlq   \c3, 32                 // (floor((y_1 B + y_0)/B^2, ?)
 269         paddq   \c3, \t                 // (y_2 + floor((y_1 B + y_0)/B^2, ?)
 270         pslldq  \c1, 12                 // (0, 0, 0, y^*_1)
 271         psrldq  \c0, 12                 // (y^*_0, 0, 0, 0)
 272         movdqa  \c2, \c3                // (y^*_2, ?, ?, ?)
 273         psrlq   \c3, 32             // (floor((y_2 B^2 + y_1 B + y_0)/B^3, ?)
 274         paddq   \c3, \u       // (y_3 + floor((y_2 B^2 + y_1 B + y_0)/B^3, ?)
 275         pslldq  \c2, 12                 // (0, 0, 0, y^*_2)
 276         psrldq  \c1, 8                  // (0, y^*_1, 0, 0)
 277         psrldq  \c2, 4                  // (0, 0, y^*_2, 0)
 278   .ifnes "\h", "nil"
 279         movdqu  \t, \c3
 280         pxor    \u, \u
 281   .endif
 282         pslldq  \c3, 12                 // (0, 0, 0, y^*_3)
 283         por     \c0, \c1                // (y^*_0, y^*_1, 0, 0)
 284         por     \c2, \c3                // (0, 0, y^*_2, y^*_3)
 285         por     \c0, \c2                // y mod B^4
 286   .ifnes "\h", "nil"
 287         psrlq   \t, 32                  // very high bits of y
 288         paddq   \h, \t
 289         punpcklqdq \h, \u               // carry up
 290   .endif
 291 .endm
 292
 293 .macro  carryadd
 294         // On entry, EDI points to a packed addend A, and XMM4, XMM5, XMM6
 295         // hold the incoming carry registers c0, c1, and c2 representing a
 296         // carry-in C.
 297         //
 298         // On exit, the carry registers, including XMM7, are updated to hold
 299         // C + A; XMM0, XMM1, XMM2, and XMM3 are clobbered.  The other
 300         // registers are preserved.
 301         movd    xmm0, [edi +  0]        // (a_0, 0)
 302         movd    xmm1, [edi +  4]        // (a_1, 0)
 303         movd    xmm2, [edi +  8]        // (a_2, 0)
 304         movd    xmm7, [edi + 12]        // (a_3, 0)
 305
 306         paddq   xmm4, xmm0              // (c'_0 + a_0, c''_0)
 307         paddq   xmm5, xmm1              // (c'_1 + a_1, c''_1)
 308         paddq   xmm6, xmm2              // (c'_2 + a_2, c''_2 + a_3 b)
 309 .endm
 310
 311 ///--------------------------------------------------------------------------
 312 /// Primitive multipliers and related utilities.
 313
 314 INTFUNC(carryprop)
 315         // On entry, XMM4, XMM5, and XMM6 hold a 144-bit carry in an expanded
 316         // form.  Store the low 128 bits of the represented carry to [EDI] as
 317         // a packed 128-bit value, and leave the remaining 16 bits in the low
 318         // 32 bits of XMM4.  On exit, XMM3, XMM5 and XMM6 are clobbered.
 319   endprologue
 320
 321         propout [edi +  0], xmm4, xmm5
 322         propout [edi +  4], xmm5, xmm6
 323         propout [edi +  8], xmm6, nil
 324         endprop [edi + 12], xmm6, xmm4
 325         ret
 326
 327 ENDFUNC
 328
 329 INTFUNC(dmul4)
 330         // On entry, EDI points to the destination buffer; EAX and EBX point
 331         // to the packed operands U and X; ECX and EDX point to the expanded
 332         // operands V and Y; and XMM4, XMM5, XMM6 hold the incoming carry
 333         // registers c0, c1, and c2; c3 is assumed to be zero.
 334         //
 335         // On exit, we write the low 128 bits of the sum C + U V + X Y to
 336         // [EDI], and update the carry registers with the carry out.  The
 337         // registers XMM0, XMM1, XMM2, XMM3, and XMM7 are clobbered; the
 338         // general-purpose registers are preserved.
 339   endprologue
 340
 341         mulacc  [eax +  0], ecx, xmm4, xmm5, xmm6, xmm7, t
 342         mulacc  [ebx +  0], edx, xmm4, xmm5, xmm6, xmm7, nil
 343         propout [edi +  0],      xmm4, xmm5
 344
 345         mulacc  [eax +  4], ecx, xmm5, xmm6, xmm7, xmm4, t
 346         mulacc  [ebx +  4], edx, xmm5, xmm6, xmm7, xmm4, nil
 347         propout [edi +  4],      xmm5, xmm6
 348
 349         mulacc  [eax +  8], ecx, xmm6, xmm7, xmm4, xmm5, t
 350         mulacc  [ebx +  8], edx, xmm6, xmm7, xmm4, xmm5, nil
 351         propout [edi +  8],      xmm6, xmm7
 352
 353         mulacc  [eax + 12], ecx, xmm7, xmm4, xmm5, xmm6, t
 354         mulacc  [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6, nil
 355         propout [edi + 12],      xmm7, xmm4
 356
 357         ret
 358
 359 ENDFUNC
 360
 361 INTFUNC(dmla4)
 362         // On entry, EDI points to the destination buffer, which also
 363         // contains an addend A to accumulate; EAX and EBX point to the
 364         // packed operands U and X; ECX and EDX point to the expanded
 365         // operands V and Y; and XMM4, XMM5, XMM6 hold the incoming carry
 366         // registers c0, c1, and c2 representing a carry-in C; c3 is assumed
 367         // to be zero.
 368         //
 369         // On exit, we write the low 128 bits of the sum A + C + U V + X Y to
 370         // [EDI], and update the carry registers with the carry out.  The
 371         // registers XMM0, XMM1, XMM2, XMM3, and XMM7 are clobbered; the
 372         // general-purpose registers are preserved.
 373   endprologue
 374
 375         carryadd
 376
 377         mulacc  [eax +  0], ecx, xmm4, xmm5, xmm6, xmm7, nil
 378         mulacc  [ebx +  0], edx, xmm4, xmm5, xmm6, xmm7, nil
 379         propout [edi +  0],      xmm4, xmm5
 380
 381         mulacc  [eax +  4], ecx, xmm5, xmm6, xmm7, xmm4, t
 382         mulacc  [ebx +  4], edx, xmm5, xmm6, xmm7, xmm4, nil
 383         propout [edi +  4],      xmm5, xmm6
 384
 385         mulacc  [eax +  8], ecx, xmm6, xmm7, xmm4, xmm5, t
 386         mulacc  [ebx +  8], edx, xmm6, xmm7, xmm4, xmm5, nil
 387         propout [edi +  8],      xmm6, xmm7
 388
 389         mulacc  [eax + 12], ecx, xmm7, xmm4, xmm5, xmm6, t
 390         mulacc  [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6, nil
 391         propout [edi + 12],      xmm7, xmm4
 392
 393         ret
 394
 395 ENDFUNC
 396
 397 INTFUNC(mul4zc)
 398         // On entry, EDI points to the destination buffer; EBX points to a
 399         // packed operand X; and EDX points to an expanded operand Y.
 400         //
 401         // On exit, we write the low 128 bits of the product X Y to [EDI],
 402         // and set the carry registers XMM4, XMM5, XMM6 to the carry out.
 403         // The registers XMM0, XMM1, XMM2, XMM3, and XMM7 are clobbered; the
 404         // general-purpose registers are preserved.
 405   endprologue
 406
 407         mulcore [ebx +  0], edx, xmm4, xmm5, xmm6, xmm7
 408         propout [edi +  0],      xmm4, xmm5
 409
 410         mulacc  [ebx +  4], edx, xmm5, xmm6, xmm7, xmm4, t
 411         propout [edi +  4],      xmm5, xmm6
 412
 413         mulacc  [ebx +  8], edx, xmm6, xmm7, xmm4, xmm5, t
 414         propout [edi +  8],      xmm6, xmm7
 415
 416         mulacc  [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6, t
 417         propout [edi + 12],      xmm7, xmm4
 418
 419         ret
 420
 421 ENDFUNC
 422
 423 INTFUNC(mul4)
 424         // On entry, EDI points to the destination buffer; EBX points to a
 425         // packed operand X; EDX points to an expanded operand Y; and XMM4,
 426         // XMM5, XMM6 hold the incoming carry registers c0, c1, and c2,
 427         // representing a carry-in C; c3 is assumed to be zero.
 428         //
 429         // On exit, we write the low 128 bits of the sum C + X Y to [EDI],
 430         // and update the carry registers with the carry out.  The registers
 431         // XMM0, XMM1, XMM2, XMM3, and XMM7 are clobbered; the
 432         // general-purpose registers are preserved.
 433   endprologue
 434
 435         mulacc  [ebx +  0], edx, xmm4, xmm5, xmm6, xmm7, t
 436         propout [edi +  0],      xmm4, xmm5
 437
 438         mulacc  [ebx +  4], edx, xmm5, xmm6, xmm7, xmm4, t
 439         propout [edi +  4],      xmm5, xmm6
 440
 441         mulacc  [ebx +  8], edx, xmm6, xmm7, xmm4, xmm5, t
 442         propout [edi +  8],      xmm6, xmm7
 443
 444         mulacc  [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6, t
 445         propout [edi + 12],      xmm7, xmm4
 446
 447         ret
 448
 449 ENDFUNC
 450
 451 INTFUNC(mla4zc)
 452         // On entry, EDI points to the destination buffer, which also
 453         // contains an addend A to accumulate; EBX points to a packed operand
 454         // X; and EDX points to an expanded operand Y.
 455         //
 456         // On exit, we write the low 128 bits of the sum A + X Y to [EDI],
 457         // and set the carry registers XMM4, XMM5, XMM6 to the carry out.
 458         // The registers XMM0, XMM1, XMM2, XMM3, and XMM7 are clobbered; the
 459         // general-purpose registers are preserved.
 460   endprologue
 461
 462         movd    xmm4, [edi +  0]
 463         movd    xmm5, [edi +  4]
 464         movd    xmm6, [edi +  8]
 465         movd    xmm7, [edi + 12]
 466
 467         mulacc  [ebx +  0], edx, xmm4, xmm5, xmm6, xmm7, nil
 468         propout [edi +  0],      xmm4, xmm5
 469
 470         mulacc  [ebx +  4], edx, xmm5, xmm6, xmm7, xmm4, t
 471         propout [edi +  4],      xmm5, xmm6
 472
 473         mulacc  [ebx +  8], edx, xmm6, xmm7, xmm4, xmm5, t
 474         propout [edi +  8],      xmm6, xmm7
 475
 476         mulacc  [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6, t
 477         propout [edi + 12],      xmm7, xmm4
 478
 479         ret
 480
 481 ENDFUNC
 482
 483 INTFUNC(mla4)
 484         // On entry, EDI points to the destination buffer, which also
 485         // contains an addend A to accumulate; EBX points to a packed operand
 486         // X; EDX points to an expanded operand Y; and XMM4, XMM5, XMM6 hold
 487         // the incoming carry registers c0, c1, and c2, representing a
 488         // carry-in C; c3 is assumed to be zero.
 489         //
 490         // On exit, we write the low 128 bits of the sum A + C + X Y to
 491         // [EDI], and update the carry registers with the carry out.  The
 492         // registers XMM0, XMM1, XMM2, XMM3, and XMM7 are clobbered; the
 493         // general-purpose registers are preserved.
 494   endprologue
 495
 496         carryadd
 497
 498         mulacc  [ebx +  0], edx, xmm4, xmm5, xmm6, xmm7, nil
 499         propout [edi +  0],      xmm4, xmm5
 500
 501         mulacc  [ebx +  4], edx, xmm5, xmm6, xmm7, xmm4, t
 502         propout [edi +  4],      xmm5, xmm6
 503
 504         mulacc  [ebx +  8], edx, xmm6, xmm7, xmm4, xmm5, t
 505         propout [edi +  8],      xmm6, xmm7
 506
 507         mulacc  [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6, t
 508         propout [edi + 12],      xmm7, xmm4
 509
 510         ret
 511
 512 ENDFUNC
 513
 514 INTFUNC(mmul4)
 515         // On entry, EDI points to the destination buffer; EAX and EBX point
 516         // to the packed operands U and N; ECX and ESI point to the expanded
 517         // operands V and M; and EDX points to a place to store an expanded
 518         // result Y (32 bytes, at a 16-byte boundary).  The stack pointer
 519         // must be 16-byte aligned.  (This is not the usual convention, which
 520         // requires alignment before the call.)
 521         //
 522         // On exit, we write Y = U V M mod B to [EDX], and the low 128 bits
 523         // of the sum U V + N Y to [EDI], leaving the remaining carry in
 524         // XMM4, XMM5, and XMM6.  The registers XMM0, XMM1, XMM2, XMM3, and
 525         // XMM7 are clobbered; the general-purpose registers are preserved.
 526         stalloc 48                      // space for the carries
 527   endprologue
 528
 529         // Calculate W = U V, and leave it in the destination.  Stash the
 530         // carry pieces for later.
 531         mulcore [eax +  0], ecx, xmm4, xmm5, xmm6, xmm7
 532         propout [edi +  0],      xmm4, xmm5
 533         jmp     5f
 534
 535 ENDFUNC
 536
 537 INTFUNC(mmla4)
 538         // On entry, EDI points to the destination buffer, which also
 539         // contains an addend A to accumulate; EAX and EBX point
 540         // to the packed operands U and N; ECX and ESI point to the expanded
 541         // operands V and M; and EDX points to a place to store an expanded
 542         // result Y (32 bytes, at a 16-byte boundary).  The stack pointer
 543         // must be 16-byte aligned.  (This is not the usual convention, which
 544         // requires alignment before the call.)
 545         //
 546         // On exit, we write Y = (A + U V) M mod B to [EDX], and the low 128
 547         // bits of the sum A + U V + N Y to [EDI], leaving the remaining
 548         // carry in XMM4, XMM5, and XMM6.  The registers XMM0, XMM1, XMM2,
 549         // XMM3, and XMM7 are clobbered; the general-purpose registers are
 550         // preserved.
 551         stalloc 48                      // space for the carries
 552   endprologue
 553
 554         movd    xmm4, [edi +  0]
 555         movd    xmm5, [edi +  4]
 556         movd    xmm6, [edi +  8]
 557         movd    xmm7, [edi + 12]
 558         mulacc  [eax +  0], ecx, xmm4, xmm5, xmm6, xmm7, nil
 559         propout [edi +  0],      xmm4, xmm5
 560
 561 5:      mulacc  [eax +  4], ecx, xmm5, xmm6, xmm7, xmm4, t
 562         propout [edi +  4],      xmm5, xmm6
 563
 564         mulacc  [eax +  8], ecx, xmm6, xmm7, xmm4, xmm5, t
 565         propout [edi +  8],      xmm6, xmm7
 566
 567         mulacc  [eax + 12], ecx, xmm7, xmm4, xmm5, xmm6, t
 568         propout [edi + 12],      xmm7, xmm4
 569
 570         movdqa  [esp +  0], xmm4
 571         movdqa  [esp + 16], xmm5
 572         movdqa  [esp + 32], xmm6
 573
 574         // Calculate Y = W M.
 575         mulcore [edi +  0], esi, xmm4, xmm5, xmm6, xmm7
 576
 577         mulcore [edi +  4], esi, xmm0, xmm1, xmm2, nil
 578         accum                    xmm5, xmm6, xmm7, nil
 579
 580         mulcore [edi +  8], esi, xmm0, xmm1, nil,  nil
 581         accum                    xmm6, xmm7, nil,  nil
 582
 583         mulcore [edi + 12], esi, xmm0, nil,  nil,  nil
 584         accum                    xmm7, nil,  nil,  nil
 585
 586         // That's lots of pieces.  Now we have to assemble the answer.
 587         squash  xmm4, xmm5, xmm6, xmm7, nil, xmm0, xmm1
 588
 589         // Expand it.
 590         pxor    xmm2, xmm2
 591         expand  xmm4, xmm1, nil, nil, xmm2
 592         movdqa  [edx +  0], xmm4
 593         movdqa  [edx + 16], xmm1
 594
 595         // Initialize the carry from the value for W we calculated earlier.
 596         movd    xmm4, [edi +  0]
 597         movd    xmm5, [edi +  4]
 598         movd    xmm6, [edi +  8]
 599         movd    xmm7, [edi + 12]
 600
 601         // Finish the calculation by adding the Montgomery product.
 602         mulacc  [ebx +  0], edx, xmm4, xmm5, xmm6, xmm7, nil
 603         propout [edi +  0],      xmm4, xmm5
 604
 605         mulacc  [ebx +  4], edx, xmm5, xmm6, xmm7, xmm4, t
 606         propout [edi +  4],      xmm5, xmm6
 607
 608         mulacc  [ebx +  8], edx, xmm6, xmm7, xmm4, xmm5, t
 609         propout [edi +  8],      xmm6, xmm7
 610
 611         mulacc  [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6, t
 612         propout [edi + 12],      xmm7, xmm4
 613
 614         // Add add on the carry we calculated earlier.
 615         paddq   xmm4, [esp +  0]
 616         paddq   xmm5, [esp + 16]
 617         paddq   xmm6, [esp + 32]
 618
 619         // And, with that, we're done.
 620         stfree  48
 621         ret
 622
 623 ENDFUNC
 624
 625 INTFUNC(mont4)
 626         // On entry, EDI points to the destination buffer holding a packed
 627         // value W; EBX points to a packed operand N; ESI points to an
 628         // expanded operand M; and EDX points to a place to store an expanded
 629         // result Y (32 bytes, at a 16-byte boundary).
 630         //
 631         // On exit, we write Y = W M mod B to [EDX], and the low 128 bits
 632         // of the sum W + N Y to [EDI], leaving the remaining carry in
 633         // XMM4, XMM5, and XMM6.  The registers XMM0, XMM1, XMM2, XMM3, and
 634         // XMM7 are clobbered; the general-purpose registers are preserved.
 635   endprologue
 636
 637         // Calculate Y = W M.
 638         mulcore [edi +  0], esi, xmm4, xmm5, xmm6, xmm7
 639
 640         mulcore [edi +  4], esi, xmm0, xmm1, xmm2, nil
 641         accum                    xmm5, xmm6, xmm7, nil
 642
 643         mulcore [edi +  8], esi, xmm0, xmm1, nil,  nil
 644         accum                    xmm6, xmm7, nil,  nil
 645
 646         mulcore [edi + 12], esi, xmm0, nil,  nil,  nil
 647         accum                    xmm7, nil,  nil,  nil
 648
 649         // That's lots of pieces.  Now we have to assemble the answer.
 650         squash  xmm4, xmm5, xmm6, xmm7, nil, xmm0, xmm1
 651
 652         // Expand it.
 653         pxor    xmm2, xmm2
 654         expand  xmm4, xmm1, nil, nil, xmm2
 655         movdqa  [edx +  0], xmm4
 656         movdqa  [edx + 16], xmm1
 657
 658         // Initialize the carry from W.
 659         movd    xmm4, [edi +  0]
 660         movd    xmm5, [edi +  4]
 661         movd    xmm6, [edi +  8]
 662         movd    xmm7, [edi + 12]
 663
 664         // Finish the calculation by adding the Montgomery product.
 665         mulacc  [ebx +  0], edx, xmm4, xmm5, xmm6, xmm7, nil
 666         propout [edi +  0],      xmm4, xmm5
 667
 668         mulacc  [ebx +  4], edx, xmm5, xmm6, xmm7, xmm4, t
 669         propout [edi +  4],      xmm5, xmm6
 670
 671         mulacc  [ebx +  8], edx, xmm6, xmm7, xmm4, xmm5, t
 672         propout [edi +  8],      xmm6, xmm7
 673
 674         mulacc  [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6, t
 675         propout [edi + 12],      xmm7, xmm4
 676
 677         // And, with that, we're done.
 678         ret
 679
 680 ENDFUNC
 681
 682 ///--------------------------------------------------------------------------
 683 /// Bulk multipliers.
 684
 685 FUNC(mpx_umul4_x86_sse2)
 686         // void mpx_umul4_x86_sse2(mpw *dv, const mpw *av, const mpw *avl,
 687         //                         const mpw *bv, const mpw *bvl);
 688
 689         // Build a stack frame.  Arguments will be relative to EBP, as
 690         // follows.
 691         //
 692         //      ebp + 20        dv
 693         //      ebp + 24        av
 694         //      ebp + 28        avl
 695         //      ebp + 32        bv
 696         //      ebp + 36        bvl
 697         //
 698         // Locals are relative to ESP, as follows.
 699         //
 700         //      esp +  0        expanded Y (32 bytes)
 701         //      esp + 32        (top of locals)
 702         pushreg ebp
 703         pushreg ebx
 704         pushreg esi
 705         pushreg edi
 706         setfp   ebp
 707         and     esp, ~15
 708         sub     esp, 32
 709   endprologue
 710
 711         // Prepare for the first iteration.
 712         mov     esi, [ebp + 32]         // -> bv[0]
 713         pxor    xmm7, xmm7
 714         movdqu  xmm0, [esi]             // bv[0]
 715         mov     edi, [ebp + 20]         // -> dv[0]
 716         mov     ecx, edi                // outer loop dv cursor
 717         expand  xmm0, xmm1, nil, nil, xmm7
 718         mov     ebx, [ebp + 24]         // -> av[0]
 719         mov     eax, [ebp + 28]         // -> av[m] = av limit
 720         mov     edx, esp                // -> expanded Y = bv[0]
 721         movdqa  [esp + 0], xmm0         // bv[0] expanded low
 722         movdqa  [esp + 16], xmm1        // bv[0] expanded high
 723         call    mul4zc
 724         add     ebx, 16
 725         add     edi, 16
 726         add     ecx, 16
 727         add     esi, 16
 728         cmp     ebx, eax                // all done?
 729         jae     8f
 730
 731         .p2align 4
 732         // Continue with the first iteration.
 733 0:      call    mul4
 734         add     ebx, 16
 735         add     edi, 16
 736         cmp     ebx, eax                // all done?
 737         jb      0b
 738
 739         // Write out the leftover carry.  There can be no tail here.
 740 8:      call    carryprop
 741         cmp     esi, [ebp + 36]         // more passes to do?
 742         jae     9f
 743
 744         .p2align 4
 745         // Set up for the next pass.
 746 1:      movdqu  xmm0, [esi]             // bv[i]
 747         mov     edi, ecx                // -> dv[i]
 748         pxor    xmm7, xmm7
 749         expand  xmm0, xmm1, nil, nil, xmm7
 750         mov     ebx, [ebp + 24]         // -> av[0]
 751         movdqa  [esp + 0], xmm0         // bv[i] expanded low
 752         movdqa  [esp + 16], xmm1        // bv[i] expanded high
 753         call    mla4zc
 754         add     edi, 16
 755         add     ebx, 16
 756         add     ecx, 16
 757         add     esi, 16
 758         cmp     ebx, eax                // done yet?
 759         jae     8f
 760
 761         .p2align 4
 762         // Continue...
 763 0:      call    mla4
 764         add     ebx, 16
 765         add     edi, 16
 766         cmp     ebx, eax
 767         jb      0b
 768
 769         // Finish off this pass.  There was no tail on the previous pass, and
 770         // there can be none on this pass.
 771 8:      call    carryprop
 772         cmp     esi, [ebp + 36]
 773         jb      1b
 774
 775         // All over.
 776 9:      dropfp
 777         pop     edi
 778         pop     esi
 779         pop     ebx
 780         pop     ebp
 781         ret
 782
 783 ENDFUNC
 784
 785 FUNC(mpxmont_mul4_x86_sse2)
 786         // void mpxmont_mul4_x86_sse2(mpw *dv, const mpw *av, const mpw *bv,
 787         //                           const mpw *nv, size_t n, const mpw *mi);
 788
 789         // Build a stack frame.  Arguments will be relative to EBP, as
 790         // follows.
 791         //
 792         //      ebp + 20        dv
 793         //      ebp + 24        av
 794         //      ebp + 28        bv
 795         //      ebp + 32        nv
 796         //      ebp + 36        n (nonzero multiple of 4)
 797         //      ebp + 40        mi
 798         //
 799         // Locals are relative to ESP, which is 4 mod 16, as follows.
 800         //
 801         //      esp +   0       outer loop dv
 802         //      esp +   4       outer loop bv
 803         //      esp +   8       av limit (mostly in ESI)
 804         //      esp +  12       expanded V (32 bytes)
 805         //      esp +  44       expanded M (32 bytes)
 806         //      esp +  76       expanded Y (32 bytes)
 807         //      esp + 108       bv limit
 808         //      esp + 112       (gap)
 809         //      esp + 124       (top of locals)
 810         pushreg ebp
 811         pushreg ebx
 812         pushreg esi
 813         pushreg edi
 814         setfp   ebp
 815         and     esp, ~15
 816         sub     esp, 124
 817   endprologue
 818
 819         // Establish the expanded operands.
 820         pxor    xmm7, xmm7
 821         mov     ecx, [ebp + 28]         // -> bv
 822         mov     edx, [ebp + 40]         // -> mi
 823         movdqu  xmm0, [ecx]             // bv[0]
 824         movdqu  xmm2, [edx]             // mi
 825         expand  xmm0, xmm1, xmm2, xmm3, xmm7
 826         movdqa  [esp + 12], xmm0        // bv[0] expanded low
 827         movdqa  [esp + 28], xmm1        // bv[0] expanded high
 828         movdqa  [esp + 44], xmm2        // mi expanded low
 829         movdqa  [esp + 60], xmm3        // mi expanded high
 830
 831         // Set up the outer loop state and prepare for the first iteration.
 832         mov     edx, [ebp + 36]         // n
 833         mov     eax, [ebp + 24]         // -> U = av[0]
 834         mov     ebx, [ebp + 32]         // -> X = nv[0]
 835         mov     edi, [ebp + 20]         // -> Z = dv[0]
 836         mov     [esp + 4], ecx
 837         lea     ecx, [ecx + 4*edx]      // -> bv[n/4] = bv limit
 838         lea     edx, [eax + 4*edx]      // -> av[n/4] = av limit
 839         mov     [esp + 0], edi
 840         mov     [esp + 108], ecx
 841         mov     [esp + 8], edx
 842         lea     ecx, [esp + 12]         // -> expanded V = bv[0]
 843         lea     esi, [esp + 44]         // -> expanded M = mi
 844         lea     edx, [esp + 76]         // -> space for Y
 845         call    mmul4
 846         mov     esi, [esp + 8]          // recover av limit
 847         add     edi, 16
 848         add     eax, 16
 849         add     ebx, 16
 850         cmp     eax, esi                // done already?
 851         jae     8f
 852         mov     [esp + 0], edi
 853
 854         .p2align 4
 855         // Complete the first inner loop.
 856 0:      call    dmul4
 857         add     edi, 16
 858         add     eax, 16
 859         add     ebx, 16
 860         cmp     eax, esi                // done yet?
 861         jb      0b
 862
 863         // Still have carries left to propagate.
 864         call    carryprop
 865         movd    [edi + 16], xmm4
 866
 867         .p2align 4
 868         // Embark on the next iteration.  (There must be one.  If n = 1, then
 869         // we would have bailed above, to label 8.  Similarly, the subsequent
 870         // iterations can fall into the inner loop immediately.)
 871 1:      mov     eax, [esp + 4]          // -> bv[i - 1]
 872         mov     edi, [esp + 0]          // -> Z = dv[i]
 873         add     eax, 16                 // -> bv[i]
 874         pxor    xmm7, xmm7
 875         movdqu  xmm0, [eax]             // bv[i]
 876         mov     [esp + 4], eax
 877         cmp     eax, [esp + 108]        // done yet?
 878         jae     9f
 879         mov     ebx, [ebp + 32]         // -> X = nv[0]
 880         lea     esi, [esp + 44]         // -> expanded M = mi
 881         mov     eax, [ebp + 24]         // -> U = av[0]
 882         expand  xmm0, xmm1, nil, nil, xmm7
 883         movdqa  [esp + 12], xmm0        // bv[i] expanded low
 884         movdqa  [esp + 28], xmm1        // bv[i] expanded high
 885         call    mmla4
 886         mov     esi, [esp + 8]          // recover av limit
 887         add     edi, 16
 888         add     eax, 16
 889         add     ebx, 16
 890         mov     [esp + 0], edi
 891
 892         .p2align 4
 893         // Complete the next inner loop.
 894 0:      call    dmla4
 895         add     edi, 16
 896         add     eax, 16
 897         add     ebx, 16
 898         cmp     eax, esi
 899         jb      0b
 900
 901         // Still have carries left to propagate, and they overlap the
 902         // previous iteration's final tail, so read that in and add it.
 903         movd    xmm0, [edi]
 904         paddq   xmm4, xmm0
 905         call    carryprop
 906         movd    [edi + 16], xmm4
 907
 908         // Back again.
 909         jmp     1b
 910
 911         // First iteration was short.  Write out the carries and we're done.
 912         // (This could be folded into the main loop structure, but that would
 913         // penalize small numbers more.)
 914 8:      call    carryprop
 915         movd    [edi + 16], xmm4
 916
 917         // All done.
 918 9:      dropfp
 919         popreg  edi
 920         popreg  esi
 921         popreg  ebx
 922         popreg  ebp
 923         ret
 924
 925 ENDFUNC
 926
 927 FUNC(mpxmont_redc4_x86_sse2)
 928         // void mpxmont_redc4_x86_sse2(mpw *dv, mpw *dvl, const mpw *nv,
 929         //                             size_t n, const mpw *mi);
 930
 931         // Build a stack frame.  Arguments will be relative to EBP, as
 932         // follows.
 933         //
 934         //      ebp + 20        dv
 935         //      ebp + 24        dvl
 936         //      ebp + 28        nv
 937         //      ebp + 32        n (nonzero multiple of 4)
 938         //      ebp + 36        mi
 939         //
 940         // Locals are relative to ESP, as follows.
 941         //
 942         //      esp +  0        outer loop dv
 943         //      esp +  4        outer dv limit
 944         //      esp +  8        blocks-of-4 dv limit
 945         //      esp + 12        expanded M (32 bytes)
 946         //      esp + 44        expanded Y (32 bytes)
 947         //      esp + 76        (top of locals)
 948         pushreg ebp
 949         pushreg ebx
 950         pushreg esi
 951         pushreg edi
 952         setfp   ebp
 953         and     esp, ~15
 954         sub     esp, 76
 955   endprologue
 956
 957         // Establish the expanded operands and the blocks-of-4 dv limit.
 958         mov     edi, [ebp + 20]         // -> Z = dv[0]
 959         pxor    xmm7, xmm7
 960         mov     eax, [ebp + 24]         // -> dv[n] = dv limit
 961         sub     eax, edi                // length of dv in bytes
 962         mov     edx, [ebp + 36]         // -> mi
 963         movdqu  xmm0, [edx]             // mi
 964         and     eax, ~15                // mask off the tail end
 965         expand  xmm0, xmm1, nil, nil, xmm7
 966         add     eax, edi                // find limit
 967         movdqa  [esp + 12], xmm0        // mi expanded low
 968         movdqa  [esp + 28], xmm1        // mi expanded high
 969         mov     [esp + 8], eax
 970
 971         // Set up the outer loop state and prepare for the first iteration.
 972         mov     ecx, [ebp + 32]         // n
 973         mov     ebx, [ebp + 28]         // -> X = nv[0]
 974         lea     edx, [edi + 4*ecx]      // -> dv[n/4] = outer dv limit
 975         lea     ecx, [ebx + 4*ecx]      // -> nv[n/4] = nv limit
 976         mov     [esp + 0], edi
 977         mov     [esp + 4], edx
 978         lea     esi, [esp + 12]         // -> expanded M = mi
 979         lea     edx, [esp + 44]         // -> space for Y
 980         call    mont4
 981         add     edi, 16
 982         add     ebx, 16
 983         cmp     ebx, ecx                // done already?
 984         jae     8f
 985
 986         .p2align 4
 987         // Complete the first inner loop.
 988 5:      call    mla4
 989         add     ebx, 16
 990         add     edi, 16
 991         cmp     ebx, ecx                // done yet?
 992         jb      5b
 993
 994         // Still have carries left to propagate.
 995 8:      carryadd
 996         mov     esi, [esp + 8]          // -> dv blocks limit
 997         mov     edx, [ebp + 24]         // dv limit
 998         psllq   xmm7, 16
 999         pslldq  xmm7, 8
1000         paddq   xmm6, xmm7
1001         call    carryprop
1002         movd    eax, xmm4
1003         add     edi, 16
1004         cmp     edi, esi
1005         jae     7f
1006
1007         .p2align 4
1008         // Continue carry propagation until the end of the buffer.
1009 0:      add     [edi], eax
1010         mov     eax, 0                  // preserves flags
1011         adcd    [edi + 4], 0
1012         adcd    [edi + 8], 0
1013         adcd    [edi + 12], 0
1014         adc     eax, 0
1015         add     edi, 16
1016         cmp     edi, esi
1017         jb      0b
1018
1019         // Deal with the tail end.
1020 7:      add     [edi], eax
1021         mov     eax, 0                  // preserves flags
1022         add     edi, 4
1023         adc     eax, 0
1024         cmp     edi, edx
1025         jb      7b
1026
1027         // All done for this iteration.  Start the next.  (This must have at
1028         // least one follow-on iteration, or we'd not have started this outer
1029         // loop.)
1030 8:      mov     edi, [esp + 0]          // -> dv[i - 1]
1031         mov     ebx, [ebp + 28]         // -> X = nv[0]
1032         lea     edx, [esp + 44]         // -> space for Y
1033         lea     esi, [esp + 12]         // -> expanded M = mi
1034         add     edi, 16                 // -> Z = dv[i]
1035         cmp     edi, [esp + 4]          // all done yet?
1036         jae     9f
1037         mov     [esp + 0], edi
1038         call    mont4
1039         add     edi, 16
1040         add     ebx, 16
1041         jmp     5b
1042
1043         // All over.
1044 9:      dropfp
1045         popreg  edi
1046         popreg  esi
1047         popreg  ebx
1048         popreg  ebp
1049         ret
1050
1051 ENDFUNC
1052
1053 ///--------------------------------------------------------------------------
1054 /// Testing and performance measurement.
1055
1056 #ifdef TEST_MUL4
1057
1058 .macro  cysetup c
1059         rdtsc
1060         mov     [\c], eax
1061         mov     [\c + 4], edx
1062 .endm
1063
1064 .macro  cystore c, v, n
1065         rdtsc
1066         sub     eax, [\c]
1067         sbb     edx, [\c + 4]
1068         mov     ebx, [\v]
1069         mov     ecx, [\n]
1070         dec     ecx
1071         mov     [\n], ecx
1072         mov     [ebx + ecx*8], eax
1073         mov     [ebx + ecx*8 + 4], edx
1074 .endm
1075
1076 .macro  testprologue
1077         pushreg ebp
1078         pushreg ebx
1079         pushreg esi
1080         pushreg edi
1081         setfp   ebp
1082         and     esp, ~15
1083         sub     esp, 3*32 + 12
1084   endprologue
1085         // vars:
1086         //      esp +  0 = cycles
1087         //      esp + 12 = v expanded
1088         //      esp + 44 = y expanded
1089         //      esp + 72 = ? expanded
1090 .endm
1091
1092 .macro  testepilogue
1093         dropfp
1094         popreg  edi
1095         popreg  esi
1096         popreg  ebx
1097         popreg  ebp
1098         ret
1099 .endm
1100
1101 .macro  testldcarry c
1102         mov     ecx, \c                 // -> c
1103         movdqu  xmm4, [ecx +  0]        // (c'_0, c''_0)
1104         movdqu  xmm5, [ecx + 16]        // (c'_1, c''_1)
1105         movdqu  xmm6, [ecx + 32]        // (c'_2, c''_2)
1106 .endm
1107
1108 .macro  testexpand v, y
1109         pxor    xmm7, xmm7
1110   .ifnes "\v", "nil"
1111         mov     ecx, \v
1112         movdqu  xmm0, [ecx]
1113         expand  xmm0, xmm1, nil, nil, xmm7
1114         movdqa  [esp + 12], xmm0
1115         movdqa  [esp + 28], xmm1
1116   .endif
1117   .ifnes "\y", "nil"
1118         mov     edx, \y
1119         movdqu  xmm2, [edx]
1120         expand  xmm2, xmm3, nil, nil, xmm7
1121         movdqa  [esp + 44], xmm2
1122         movdqa  [esp + 60], xmm3
1123   .endif
1124 .endm
1125
1126 .macro  testtop u, x, mode
1127         .p2align 4
1128 0:
1129   .ifnes "\u", "nil"
1130         lea     ecx, [esp + 12]
1131   .endif
1132         mov     ebx, \x
1133   .ifeqs "\mode", "mont"
1134         lea     esi, [esp + 44]
1135   .endif
1136         cysetup esp + 0
1137   .ifnes "\u", "nil"
1138         mov     eax, \u
1139   .endif
1140   .ifeqs "\mode", "mont"
1141         lea     edx, [esp + 76]
1142   .else
1143         lea     edx, [esp + 44]
1144   .endif
1145 .endm
1146
1147 .macro  testtail cyv, n
1148         cystore esp + 0, \cyv, \n
1149         jnz     0b
1150 .endm
1151
1152 .macro  testcarryout c
1153         mov     ecx, \c
1154         movdqu  [ecx +  0], xmm4
1155         movdqu  [ecx + 16], xmm5
1156         movdqu  [ecx + 32], xmm6
1157 .endm
1158
1159 FUNC(test_dmul4)
1160         testprologue
1161         testldcarry [ebp + 24]
1162         testexpand [ebp + 36], [ebp + 40]
1163         mov     edi, [ebp + 20]
1164         testtop [ebp + 28], [ebp + 32]
1165         call    dmul4
1166         testtail [ebp + 48], [ebp + 44]
1167         testcarryout [ebp + 24]
1168         testepilogue
1169 ENDFUNC
1170
1171 FUNC(test_dmla4)
1172         testprologue
1173         testldcarry [ebp + 24]
1174         testexpand [ebp + 36], [ebp + 40]
1175         mov     edi, [ebp + 20]
1176         testtop [ebp + 28], [ebp + 32]
1177         call    dmla4
1178         testtail [ebp + 48], [ebp + 44]
1179         testcarryout [ebp + 24]
1180         testepilogue
1181 ENDFUNC
1182
1183 FUNC(test_mul4)
1184         testprologue
1185         testldcarry [ebp + 24]
1186         testexpand nil, [ebp + 32]
1187         mov     edi, [ebp + 20]
1188         testtop nil, [ebp + 28]
1189         call    mul4
1190         testtail [ebp + 40], [ebp + 36]
1191         testcarryout [ebp + 24]
1192         testepilogue
1193 ENDFUNC
1194
1195 FUNC(test_mla4)
1196         testprologue
1197         testldcarry [ebp + 24]
1198         testexpand nil, [ebp + 32]
1199         mov     edi, [ebp + 20]
1200         testtop nil, [ebp + 28]
1201         call    mla4
1202         testtail [ebp + 40], [ebp + 36]
1203         testcarryout [ebp + 24]
1204         testepilogue
1205 ENDFUNC
1206
1207 FUNC(test_mmul4)
1208         testprologue
1209         testexpand [ebp + 40], [ebp + 44]
1210         mov     edi, [ebp + 20]
1211         testtop [ebp + 32], [ebp + 36], mont
1212         call    mmul4
1213         testtail [ebp + 52], [ebp + 48]
1214         mov     edi, [ebp + 28]
1215         movdqa  xmm0, [esp + 76]
1216         movdqa  xmm1, [esp + 92]
1217         movdqu  [edi], xmm0
1218         movdqu  [edi + 16], xmm1
1219         testcarryout [ebp + 24]
1220         testepilogue
1221 ENDFUNC
1222
1223 FUNC(test_mmla4)
1224         testprologue
1225         testexpand [ebp + 40], [ebp + 44]
1226         mov     edi, [ebp + 20]
1227         testtop [ebp + 32], [ebp + 36], mont
1228         call    mmla4
1229         testtail [ebp + 52], [ebp + 48]
1230         mov     edi, [ebp + 28]
1231         movdqa  xmm0, [esp + 76]
1232         movdqa  xmm1, [esp + 92]
1233         movdqu  [edi], xmm0
1234         movdqu  [edi + 16], xmm1
1235         testcarryout [ebp + 24]
1236         testepilogue
1237 ENDFUNC
1238
1239 FUNC(test_mont4)
1240         testprologue
1241         testexpand nil, [ebp + 36]
1242         mov     edi, [ebp + 20]
1243         testtop nil, [ebp + 32], mont
1244         call    mont4
1245         testtail [ebp + 44], [ebp + 40]
1246         mov     edi, [ebp + 28]
1247         movdqa  xmm0, [esp + 76]
1248         movdqa  xmm1, [esp + 92]
1249         movdqu  [edi], xmm0
1250         movdqu  [edi + 16], xmm1
1251         testcarryout [ebp + 24]
1252         testepilogue
1253 ENDFUNC
1254
1255 #endif
1256
1257 ///----- That's all, folks --------------------------------------------------