chiark - git - mdw - catacomb/blob - math/mpx-mul4-x86-sse2.S

   1 /// -*- mode: asm; asm-comment-char: ?/; comment-start: "// " -*-
   2 ///
   3 /// Large SIMD-based multiplications
   4 ///
   5 /// (c) 2016 Straylight/Edgeware
   6
   7 ///----- Licensing notice ---------------------------------------------------
   8 ///
   9 /// This file is part of Catacomb.
  10 ///
  11 /// Catacomb is free software; you can redistribute it and/or modify
  12 /// it under the terms of the GNU Library General Public License as
  13 /// published by the Free Software Foundation; either version 2 of the
  14 /// License, or (at your option) any later version.
  15 ///
  16 /// Catacomb is distributed in the hope that it will be useful,
  17 /// but WITHOUT ANY WARRANTY; without even the implied warranty of
  18 /// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  19 /// GNU Library General Public License for more details.
  20 ///
  21 /// You should have received a copy of the GNU Library General Public
  22 /// License along with Catacomb; if not, write to the Free
  23 /// Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
  24 /// MA 02111-1307, USA.
  25
  26 ///--------------------------------------------------------------------------
  27 /// Preliminaries.
  28
  29 #include "config.h"
  30 #include "asm-common.h"
  31
  32         .arch   pentium4
  33
  34         .text
  35
  36 ///--------------------------------------------------------------------------
  37 /// Theory.
  38 ///
  39 /// We define a number of primitive fixed-size multipliers from which we can
  40 /// construct more general variable-length multipliers.
  41 ///
  42 /// The basic trick is the same throughout.  In an operand-scanning
  43 /// multiplication, the inner multiplication loop multiplies a multiple-
  44 /// precision operand by a single precision factor, and adds the result,
  45 /// appropriately shifted, to the result.  A `finely integrated operand
  46 /// scanning' implementation of Montgomery multiplication also adds the
  47 /// product of a single-precision `Montgomery factor' and the modulus,
  48 /// calculated in the same pass.  The more common `coarsely integrated
  49 /// operand scanning' alternates main multiplication and Montgomery passes,
  50 /// which requires additional carry propagation.
  51 ///
  52 /// Throughout both plain-multiplication and Montgomery stages, then, one of
  53 /// the factors remains constant throughout the operation, so we can afford
  54 /// to take a little time to preprocess it.  The transformation we perform is
  55 /// as follows.  Let b = 2^16, and B = b^2 = 2^32.  Suppose we're given a
  56 /// 128-bit factor v = v_0 + v_1 B + v_2 B^2 + v_3 B^3.  Split each v_i into
  57 /// two sixteen-bit pieces, so v_i = v'_i + v''_i b.  These eight 16-bit
  58 /// pieces are placed into 32-bit cells, and arranged as two 128-bit SSE
  59 /// operands, as follows.
  60 ///
  61 ///     Offset     0       4        8      12
  62 ///        0    v'_0    v'_1    v''_0   v''_1
  63 ///       16    v'_2    v'_3    v''_2   v''_3
  64 ///
  65 /// A `pmuludq' instruction ignores the odd positions in its operands; thus,
  66 /// it will act on (say) v'_0 and v''_0 in a single instruction.  Shifting
  67 /// this vector right by 4 bytes brings v'_1 and v''_1 into position.  We can
  68 /// multiply such a vector by a full 32-bit scalar to produce two 48-bit
  69 /// results in 64-bit fields.  The sixteen bits of headroom allows us to add
  70 /// many products together before we must deal with carrying; it also allows
  71 /// for some calculations to be performed on the above expanded form.
  72 ///
  73 /// We maintain four `carry' registers XMM4--XMM7 accumulating intermediate
  74 /// results.  The registers' precise roles rotate during the computation; we
  75 /// name them `c0', `c1', `c2', and `c3'.  Each carry register holds two
  76 /// 64-bit halves: the register c0, for example, holds c'_0 (low half) and
  77 /// c''_0 (high half), and represents the value c_0 = c'_0 + c''_0 b; the
  78 /// carry registers collectively represent the value c_0 + c_1 B + c_2 B^2 +
  79 /// c_3 B^3.  The `pmuluqd' instruction acting on a scalar operand (broadcast
  80 /// across all lanes of its vector) and an operand in the expanded form above
  81 /// produces a result which can be added directly to the appropriate carry
  82 /// register.  Following a pass of four multiplications, we perform some
  83 /// limited carry propagation: let t = c''_0 mod B, and let d = c'_0 + t b;
  84 /// then we output z = d mod B, add (floor(d/B), floor(c''_0/B)) to c1, and
  85 /// cycle the carry registers around, so that c1 becomes c0, and the old
  86 /// (implicitly) zeroed c0 becomes c3.
  87 ///
  88 /// On 32-bit x86, we are register starved: the expanded operands are kept in
  89 /// memory, typically in warm L1 cache.  The packed operands are read from
  90 /// memory into working registers XMM0--XMM3 and processed immediately.
  91 /// The following conventional argument names and locations are used
  92 /// throughout.
  93 ///
  94 /// Arg Format      Location    Notes
  95 ///
  96 /// U   packed      [EAX]
  97 /// X   packed      [EBX]       In Montgomery multiplication, X = N
  98 /// V   expanded    [ECX]
  99 /// Y   expanded    [EDX]       In Montgomery multiplication, Y = (A + U V) M
 100 /// M   expanded    [ESI]       -N^{-1} (mod B^4)
 101 /// N                           Modulus, for Montgomery multiplication
 102 /// A   packed      [EDI]       Destination/accumulator
 103 /// C   carry       XMM4--XMM7
 104 ///
 105 /// The calculation is some variant of
 106 ///
 107 ///     A' + C' B^4 <- U V + X Y + A + C
 108 ///
 109 /// The low-level functions fit into a fairly traditional (finely-integrated)
 110 /// operand scanning loop over operand pairs (U, X) (indexed by j) and (V, Y)
 111 /// (indexed by i).
 112 ///
 113 /// The variants are as follows.
 114 ///
 115 /// Function    Variant                 Use             i       j
 116 ///
 117 /// mmul4       A = C = 0               Montgomery      0       0
 118 /// dmul4       A = 0                   Montgomery      0       +
 119 /// mmla4       C = 0                   Montgomery      +       0
 120 /// dmla4       exactly as shown        Montgomery      +       +
 121 /// mont4       U = V = C = 0           Montgomery      any     0
 122 ///
 123 /// mul4zc      U = V = A = C = 0       Plain           0       0
 124 /// mul4        U = V = A = 0           Plain           0       +
 125 /// mla4zc      U = V = C = 0           Plain           +       0
 126 /// mla4        U = V = 0               Plain           +       +
 127 ///
 128 /// The `mmul4' and `mmla4' functions are also responsible for calculating
 129 /// the Montgomery reduction factor Y = (A + U V) M used by the rest of the
 130 /// inner loop.
 131
 132 ///--------------------------------------------------------------------------
 133 /// Macro definitions.
 134
 135 .macro  mulcore r, s, d0, d1=nil, d2=nil, d3=nil
 136         // Load a word r_i from R, multiply by the expanded operand [S], and
 137         // leave the pieces of the product in registers D0, D1, D2, D3.
 138         movd    \d0, \r                 // (r_i, 0; 0, 0)
 139   .ifnes "\d1", "nil"
 140         movdqa  \d1, [\s]               // (s'_0, s'_1; s''_0, s''_1)
 141   .endif
 142   .ifnes "\d3", "nil"
 143         movdqa  \d3, [\s + 16]          // (s'_2, s'_3; s''_2, s''_3)
 144   .endif
 145         pshufd  \d0, \d0, SHUF(0, 3, 0, 3) // (r_i, ?; r_i, ?)
 146   .ifnes "\d1", "nil"
 147         psrldq  \d1, 4                  // (s'_1, s''_0; s''_1, 0)
 148   .endif
 149   .ifnes "\d2", "nil"
 150     .ifnes "\d3", "nil"
 151         movdqa  \d2, \d3                // another copy of (s'_2, s'_3; ...)
 152     .else
 153         movdqa  \d2, \d0                // another copy of (r_i, ?; r_i, ?)
 154     .endif
 155   .endif
 156   .ifnes "\d3", "nil"
 157         psrldq  \d3, 4                  // (s'_3, s''_2; s''_3, 0)
 158   .endif
 159   .ifnes "\d1", "nil"
 160         pmuludq \d1, \d0                // (r_i s'_1; r_i s''_1)
 161   .endif
 162   .ifnes "\d3", "nil"
 163         pmuludq \d3, \d0                // (r_i s'_3; r_i s''_3)
 164   .endif
 165   .ifnes "\d2", "nil"
 166     .ifnes "\d3", "nil"
 167         pmuludq \d2, \d0                // (r_i s'_2; r_i s''_2)
 168     .else
 169         pmuludq \d2, [\s + 16]
 170     .endif
 171   .endif
 172         pmuludq \d0, [\s]               // (r_i s'_0; r_i s''_0)
 173 .endm
 174
 175 .macro  accum   c0, c1=nil, c2=nil, c3=nil
 176         // Accumulate 64-bit pieces in XMM0--XMM3 into the corresponding
 177         // carry registers C0--C3.  Any or all of C1--C3 may be `nil' to skip
 178         // updating that register.
 179         paddq   \c0, xmm0
 180   .ifnes "\c1", "nil"
 181         paddq   \c1, xmm1
 182   .endif
 183   .ifnes "\c2", "nil"
 184         paddq   \c2, xmm2
 185   .endif
 186   .ifnes "\c3", "nil"
 187         paddq   \c3, xmm3
 188   .endif
 189 .endm
 190
 191 .macro  mulacc  r, s, c0, c1, c2, c3, z3p=nil
 192         // Load a word r_i from R, multiply by the expanded operand [S],
 193         // and accumulate in carry registers C0, C1, C2, C3.  If Z3P is `t'
 194         // then C3 notionally contains zero, but needs clearing; in practice,
 195         // we store the product directly rather than attempting to add.  On
 196         // completion, XMM0, XMM1, and XMM2 are clobbered, as is XMM3 if Z3P
 197         // is not `t'.
 198   .ifeqs "\z3p", "t"
 199         mulcore \r, \s, xmm0, xmm1, xmm2, \c3
 200         accum           \c0,  \c1,  \c2
 201   .else
 202         mulcore \r, \s, xmm0, xmm1, xmm2, xmm3
 203         accum           \c0,  \c1,  \c2,  \c3
 204   .endif
 205 .endm
 206
 207 .macro  propout d, c, cc=nil
 208         // Calculate an output word from C, and store it in D; propagate
 209         // carries out from C to CC in preparation for a rotation of the
 210         // carry registers.  On completion, XMM3 is clobbered.  If CC is
 211         // `nil', then the contribution which would have been added to it is
 212         // left in C.
 213         pshufd  xmm3, \c, SHUF(3, 3, 3, 2) // (?, ?; ?, t = c'' mod B)
 214         psrldq  xmm3, 12                // (t, 0; 0, 0) = (t, 0)
 215         pslldq  xmm3, 2                 // (t b; 0)
 216         paddq   \c, xmm3                // (c' + t b; c'')
 217         movd    \d, \c
 218         psrlq   \c, 32                  // floor(c/B)
 219   .ifnes "\cc", "nil"
 220         paddq   \cc, \c                 // propagate up
 221   .endif
 222 .endm
 223
 224 .macro  endprop d, c, t
 225         // On entry, C contains a carry register.  On exit, the low 32 bits
 226         // of the value represented in C are written to D, and the remaining
 227         // bits are left at the bottom of T.
 228         movdqa  \t, \c
 229         psllq   \t, 16                  // (?; c'' b)
 230         pslldq  \c, 8                   // (0; c')
 231         paddq   \t, \c                  // (?; c' + c'' b)
 232         psrldq  \t, 8                   // (c' + c'' b; 0) = (c; 0)
 233         movd    \d, \t
 234         psrldq  \t, 4                   // (floor(c/B); 0)
 235 .endm
 236
 237 .macro  expand  z, a, b, c=nil, d=nil
 238         // On entry, A and C hold packed 128-bit values, and Z is zero.  On
 239         // exit, A:B and C:D together hold the same values in expanded
 240         // form.  If C is `nil', then only expand A to A:B.
 241         movdqa  \b, \a                  // (a_0, a_1; a_2, a_3)
 242   .ifnes "\c", "nil"
 243         movdqa  \d, \c                  // (c_0, c_1; c_2, c_3)
 244   .endif
 245         punpcklwd \a, \z                // (a'_0, a''_0; a'_1, a''_1)
 246         punpckhwd \b, \z                // (a'_2, a''_2; a'_3, a''_3)
 247   .ifnes "\c", "nil"
 248         punpcklwd \c, \z                // (c'_0, c''_0; c'_1, c''_1)
 249         punpckhwd \d, \z                // (c'_2, c''_2; c'_3, c''_3)
 250   .endif
 251         pshufd  \a, \a, SHUF(0, 2, 1, 3) // (a'_0, a'_1; a''_0, a''_1)
 252         pshufd  \b, \b, SHUF(0, 2, 1, 3) // (a'_2, a'_3; a''_2, a''_3)
 253   .ifnes "\c", "nil"
 254         pshufd  \c, \c, SHUF(0, 2, 1, 3) // (c'_0, c'_1; c''_0, c''_1)
 255         pshufd  \d, \d, SHUF(0, 2, 1, 3) // (c'_2, c'_3; c''_2, c''_3)
 256   .endif
 257 .endm
 258
 259 .macro  squash  c0, c1, c2, c3, t, u, lo, hi=nil
 260         // On entry, C0, C1, C2, C3 are carry registers representing a value
 261         // Y.  On exit, LO holds the low 128 bits of the carry value; C1, C2,
 262         // C3, T, and U are clobbered; and the high bits of Y are stored in
 263         // HI, if this is not `nil'.
 264
 265         // The first step is to eliminate the `double-prime' pieces -- i.e.,
 266         // the ones offset by 16 bytes from a 32-bit boundary -- by carrying
 267         // them into the 32-bit-aligned pieces above and below.  But before
 268         // we can do that, we must gather them together.
 269         movdqa  \t, \c0
 270         movdqa  \u, \c1
 271         punpcklqdq \t, \c2              // (y'_0; y'_2)
 272         punpckhqdq \c0, \c2             // (y''_0; y''_2)
 273         punpcklqdq \u, \c3              // (y'_1; y'_3)
 274         punpckhqdq \c1, \c3             // (y''_1; y''_3)
 275
 276         // Now split the double-prime pieces.  The high (up to) 48 bits will
 277         // go up; the low 16 bits go down.
 278         movdqa  \c2, \c0
 279         movdqa  \c3, \c1
 280         psllq   \c2, 48
 281         psllq   \c3, 48
 282         psrlq   \c0, 16                 // high parts of (y''_0; y''_2)
 283         psrlq   \c1, 16                 // high parts of (y''_1; y''_3)
 284         psrlq   \c2, 32                 // low parts of (y''_0; y''_2)
 285         psrlq   \c3, 32                 // low parts of (y''_1; y''_3)
 286   .ifnes "\hi", "nil"
 287         movdqa  \hi, \c1
 288   .endif
 289         pslldq  \c1, 8                  // high part of (0; y''_1)
 290
 291         paddq   \t, \c2                 // propagate down
 292         paddq   \u, \c3
 293         paddq   \t, \c1                 // and up: (y_0; y_2)
 294         paddq   \u, \c0                 // (y_1; y_3)
 295   .ifnes "\hi", "nil"
 296         psrldq  \hi, 8                  // high part of (y''_3; 0)
 297   .endif
 298
 299         // Finally extract the answer.  This complicated dance is better than
 300         // storing to memory and loading, because the piecemeal stores
 301         // inhibit store forwarding.
 302         movdqa  \c3, \t                 // (y_0; ?)
 303         movdqa  \lo, \t                 // (y^*_0, ?; ?, ?)
 304         psrldq  \t, 8                   // (y_2; 0)
 305         psrlq   \c3, 32                 // (floor(y_0/B); ?)
 306         paddq   \c3, \u                 // (y_1 + floor(y_0/B); ?)
 307         movdqa  \c1, \c3                // (y^*_1, ?; ?, ?)
 308         psrldq  \u, 8                   // (y_3; 0)
 309         psrlq   \c3, 32                 // (floor((y_1 B + y_0)/B^2; ?)
 310         paddq   \c3, \t                 // (y_2 + floor((y_1 B + y_0)/B^2; ?)
 311         punpckldq \lo, \c3              // (y^*_0, y^*_2; ?, ?)
 312         psrlq   \c3, 32             // (floor((y_2 B^2 + y_1 B + y_0)/B^3; ?)
 313         paddq   \c3, \u       // (y_3 + floor((y_2 B^2 + y_1 B + y_0)/B^3; ?)
 314   .ifnes "\hi", "nil"
 315         movdqa  \t, \c3
 316         pxor    \u, \u
 317   .endif
 318         punpckldq \c1, \c3              // (y^*_1, y^*_3; ?, ?)
 319   .ifnes "\hi", "nil"
 320         psrlq   \t, 32                  // very high bits of y
 321         paddq   \hi, \t
 322         punpcklqdq \hi, \u              // carry up
 323   .endif
 324         punpckldq \lo, \c1              // y mod B^4
 325 .endm
 326
 327 .macro  carryadd
 328         // On entry, EDI points to a packed addend A, and XMM4, XMM5, XMM6
 329         // hold the incoming carry registers c0, c1, and c2 representing a
 330         // carry-in C.
 331         //
 332         // On exit, the carry registers, including XMM7, are updated to hold
 333         // C + A; XMM0, XMM1, XMM2, and XMM3 are clobbered.  The other
 334         // registers are preserved.
 335         movd    xmm0, [edi +  0]        // (a_0; 0)
 336         movd    xmm1, [edi +  4]        // (a_1; 0)
 337         movd    xmm2, [edi +  8]        // (a_2; 0)
 338         movd    xmm7, [edi + 12]        // (a_3; 0)
 339
 340         paddq   xmm4, xmm0              // (c'_0 + a_0; c''_0)
 341         paddq   xmm5, xmm1              // (c'_1 + a_1; c''_1)
 342         paddq   xmm6, xmm2              // (c'_2 + a_2; c''_2 + a_3 b)
 343 .endm
 344
 345 ///--------------------------------------------------------------------------
 346 /// Primitive multipliers and related utilities.
 347
 348 INTFUNC(carryprop)
 349         // On entry, XMM4, XMM5, and XMM6 hold a 144-bit carry in an expanded
 350         // form.  Store the low 128 bits of the represented carry to [EDI] as
 351         // a packed 128-bit value, and leave the remaining 16 bits in the low
 352         // 32 bits of XMM4.  On exit, XMM3, XMM5 and XMM6 are clobbered.
 353   endprologue
 354
 355         propout [edi +  0], xmm4, xmm5
 356         propout [edi +  4], xmm5, xmm6
 357         propout [edi +  8], xmm6, nil
 358         endprop [edi + 12], xmm6, xmm4
 359         ret
 360 ENDFUNC
 361
 362 INTFUNC(dmul4)
 363         // On entry, EDI points to the destination buffer; EAX and EBX point
 364         // to the packed operands U and X; ECX and EDX point to the expanded
 365         // operands V and Y; and XMM4, XMM5, XMM6 hold the incoming carry
 366         // registers c0, c1, and c2; c3 is assumed to be zero.
 367         //
 368         // On exit, we write the low 128 bits of the sum C + U V + X Y to
 369         // [EDI], and update the carry registers with the carry out.  The
 370         // registers XMM0, XMM1, XMM2, XMM3, and XMM7 are clobbered; the
 371         // general-purpose registers are preserved.
 372   endprologue
 373
 374         mulacc  [eax +  0], ecx, xmm4, xmm5, xmm6, xmm7, t
 375         mulacc  [ebx +  0], edx, xmm4, xmm5, xmm6, xmm7
 376         propout [edi +  0],      xmm4, xmm5
 377
 378         mulacc  [eax +  4], ecx, xmm5, xmm6, xmm7, xmm4, t
 379         mulacc  [ebx +  4], edx, xmm5, xmm6, xmm7, xmm4
 380         propout [edi +  4],      xmm5, xmm6
 381
 382         mulacc  [eax +  8], ecx, xmm6, xmm7, xmm4, xmm5, t
 383         mulacc  [ebx +  8], edx, xmm6, xmm7, xmm4, xmm5
 384         propout [edi +  8],      xmm6, xmm7
 385
 386         mulacc  [eax + 12], ecx, xmm7, xmm4, xmm5, xmm6, t
 387         mulacc  [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6
 388         propout [edi + 12],      xmm7, xmm4
 389
 390         ret
 391 ENDFUNC
 392
 393 INTFUNC(dmla4)
 394         // On entry, EDI points to the destination buffer, which also
 395         // contains an addend A to accumulate; EAX and EBX point to the
 396         // packed operands U and X; ECX and EDX point to the expanded
 397         // operands V and Y; and XMM4, XMM5, XMM6 hold the incoming carry
 398         // registers c0, c1, and c2 representing a carry-in C; c3 is assumed
 399         // to be zero.
 400         //
 401         // On exit, we write the low 128 bits of the sum A + C + U V + X Y to
 402         // [EDI], and update the carry registers with the carry out.  The
 403         // registers XMM0, XMM1, XMM2, XMM3, and XMM7 are clobbered; the
 404         // general-purpose registers are preserved.
 405   endprologue
 406
 407         carryadd
 408
 409         mulacc  [eax +  0], ecx, xmm4, xmm5, xmm6, xmm7
 410         mulacc  [ebx +  0], edx, xmm4, xmm5, xmm6, xmm7
 411         propout [edi +  0],      xmm4, xmm5
 412
 413         mulacc  [eax +  4], ecx, xmm5, xmm6, xmm7, xmm4, t
 414         mulacc  [ebx +  4], edx, xmm5, xmm6, xmm7, xmm4
 415         propout [edi +  4],      xmm5, xmm6
 416
 417         mulacc  [eax +  8], ecx, xmm6, xmm7, xmm4, xmm5, t
 418         mulacc  [ebx +  8], edx, xmm6, xmm7, xmm4, xmm5
 419         propout [edi +  8],      xmm6, xmm7
 420
 421         mulacc  [eax + 12], ecx, xmm7, xmm4, xmm5, xmm6, t
 422         mulacc  [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6
 423         propout [edi + 12],      xmm7, xmm4
 424
 425         ret
 426 ENDFUNC
 427
 428 INTFUNC(mul4zc)
 429         // On entry, EDI points to the destination buffer; EBX points to a
 430         // packed operand X; and EDX points to an expanded operand Y.
 431         //
 432         // On exit, we write the low 128 bits of the product X Y to [EDI],
 433         // and set the carry registers XMM4, XMM5, XMM6 to the carry out.
 434         // The registers XMM0, XMM1, XMM2, XMM3, and XMM7 are clobbered; the
 435         // general-purpose registers are preserved.
 436   endprologue
 437
 438         mulcore [ebx +  0], edx, xmm4, xmm5, xmm6, xmm7
 439         propout [edi +  0],      xmm4, xmm5
 440
 441         mulacc  [ebx +  4], edx, xmm5, xmm6, xmm7, xmm4, t
 442         propout [edi +  4],      xmm5, xmm6
 443
 444         mulacc  [ebx +  8], edx, xmm6, xmm7, xmm4, xmm5, t
 445         propout [edi +  8],      xmm6, xmm7
 446
 447         mulacc  [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6, t
 448         propout [edi + 12],      xmm7, xmm4
 449
 450         ret
 451 ENDFUNC
 452
 453 INTFUNC(mul4)
 454         // On entry, EDI points to the destination buffer; EBX points to a
 455         // packed operand X; EDX points to an expanded operand Y; and XMM4,
 456         // XMM5, XMM6 hold the incoming carry registers c0, c1, and c2,
 457         // representing a carry-in C; c3 is assumed to be zero.
 458         //
 459         // On exit, we write the low 128 bits of the sum C + X Y to [EDI],
 460         // and update the carry registers with the carry out.  The registers
 461         // XMM0, XMM1, XMM2, XMM3, and XMM7 are clobbered; the
 462         // general-purpose registers are preserved.
 463   endprologue
 464
 465         mulacc  [ebx +  0], edx, xmm4, xmm5, xmm6, xmm7, t
 466         propout [edi +  0],      xmm4, xmm5
 467
 468         mulacc  [ebx +  4], edx, xmm5, xmm6, xmm7, xmm4, t
 469         propout [edi +  4],      xmm5, xmm6
 470
 471         mulacc  [ebx +  8], edx, xmm6, xmm7, xmm4, xmm5, t
 472         propout [edi +  8],      xmm6, xmm7
 473
 474         mulacc  [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6, t
 475         propout [edi + 12],      xmm7, xmm4
 476
 477         ret
 478 ENDFUNC
 479
 480 INTFUNC(mla4zc)
 481         // On entry, EDI points to the destination buffer, which also
 482         // contains an addend A to accumulate; EBX points to a packed operand
 483         // X; and EDX points to an expanded operand Y.
 484         //
 485         // On exit, we write the low 128 bits of the sum A + X Y to [EDI],
 486         // and set the carry registers XMM4, XMM5, XMM6 to the carry out.
 487         // The registers XMM0, XMM1, XMM2, XMM3, and XMM7 are clobbered; the
 488         // general-purpose registers are preserved.
 489   endprologue
 490
 491         movd    xmm4, [edi +  0]
 492         movd    xmm5, [edi +  4]
 493         movd    xmm6, [edi +  8]
 494         movd    xmm7, [edi + 12]
 495
 496         mulacc  [ebx +  0], edx, xmm4, xmm5, xmm6, xmm7
 497         propout [edi +  0],      xmm4, xmm5
 498
 499         mulacc  [ebx +  4], edx, xmm5, xmm6, xmm7, xmm4, t
 500         propout [edi +  4],      xmm5, xmm6
 501
 502         mulacc  [ebx +  8], edx, xmm6, xmm7, xmm4, xmm5, t
 503         propout [edi +  8],      xmm6, xmm7
 504
 505         mulacc  [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6, t
 506         propout [edi + 12],      xmm7, xmm4
 507
 508         ret
 509 ENDFUNC
 510
 511 INTFUNC(mla4)
 512         // On entry, EDI points to the destination buffer, which also
 513         // contains an addend A to accumulate; EBX points to a packed operand
 514         // X; EDX points to an expanded operand Y; and XMM4, XMM5, XMM6 hold
 515         // the incoming carry registers c0, c1, and c2, representing a
 516         // carry-in C; c3 is assumed to be zero.
 517         //
 518         // On exit, we write the low 128 bits of the sum A + C + X Y to
 519         // [EDI], and update the carry registers with the carry out.  The
 520         // registers XMM0, XMM1, XMM2, XMM3, and XMM7 are clobbered; the
 521         // general-purpose registers are preserved.
 522   endprologue
 523
 524         carryadd
 525
 526         mulacc  [ebx +  0], edx, xmm4, xmm5, xmm6, xmm7
 527         propout [edi +  0],      xmm4, xmm5
 528
 529         mulacc  [ebx +  4], edx, xmm5, xmm6, xmm7, xmm4, t
 530         propout [edi +  4],      xmm5, xmm6
 531
 532         mulacc  [ebx +  8], edx, xmm6, xmm7, xmm4, xmm5, t
 533         propout [edi +  8],      xmm6, xmm7
 534
 535         mulacc  [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6, t
 536         propout [edi + 12],      xmm7, xmm4
 537
 538         ret
 539 ENDFUNC
 540
 541 INTFUNC(mmul4)
 542         // On entry, EDI points to the destination buffer; EAX and EBX point
 543         // to the packed operands U and N; ECX and ESI point to the expanded
 544         // operands V and M; and EDX points to a place to store an expanded
 545         // result Y (32 bytes, at a 16-byte boundary).  The stack pointer
 546         // must be 12 modulo 16, as is usual for modern x86 ABIs.
 547         //
 548         // On exit, we write Y = U V M mod B to [EDX], and the low 128 bits
 549         // of the sum U V + N Y to [EDI], leaving the remaining carry in
 550         // XMM4, XMM5, and XMM6.  The registers XMM0, XMM1, XMM2, XMM3, and
 551         // XMM7 are clobbered; the general-purpose registers are preserved.
 552         stalloc 48 + 12                 // space for the carries
 553   endprologue
 554
 555         // Calculate W = U V, and leave it in the destination.  Stash the
 556         // carry pieces for later.
 557         mulcore [eax +  0], ecx, xmm4, xmm5, xmm6, xmm7
 558         propout [edi +  0],      xmm4, xmm5
 559         jmp     5f
 560 ENDFUNC
 561
 562 INTFUNC(mmla4)
 563         // On entry, EDI points to the destination buffer, which also
 564         // contains an addend A to accumulate; EAX and EBX point to the
 565         // packed operands U and N; ECX and ESI point to the expanded
 566         // operands V and M; and EDX points to a place to store an expanded
 567         // result Y (32 bytes, at a 16-byte boundary).  The stack pointer
 568         // must be 12 modulo 16, as is usual for modern x86 ABIs.
 569         //
 570         // On exit, we write Y = (A + U V) M mod B to [EDX], and the low 128
 571         // bits of the sum A + U V + N Y to [EDI], leaving the remaining
 572         // carry in XMM4, XMM5, and XMM6.  The registers XMM0, XMM1, XMM2,
 573         // XMM3, and XMM7 are clobbered; the general-purpose registers are
 574         // preserved.
 575         stalloc 48 + 12                 // space for the carries
 576   endprologue
 577
 578         movd    xmm4, [edi +  0]
 579         movd    xmm5, [edi +  4]
 580         movd    xmm6, [edi +  8]
 581         movd    xmm7, [edi + 12]
 582
 583         // Calculate W = U V, and leave it in the destination.  Stash the
 584         // carry pieces for later.
 585         mulacc  [eax +  0], ecx, xmm4, xmm5, xmm6, xmm7
 586         propout [edi +  0],      xmm4, xmm5
 587
 588 5:      mulacc  [eax +  4], ecx, xmm5, xmm6, xmm7, xmm4, t
 589         propout [edi +  4],      xmm5, xmm6
 590
 591         mulacc  [eax +  8], ecx, xmm6, xmm7, xmm4, xmm5, t
 592         propout [edi +  8],      xmm6, xmm7
 593
 594         mulacc  [eax + 12], ecx, xmm7, xmm4, xmm5, xmm6, t
 595         propout [edi + 12],      xmm7, xmm4
 596
 597         movdqa  [SP +  0], xmm4
 598         movdqa  [SP + 16], xmm5
 599         movdqa  [SP + 32], xmm6
 600
 601         // Calculate Y = W M.
 602         mulcore [edi +  0], esi, xmm4, xmm5, xmm6, xmm7
 603
 604         mulcore [edi +  4], esi, xmm0, xmm1, xmm2
 605         accum                    xmm5, xmm6, xmm7
 606
 607         mulcore [edi +  8], esi, xmm0, xmm1
 608         accum                    xmm6, xmm7
 609
 610         mulcore [edi + 12], esi, xmm0
 611         accum                    xmm7
 612
 613         // That's lots of pieces.  Now we have to assemble the answer.
 614         squash  xmm4, xmm5, xmm6, xmm7,  xmm0, xmm1,  xmm4
 615
 616         // Expand it.
 617         pxor    xmm2, xmm2
 618         expand  xmm2, xmm4, xmm1
 619         movdqa  [edx +  0], xmm4
 620         movdqa  [edx + 16], xmm1
 621
 622         // Initialize the carry from the value for W we calculated earlier.
 623         movd    xmm4, [edi +  0]
 624         movd    xmm5, [edi +  4]
 625         movd    xmm6, [edi +  8]
 626         movd    xmm7, [edi + 12]
 627
 628         // Finish the calculation by adding the Montgomery product.
 629         mulacc  [ebx +  0], edx, xmm4, xmm5, xmm6, xmm7
 630         propout [edi +  0],      xmm4, xmm5
 631
 632         mulacc  [ebx +  4], edx, xmm5, xmm6, xmm7, xmm4, t
 633         propout [edi +  4],      xmm5, xmm6
 634
 635         mulacc  [ebx +  8], edx, xmm6, xmm7, xmm4, xmm5, t
 636         propout [edi +  8],      xmm6, xmm7
 637
 638         mulacc  [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6, t
 639         propout [edi + 12],      xmm7, xmm4
 640
 641         // Add add on the carry we calculated earlier.
 642         paddq   xmm4, [SP +  0]
 643         paddq   xmm5, [SP + 16]
 644         paddq   xmm6, [SP + 32]
 645
 646         // And, with that, we're done.
 647         stfree  48 + 12
 648         ret
 649 ENDFUNC
 650
 651 INTFUNC(mont4)
 652         // On entry, EDI points to the destination buffer holding a packed
 653         // value W; EBX points to a packed operand N; ESI points to an
 654         // expanded operand M; and EDX points to a place to store an expanded
 655         // result Y (32 bytes, at a 16-byte boundary).
 656         //
 657         // On exit, we write Y = W M mod B to [EDX], and the low 128 bits
 658         // of the sum W + N Y to [EDI], leaving the remaining carry in
 659         // XMM4, XMM5, and XMM6.  The registers XMM0, XMM1, XMM2, XMM3, and
 660         // XMM7 are clobbered; the general-purpose registers are preserved.
 661   endprologue
 662
 663         // Calculate Y = W M.
 664         mulcore [edi +  0], esi, xmm4, xmm5, xmm6, xmm7
 665
 666         mulcore [edi +  4], esi, xmm0, xmm1, xmm2
 667         accum                    xmm5, xmm6, xmm7
 668
 669         mulcore [edi +  8], esi, xmm0, xmm1
 670         accum                    xmm6, xmm7
 671
 672         mulcore [edi + 12], esi, xmm0
 673         accum                    xmm7
 674
 675         // That's lots of pieces.  Now we have to assemble the answer.
 676         squash  xmm4, xmm5, xmm6, xmm7,  xmm0, xmm1,  xmm4
 677
 678         // Expand it.
 679         pxor    xmm2, xmm2
 680         expand  xmm2, xmm4, xmm1
 681         movdqa  [edx +  0], xmm4
 682         movdqa  [edx + 16], xmm1
 683
 684         // Initialize the carry from W.
 685         movd    xmm4, [edi +  0]
 686         movd    xmm5, [edi +  4]
 687         movd    xmm6, [edi +  8]
 688         movd    xmm7, [edi + 12]
 689
 690         // Finish the calculation by adding the Montgomery product.
 691         mulacc  [ebx +  0], edx, xmm4, xmm5, xmm6, xmm7
 692         propout [edi +  0],      xmm4, xmm5
 693
 694         mulacc  [ebx +  4], edx, xmm5, xmm6, xmm7, xmm4, t
 695         propout [edi +  4],      xmm5, xmm6
 696
 697         mulacc  [ebx +  8], edx, xmm6, xmm7, xmm4, xmm5, t
 698         propout [edi +  8],      xmm6, xmm7
 699
 700         mulacc  [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6, t
 701         propout [edi + 12],      xmm7, xmm4
 702
 703         // And, with that, we're done.
 704         ret
 705 ENDFUNC
 706
 707 ///--------------------------------------------------------------------------
 708 /// Bulk multipliers.
 709
 710 FUNC(mpx_umul4_x86_avx)
 711         .arch   .avx
 712         vzeroupper
 713   endprologue
 714         // and drop through...
 715         .arch   pentium4
 716 ENDFUNC
 717
 718 FUNC(mpx_umul4_x86_sse2)
 719         // void mpx_umul4_x86_sse2(mpw *dv, const mpw *av, const mpw *avl,
 720         //                         const mpw *bv, const mpw *bvl);
 721
 722         // Build a stack frame.  Arguments will be relative to BP, as
 723         // follows.
 724         //
 725         //      BP + 20 dv
 726         //      BP + 24 av
 727         //      BP + 28 avl
 728         //      BP + 32 bv
 729         //      BP + 36 bvl
 730         //
 731         // Locals are relative to SP, as follows.
 732         //
 733         //      SP +  0 expanded Y (32 bytes)
 734         //      SP + 32 (top of locals)
 735         pushreg BP
 736         pushreg ebx
 737         pushreg esi
 738         pushreg edi
 739         setfp
 740         stalloc 32
 741         and     SP, ~15
 742   endprologue
 743
 744         // Prepare for the first iteration.
 745         mov     esi, [BP + 32]          // -> bv[0]
 746         pxor    xmm7, xmm7
 747         movdqu  xmm0, [esi]             // bv[0]
 748         mov     edi, [BP + 20]          // -> dv[0]
 749         mov     ecx, edi                // outer loop dv cursor
 750         expand  xmm7, xmm0, xmm1
 751         mov     ebx, [BP + 24]          // -> av[0]
 752         mov     eax, [BP + 28]          // -> av[m] = av limit
 753         mov     edx, SP                 // -> expanded Y = bv[0]
 754         movdqa  [SP + 0], xmm0          // bv[0] expanded low
 755         movdqa  [SP + 16], xmm1         // bv[0] expanded high
 756         call    mul4zc
 757         add     ebx, 16
 758         add     edi, 16
 759         add     ecx, 16
 760         add     esi, 16
 761         cmp     ebx, eax                // all done?
 762         jae     8f
 763
 764         .p2align 4
 765         // Continue with the first iteration.
 766 0:      call    mul4
 767         add     ebx, 16
 768         add     edi, 16
 769         cmp     ebx, eax                // all done?
 770         jb      0b
 771
 772         // Write out the leftover carry.  There can be no tail here.
 773 8:      call    carryprop
 774         cmp     esi, [BP + 36]          // more passes to do?
 775         jae     9f
 776
 777         .p2align 4
 778         // Set up for the next pass.
 779 1:      movdqu  xmm0, [esi]             // bv[i]
 780         mov     edi, ecx                // -> dv[i]
 781         pxor    xmm7, xmm7
 782         expand  xmm7, xmm0, xmm1
 783         mov     ebx, [BP + 24]          // -> av[0]
 784         movdqa  [SP + 0], xmm0          // bv[i] expanded low
 785         movdqa  [SP + 16], xmm1         // bv[i] expanded high
 786         call    mla4zc
 787         add     edi, 16
 788         add     ebx, 16
 789         add     ecx, 16
 790         add     esi, 16
 791         cmp     ebx, eax                // done yet?
 792         jae     8f
 793
 794         .p2align 4
 795         // Continue...
 796 0:      call    mla4
 797         add     ebx, 16
 798         add     edi, 16
 799         cmp     ebx, eax
 800         jb      0b
 801
 802         // Finish off this pass.  There was no tail on the previous pass, and
 803         // there can be none on this pass.
 804 8:      call    carryprop
 805         cmp     esi, [BP + 36]
 806         jb      1b
 807
 808         // All over.
 809 9:      dropfp
 810         pop     edi
 811         pop     esi
 812         pop     ebx
 813         pop     BP
 814         ret
 815 ENDFUNC
 816
 817 FUNC(mpxmont_mul4_x86_avx)
 818         .arch   .avx
 819         vzeroupper
 820   endprologue
 821         // and drop through...
 822         .arch   pentium4
 823 ENDFUNC
 824
 825 FUNC(mpxmont_mul4_x86_sse2)
 826         // void mpxmont_mul4_x86_sse2(mpw *dv, const mpw *av, const mpw *bv,
 827         //                           const mpw *nv, size_t n, const mpw *mi);
 828
 829         // Build a stack frame.  Arguments will be relative to BP, as
 830         // follows.
 831         //
 832         //      BP + 20 dv
 833         //      BP + 24 av
 834         //      BP + 28 bv
 835         //      BP + 32 nv
 836         //      BP + 36 n (nonzero multiple of 4)
 837         //      BP + 40 mi
 838         //
 839         // Locals are relative to SP, which 16-byte aligned, as follows.
 840         //
 841         //      SP +   0        expanded V (32 bytes)
 842         //      SP +  32        expanded M (32 bytes)
 843         //      SP +  64        expanded Y (32 bytes)
 844         //      SP +  96        outer loop dv
 845         //      SP + 100        outer loop bv
 846         //      SP + 104        av limit (mostly in ESI)
 847         //      SP + 108        bv limit
 848         //      SP + 112        (top of locals)
 849         pushreg BP
 850         pushreg ebx
 851         pushreg esi
 852         pushreg edi
 853         setfp
 854         stalloc 112
 855         and     SP, ~15
 856   endprologue
 857
 858         // Establish the expanded operands.
 859         pxor    xmm7, xmm7
 860         mov     ecx, [BP + 28]          // -> bv
 861         mov     edx, [BP + 40]          // -> mi
 862         movdqu  xmm0, [ecx]             // bv[0]
 863         movdqu  xmm2, [edx]             // mi
 864         expand  xmm7, xmm0, xmm1, xmm2, xmm3
 865         movdqa  [SP +  0], xmm0         // bv[0] expanded low
 866         movdqa  [SP + 16], xmm1         // bv[0] expanded high
 867         movdqa  [SP + 32], xmm2         // mi expanded low
 868         movdqa  [SP + 48], xmm3         // mi expanded high
 869
 870         // Set up the outer loop state and prepare for the first iteration.
 871         mov     edx, [BP + 36]          // n
 872         mov     eax, [BP + 24]          // -> U = av[0]
 873         mov     ebx, [BP + 32]          // -> X = nv[0]
 874         mov     edi, [BP + 20]          // -> Z = dv[0]
 875         mov     [SP + 100], ecx
 876         lea     ecx, [ecx + 4*edx]      // -> bv[n/4] = bv limit
 877         lea     edx, [eax + 4*edx]      // -> av[n/4] = av limit
 878         mov     [SP + 96], edi
 879         mov     [SP + 104], edx
 880         mov     [SP + 108], ecx
 881         lea     ecx, [SP + 0]           // -> expanded V = bv[0]
 882         lea     esi, [SP + 32]          // -> expanded M = mi
 883         lea     edx, [SP + 64]          // -> space for Y
 884         call    mmul4
 885         mov     esi, [SP + 104]         // recover av limit
 886         add     edi, 16
 887         add     eax, 16
 888         add     ebx, 16
 889         cmp     eax, esi                // done already?
 890         jae     8f
 891         mov     [SP + 96], edi
 892
 893         .p2align 4
 894         // Complete the first inner loop.
 895 0:      call    dmul4
 896         add     edi, 16
 897         add     eax, 16
 898         add     ebx, 16
 899         cmp     eax, esi                // done yet?
 900         jb      0b
 901
 902         // Still have carries left to propagate.
 903         call    carryprop
 904         movd    [edi + 16], xmm4
 905
 906         .p2align 4
 907         // Embark on the next iteration.  (There must be one.  If n = 1, then
 908         // we would have bailed above, to label 8.  Similarly, the subsequent
 909         // iterations can fall into the inner loop immediately.)
 910 1:      mov     eax, [SP + 100]         // -> bv[i - 1]
 911         mov     edi, [SP + 96]          // -> Z = dv[i]
 912         add     eax, 16                 // -> bv[i]
 913         pxor    xmm7, xmm7
 914         mov     [SP + 100], eax
 915         cmp     eax, [SP + 108]         // done yet?
 916         jae     9f
 917         movdqu  xmm0, [eax]             // bv[i]
 918         mov     ebx, [BP + 32]          // -> X = nv[0]
 919         lea     esi, [SP + 32]          // -> expanded M = mi
 920         mov     eax, [BP + 24]          // -> U = av[0]
 921         expand  xmm7, xmm0, xmm1
 922         movdqa  [SP + 0], xmm0          // bv[i] expanded low
 923         movdqa  [SP + 16], xmm1         // bv[i] expanded high
 924         call    mmla4
 925         mov     esi, [SP + 104]         // recover av limit
 926         add     edi, 16
 927         add     eax, 16
 928         add     ebx, 16
 929         mov     [SP + 96], edi
 930
 931         .p2align 4
 932         // Complete the next inner loop.
 933 0:      call    dmla4
 934         add     edi, 16
 935         add     eax, 16
 936         add     ebx, 16
 937         cmp     eax, esi
 938         jb      0b
 939
 940         // Still have carries left to propagate, and they overlap the
 941         // previous iteration's final tail, so read that in and add it.
 942         movd    xmm0, [edi]
 943         paddq   xmm4, xmm0
 944         call    carryprop
 945         movd    [edi + 16], xmm4
 946
 947         // Back again.
 948         jmp     1b
 949
 950         // First iteration was short.  Write out the carries and we're done.
 951         // (This could be folded into the main loop structure, but that would
 952         // penalize small numbers more.)
 953 8:      call    carryprop
 954         movd    [edi + 16], xmm4
 955
 956         // All done.
 957 9:      dropfp
 958         popreg  edi
 959         popreg  esi
 960         popreg  ebx
 961         popreg  BP
 962         ret
 963 ENDFUNC
 964
 965 FUNC(mpxmont_redc4_x86_avx)
 966         .arch   .avx
 967         vzeroupper
 968   endprologue
 969         // and drop through...
 970         .arch   pentium4
 971 ENDFUNC
 972
 973 FUNC(mpxmont_redc4_x86_sse2)
 974         // void mpxmont_redc4_x86_sse2(mpw *dv, mpw *dvl, const mpw *nv,
 975         //                             size_t n, const mpw *mi);
 976
 977         // Build a stack frame.  Arguments will be relative to BP, as
 978         // follows.
 979         //
 980         //      BP + 20 dv
 981         //      BP + 24 dvl
 982         //      BP + 28 nv
 983         //      BP + 32 n (nonzero multiple of 4)
 984         //      BP + 36 mi
 985         //
 986         // Locals are relative to SP, as follows.
 987         //
 988         //      SP +  0 outer loop dv
 989         //      SP +  4 outer dv limit
 990         //      SP +  8 blocks-of-4 dv limit
 991         //      SP + 12 expanded M (32 bytes)
 992         //      SP + 44 expanded Y (32 bytes)
 993         //      SP + 76 (top of locals)
 994         pushreg BP
 995         pushreg ebx
 996         pushreg esi
 997         pushreg edi
 998         setfp
 999         and     SP, ~15
1000         stalloc 76
1001   endprologue
1002
1003         // Establish the expanded operands and the blocks-of-4 dv limit.
1004         mov     edi, [BP + 20]          // -> Z = dv[0]
1005         pxor    xmm7, xmm7
1006         mov     eax, [BP + 24]          // -> dv[n] = dv limit
1007         sub     eax, edi                // length of dv in bytes
1008         mov     edx, [BP + 36]          // -> mi
1009         movdqu  xmm0, [edx]             // mi
1010         and     eax, ~15                // mask off the tail end
1011         expand  xmm7, xmm0, xmm1
1012         add     eax, edi                // find limit
1013         movdqa  [SP + 12], xmm0         // mi expanded low
1014         movdqa  [SP + 28], xmm1         // mi expanded high
1015         mov     [SP + 8], eax
1016
1017         // Set up the outer loop state and prepare for the first iteration.
1018         mov     ecx, [BP + 32]          // n
1019         mov     ebx, [BP + 28]          // -> X = nv[0]
1020         lea     edx, [edi + 4*ecx]      // -> dv[n/4] = outer dv limit
1021         lea     ecx, [ebx + 4*ecx]      // -> nv[n/4] = nv limit
1022         mov     [SP + 0], edi
1023         mov     [SP + 4], edx
1024         lea     esi, [SP + 12]          // -> expanded M = mi
1025         lea     edx, [SP + 44]          // -> space for Y
1026         call    mont4
1027         add     ebx, 16
1028         add     edi, 16
1029         cmp     ebx, ecx                // done already?
1030         jae     8f
1031
1032         .p2align 4
1033         // Complete the first inner loop.
1034 5:      call    mla4
1035         add     ebx, 16
1036         add     edi, 16
1037         cmp     ebx, ecx                // done yet?
1038         jb      5b
1039
1040         // Still have carries left to propagate.
1041 8:      carryadd
1042         mov     esi, [SP + 8]           // -> dv blocks limit
1043         mov     edx, [BP + 24]          // dv limit
1044         psllq   xmm7, 16
1045         pslldq  xmm7, 8
1046         paddq   xmm6, xmm7
1047         call    carryprop
1048         movd    eax, xmm4
1049         add     edi, 16
1050         cmp     edi, esi
1051         jae     7f
1052
1053         .p2align 4
1054         // Continue carry propagation until the end of the buffer.
1055 0:      add     [edi], eax
1056         mov     eax, 0                  // preserves flags
1057         adcd    [edi + 4], 0
1058         adcd    [edi + 8], 0
1059         adcd    [edi + 12], 0
1060         adc     eax, 0
1061         add     edi, 16
1062         cmp     edi, esi
1063         jb      0b
1064
1065         // Deal with the tail end.  Note that the actual destination length
1066         // won't be an exact number of blocks of four, so it's safe to just
1067         // drop through here.
1068 7:      add     [edi], eax
1069         mov     eax, 0
1070         add     edi, 4
1071         adc     eax, 0
1072         cmp     edi, edx
1073         jb      7b
1074
1075         // All done for this iteration.  Start the next.
1076 8:      mov     edi, [SP + 0]           // -> dv[i - 1]
1077         mov     ebx, [BP + 28]          // -> X = nv[0]
1078         lea     edx, [SP + 44]          // -> space for Y
1079         lea     esi, [SP + 12]          // -> expanded M = mi
1080         add     edi, 16                 // -> Z = dv[i]
1081         cmp     edi, [SP + 4]           // all done yet?
1082         jae     9f
1083         mov     [SP + 0], edi
1084         call    mont4
1085         add     edi, 16
1086         add     ebx, 16
1087         jmp     5b
1088
1089         // All over.
1090 9:      dropfp
1091         popreg  edi
1092         popreg  esi
1093         popreg  ebx
1094         popreg  BP
1095         ret
1096 ENDFUNC
1097
1098 ///--------------------------------------------------------------------------
1099 /// Testing and performance measurement.
1100
1101 #ifdef TEST_MUL4
1102
1103 .macro  cysetup c
1104         rdtsc
1105         mov     [\c], eax
1106         mov     [\c + 4], edx
1107 .endm
1108
1109 .macro  cystore c, v, n
1110         rdtsc
1111         sub     eax, [\c]
1112         sbb     edx, [\c + 4]
1113         mov     ebx, [\v]
1114         mov     ecx, [\n]
1115         dec     ecx
1116         mov     [\n], ecx
1117         mov     [ebx + ecx*8], eax
1118         mov     [ebx + ecx*8 + 4], edx
1119 .endm
1120
1121 .macro  testprologue n
1122         pushreg BP
1123         pushreg ebx
1124         pushreg esi
1125         pushreg edi
1126         setfp
1127         stalloc 3*32 + 4*4
1128         and     SP, ~15
1129   endprologue
1130         mov     eax, \n
1131         mov     [SP + 104], eax
1132         // vars:
1133         //      SP +   0 = v expanded
1134         //      SP +  32 = y expanded
1135         //      SP +  64 = ? expanded
1136         //      SP +  96 = cycles
1137         //      SP + 104 = count
1138 .endm
1139
1140 .macro  testepilogue
1141         dropfp
1142         popreg  edi
1143         popreg  esi
1144         popreg  ebx
1145         popreg  BP
1146         ret
1147 .endm
1148
1149 .macro  testldcarry c
1150         mov     ecx, \c                 // -> c
1151         movdqu  xmm4, [ecx +  0]        // (c'_0; c''_0)
1152         movdqu  xmm5, [ecx + 16]        // (c'_1; c''_1)
1153         movdqu  xmm6, [ecx + 32]        // (c'_2; c''_2)
1154 .endm
1155
1156 .macro  testexpand v=nil, y=nil
1157         pxor    xmm7, xmm7
1158   .ifnes "\v", "nil"
1159         mov     ecx, \v
1160         movdqu  xmm0, [ecx]
1161         expand  xmm7, xmm0, xmm1
1162         movdqa  [SP +  0], xmm0
1163         movdqa  [SP + 16], xmm1
1164   .endif
1165   .ifnes "\y", "nil"
1166         mov     edx, \y
1167         movdqu  xmm2, [edx]
1168         expand  xmm7, xmm2, xmm3
1169         movdqa  [SP + 32], xmm2
1170         movdqa  [SP + 48], xmm3
1171   .endif
1172 .endm
1173
1174 .macro  testtop u=nil, x=nil, mode=nil
1175         .p2align 4
1176 0:
1177   .ifnes "\u", "nil"
1178         lea     ecx, [SP + 0]
1179   .endif
1180         mov     ebx, \x
1181   .ifeqs "\mode", "mont"
1182         lea     esi, [SP + 32]
1183   .endif
1184         cysetup SP + 96
1185   .ifnes "\u", "nil"
1186         mov     eax, \u
1187   .endif
1188   .ifeqs "\mode", "mont"
1189         lea     edx, [SP + 64]
1190   .else
1191         lea     edx, [SP + 32]
1192   .endif
1193 .endm
1194
1195 .macro  testtail cyv
1196         cystore SP + 96, \cyv, SP + 104
1197         jnz     0b
1198 .endm
1199
1200 .macro  testcarryout c
1201         mov     ecx, \c
1202         movdqu  [ecx +  0], xmm4
1203         movdqu  [ecx + 16], xmm5
1204         movdqu  [ecx + 32], xmm6
1205 .endm
1206
1207 FUNC(test_dmul4)
1208         testprologue [BP + 44]
1209         testldcarry [BP + 24]
1210         testexpand [BP + 36], [BP + 40]
1211         mov     edi, [BP + 20]
1212         testtop [BP + 28], [BP + 32]
1213         call    dmul4
1214         testtail [BP + 48]
1215         testcarryout [BP + 24]
1216         testepilogue
1217 ENDFUNC
1218
1219 FUNC(test_dmla4)
1220         testprologue [BP + 44]
1221         testldcarry [BP + 24]
1222         testexpand [BP + 36], [BP + 40]
1223         mov     edi, [BP + 20]
1224         testtop [BP + 28], [BP + 32]
1225         call    dmla4
1226         testtail [BP + 48]
1227         testcarryout [BP + 24]
1228         testepilogue
1229 ENDFUNC
1230
1231 FUNC(test_mul4)
1232         testprologue [BP + 36]
1233         testldcarry [BP + 24]
1234         testexpand nil, [BP + 32]
1235         mov     edi, [BP + 20]
1236         testtop nil, [BP + 28]
1237         call    mul4
1238         testtail [BP + 40]
1239         testcarryout [BP + 24]
1240         testepilogue
1241 ENDFUNC
1242
1243 FUNC(test_mul4zc)
1244         testprologue [BP + 36]
1245         testldcarry [BP + 24]
1246         testexpand nil, [BP + 32]
1247         mov     edi, [BP + 20]
1248         testtop nil, [BP + 28]
1249         call    mul4zc
1250         testtail [BP + 40]
1251         testcarryout [BP + 24]
1252         testepilogue
1253 ENDFUNC
1254
1255 FUNC(test_mla4)
1256         testprologue [BP + 36]
1257         testldcarry [BP + 24]
1258         testexpand nil, [BP + 32]
1259         mov     edi, [BP + 20]
1260         testtop nil, [BP + 28]
1261         call    mla4
1262         testtail [BP + 40]
1263         testcarryout [BP + 24]
1264         testepilogue
1265 ENDFUNC
1266
1267 FUNC(test_mla4zc)
1268         testprologue [BP + 36]
1269         testldcarry [BP + 24]
1270         testexpand nil, [BP + 32]
1271         mov     edi, [BP + 20]
1272         testtop nil, [BP + 28]
1273         call    mla4zc
1274         testtail [BP + 40]
1275         testcarryout [BP + 24]
1276         testepilogue
1277 ENDFUNC
1278
1279 FUNC(test_mmul4)
1280         testprologue [BP + 48]
1281         testexpand [BP + 40], [BP + 44]
1282         mov     edi, [BP + 20]
1283         testtop [BP + 32], [BP + 36], mont
1284         call    mmul4
1285         testtail [BP + 52]
1286         mov     edi, [BP + 28]
1287         movdqa  xmm0, [SP + 64]
1288         movdqa  xmm1, [SP + 80]
1289         pshufd  xmm0, xmm0, SHUF(0, 2, 1, 3)
1290         pshufd  xmm1, xmm1, SHUF(0, 2, 1, 3)
1291         movdqu  [edi], xmm0
1292         movdqu  [edi + 16], xmm1
1293         testcarryout [BP + 24]
1294         testepilogue
1295 ENDFUNC
1296
1297 FUNC(test_mmla4)
1298         testprologue [BP + 48]
1299         testexpand [BP + 40], [BP + 44]
1300         mov     edi, [BP + 20]
1301         testtop [BP + 32], [BP + 36], mont
1302         call    mmla4
1303         testtail [BP + 52]
1304         mov     edi, [BP + 28]
1305         movdqa  xmm0, [SP + 64]
1306         movdqa  xmm1, [SP + 80]
1307         pshufd  xmm0, xmm0, SHUF(0, 2, 1, 3)
1308         pshufd  xmm1, xmm1, SHUF(0, 2, 1, 3)
1309         movdqu  [edi], xmm0
1310         movdqu  [edi + 16], xmm1
1311         testcarryout [BP + 24]
1312         testepilogue
1313 ENDFUNC
1314
1315 FUNC(test_mont4)
1316         testprologue [BP + 40]
1317         testexpand nil, [BP + 36]
1318         mov     edi, [BP + 20]
1319         testtop nil, [BP + 32], mont
1320         call    mont4
1321         testtail [BP + 44]
1322         mov     edi, [BP + 28]
1323         movdqa  xmm0, [SP + 64]
1324         movdqa  xmm1, [SP + 80]
1325         pshufd  xmm0, xmm0, SHUF(0, 2, 1, 3)
1326         pshufd  xmm1, xmm1, SHUF(0, 2, 1, 3)
1327         movdqu  [edi], xmm0
1328         movdqu  [edi + 16], xmm1
1329         testcarryout [BP + 24]
1330         testepilogue
1331 ENDFUNC
1332
1333 #endif
1334
1335 ///----- That's all, folks --------------------------------------------------