chiark - git - mdw - catacomb/blob - math/mpx-mul4-amd64-sse2.S

   1 /// -*- mode: asm; asm-comment-char: ?/; comment-start: "// " -*-
   2 ///
   3 /// Large SIMD-based multiplications
   4 ///
   5 /// (c) 2016 Straylight/Edgeware
   6 ///
   7
   8 ///----- Licensing notice ---------------------------------------------------
   9 ///
  10 /// This file is part of Catacomb.
  11 ///
  12 /// Catacomb is free software; you can redistribute it and/or modify
  13 /// it under the terms of the GNU Library General Public License as
  14 /// published by the Free Software Foundation; either version 2 of the
  15 /// License, or (at your option) any later version.
  16 ///
  17 /// Catacomb is distributed in the hope that it will be useful,
  18 /// but WITHOUT ANY WARRANTY; without even the implied warranty of
  19 /// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  20 /// GNU Library General Public License for more details.
  21 ///
  22 /// You should have received a copy of the GNU Library General Public
  23 /// License along with Catacomb; if not, write to the Free
  24 /// Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
  25 /// MA 02111-1307, USA.
  26
  27 ///--------------------------------------------------------------------------
  28 /// Preliminaries.
  29
  30 #include "config.h"
  31 #include "asm-common.h"
  32
  33         .arch   pentium4
  34
  35         .text
  36
  37 ///--------------------------------------------------------------------------
  38 /// Theory.
  39 ///
  40 /// We define a number of primitive fixed-size multipliers from which we can
  41 /// construct more general variable-length multipliers.
  42 ///
  43 /// The basic trick is the same throughout.  In an operand-scanning
  44 /// multiplication, the inner multiplication loop multiplies a
  45 /// multiple-precision operand by a single precision factor, and adds the
  46 /// result, appropriately shifted, to the result.  A `finely integrated
  47 /// operand scanning' implementation of Montgomery multiplication also adds
  48 /// the product of a single-precision `Montgomery factor' and the modulus,
  49 /// calculated in the same pass.  The more common `coarsely integrated
  50 /// operand scanning' alternates main multiplication and Montgomery passes,
  51 /// which requires additional carry propagation.
  52 ///
  53 /// Throughout both plain-multiplication and Montgomery stages, then, one of
  54 /// the factors remains constant throughout the operation, so we can afford
  55 /// to take a little time to preprocess it.  The transformation we perform is
  56 /// as follows.  Let b = 2^16, and B = b^2 = 2^32.  Suppose we're given a
  57 /// 128-bit factor v = v_0 + v_1 B + v_2 B^2 + v_3 B^3.  Split each v_i into
  58 /// two sixteen-bit pieces, so v_i = v'_i + v''_i b.  These eight 16-bit
  59 /// pieces are placed into 32-bit cells, and arranged as two 128-bit SSE
  60 /// operands, as follows.
  61 ///
  62 ///     Offset     0       4        8      12
  63 ///        0    v'_0    v'_1    v''_0   v''_1
  64 ///       16    v'_2    v'_3    v''_2   v''_3
  65 ///
  66 /// A `pmuludqd' instruction ignores the odd positions in its operands; thus,
  67 /// it will act on (say) v'_0 and v''_0 in a single instruction.  Shifting
  68 /// this vector right by 4 bytes brings v'_1 and v''_1 into position.  We can
  69 /// multiply such a vector by a full 32-bit scalar to produce two 48-bit
  70 /// results in 64-bit fields.  The sixteen bits of headroom allows us to add
  71 /// many products together before we must deal with carrying; it also allows
  72 /// for some calculations to be performed on the above expanded form.
  73 ///
  74 /// ...
  75 ///
  76 /// We maintain four `carry' registers accumulating intermediate results.
  77 /// The registers' precise roles rotate during the computation; we name them
  78 /// `c0', `c1', `c2', and `c3'.  Each carry register holds two 64-bit halves:
  79 /// the register c0, for example, holds c'_0 (low half) and c''_0 (high
  80 /// half), and represents the value c_0 = c'_0 + c''_0 b; the carry registers
  81 /// collectively represent the value c_0 + c_1 B + c_2 B^2 + c_3 B^3.  The
  82 /// `pmuluqdq' instruction acting on a scalar operand (broadcast across all
  83 /// lanes of its vector) and an operand in the expanded form above produces a
  84 /// result which can be added directly to the appropriate carry register.
  85 /// Following a pass of four multiplications, we perform some limited carry
  86 /// propagation: let t = c''_0 mod B, and let d = c'_0 + t b; then we output
  87 /// z = d mod B, add (floor(d/B), floor(c''_0/B)) to c1, and cycle the carry
  88 /// registers around, so that c1 becomes c0, and the old c0 is (implicitly)
  89 /// zeroed becomes c3.
  90
  91 ///--------------------------------------------------------------------------
  92 /// Macro definitions.
  93
  94 .macro  mulcore r, i, slo, shi, d0, d1=nil, d2=nil, d3=nil
  95         // Multiply R_I by the expanded operand SLO/SHI, and leave the pieces
  96         // of the product in registers D0, D1, D2, D3.
  97         pshufd  \d0, \r, SHUF(\i, 3, \i, 3) // (r_i, ?; r_i, ?)
  98   .ifnes "\d1", "nil"
  99         movdqa  \d1, \slo               // (s'_0, s'_1; s''_0, s''_1)
 100   .endif
 101   .ifnes "\d3", "nil"
 102         movdqa  \d3, \shi               // (s'_2, s'_3; s''_2, s''_3)
 103   .endif
 104   .ifnes "\d1", "nil"
 105         psrldq  \d1, 4                  // (s'_1, s''_0; s''_1, 0)
 106   .endif
 107   .ifnes "\d2", "nil"
 108         movdqa  \d2, \d0                // another copy of (r_i, ?; r_i, ?)
 109   .endif
 110   .ifnes "\d3", "nil"
 111         psrldq  \d3, 4                  // (s'_3, s''_2; s''_3, 0)
 112   .endif
 113   .ifnes "\d1", "nil"
 114         pmuludq \d1, \d0                // (r_i s'_1; r_i s''_1)
 115   .endif
 116   .ifnes "\d3", "nil"
 117         pmuludq \d3, \d0                // (r_i s'_3; r_i s''_3)
 118   .endif
 119   .ifnes "\d2", "nil"
 120         pmuludq \d2, \shi               // (r_i s'_2; r_i s''_2)
 121   .endif
 122         pmuludq \d0, \slo               // (r_i s'_0; r_i s''_0)
 123 .endm
 124
 125 .macro  accum   c0, c1=nil, c2=nil, c3=nil
 126         // Accumulate 64-bit pieces in XMM0--XMM3 into the corresponding
 127         // carry registers C0--C3.  Any or all of C1--C3 may be `nil' to skip
 128         // updating that register.
 129         paddq   \c0, xmm0
 130   .ifnes "\c1", "nil"
 131         paddq   \c1, xmm1
 132   .endif
 133   .ifnes "\c2", "nil"
 134         paddq   \c2, xmm2
 135   .endif
 136   .ifnes "\c3", "nil"
 137         paddq   \c3, xmm3
 138   .endif
 139 .endm
 140
 141 .macro  mulacc  r, i, slo, shi, c0=nil, c1=nil, c2=nil, c3=nil, z3p=nil
 142         // Multiply R_I by the expanded operand SLO/SHI, and accumulate in
 143         // carry registers C0, C1, C2, C3.  If Z3P is `t' then C3 notionally
 144         // contains zero, but needs clearing; in practice, we store the
 145         // product directly rather than attempting to add.  On completion,
 146         // XMM0, XMM1, and XMM2 are clobbered, as is XMM3 if Z3P is not `t'.
 147   .ifeqs "\z3p", "t"
 148         mulcore \r, \i, \slo, \shi, xmm0, xmm1, xmm2, \c3
 149         accum                       \c0,  \c1,  \c2
 150   .else
 151         mulcore \r, \i, \slo, \shi, xmm0, xmm1, xmm2, xmm3
 152         accum                       \c0,  \c1,  \c2,  \c3
 153   .endif
 154 .endm
 155
 156 .macro  propout d, pos, c, cc=nil
 157         // Calculate an output word from C, and store it at POS in D;
 158         // propagate carries out from C to CC in preparation for a rotation
 159         // of the carry registers.  D is an XMM register; the POS is either
 160         // `lo' or `hi' according to whether the output word should be in
 161         // lane 0 or 1 of D; the high two lanes of D are clobbered.  On
 162         // completion, XMM3 is clobbered.  If CC is `nil', then the
 163         // contribution which would have been added to it is left in C.
 164         pshufd  xmm3, \c, SHUF(3, 3, 3, 2) // (?, ?; ?, t = c'' mod B)
 165         psrldq  xmm3, 12                // (t, 0; 0, 0) = (t; 0)
 166         pslldq  xmm3, 2                 // (t b; 0)
 167         paddq   \c, xmm3                // (c' + t b; c'')
 168   .ifeqs "\pos", "lo"
 169         movdqa  \d, \c
 170   .else
 171         punpckldq \d, \c
 172   .endif
 173         psrlq   \c, 32                  // floor(c/B)
 174   .ifnes "\cc", "nil"
 175         paddq   \cc, \c                 // propagate up
 176   .endif
 177 .endm
 178
 179 .macro endprop d, pos, c, t
 180         // On entry, C contains a carry register.  On exit, the low 32 bits
 181         // of the value represented in C are written at POS in D, and the
 182         // remaining bits are left at the bottom of T.
 183         movdqa  \t, \c
 184         psllq   \t, 16                  // (?; c'' b)
 185         pslldq  \c, 8                   // (0; c')
 186         paddq   \t, \c                  // (?; c' + c'' b)
 187         psrldq  \t, 8                   // (c' + c'' b; 0) = (c; 0)
 188   .ifeqs "\pos", "lo"
 189         movdqa  \d, \t
 190   .else
 191         punpckldq \d, \t
 192   .endif
 193         psrldq  \t, 4                   // (floor(c/B); 0)
 194 .endm
 195
 196 .macro  expand  z, a, b, c=nil, d=nil
 197         // On entry, A and C hold packed 128-bit values, and Z is zero.  On
 198         // exit, A:B and C:D together hold the same values in expanded
 199         // form.  If C is `nil', then only expand A to A:B.
 200         movdqa  \b, \a                  // (a_0, a_1; a_2, a_3)
 201   .ifnes "\c", "nil"
 202         movdqa  \d, \c                  // (c_0, c_1; c_2, c_3)
 203   .endif
 204         punpcklwd \a, \z                // (a'_0, a''_0; a'_1, a''_1)
 205         punpckhwd \b, \z                // (a'_2, a''_2; a'_3, a''_3)
 206   .ifnes "\c", "nil"
 207         punpcklwd \c, \z                // (c'_0, c''_0; c'_1, c''_1)
 208         punpckhwd \d, \z                // (c'_2, c''_2; c'_3, c''_3)
 209   .endif
 210         pshufd  \a, \a, SHUF(0, 2, 1, 3) // (a'_0, a'_1; a''_0, a''_1)
 211         pshufd  \b, \b, SHUF(0, 2, 1, 3) // (a'_2, a'_3; a''_2, a''_3)
 212   .ifnes "\c", "nil"
 213         pshufd  \c, \c, SHUF(0, 2, 1, 3) // (c'_0, c'_1; c''_0, c''_1)
 214         pshufd  \d, \d, SHUF(0, 2, 1, 3) // (c'_2, c'_3; c''_2, c''_3)
 215   .endif
 216 .endm
 217
 218 .macro  squash  c0, c1, c2, c3, t, u, lo, hi=nil
 219         // On entry, C0, C1, C2, C3 are carry registers representing a value
 220         // Y.  On exit, LO holds the low 128 bits of the carry value; C1, C2,
 221         // C3, T, and U are clobbered; and the high bits of Y are stored in
 222         // HI, if this is not `nil'.
 223
 224         // The first step is to eliminate the `double-prime' pieces -- i.e.,
 225         // the ones offset by 16 bytes from a 32-bit boundary -- by carrying
 226         // them into the 32-bit-aligned pieces above and below.  But before
 227         // we can do that, we must gather them together.
 228         movdqa  \t, \c0
 229         movdqa  \u, \c1
 230         punpcklqdq \t, \c2              // (y'_0; y'_2)
 231         punpckhqdq \c0, \c2             // (y''_0; y''_2)
 232         punpcklqdq \u, \c3              // (y'_1; y'_3)
 233         punpckhqdq \c1, \c3             // (y''_1; y''_3)
 234
 235         // Now split the double-prime pieces.  The high (up to) 48 bits will
 236         // go up; the low 16 bits go down.
 237         movdqa  \c2, \c0
 238         movdqa  \c3, \c1
 239         psllq   \c2, 48
 240         psllq   \c3, 48
 241         psrlq   \c0, 16                 // high parts of (y''_0; y''_2)
 242         psrlq   \c1, 16                 // high parts of (y''_1; y''_3)
 243         psrlq   \c2, 32                 // low parts of (y''_0; y''_2)
 244         psrlq   \c3, 32                 // low parts of (y''_1; y''_3)
 245   .ifnes "\hi", "nil"
 246         movdqa  \hi, \c1
 247   .endif
 248         pslldq  \c1, 8                  // high part of (0; y''_1)
 249
 250         paddq   \t, \c2                 // propagate down
 251         paddq   \u, \c3
 252         paddq   \t, \c1                 // and up: (y_0; y_2)
 253         paddq   \u, \c0                 // (y_1; y_3)
 254   .ifnes "\hi", "nil"
 255         psrldq  \hi, 8                  // high part of (y''_3; 0)
 256   .endif
 257
 258         // Finally extract the answer.  This complicated dance is better than
 259         // storing to memory and loading, because the piecemeal stores
 260         // inhibit store forwarding.
 261         movdqa  \c3, \t                 // (y_0; ?)
 262         movdqa  \lo, \t                 // (y^*_0, ?; ?, ?)
 263         psrldq  \t, 8                   // (y_2; 0)
 264         psrlq   \c3, 32                 // (floor(y_0/B); ?)
 265         paddq   \c3, \u                 // (y_1 + floor(y_0/B); ?)
 266         movdqa  \c1, \c3                // (y^*_1, ?; ?, ?)
 267         psrldq  \u, 8                   // (y_3; 0)
 268         psrlq   \c3, 32                 // (floor((y_1 B + y_0)/B^2; ?)
 269         paddq   \c3, \t                 // (y_2 + floor((y_1 B + y_0)/B^2; ?)
 270         punpckldq \lo, \c3              // (y^*_0, y^*_2; ?, ?)
 271         psrlq   \c3, 32             // (floor((y_2 B^2 + y_1 B + y_0)/B^3; ?)
 272         paddq   \c3, \u       // (y_3 + floor((y_2 B^2 + y_1 B + y_0)/B^3; ?)
 273   .ifnes "\hi", "nil"
 274         movdqa  \t, \c3
 275         pxor    \u, \u
 276   .endif
 277         punpckldq \c1, \c3              // (y^*_1, y^*_3; ?, ?)
 278   .ifnes "\hi", "nil"
 279         psrlq   \t, 32                  // very high bits of y
 280         paddq   \hi, \t
 281         punpcklqdq \hi, \u              // carry up
 282   .endif
 283         punpckldq \lo, \c1              // y mod B^4
 284 .endm
 285
 286 .macro  carryadd
 287         // On entry, RDI points to a packed addend A, and XMM12, XMM13, XMM14
 288         // hold the incoming carry registers c0, c1, and c2 representing a
 289         // carry-in C.
 290         //
 291         // On exit, the carry registers, including XMM15, are updated to hold
 292         // C + A; XMM0, XMM1, XMM2, and XMM3 are clobbered.  The other
 293         // registers are preserved.
 294         movd    xmm0, [rdi +  0]        // (a_0; 0)
 295         movd    xmm1, [rdi +  4]        // (a_1; 0)
 296         movd    xmm2, [rdi +  8]        // (a_2; 0)
 297         movd    xmm15, [rdi + 12]       // (a_3; 0)
 298         paddq   xmm12, xmm0             // (c'_0 + a_0; c''_0)
 299         paddq   xmm13, xmm1             // (c'_1 + a_1; c''_1)
 300         paddq   xmm14, xmm2             // (c'_2 + a_2; c''_2 + a_3 b)
 301 .endm
 302
 303 ///--------------------------------------------------------------------------
 304 /// Primitive multipliers and related utilities.
 305
 306 INTFUNC(carryprop)
 307         // On entry, XMM12, XMM13, and XMM14 hold a 144-bit carry in an
 308         // expanded form.  Store the low 128 bits of the represented carry to
 309         // [RDI] as a packed 128-bit value, and leave the remaining 16 bits
 310         // in the low 32 bits of XMM12.  On exit, XMM0, XMM1, XMM3, XMM13 and
 311         // XMM14 are clobbered.
 312   endprologue
 313
 314         propout xmm0, lo, xmm12, xmm13
 315         propout xmm1, lo, xmm13, xmm14
 316         propout xmm0, hi, xmm14, nil
 317         endprop xmm1, hi, xmm14, xmm12
 318         punpckldq xmm0, xmm1
 319         movdqu  [rdi], xmm0
 320
 321         ret
 322 ENDFUNC
 323
 324 INTFUNC(dmul4)
 325         // On entry, RDI points to the destination buffer; RAX and RBX point
 326         // to the packed operands U and X; XMM8/XMM9 and XMM10/XMM11 hold the
 327         // expanded operands V and Y; and XMM12, XMM13, XMM14 hold the
 328         // incoming carry registers c0, c1, and c2; c3 is assumed to be zero.
 329         //
 330         // On exit, we write the low 128 bits of the sum C + U V + X Y to
 331         // [RDI], and update the carry registers with the carry out.  The
 332         // registers XMM0--XMM7, and XMM15 are clobbered; the general-purpose
 333         // registers are preserved.
 334   endprologue
 335
 336         movdqu  xmm4, [rax]
 337         movdqu  xmm5, [rbx]
 338
 339         mulacc  xmm4, 0,   xmm8,  xmm9,  xmm12, xmm13, xmm14, xmm15, t
 340         mulacc  xmm5, 0,   xmm10, xmm11, xmm12, xmm13, xmm14, xmm15
 341         propout xmm6, lo,                xmm12, xmm13
 342
 343         mulacc  xmm4, 1,   xmm8,  xmm9,  xmm13, xmm14, xmm15, xmm12, t
 344         mulacc  xmm5, 1,   xmm10, xmm11, xmm13, xmm14, xmm15, xmm12
 345         propout xmm7, lo,                xmm13, xmm14
 346
 347         mulacc  xmm4, 2,   xmm8,  xmm9,  xmm14, xmm15, xmm12, xmm13, t
 348         mulacc  xmm5, 2,   xmm10, xmm11, xmm14, xmm15, xmm12, xmm13
 349         propout xmm6, hi,                xmm14, xmm15
 350
 351         mulacc  xmm4, 3,   xmm8,  xmm9,  xmm15, xmm12, xmm13, xmm14, t
 352         mulacc  xmm5, 3,   xmm10, xmm11, xmm15, xmm12, xmm13, xmm14
 353         propout xmm7, hi,                xmm15, xmm12
 354
 355         punpckldq xmm6, xmm7
 356         movdqu  [rdi], xmm6
 357
 358         ret
 359 ENDFUNC
 360
 361 INTFUNC(dmla4)
 362         // On entry, RDI points to the destination buffer, which also
 363         // contains an addend A to accumulate; RAX and RBX point to the
 364         // packed operands U and X; XMM8/XMM9 and XMM10/XMM11 hold the
 365         // expanded operands V and Y; and XMM12, XMM13, XMM14 hold the
 366         // incoming carry registers c0, c1, and c2 representing a carry-in C;
 367         // c3 is assumed to be zero.
 368         //
 369         // On exit, we write the low 128 bits of the sum A + C + U V + X Y to
 370         // [RDI], and update the carry registers with the carry out.  The
 371         // registers XMM0--XMM7, and XMM15 are clobbered; the general-purpose
 372         // registers are preserved.
 373   endprologue
 374
 375         movdqu  xmm4, [rax]
 376         movdqu  xmm5, [rbx]
 377         carryadd
 378
 379         mulacc  xmm4, 0,   xmm8,  xmm9,  xmm12, xmm13, xmm14, xmm15
 380         mulacc  xmm5, 0,   xmm10, xmm11, xmm12, xmm13, xmm14, xmm15
 381         propout xmm6, lo,                xmm12, xmm13
 382
 383         mulacc  xmm4, 1,   xmm8,  xmm9,  xmm13, xmm14, xmm15, xmm12, t
 384         mulacc  xmm5, 1,   xmm10, xmm11, xmm13, xmm14, xmm15, xmm12
 385         propout xmm7, lo,                xmm13, xmm14
 386
 387         mulacc  xmm4, 2,   xmm8,  xmm9,  xmm14, xmm15, xmm12, xmm13, t
 388         mulacc  xmm5, 2,   xmm10, xmm11, xmm14, xmm15, xmm12, xmm13
 389         propout xmm6, hi,                xmm14, xmm15
 390
 391         mulacc  xmm4, 3,   xmm8,  xmm9,  xmm15, xmm12, xmm13, xmm14, t
 392         mulacc  xmm5, 3,   xmm10, xmm11, xmm15, xmm12, xmm13, xmm14
 393         propout xmm7, hi,                xmm15, xmm12
 394
 395         punpckldq xmm6, xmm7
 396         movdqu  [rdi], xmm6
 397
 398         ret
 399 ENDFUNC
 400
 401 INTFUNC(mul4zc)
 402         // On entry, RDI points to the destination buffer; RBX points to a
 403         // packed operand X; and XMM10/XMM11 hold an expanded operand Y.
 404         //
 405         // On exit, we write the low 128 bits of the product X Y to [RDI],
 406         // and set the carry registers XMM12, XMM13, XMM14 to the carry out.
 407         // The registers XMM0--XMM3, XMM5--XMM7, and XMM15 are clobbered; the
 408         // general-purpose registers are preserved.
 409   endprologue
 410
 411         movdqu  xmm5, [rbx]
 412
 413         mulcore xmm5, 0,   xmm10, xmm11, xmm12, xmm13, xmm14, xmm15
 414         propout xmm6, lo,                xmm12, xmm13
 415
 416         mulacc  xmm5, 1,   xmm10, xmm11, xmm13, xmm14, xmm15, xmm12, t
 417         propout xmm7, lo,                xmm13, xmm14
 418
 419         mulacc  xmm5, 2,   xmm10, xmm11, xmm14, xmm15, xmm12, xmm13, t
 420         propout xmm6, hi,                xmm14, xmm15
 421
 422         mulacc  xmm5, 3,   xmm10, xmm11, xmm15, xmm12, xmm13, xmm14, t
 423         propout xmm7, hi,                xmm15, xmm12
 424
 425         punpckldq xmm6, xmm7
 426         movdqu  [rdi], xmm6
 427
 428         ret
 429 ENDFUNC
 430
 431 INTFUNC(mul4)
 432         // On entry, RDI points to the destination buffer; RBX points to a
 433         // packed operand X; XMM10/XMM11 hold an expanded operand Y; and
 434         // XMM12, XMM13, XMM14 hold the incoming carry registers c0, c1, and
 435         // c2, representing a carry-in C; c3 is assumed to be zero.
 436         //
 437         // On exit, we write the low 128 bits of the sum C + X Y to [RDI],
 438         // and update the carry registers with the carry out.  The registers
 439         // XMM0--XMM3, XMM5--XMM7, and XMM15 are clobbered; the
 440         // general-purpose registers are preserved.
 441   endprologue
 442
 443         movdqu  xmm5, [rbx]
 444
 445         mulacc  xmm5, 0,   xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, t
 446         propout xmm6, lo,                xmm12, xmm13
 447
 448         mulacc  xmm5, 1,   xmm10, xmm11, xmm13, xmm14, xmm15, xmm12, t
 449         propout xmm7, lo,                xmm13, xmm14
 450
 451         mulacc  xmm5, 2,   xmm10, xmm11, xmm14, xmm15, xmm12, xmm13, t
 452         propout xmm6, hi,                xmm14, xmm15
 453
 454         mulacc  xmm5, 3,   xmm10, xmm11, xmm15, xmm12, xmm13, xmm14, t
 455         propout xmm7, hi,                xmm15, xmm12
 456
 457         punpckldq xmm6, xmm7
 458         movdqu  [rdi], xmm6
 459
 460         ret
 461 ENDFUNC
 462
 463 INTFUNC(mla4zc)
 464         // On entry, RDI points to the destination buffer, which also
 465         // contains an addend A to accumulate; RBX points to a packed operand
 466         // X; and XMM10/XMM11 points to an expanded operand Y.
 467         //
 468         // On exit, we write the low 128 bits of the sum A + X Y to [RDI],
 469         // and set the carry registers XMM12, XMM13, XMM14 to the carry out.
 470         // The registers XMM0--XMM3, XMM5--XMM7, and XMM15 are clobbered; the
 471         // general-purpose registers are preserved.
 472   endprologue
 473
 474         movdqu  xmm5, [rbx]
 475         movd    xmm12, [rdi +  0]
 476         movd    xmm13, [rdi +  4]
 477         movd    xmm14, [rdi +  8]
 478         movd    xmm15, [rdi + 12]
 479
 480         mulacc  xmm5, 0,   xmm10, xmm11, xmm12, xmm13, xmm14, xmm15
 481         propout xmm6, lo,                xmm12, xmm13
 482
 483         mulacc  xmm5, 1,   xmm10, xmm11, xmm13, xmm14, xmm15, xmm12, t
 484         propout xmm7, lo,                xmm13, xmm14
 485
 486         mulacc  xmm5, 2,   xmm10, xmm11, xmm14, xmm15, xmm12, xmm13, t
 487         propout xmm6, hi,                xmm14, xmm15
 488
 489         mulacc  xmm5, 3,   xmm10, xmm11, xmm15, xmm12, xmm13, xmm14, t
 490         propout xmm7, hi,                xmm15, xmm12
 491
 492         punpckldq xmm6, xmm7
 493         movdqu  [rdi], xmm6
 494
 495         ret
 496 ENDFUNC
 497
 498 INTFUNC(mla4)
 499         // On entry, RDI points to the destination buffer, which also
 500         // contains an addend A to accumulate; RBX points to a packed operand
 501         // X; XMM10/XMM11 holds an expanded operand Y; and XMM12, XMM13,
 502         // XMM14 hold the incoming carry registers c0, c1, and c2,
 503         // representing a carry-in C; c3 is assumed to be zero.
 504         //
 505         // On exit, we write the low 128 bits of the sum A + C + X Y to
 506         // [RDI], and update the carry registers with the carry out.  The
 507         // registers XMM0--XMM3, XMM5--XMM7, and XMM15 are clobbered; the
 508         // general-purpose registers are preserved.
 509   endprologue
 510
 511         movdqu  xmm5, [rbx]
 512         carryadd
 513
 514         mulacc  xmm5, 0,    xmm10, xmm11, xmm12, xmm13, xmm14, xmm15
 515         propout xmm6, lo,                xmm12, xmm13
 516
 517         mulacc  xmm5, 1,    xmm10, xmm11, xmm13, xmm14, xmm15, xmm12, t
 518         propout xmm7, lo,                xmm13, xmm14
 519
 520         mulacc  xmm5, 2,    xmm10, xmm11, xmm14, xmm15, xmm12, xmm13, t
 521         propout xmm6, hi,                xmm14, xmm15
 522
 523         mulacc  xmm5, 3,    xmm10, xmm11, xmm15, xmm12, xmm13, xmm14, t
 524         propout xmm7, hi,                xmm15, xmm12
 525
 526         punpckldq xmm6, xmm7
 527         movdqu  [rdi], xmm6
 528
 529         ret
 530 ENDFUNC
 531
 532 INTFUNC(mmul4)
 533         // On entry, RDI points to the destination buffer; RAX and RBX point
 534         // to the packed operands U and N; and XMM8/XMM9 and XMM10/XMM11 hold
 535         // the expanded operands V and M.  The stack pointer must be 8 modulo 16
 536         // (as usual for AMD64 ABIs).
 537         //
 538         // On exit, we store Y = U V M mod B in XMM10/XMM11, and write the
 539         // low 128 bits of the sum U V + N Y to [RDI], leaving the remaining
 540         // carry in XMM12, XMM13, and XMM14.  The registers XMM0--XMM7, and
 541         // XMM15 are clobbered; the general-purpose registers are preserved.
 542         movdqu  xmm4, [rax]
 543 #if ABI_WIN
 544         stalloc 48 + 8                  // space for the carries
 545 #endif
 546   endprologue
 547
 548         // Calculate W = U V, and leave it in XMM7.  Stash the carry pieces
 549         // for later.
 550         mulcore xmm4, 0,   xmm8,  xmm9,  xmm12, xmm13, xmm14, xmm15
 551         propout xmm7, lo,                xmm12, xmm13
 552         jmp     5f
 553 ENDFUNC
 554
 555 INTFUNC(mmla4)
 556         // On entry, RDI points to the destination buffer, which also
 557         // contains an addend A to accumulate; RAX and RBX point to the
 558         // packed operands U and N; and XMM8/XMM9 and XMM10/XMM11 hold the
 559         // expanded operands V and M.  The stack pointer must be 8 modulo 16
 560         // (as usual for AMD64 ABIs).
 561         //
 562         // On exit, we store Y = (A + U V) M mod B in XMM10/XMM11, and write
 563         // the low 128 bits of the sum A + U V + N Y to [RDI], leaving the
 564         // remaining carry in XMM12, XMM13, and XMM14.  The registers
 565         // XMM0--XMM7, and XMM15 are clobbered; the general-purpose registers
 566         // are preserved.
 567         movdqu  xmm4, [rax]
 568 #if ABI_WIN
 569         stalloc 48 + 8                  // space for the carries
 570 #  define STKTMP(i) [SP + i]
 571 #endif
 572 #if ABI_SYSV
 573 #  define STKTMP(i) [SP + i - 48 - 8]   // use red zone
 574 #endif
 575   endprologue
 576
 577         movd    xmm12, [rdi +  0]
 578         movd    xmm13, [rdi +  4]
 579         movd    xmm14, [rdi +  8]
 580         movd    xmm15, [rdi + 12]
 581
 582         // Calculate W = U V, and leave it in XMM7.  Stash the carry pieces
 583         // for later.
 584         mulacc  xmm4, 0,   xmm8,  xmm9,  xmm12, xmm13, xmm14, xmm15
 585         propout xmm7, lo,                xmm12, xmm13
 586
 587 5:      mulacc  xmm4, 1,   xmm8,  xmm9,  xmm13, xmm14, xmm15, xmm12, t
 588         propout xmm6, lo,                xmm13, xmm14
 589
 590         mulacc  xmm4, 2,   xmm8,  xmm9,  xmm14, xmm15, xmm12, xmm13, t
 591         propout xmm7, hi,                xmm14, xmm15
 592
 593         mulacc  xmm4, 3,   xmm8,  xmm9,  xmm15, xmm12, xmm13, xmm14, t
 594         propout xmm6, hi,                xmm15, xmm12
 595
 596         // Prepare W, and stash carries for later.
 597         punpckldq xmm7, xmm6
 598         movdqa  STKTMP( 0), xmm12
 599         movdqa  STKTMP(16), xmm13
 600         movdqa  STKTMP(32), xmm14
 601
 602         // Calculate Y = W M.  We just about have enough spare registers to
 603         // make this work.
 604         mulcore xmm7, 0,   xmm10, xmm11, xmm3,  xmm4,  xmm5,  xmm6
 605
 606         // Start expanding W back into the main carry registers...
 607         pxor    xmm15, xmm15
 608         movdqa  xmm12, xmm7
 609         movdqa  xmm14, xmm7
 610
 611         mulcore xmm7, 1,   xmm10, xmm11, xmm0,  xmm1,  xmm2
 612         accum                            xmm4,  xmm5,  xmm6
 613
 614         punpckldq xmm12, xmm15          // (w_0, 0; w_1, 0)
 615         punpckhdq xmm14, xmm15          // (w_2, 0; w_3, 0)
 616
 617         mulcore xmm7, 2,   xmm10, xmm11, xmm0,  xmm1
 618         accum                            xmm5,  xmm6
 619
 620         pxor    xmm2, xmm2
 621         movdqa  xmm13, xmm12
 622         movdqa  xmm15, xmm14
 623
 624         mulcore xmm7, 3,   xmm10, xmm11, xmm0
 625         accum                            xmm6
 626
 627         punpckldq xmm12, xmm2           // (w_0, 0; 0, 0)
 628         punpckldq xmm14, xmm2           // (w_2, 0; 0, 0)
 629         punpckhdq xmm13, xmm2           // (w_1, 0; 0, 0)
 630         punpckhdq xmm15, xmm2           // (w_3, 0; 0, 0)
 631
 632         // That's lots of pieces.  Now we have to assemble the answer.
 633         squash  xmm3, xmm4, xmm5, xmm6,  xmm0, xmm1,  xmm10
 634
 635         // Expand it.
 636         movdqu  xmm5, [rbx]
 637         expand  xmm2, xmm10, xmm11
 638
 639         // Finish the calculation by adding the Montgomery product.
 640         mulacc  xmm5, 0    xmm10, xmm11, xmm12, xmm13, xmm14, xmm15
 641         propout xmm6, lo,                xmm12, xmm13
 642
 643         mulacc  xmm5, 1    xmm10, xmm11, xmm13, xmm14, xmm15, xmm12, t
 644         propout xmm7, lo,                xmm13, xmm14
 645
 646         mulacc  xmm5, 2    xmm10, xmm11, xmm14, xmm15, xmm12, xmm13, t
 647         propout xmm6, hi,                xmm14, xmm15
 648
 649         mulacc  xmm5, 3    xmm10, xmm11, xmm15, xmm12, xmm13, xmm14, t
 650         propout xmm7, hi,                xmm15, xmm12
 651
 652         punpckldq xmm6, xmm7
 653
 654         // Add add on the carry we calculated earlier.
 655         paddq   xmm12, STKTMP( 0)
 656         paddq   xmm13, STKTMP(16)
 657         paddq   xmm14, STKTMP(32)
 658
 659         // And, with that, we're done.
 660         movdqu  [rdi], xmm6
 661 #if ABI_WIN
 662         stfree  56
 663 #endif
 664         ret
 665
 666 #undef STKTMP
 667
 668 ENDFUNC
 669
 670 INTFUNC(mont4)
 671         // On entry, RDI points to the destination buffer holding a packed
 672         // value W; RBX points to a packed operand N; and XMM8/XMM9 hold an
 673         // expanded operand M.
 674         //
 675         // On exit, we store Y = W M mod B in XMM10/XMM11, and write the low
 676         // 128 bits of the sum W + N Y to [RDI], leaving the remaining carry
 677         // in XMM12, XMM13, and XMM14.  The registers XMM0--XMM3, XMM5--XMM7,
 678         // and XMM15 are clobbered; the general-purpose registers are
 679         // preserved.
 680   endprologue
 681
 682         movdqu  xmm7, [rdi]
 683
 684         // Calculate Y = W M.  Avoid the standard carry registers, because
 685         // we're setting something else up there.
 686         mulcore xmm7, 0,   xmm8,  xmm9,  xmm3,  xmm4,  xmm5,  xmm6
 687
 688         // Start expanding W back into the main carry registers...
 689         pxor    xmm15, xmm15
 690         movdqa  xmm12, xmm7
 691         movdqa  xmm14, xmm7
 692
 693         mulcore xmm7, 1,   xmm8,  xmm9,  xmm0,  xmm1,  xmm2
 694         accum                            xmm4,  xmm5,  xmm6
 695
 696         punpckldq xmm12, xmm15          // (w_0, 0; w_1, 0)
 697         punpckhdq xmm14, xmm15          // (w_2, 0; w_3, 0)
 698
 699         mulcore xmm7, 2,   xmm8,  xmm9,  xmm0,  xmm1
 700         accum                            xmm5,  xmm6
 701
 702         pxor    xmm2, xmm2
 703         movdqa  xmm13, xmm12
 704         movdqa  xmm15, xmm14
 705
 706         mulcore xmm7, 3,   xmm8,  xmm9,  xmm0
 707         accum                            xmm6
 708
 709         punpckldq xmm12, xmm2           // (w_0, 0; 0, 0)
 710         punpckldq xmm14, xmm2           // (w_2, 0; 0, 0)
 711         punpckhdq xmm13, xmm2           // (w_1, 0; 0, 0)
 712         punpckhdq xmm15, xmm2           // (w_3, 0; 0, 0)
 713
 714         // That's lots of pieces.  Now we have to assemble the answer.
 715         squash  xmm3, xmm4, xmm5, xmm6,  xmm0, xmm1,  xmm10
 716
 717         // Expand it.
 718         movdqu  xmm5, [rbx]
 719         expand  xmm2, xmm10, xmm11
 720
 721         // Finish the calculation by adding the Montgomery product.
 722         mulacc  xmm5, 0    xmm10, xmm11, xmm12, xmm13, xmm14, xmm15
 723         propout xmm6, lo,                xmm12, xmm13
 724
 725         mulacc  xmm5, 1    xmm10, xmm11, xmm13, xmm14, xmm15, xmm12, t
 726         propout xmm7, lo,                xmm13, xmm14
 727
 728         mulacc  xmm5, 2    xmm10, xmm11, xmm14, xmm15, xmm12, xmm13, t
 729         propout xmm6, hi,                xmm14, xmm15
 730
 731         mulacc  xmm5, 3    xmm10, xmm11, xmm15, xmm12, xmm13, xmm14, t
 732         propout xmm7, hi,                xmm15, xmm12
 733
 734         punpckldq xmm6, xmm7
 735
 736         // And, with that, we're done.
 737         movdqu  [rdi], xmm6
 738         ret
 739 ENDFUNC
 740
 741 ///--------------------------------------------------------------------------
 742 /// Bulk multipliers.
 743
 744 FUNC(mpx_umul4_amd64_avx)
 745         .arch   .avx
 746         vzeroupper
 747   endprologue
 748         .arch   pentium4
 749 ENDFUNC
 750
 751 FUNC(mpx_umul4_amd64_sse2)
 752         // void mpx_umul4_amd64_sse2(mpw *dv, const mpw *av, const mpw *avl,
 753         //                         const mpw *bv, const mpw *bvl);
 754
 755         // Establish the arguments and do initial setup.
 756         //
 757         //                      sysv    win
 758         // inner loop dv        rdi     rdi*
 759         // inner loop av        rbx*    rbx*
 760         // outer loop dv        r10     rcx
 761         // outer loop bv        rcx     r9
 762         // av base              rsi     rdx
 763         // av limit             rdx     r8
 764         // bv limit             r8      r10
 765
 766 #if ABI_SYSV
 767 #  define DV r10
 768 #  define AV rsi
 769 #  define AVL rdx
 770 #  define BV rcx
 771 #  define BVL r8
 772
 773         pushreg rbx
 774   endprologue
 775
 776         mov     DV, rdi
 777 #endif
 778
 779 #if ABI_WIN
 780 #  define DV rcx
 781 #  define AV rdx
 782 #  define AVL r8
 783 #  define BV r9
 784 #  define BVL r10
 785
 786         pushreg rbx
 787         pushreg rdi
 788         stalloc 160 + 8
 789
 790         savexmm xmm6,    0
 791         savexmm xmm7,   16
 792         savexmm xmm8,   32
 793         savexmm xmm9,   48
 794         savexmm xmm10,  64
 795         savexmm xmm11,  80
 796         savexmm xmm12,  96
 797         savexmm xmm13, 112
 798         savexmm xmm14, 128
 799         savexmm xmm15, 144
 800
 801   endprologue
 802
 803         mov     rdi, DV
 804         mov     BVL, [SP + 224]
 805 #endif
 806
 807         // Prepare for the first iteration.
 808         pxor    xmm0, xmm0
 809         movdqu  xmm10, [BV]             // bv[0]
 810         mov     rbx, AV
 811         add     DV, 16
 812         add     BV, 16
 813         expand  xmm0, xmm10, xmm11
 814         call    mul4zc
 815         add     rbx, 16
 816         add     rdi, 16
 817         cmp     rbx, AVL                // all done?
 818         jae     8f
 819
 820         .p2align 4
 821         // Continue with the first iteration.
 822 0:      call    mul4
 823         add     rbx, 16
 824         add     rdi, 16
 825         cmp     rbx, AVL                // all done?
 826         jb      0b
 827
 828         // Write out the leftover carry.  There can be no tail here.
 829 8:      call    carryprop
 830         cmp     BV, BVL                 // more passes to do?
 831         jae     9f
 832
 833         .p2align 4
 834         // Set up for the next pass.
 835 1:      movdqu  xmm10, [BV]             // bv[i]
 836         mov     rdi, DV                 // -> dv[i]
 837         pxor    xmm0, xmm0
 838         expand  xmm0, xmm10, xmm11
 839         mov     rbx, AV                 // -> av[0]
 840         add     DV, 16
 841         add     BV, 16
 842         call    mla4zc
 843         add     rbx, 16
 844         add     rdi, 16
 845         cmp     rbx, AVL                // done yet?
 846         jae     8f
 847
 848         .p2align 4
 849         // Continue...
 850 0:      call    mla4
 851         add     rbx, 16
 852         add     rdi, 16
 853         cmp     rbx, AVL
 854         jb      0b
 855
 856         // Finish off this pass.  There was no tail on the previous pass, and
 857         // there can be none on this pass.
 858 8:      call    carryprop
 859         cmp     BV, BVL
 860         jb      1b
 861
 862         // All over.
 863 9:
 864
 865 #if ABI_SYSV
 866         popreg  rbx
 867 #endif
 868
 869 #if ABI_WIN
 870         rstrxmm xmm6,    0
 871         rstrxmm xmm7,   16
 872         rstrxmm xmm8,   32
 873         rstrxmm xmm9,   48
 874         rstrxmm xmm10,  64
 875         rstrxmm xmm11,  80
 876         rstrxmm xmm12,  96
 877         rstrxmm xmm13, 112
 878         rstrxmm xmm14, 128
 879         rstrxmm xmm15, 144
 880
 881         stfree  160 + 8
 882         popreg  rdi
 883         popreg  rbx
 884 #endif
 885
 886         ret
 887
 888 #undef DV
 889 #undef AV
 890 #undef AVL
 891 #undef BV
 892 #undef BVL
 893
 894 ENDFUNC
 895
 896 FUNC(mpxmont_mul4_amd64_avx)
 897         .arch   .avx
 898         vzeroupper
 899   endprologue
 900         .arch   pentium4
 901 ENDFUNC
 902
 903 FUNC(mpxmont_mul4_amd64_sse2)
 904         // void mpxmont_mul4_amd64_sse2(mpw *dv, const mpw *av, const mpw *bv,
 905         //                           const mpw *nv, size_t n, const mpw *mi);
 906
 907         // Establish the arguments and do initial setup.
 908         //
 909         //                      sysv    win
 910         // inner loop dv        rdi     rdi*
 911         // inner loop av        rax     rax
 912         // inner loop nv        rbx*    rbx*
 913         // mi                   r9      r10
 914         // outer loop dv        r10     rcx
 915         // outer loop bv        rdx     r8
 916         // av base              rsi     rdx
 917         // av limit             r11     r11
 918         // bv limit             r8      r12*
 919         // nv base              rcx     r9
 920         // n                    r8      r12*
 921
 922 #if ABI_SYSV
 923 #  define DV r10
 924 #  define AV rsi
 925 #  define AVL r11
 926 #  define BV rdx
 927 #  define BVL r8
 928 #  define NV rcx
 929 #  define N r8
 930 #  define MI r9
 931
 932         pushreg rbx
 933   endprologue
 934
 935         mov     DV, rdi
 936 #endif
 937
 938 #if ABI_WIN
 939 #  define DV rcx
 940 #  define AV rdx
 941 #  define AVL r11
 942 #  define BV r8
 943 #  define BVL r12
 944 #  define NV r9
 945 #  define N r12
 946 #  define MI r10
 947
 948         pushreg rbx
 949         pushreg rdi
 950         pushreg r12
 951         stalloc 160
 952
 953         savexmm xmm6,    0
 954         savexmm xmm7,   16
 955         savexmm xmm8,   32
 956         savexmm xmm9,   48
 957         savexmm xmm10,  64
 958         savexmm xmm11,  80
 959         savexmm xmm12,  96
 960         savexmm xmm13, 112
 961         savexmm xmm14, 128
 962         savexmm xmm15, 144
 963
 964   endprologue
 965
 966         mov     rdi, DV
 967         mov     N, [SP + 224]
 968         mov     MI, [SP + 232]
 969 #endif
 970
 971         // Establish the expanded operands.
 972         pxor    xmm0, xmm0
 973         movdqu  xmm8, [BV]              // bv[0]
 974         movdqu  xmm10, [MI]             // mi
 975         expand  xmm0, xmm8, xmm9, xmm10, xmm11
 976
 977         // Set up the outer loop state and prepare for the first iteration.
 978         mov     rax, AV                 // -> U = av[0]
 979         mov     rbx, NV                 // -> X = nv[0]
 980         lea     AVL, [AV + 4*N]         // -> av[n/4] = av limit
 981         lea     BVL, [BV + 4*N]         // -> bv[n/4] = bv limit
 982         add     BV, 16
 983         add     DV, 16
 984         call    mmul4
 985         add     rdi, 16
 986         add     rax, 16
 987         add     rbx, 16
 988         cmp     rax, AVL                // done already?
 989         jae     8f
 990
 991         .p2align 4
 992         // Complete the first inner loop.
 993 0:      call    dmul4
 994         add     rdi, 16
 995         add     rax, 16
 996         add     rbx, 16
 997         cmp     rax, AVL                // done yet?
 998         jb      0b
 999
1000         // Still have carries left to propagate.
1001         call    carryprop
1002         movd    [rdi + 16], xmm12
1003
1004         .p2align 4
1005         // Embark on the next iteration.  (There must be one.  If n = 1, then
1006         // we would have bailed above, to label 8.  Similarly, the subsequent
1007         // iterations can fall into the inner loop immediately.)
1008 1:      pxor    xmm0, xmm0
1009         movdqu  xmm8, [BV]              // bv[i]
1010         movdqu  xmm10, [MI]             // mi
1011         mov     rdi, DV                 // -> Z = dv[i]
1012         mov     rax, AV                 // -> U = av[0]
1013         mov     rbx, NV                 // -> X = nv[0]
1014         expand  xmm0, xmm8, xmm9, xmm10, xmm11
1015         add     BV, 16
1016         add     DV, 16
1017         call    mmla4
1018         add     rdi, 16
1019         add     rax, 16
1020         add     rbx, 16
1021
1022         .p2align 4
1023         // Complete the next inner loop.
1024 0:      call    dmla4
1025         add     rdi, 16
1026         add     rax, 16
1027         add     rbx, 16
1028         cmp     rax, AVL
1029         jb      0b
1030
1031         // Still have carries left to propagate, and they overlap the
1032         // previous iteration's final tail, so read that in and add it.
1033         movd    xmm0, [rdi]
1034         paddq   xmm12, xmm0
1035         call    carryprop
1036         movd    [rdi + 16], xmm12
1037
1038         // Back again, maybe.
1039         cmp     BV, BVL
1040         jb      1b
1041
1042         // All done.
1043 9:
1044
1045 #if ABI_SYSV
1046         popreg  rbx
1047 #endif
1048
1049 #if ABI_WIN
1050         rstrxmm xmm6,    0
1051         rstrxmm xmm7,   16
1052         rstrxmm xmm8,   32
1053         rstrxmm xmm9,   48
1054         rstrxmm xmm10,  64
1055         rstrxmm xmm11,  80
1056         rstrxmm xmm12,  96
1057         rstrxmm xmm13, 112
1058         rstrxmm xmm14, 128
1059         rstrxmm xmm15, 144
1060
1061         stfree  160
1062         popreg  r12
1063         popreg  rdi
1064         popreg  rbx
1065 #endif
1066
1067         ret
1068
1069         // First iteration was short.  Write out the carries and we're done.
1070         // (This could be folded into the main loop structure, but that would
1071         // penalize small numbers more.)
1072 8:      call    carryprop
1073         movd    [rdi + 16], xmm12
1074 #if ABI_SYSV
1075         popreg  rbx
1076         ret
1077 #endif
1078 #if ABI_WIN
1079         jmp     9b
1080 #endif
1081
1082 #undef DV
1083 #undef AV
1084 #undef AVL
1085 #undef BV
1086 #undef BVL
1087 #undef NV
1088 #undef N
1089 #undef MI
1090
1091 ENDFUNC
1092
1093 FUNC(mpxmont_redc4_amd64_avx)
1094         .arch   .avx
1095         vzeroupper
1096   endprologue
1097         .arch   pentium4
1098 ENDFUNC
1099
1100 FUNC(mpxmont_redc4_amd64_sse2)
1101         // void mpxmont_redc4_amd64_sse2(mpw *dv, mpw *dvl, const mpw *nv,
1102         //                             size_t n, const mpw *mi);
1103
1104         // Establish the arguments and do initial setup.
1105         //
1106         //                      sysv    win
1107         // inner loop dv        rdi     rdi*
1108         // dv limit             rax     rax
1109         // blocks-of-4 dv limit rsi     rdx
1110         // inner loop nv        rbx*    rbx*
1111         // mi                   r8      r10
1112         // outer loop dv        r10     rcx
1113         // outer loop dv limit  r11     r11
1114         // nv base              rdx     r8
1115         // nv limit             r9      r12*
1116         // n                    rcx     r9
1117         // c                    rcx     r9
1118
1119 #if ABI_SYSV
1120 #  define DVL rax
1121 #  define DVL4 rsi
1122 #  define MI r8
1123 #  define DV r10
1124 #  define DVLO r11
1125 #  define NV rdx
1126 #  define NVL r9
1127 #  define N rcx
1128 #  define C ecx
1129
1130         pushreg rbx
1131   endprologue
1132
1133         mov     DV, rdi
1134 #endif
1135
1136 #if ABI_WIN
1137 #  define DVL rax
1138 #  define DVL4 rdx
1139 #  define MI r10
1140 #  define DV rcx
1141 #  define DVLO r11
1142 #  define NV r8
1143 #  define NVL r12
1144 #  define N r9
1145 #  define C r9d
1146
1147         pushreg rbx
1148         pushreg rdi
1149         pushreg r12
1150         stalloc 160
1151
1152         savexmm xmm6,    0
1153         savexmm xmm7,   16
1154         savexmm xmm8,   32
1155         savexmm xmm9,   48
1156         savexmm xmm10,  64
1157         savexmm xmm11,  80
1158         savexmm xmm12,  96
1159         savexmm xmm13, 112
1160         savexmm xmm14, 128
1161         savexmm xmm15, 144
1162
1163   endprologue
1164
1165         mov     rdi, DV
1166         mov     MI, [SP + 224]
1167 #endif
1168
1169         // Establish the expanded operands and the blocks-of-4 dv limit.
1170         pxor    xmm0, xmm0
1171         mov     DVL, DVL4               // -> dv[n] = dv limit
1172         sub     DVL4, DV                // length of dv in bytes
1173         movdqu  xmm8, [MI]              // mi
1174         and     DVL4, ~15               // mask off the tail end
1175         expand  xmm0, xmm8, xmm9
1176         add     DVL4, DV                // find limit
1177
1178         // Set up the outer loop state and prepare for the first iteration.
1179         mov     rbx, NV                 // -> X = nv[0]
1180         lea     DVLO, [DV + 4*N]        // -> dv[n/4] = outer dv limit
1181         lea     NVL, [NV + 4*N]         // -> nv[n/4] = nv limit
1182         add     DV, 16
1183         call    mont4
1184         add     rbx, 16
1185         add     rdi, 16
1186         cmp     rbx, NVL                // done already?
1187         jae     8f
1188
1189         .p2align 4
1190         // Complete the first inner loop.
1191 5:      call    mla4
1192         add     rbx, 16
1193         add     rdi, 16
1194         cmp     rbx, NVL                // done yet?
1195         jb      5b
1196
1197         // Still have carries left to propagate.
1198 8:      carryadd
1199         psllq   xmm15, 16
1200         pslldq  xmm15, 8
1201         paddq   xmm14, xmm15
1202         call    carryprop
1203         movd    C, xmm12
1204         add     rdi, 16
1205         cmp     rdi, DVL4
1206         jae     7f
1207
1208         .p2align 4
1209         // Continue carry propagation until the end of the buffer.
1210 0:      add     [rdi], C
1211         mov     C, 0                    // preserves flags
1212         adcd    [rdi + 4], 0
1213         adcd    [rdi + 8], 0
1214         adcd    [rdi + 12], 0
1215         adc     C, 0
1216         add     rdi, 16
1217         cmp     rdi, DVL4
1218         jb      0b
1219
1220         // Deal with the tail end.
1221 7:      add     [rdi], C
1222         mov     C, 0                    // preserves flags
1223         add     rdi, 4
1224         adc     C, 0
1225         cmp     rdi, DVL
1226         jb      7b
1227
1228         // All done for this iteration.  Start the next.  (This must have at
1229         // least one follow-on iteration, or we'd not have started this outer
1230         // loop.)
1231 8:      mov     rdi, DV                 // -> Z = dv[i]
1232         mov     rbx, NV                 // -> X = nv[0]
1233         cmp     rdi, DVLO               // all done yet?
1234         jae     9f
1235         add     DV, 16
1236         call    mont4
1237         add     rdi, 16
1238         add     rbx, 16
1239         jmp     5b
1240
1241         // All over.
1242 9:
1243
1244 #if ABI_SYSV
1245         popreg  rbx
1246 #endif
1247
1248 #if ABI_WIN
1249         rstrxmm xmm6,    0
1250         rstrxmm xmm7,   16
1251         rstrxmm xmm8,   32
1252         rstrxmm xmm9,   48
1253         rstrxmm xmm10,  64
1254         rstrxmm xmm11,  80
1255         rstrxmm xmm12,  96
1256         rstrxmm xmm13, 112
1257         rstrxmm xmm14, 128
1258         rstrxmm xmm15, 144
1259
1260         stfree  160
1261         popreg  r12
1262         popreg  rdi
1263         popreg  rbx
1264 #endif
1265
1266         ret
1267
1268 #undef DVL
1269 #undef DVL4
1270 #undef MI
1271 #undef DV
1272 #undef DVLO
1273 #undef NV
1274 #undef NVL
1275 #undef N
1276 #undef C
1277
1278 ENDFUNC
1279
1280 ///--------------------------------------------------------------------------
1281 /// Testing and performance measurement.
1282
1283 #ifdef TEST_MUL4
1284
1285 #if ABI_SYSV
1286 #  define ARG0 rdi
1287 #  define ARG1 rsi
1288 #  define ARG2 rdx
1289 #  define ARG3 rcx
1290 #  define ARG4 r8
1291 #  define ARG5 r9
1292 #  define ARG6 STKARG(0)
1293 #  define ARG7 STKARG(1)
1294 #  define ARG8 STKARG(2)
1295 #  define STKARG_OFFSET 16
1296 #endif
1297 #if ABI_WIN
1298 #  define ARG0 rcx
1299 #  define ARG1 rdx
1300 #  define ARG2 r8
1301 #  define ARG3 r9
1302 #  define ARG4 STKARG(0)
1303 #  define ARG5 STKARG(1)
1304 #  define ARG6 STKARG(2)
1305 #  define ARG7 STKARG(3)
1306 #  define ARG8 STKARG(4)
1307 #  define STKARG_OFFSET 224
1308 #endif
1309 #define STKARG(i) [SP + STKARG_OFFSET + 8*(i)]
1310
1311 //                sysv                          win
1312 //                dmul  smul  mmul  mont        dmul  smul  mmul  mont
1313 // A    rax
1314 // D    rdx
1315 // z    rdi       rdi   rdi   rdi    rdi        rcx   rcx   rcx    rcx
1316 // c    rcx       rsi   rsi   rsi    rsi        rdx   rdx   rdx    rdx
1317 // y    r10       --    --    rdx    rdx        --    --    r8     r8
1318 // u    r11       rdx   --    rcx    --         r8    --    r9     --
1319 // x    rbx       rcx   rdx   r8     rcx        r9    r8    stk0   r9
1320 // vv   xmm8/9    r8    --    r9     r8         stk0  --    stk1   stk0
1321 // yy   xmm10/11  r9    rcx   stk0   --         stk1  r9    stk2   --
1322 // n    r8        stk0  r8    stk1   r9         stk2  stk0  stk3   stk1
1323 // cyv  r9        stk1  r9    stk2   stk0       stk3  stk1  stk4   stk2
1324
1325 .macro  cysetup v, n
1326         rdtsc
1327         shl     rdx, 32
1328         or      rax, rdx
1329         mov     [\v + 8*\n - 8], rax
1330 .endm
1331
1332 .macro  cystore v, n
1333         rdtsc
1334         shl     rdx, 32
1335         or      rax, rdx
1336         sub     rax, [\v + 8*\n - 8]
1337         mov     [\v + 8*\n - 8], rax
1338         dec     \n
1339 .endm
1340
1341 .macro  testprologue mode
1342         pushreg rbx
1343 #if ABI_SYSV
1344   endprologue
1345   .ifeqs "\mode", "dmul"
1346         mov     rbx, rcx
1347         movdqu  xmm8, [r8]
1348         movdqu  xmm10, [r9]
1349         mov     r8d, STKARG(0)
1350         mov     r9, STKARG(1)
1351         mov     r11, rdx
1352         mov     rcx, rsi
1353   .endif
1354   .ifeqs "\mode", "smul"
1355         mov     rbx, rdx
1356         movdqu  xmm10, [rcx]
1357         mov     rcx, rsi
1358   .endif
1359   .ifeqs "\mode", "mmul"
1360         mov     rax, STKARG(0)
1361         mov     rbx, r8
1362         movdqu  xmm8, [r9]
1363         movdqu  xmm10, [rax]
1364         mov     r8d, STKARG(1)
1365         mov     r9, STKARG(2)
1366         mov     r10, rdx
1367         mov     r11, rcx
1368         mov     rcx, rsi
1369   .endif
1370   .ifeqs "\mode", "mont"
1371         mov     rbx, rcx
1372         movdqu  xmm8, [r8]
1373         mov     r8d, r9d
1374         mov     r9, STKARG(0)
1375         mov     r10, rdx
1376         mov     rcx, rsi
1377   .endif
1378 #endif
1379 #if ABI_WIN
1380         pushreg rdi
1381         stalloc 168
1382         savexmm xmm6,    0
1383         savexmm xmm7,   16
1384         savexmm xmm8,   32
1385         savexmm xmm9,   48
1386         savexmm xmm10,  64
1387         savexmm xmm11,  80
1388         savexmm xmm12,  96
1389         savexmm xmm13, 112
1390         savexmm xmm14, 128
1391         savexmm xmm15, 144
1392   endprologue
1393   .ifeqs "\mode", "dmul"
1394         mov     r10, STKARG(0)
1395         mov     r11, STKARG(1)
1396         mov     rdi, rcx
1397         mov     rcx, rdx
1398         mov     rbx, r9
1399         movdqu  xmm8, [r10]
1400         movdqu  xmm10, [r11]
1401         mov     r11, r8
1402         mov     r8d, STKARG(2)
1403         mov     r9, STKARG(3)
1404   .endif
1405   .ifeqs "\mode", "smul"
1406         mov     rdi, rcx
1407         mov     rcx, rdx
1408         mov     rbx, r8
1409         movdqu  xmm10, [r9]
1410         mov     r8d, STKARG(0)
1411         mov     r9, STKARG(1)
1412   .endif
1413   .ifeqs "\mode", "mmul"
1414         mov     r10, STKARG(1)
1415         mov     r11, STKARG(2)
1416         mov     rdi, rcx
1417         mov     rcx, rdx
1418         mov     rbx, STKARG(0)
1419         movdqu  xmm8, [r10]
1420         movdqu  xmm10, [r11]
1421         mov     r10, r8
1422         mov     r11, r9
1423         mov     r8d, STKARG(3)
1424         mov     r9, STKARG(4)
1425   .endif
1426   .ifeqs "\mode", "mont"
1427         mov     r10, STKARG(0)
1428         mov     rdi, rcx
1429         mov     rcx, rdx
1430         mov     rbx, r9
1431         movdqu  xmm8, [r10]
1432         mov     r10, r8
1433         mov     r8d, STKARG(1)
1434         mov     r9, STKARG(2)
1435   .endif
1436 #endif
1437
1438         pxor    xmm0, xmm0
1439   .ifeqs "\mode", "dmul"
1440         expand  xmm0, xmm8, xmm9, xmm10, xmm11
1441   .endif
1442   .ifeqs "\mode", "smul"
1443         expand  xmm0, xmm10, xmm11
1444   .endif
1445   .ifeqs "\mode", "mmul"
1446         expand  xmm0, xmm8, xmm9, xmm10, xmm11
1447   .endif
1448   .ifeqs "\mode", "mont"
1449         expand  xmm0, xmm8, xmm9
1450   .endif
1451 .endm
1452
1453 .macro  testepilogue
1454 #if ABI_WIN
1455         rstrxmm xmm6,    0
1456         rstrxmm xmm7,   16
1457         rstrxmm xmm8,   32
1458         rstrxmm xmm9,   48
1459         rstrxmm xmm10,  64
1460         rstrxmm xmm11,  80
1461         rstrxmm xmm12,  96
1462         rstrxmm xmm13, 112
1463         rstrxmm xmm14, 128
1464         rstrxmm xmm15, 144
1465         stfree  168
1466         popreg  rdi
1467 #endif
1468         popreg  rbx
1469         ret
1470 .endm
1471
1472 .macro  testldcarry
1473         movdqu  xmm12, [rcx +  0]       // (c'_0; c''_0)
1474         movdqu  xmm13, [rcx + 16]       // (c'_1; c''_1)
1475         movdqu  xmm14, [rcx + 32]       // (c'_2; c''_2)
1476 .endm
1477
1478 .macro  testtop u=nil
1479         .p2align 4
1480 0:
1481         cysetup r9, r8
1482   .ifnes "\u", "nil"
1483         mov     rax, \u
1484   .endif
1485 .endm
1486
1487 .macro  testtail
1488         cystore r9, r8
1489         jnz     0b
1490 .endm
1491
1492 .macro  testcarryout
1493         movdqu  [rcx +  0], xmm12
1494         movdqu  [rcx + 16], xmm13
1495         movdqu  [rcx + 32], xmm14
1496 .endm
1497
1498 FUNC(test_dmul4)
1499         testprologue dmul
1500         testldcarry
1501         testtop r11
1502         call    dmul4
1503         testtail
1504         testcarryout
1505         testepilogue
1506 ENDFUNC
1507
1508 FUNC(test_dmla4)
1509         testprologue dmul
1510         testldcarry
1511         testtop r11
1512         call    dmla4
1513         testtail
1514         testcarryout
1515         testepilogue
1516 ENDFUNC
1517
1518 FUNC(test_mul4)
1519         testprologue smul
1520         testldcarry
1521         testtop nil
1522         call    mul4
1523         testtail
1524         testcarryout
1525         testepilogue
1526 ENDFUNC
1527
1528 FUNC(test_mul4zc)
1529         testprologue smul
1530         testldcarry
1531         testtop nil
1532         call    mul4zc
1533         testtail
1534         testcarryout
1535         testepilogue
1536 ENDFUNC
1537
1538 FUNC(test_mla4)
1539         testprologue smul
1540         testldcarry
1541         testtop nil
1542         call    mla4
1543         testtail
1544         testcarryout
1545         testepilogue
1546 ENDFUNC
1547
1548 FUNC(test_mla4zc)
1549         testprologue smul
1550         testldcarry
1551         testtop nil
1552         call    mla4zc
1553         testtail
1554         testcarryout
1555         testepilogue
1556 ENDFUNC
1557
1558 FUNC(test_mmul4)
1559         testprologue mmul
1560         testtop r11
1561         call    mmul4
1562         testtail
1563         movdqu  [r10 +  0], xmm10
1564         movdqu  [r10 + 16], xmm11
1565         testcarryout
1566         testepilogue
1567 ENDFUNC
1568
1569 FUNC(test_mmla4)
1570         testprologue mmul
1571         testtop r11
1572         call    mmla4
1573         testtail
1574         movdqu  [r10 +  0], xmm10
1575         movdqu  [r10 + 16], xmm11
1576         testcarryout
1577         testepilogue
1578 ENDFUNC
1579
1580 FUNC(test_mont4)
1581         testprologue mont
1582         testtop
1583         call    mont4
1584         testtail
1585         movdqu  [r10 +  0], xmm10
1586         movdqu  [r10 + 16], xmm11
1587         testcarryout
1588         testepilogue
1589 ENDFUNC
1590
1591 #endif
1592
1593 ///----- That's all, folks --------------------------------------------------