chiark - git - mdw - catacomb/blob - math/mpx-mul4-amd64-sse2.S

   1 /// -*- mode: asm; asm-comment-char: ?/; comment-start: "// " -*-
   2 ///
   3 /// Large SIMD-based multiplications
   4 ///
   5 /// (c) 2016 Straylight/Edgeware
   6 ///
   7
   8 ///----- Licensing notice ---------------------------------------------------
   9 ///
  10 /// This file is part of Catacomb.
  11 ///
  12 /// Catacomb is free software; you can redistribute it and/or modify
  13 /// it under the terms of the GNU Library General Public License as
  14 /// published by the Free Software Foundation; either version 2 of the
  15 /// License, or (at your option) any later version.
  16 ///
  17 /// Catacomb is distributed in the hope that it will be useful,
  18 /// but WITHOUT ANY WARRANTY; without even the implied warranty of
  19 /// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  20 /// GNU Library General Public License for more details.
  21 ///
  22 /// You should have received a copy of the GNU Library General Public
  23 /// License along with Catacomb; if not, write to the Free
  24 /// Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
  25 /// MA 02111-1307, USA.
  26
  27 ///--------------------------------------------------------------------------
  28 /// External definitions.
  29
  30 #include "config.h"
  31 #include "asm-common.h"
  32
  33 ///--------------------------------------------------------------------------
  34 /// Prologue.
  35
  36         .arch   pentium4
  37         .text
  38
  39 ///--------------------------------------------------------------------------
  40 /// Theory.
  41 ///
  42 /// We define a number of primitive fixed-size multipliers from which we can
  43 /// construct more general variable-length multipliers.
  44 ///
  45 /// The basic trick is the same throughout.  In an operand-scanning
  46 /// multiplication, the inner multiplication loop multiplies a
  47 /// multiple-precision operand by a single precision factor, and adds the
  48 /// result, appropriately shifted, to the result.  A `finely integrated
  49 /// operand scanning' implementation of Montgomery multiplication also adds
  50 /// the product of a single-precision `Montgomery factor' and the modulus,
  51 /// calculated in the same pass.  The more common `coarsely integrated
  52 /// operand scanning' alternates main multiplication and Montgomery passes,
  53 /// which requires additional carry propagation.
  54 ///
  55 /// Throughout both plain-multiplication and Montgomery stages, then, one of
  56 /// the factors remains constant throughout the operation, so we can afford
  57 /// to take a little time to preprocess it.  The transformation we perform is
  58 /// as follows.  Let b = 2^16, and B = b^2 = 2^32.  Suppose we're given a
  59 /// 128-bit factor v = v_0 + v_1 B + v_2 B^2 + v_3 B^3.  Split each v_i into
  60 /// two sixteen-bit pieces, so v_i = v'_i + v''_i b.  These eight 16-bit
  61 /// pieces are placed into 32-bit cells, and arranged as two 128-bit SSE
  62 /// operands, as follows.
  63 ///
  64 ///     Offset     0       4        8      12
  65 ///        0    v'_0    v'_1    v''_0   v''_1
  66 ///       16    v'_2    v'_3    v''_2   v''_3
  67 ///
  68 /// A `pmuludqd' instruction ignores the odd positions in its operands; thus,
  69 /// it will act on (say) v'_0 and v''_0 in a single instruction.  Shifting
  70 /// this vector right by 4 bytes brings v'_1 and v''_1 into position.  We can
  71 /// multiply such a vector by a full 32-bit scalar to produce two 48-bit
  72 /// results in 64-bit fields.  The sixteen bits of headroom allows us to add
  73 /// many products together before we must deal with carrying; it also allows
  74 /// for some calculations to be performed on the above expanded form.
  75 ///
  76 /// ...
  77 ///
  78 /// We maintain four `carry' registers accumulating intermediate results.
  79 /// The registers' precise roles rotate during the computation; we name them
  80 /// `c0', `c1', `c2', and `c3'.  Each carry register holds two 64-bit halves:
  81 /// the register c0, for example, holds c'_0 (low half) and c''_0 (high
  82 /// half), and represents the value c_0 = c'_0 + c''_0 b; the carry registers
  83 /// collectively represent the value c_0 + c_1 B + c_2 B^2 + c_3 B^3.  The
  84 /// `pmuluqdq' instruction acting on a scalar operand (broadcast across all
  85 /// lanes of its vector) and an operand in the expanded form above produces a
  86 /// result which can be added directly to the appropriate carry register.
  87 /// Following a pass of four multiplications, we perform some limited carry
  88 /// propagation: let t = c''_0 mod B, and let d = c'_0 + t b; then we output
  89 /// z = d mod B, add (floor(d/B), floor(c''_0/B)) to c1, and cycle the carry
  90 /// registers around, so that c1 becomes c0, and the old c0 is (implicitly)
  91 /// zeroed becomes c3.
  92
  93 ///--------------------------------------------------------------------------
  94 /// Macro definitions.
  95
  96 .macro  mulcore r, i, slo, shi, d0, d1=nil, d2=nil, d3=nil
  97         // Multiply R_I by the expanded operand SLO/SHI, and leave the pieces
  98         // of the product in registers D0, D1, D2, D3.
  99         pshufd  \d0, \r, SHUF(3, \i, 3, \i) // (r_i, ?, r_i, ?)
 100   .ifnes "\d1", "nil"
 101         movdqa  \d1, \slo               // (s'_0, s'_1, s''_0, s''_1)
 102   .endif
 103   .ifnes "\d3", "nil"
 104         movdqa  \d3, \shi               // (s'_2, s'_3, s''_2, s''_3)
 105   .endif
 106   .ifnes "\d1", "nil"
 107         psrldq  \d1, 4                  // (s'_1, s''_0, s''_1, 0)
 108   .endif
 109   .ifnes "\d2", "nil"
 110         movdqa  \d2, \d0                // another copy of (r_i, ?, r_i, ?)
 111   .endif
 112   .ifnes "\d3", "nil"
 113         psrldq  \d3, 4                  // (s'_3, s''_2, s''_3, 0)
 114   .endif
 115   .ifnes "\d1", "nil"
 116         pmuludq \d1, \d0                // (r_i s'_1, r_i s''_1)
 117   .endif
 118   .ifnes "\d3", "nil"
 119         pmuludq \d3, \d0                // (r_i s'_3, r_i s''_3)
 120   .endif
 121   .ifnes "\d2", "nil"
 122         pmuludq \d2, \shi               // (r_i s'_2, r_i s''_2)
 123   .endif
 124         pmuludq \d0, \slo               // (r_i s'_0, r_i s''_0)
 125 .endm
 126
 127 .macro  accum   c0, c1=nil, c2=nil, c3=nil
 128         // Accumulate 64-bit pieces in XMM0--XMM3 into the corresponding
 129         // carry registers C0--C3.  Any or all of C1--C3 may be `nil' to skip
 130         // updating that register.
 131         paddq   \c0, xmm0
 132   .ifnes "\c1", "nil"
 133         paddq   \c1, xmm1
 134   .endif
 135   .ifnes "\c2", "nil"
 136         paddq   \c2, xmm2
 137   .endif
 138   .ifnes "\c3", "nil"
 139         paddq   \c3, xmm3
 140   .endif
 141 .endm
 142
 143 .macro  mulacc  r, i, slo, shi, c0=nil, c1=nil, c2=nil, c3=nil, z3p=nil
 144         // Multiply R_I by the expanded operand SLO/SHI, and accumulate in
 145         // carry registers C0, C1, C2, C3.  If Z3P is `t' then C3 notionally
 146         // contains zero, but needs clearing; in practice, we store the
 147         // product directly rather than attempting to add.  On completion,
 148         // XMM0, XMM1, and XMM2 are clobbered, as is XMM3 if Z3P is not `t'.
 149   .ifeqs "\z3p", "t"
 150         mulcore \r, \i, \slo, \shi, xmm0, xmm1, xmm2, \c3
 151         accum                       \c0,  \c1,  \c2
 152   .else
 153         mulcore \r, \i, \slo, \shi, xmm0, xmm1, xmm2, xmm3
 154         accum                       \c0,  \c1,  \c2,  \c3
 155   .endif
 156 .endm
 157
 158 .macro  propout d, pos, c, cc=nil
 159         // Calculate an output word from C, and store it at POS in D;
 160         // propagate carries out from C to CC in preparation for a rotation
 161         // of the carry registers.  D is an XMM register; the POS is either
 162         // `lo' or `hi' according to whether the output word should be in
 163         // lane 0 or 1 of D; the high two lanes of D are clobbered.  On
 164         // completion, XMM3 is clobbered.  If CC is `nil', then the
 165         // contribution which would have been added to it is left in C.
 166         pshufd  xmm3, \c, SHUF(2, 3, 3, 3) // (?, ?, ?, t = c'' mod B)
 167         psrldq  xmm3, 12                // (t, 0, 0, 0) = (t, 0)
 168         pslldq  xmm3, 2                 // (t b, 0)
 169         paddq   \c, xmm3                // (c' + t b, c'')
 170   .ifeqs "\pos", "lo"
 171         movdqa  \d, \c
 172   .else
 173         punpckldq \d, \c
 174   .endif
 175         psrlq   \c, 32                  // floor(c/B)
 176   .ifnes "\cc", "nil"
 177         paddq   \cc, \c                 // propagate up
 178   .endif
 179 .endm
 180
 181 .macro endprop d, pos, c, t
 182         // On entry, C contains a carry register.  On exit, the low 32 bits
 183         // of the value represented in C are written at POS in D, and the
 184         // remaining bits are left at the bottom of T.
 185         movdqa  \t, \c
 186         psllq   \t, 16                  // (?, c'' b)
 187         pslldq  \c, 8                   // (0, c')
 188         paddq   \t, \c                  // (?, c' + c'' b)
 189         psrldq  \t, 8                   // c' + c'' b
 190   .ifeqs "\pos", "lo"
 191         movdqa  \d, \t
 192   .else
 193         punpckldq \d, \t
 194   .endif
 195         psrldq  \t, 4                   // floor((c' + c'' b)/B)
 196 .endm
 197
 198 .macro  expand  z, a, b, c=nil, d=nil
 199         // On entry, A and C hold packed 128-bit values, and Z is zero.  On
 200         // exit, A:B and C:D together hold the same values in expanded
 201         // form.  If C is `nil', then only expand A to A:B.
 202         movdqa  \b, \a                  // (a_0, a_1, a_2, a_3)
 203   .ifnes "\c", "nil"
 204         movdqa  \d, \c                  // (c_0, c_1, c_2, c_3)
 205   .endif
 206         punpcklwd \a, \z                // (a'_0, a''_0, a'_1, a''_1)
 207         punpckhwd \b, \z                // (a'_2, a''_2, a'_3, a''_3)
 208   .ifnes "\c", "nil"
 209         punpcklwd \c, \z                // (c'_0, c''_0, c'_1, c''_1)
 210         punpckhwd \d, \z                // (c'_2, c''_2, c'_3, c''_3)
 211   .endif
 212         pshufd  \a, \a, SHUF(3, 1, 2, 0) // (a'_0, a'_1, a''_0, a''_1)
 213         pshufd  \b, \b, SHUF(3, 1, 2, 0) // (a'_2, a'_3, a''_2, a''_3)
 214   .ifnes "\c", "nil"
 215         pshufd  \c, \c, SHUF(3, 1, 2, 0) // (c'_0, c'_1, c''_0, c''_1)
 216         pshufd  \d, \d, SHUF(3, 1, 2, 0) // (c'_2, c'_3, c''_2, c''_3)
 217   .endif
 218 .endm
 219
 220 .macro  squash  c0, c1, c2, c3, t, u, lo, hi=nil
 221         // On entry, C0, C1, C2, C3 are carry registers representing a value
 222         // Y.  On exit, LO holds the low 128 bits of the carry value; C1, C2,
 223         // C3, T, and U are clobbered; and the high bits of Y are stored in
 224         // HI, if this is not `nil'.
 225
 226         // The first step is to eliminate the `double-prime' pieces -- i.e.,
 227         // the ones offset by 16 bytes from a 32-bit boundary -- by carrying
 228         // them into the 32-bit-aligned pieces above and below.  But before
 229         // we can do that, we must gather them together.
 230         movdqa  \t, \c0
 231         movdqa  \u, \c1
 232         punpcklqdq \t, \c2              // (y'_0, y'_2)
 233         punpckhqdq \c0, \c2             // (y''_0, y''_2)
 234         punpcklqdq \u, \c3              // (y'_1, y'_3)
 235         punpckhqdq \c1, \c3             // (y''_1, y''_3)
 236
 237         // Now split the double-prime pieces.  The high (up to) 48 bits will
 238         // go up; the low 16 bits go down.
 239         movdqa  \c2, \c0
 240         movdqa  \c3, \c1
 241         psllq   \c2, 48
 242         psllq   \c3, 48
 243         psrlq   \c0, 16                 // high parts of (y''_0, y''_2)
 244         psrlq   \c1, 16                 // high parts of (y''_1, y''_3)
 245         psrlq   \c2, 32                 // low parts of (y''_0, y''_2)
 246         psrlq   \c3, 32                 // low parts of (y''_1, y''_3)
 247   .ifnes "\hi", "nil"
 248         movdqa  \hi, \c1
 249   .endif
 250         pslldq  \c1, 8                  // high part of (0, y''_1)
 251
 252         paddq   \t, \c2                 // propagate down
 253         paddq   \u, \c3
 254         paddq   \t, \c1                 // and up: (y_0, y_2)
 255         paddq   \u, \c0                 // (y_1, y_3)
 256   .ifnes "\hi", "nil"
 257         psrldq  \hi, 8                  // high part of (y''_3, 0)
 258   .endif
 259
 260         // Finally extract the answer.  This complicated dance is better than
 261         // storing to memory and loading, because the piecemeal stores
 262         // inhibit store forwarding.
 263         movdqa  \c3, \t                 // (y_0, y_1)
 264         movdqa  \lo, \t                 // (y^*_0, ?, ?, ?)
 265         psrldq  \t, 8                   // (y_2, 0)
 266         psrlq   \c3, 32                 // (floor(y_0/B), ?)
 267         paddq   \c3, \u                 // (y_1 + floor(y_0/B), ?)
 268         movdqa  \c1, \c3                // (y^*_1, ?, ?, ?)
 269         psrldq  \u, 8                   // (y_3, 0)
 270         psrlq   \c3, 32                 // (floor((y_1 B + y_0)/B^2, ?)
 271         paddq   \c3, \t                 // (y_2 + floor((y_1 B + y_0)/B^2, ?)
 272         punpckldq \lo, \c3              // (y^*_0, y^*_2, ?, ?)
 273         psrlq   \c3, 32             // (floor((y_2 B^2 + y_1 B + y_0)/B^3, ?)
 274         paddq   \c3, \u       // (y_3 + floor((y_2 B^2 + y_1 B + y_0)/B^3, ?)
 275   .ifnes "\hi", "nil"
 276         movdqa  \t, \c3
 277         pxor    \u, \u
 278   .endif
 279         punpckldq \c1, \c3              // (y^*_1, y^*_3, ?, ?)
 280   .ifnes "\hi", "nil"
 281         psrlq   \t, 32                  // very high bits of y
 282         paddq   \hi, \t
 283         punpcklqdq \hi, \u              // carry up
 284   .endif
 285         punpckldq \lo, \c1              // y mod B^4
 286 .endm
 287
 288 .macro  carryadd
 289         // On entry, RDI points to a packed addend A, and XMM12, XMM13, XMM14
 290         // hold the incoming carry registers c0, c1, and c2 representing a
 291         // carry-in C.
 292         //
 293         // On exit, the carry registers, including XMM15, are updated to hold
 294         // C + A; XMM0, XMM1, XMM2, and XMM3 are clobbered.  The other
 295         // registers are preserved.
 296         movd    xmm0, [rdi +  0]        // (a_0, 0)
 297         movd    xmm1, [rdi +  4]        // (a_1, 0)
 298         movd    xmm2, [rdi +  8]        // (a_2, 0)
 299         movd    xmm15, [rdi + 12]       // (a_3, 0)
 300         paddq   xmm12, xmm0             // (c'_0 + a_0, c''_0)
 301         paddq   xmm13, xmm1             // (c'_1 + a_1, c''_1)
 302         paddq   xmm14, xmm2             // (c'_2 + a_2, c''_2 + a_3 b)
 303 .endm
 304
 305 ///--------------------------------------------------------------------------
 306 /// Primitive multipliers and related utilities.
 307
 308 INTFUNC(carryprop)
 309         // On entry, XMM12, XMM13, and XMM14 hold a 144-bit carry in an
 310         // expanded form.  Store the low 128 bits of the represented carry to
 311         // [RDI] as a packed 128-bit value, and leave the remaining 16 bits
 312         // in the low 32 bits of XMM12.  On exit, XMM0, XMM1, XMM3, XMM13 and
 313         // XMM14 are clobbered.
 314   endprologue
 315
 316         propout xmm0, lo, xmm12, xmm13
 317         propout xmm1, lo, xmm13, xmm14
 318         propout xmm0, hi, xmm14, nil
 319         endprop xmm1, hi, xmm14, xmm12
 320         punpckldq xmm0, xmm1
 321         movdqu  [rdi], xmm0
 322
 323         ret
 324
 325 ENDFUNC
 326
 327 INTFUNC(dmul4)
 328         // On entry, RDI points to the destination buffer; RAX and RBX point
 329         // to the packed operands U and X; XMM8/XMM9 and XMM10/XMM11 hold the
 330         // expanded operands V and Y; and XMM12, XMM13, XMM14 hold the
 331         // incoming carry registers c0, c1, and c2; c3 is assumed to be zero.
 332         //
 333         // On exit, we write the low 128 bits of the sum C + U V + X Y to
 334         // [RDI], and update the carry registers with the carry out.  The
 335         // registers XMM0--XMM7, and XMM15 are clobbered; the general-purpose
 336         // registers are preserved.
 337   endprologue
 338
 339         movdqu  xmm4, [rax]
 340         movdqu  xmm5, [rbx]
 341
 342         mulacc  xmm4, 0,   xmm8,  xmm9,  xmm12, xmm13, xmm14, xmm15, t
 343         mulacc  xmm5, 0,   xmm10, xmm11, xmm12, xmm13, xmm14, xmm15
 344         propout xmm6, lo,                xmm12, xmm13
 345
 346         mulacc  xmm4, 1,   xmm8,  xmm9,  xmm13, xmm14, xmm15, xmm12, t
 347         mulacc  xmm5, 1,   xmm10, xmm11, xmm13, xmm14, xmm15, xmm12
 348         propout xmm7, lo,                xmm13, xmm14
 349
 350         mulacc  xmm4, 2,   xmm8,  xmm9,  xmm14, xmm15, xmm12, xmm13, t
 351         mulacc  xmm5, 2,   xmm10, xmm11, xmm14, xmm15, xmm12, xmm13
 352         propout xmm6, hi,                xmm14, xmm15
 353
 354         mulacc  xmm4, 3,   xmm8,  xmm9,  xmm15, xmm12, xmm13, xmm14, t
 355         mulacc  xmm5, 3,   xmm10, xmm11, xmm15, xmm12, xmm13, xmm14
 356         propout xmm7, hi,                xmm15, xmm12
 357
 358         punpckldq xmm6, xmm7
 359         movdqu  [rdi], xmm6
 360
 361         ret
 362
 363 ENDFUNC
 364
 365 INTFUNC(dmla4)
 366         // On entry, RDI points to the destination buffer, which also
 367         // contains an addend A to accumulate; RAX and RBX point to the
 368         // packed operands U and X; XMM8/XMM9 and XMM10/XMM11 hold the
 369         // expanded operands V and Y; and XMM12, XMM13, XMM14 hold the
 370         // incoming carry registers c0, c1, and c2 representing a carry-in C;
 371         // c3 is assumed to be zero.
 372         //
 373         // On exit, we write the low 128 bits of the sum A + C + U V + X Y to
 374         // [RDI], and update the carry registers with the carry out.  The
 375         // registers XMM0--XMM7, and XMM15 are clobbered; the general-purpose
 376         // registers are preserved.
 377   endprologue
 378
 379         movdqu  xmm4, [rax]
 380         movdqu  xmm5, [rbx]
 381         carryadd
 382
 383         mulacc  xmm4, 0,   xmm8,  xmm9,  xmm12, xmm13, xmm14, xmm15
 384         mulacc  xmm5, 0,   xmm10, xmm11, xmm12, xmm13, xmm14, xmm15
 385         propout xmm6, lo,                xmm12, xmm13
 386
 387         mulacc  xmm4, 1,   xmm8,  xmm9,  xmm13, xmm14, xmm15, xmm12, t
 388         mulacc  xmm5, 1,   xmm10, xmm11, xmm13, xmm14, xmm15, xmm12
 389         propout xmm7, lo,                xmm13, xmm14
 390
 391         mulacc  xmm4, 2,   xmm8,  xmm9,  xmm14, xmm15, xmm12, xmm13, t
 392         mulacc  xmm5, 2,   xmm10, xmm11, xmm14, xmm15, xmm12, xmm13
 393         propout xmm6, hi,                xmm14, xmm15
 394
 395         mulacc  xmm4, 3,   xmm8,  xmm9,  xmm15, xmm12, xmm13, xmm14, t
 396         mulacc  xmm5, 3,   xmm10, xmm11, xmm15, xmm12, xmm13, xmm14
 397         propout xmm7, hi,                xmm15, xmm12
 398
 399         punpckldq xmm6, xmm7
 400         movdqu  [rdi], xmm6
 401
 402         ret
 403
 404 ENDFUNC
 405
 406 INTFUNC(mul4zc)
 407         // On entry, RDI points to the destination buffer; RBX points to a
 408         // packed operand X; and XMM10/XMM11 hold an expanded operand Y.
 409         //
 410         // On exit, we write the low 128 bits of the product X Y to [RDI],
 411         // and set the carry registers XMM12, XMM13, XMM14 to the carry out.
 412         // The registers XMM0--XMM3, XMM5--XMM7, and XMM15 are clobbered; the
 413         // general-purpose registers are preserved.
 414   endprologue
 415
 416         movdqu  xmm5, [rbx]
 417
 418         mulcore xmm5, 0,   xmm10, xmm11, xmm12, xmm13, xmm14, xmm15
 419         propout xmm6, lo,                xmm12, xmm13
 420
 421         mulacc  xmm5, 1,   xmm10, xmm11, xmm13, xmm14, xmm15, xmm12, t
 422         propout xmm7, lo,                xmm13, xmm14
 423
 424         mulacc  xmm5, 2,   xmm10, xmm11, xmm14, xmm15, xmm12, xmm13, t
 425         propout xmm6, hi,                xmm14, xmm15
 426
 427         mulacc  xmm5, 3,   xmm10, xmm11, xmm15, xmm12, xmm13, xmm14, t
 428         propout xmm7, hi,                xmm15, xmm12
 429
 430         punpckldq xmm6, xmm7
 431         movdqu  [rdi], xmm6
 432
 433         ret
 434
 435 ENDFUNC
 436
 437 INTFUNC(mul4)
 438         // On entry, RDI points to the destination buffer; RBX points to a
 439         // packed operand X; XMM10/XMM11 hold an expanded operand Y; and
 440         // XMM12, XMM13, XMM14 hold the incoming carry registers c0, c1, and
 441         // c2, representing a carry-in C; c3 is assumed to be zero.
 442         //
 443         // On exit, we write the low 128 bits of the sum C + X Y to [RDI],
 444         // and update the carry registers with the carry out.  The registers
 445         // XMM0--XMM3, XMM5--XMM7, and XMM15 are clobbered; the
 446         // general-purpose registers are preserved.
 447   endprologue
 448
 449         movdqu  xmm5, [rbx]
 450
 451         mulacc  xmm5, 0,   xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, t
 452         propout xmm6, lo,                xmm12, xmm13
 453
 454         mulacc  xmm5, 1,   xmm10, xmm11, xmm13, xmm14, xmm15, xmm12, t
 455         propout xmm7, lo,                xmm13, xmm14
 456
 457         mulacc  xmm5, 2,   xmm10, xmm11, xmm14, xmm15, xmm12, xmm13, t
 458         propout xmm6, hi,                xmm14, xmm15
 459
 460         mulacc  xmm5, 3,   xmm10, xmm11, xmm15, xmm12, xmm13, xmm14, t
 461         propout xmm7, hi,                xmm15, xmm12
 462
 463         punpckldq xmm6, xmm7
 464         movdqu  [rdi], xmm6
 465
 466         ret
 467
 468 ENDFUNC
 469
 470 INTFUNC(mla4zc)
 471         // On entry, RDI points to the destination buffer, which also
 472         // contains an addend A to accumulate; RBX points to a packed operand
 473         // X; and XMM10/XMM11 points to an expanded operand Y.
 474         //
 475         // On exit, we write the low 128 bits of the sum A + X Y to [RDI],
 476         // and set the carry registers XMM12, XMM13, XMM14 to the carry out.
 477         // The registers XMM0--XMM3, XMM5--XMM7, and XMM15 are clobbered; the
 478         // general-purpose registers are preserved.
 479   endprologue
 480
 481         movdqu  xmm5, [rbx]
 482         movd    xmm12, [rdi +  0]
 483         movd    xmm13, [rdi +  4]
 484         movd    xmm14, [rdi +  8]
 485         movd    xmm15, [rdi + 12]
 486
 487         mulacc  xmm5, 0,   xmm10, xmm11, xmm12, xmm13, xmm14, xmm15
 488         propout xmm6, lo,                xmm12, xmm13
 489
 490         mulacc  xmm5, 1,   xmm10, xmm11, xmm13, xmm14, xmm15, xmm12, t
 491         propout xmm7, lo,                xmm13, xmm14
 492
 493         mulacc  xmm5, 2,   xmm10, xmm11, xmm14, xmm15, xmm12, xmm13, t
 494         propout xmm6, hi,                xmm14, xmm15
 495
 496         mulacc  xmm5, 3,   xmm10, xmm11, xmm15, xmm12, xmm13, xmm14, t
 497         propout xmm7, hi,                xmm15, xmm12
 498
 499         punpckldq xmm6, xmm7
 500         movdqu  [rdi], xmm6
 501
 502         ret
 503
 504 ENDFUNC
 505
 506 INTFUNC(mla4)
 507         // On entry, RDI points to the destination buffer, which also
 508         // contains an addend A to accumulate; RBX points to a packed operand
 509         // X; XMM10/XMM11 holds an expanded operand Y; and XMM12, XMM13,
 510         // XMM14 hold the incoming carry registers c0, c1, and c2,
 511         // representing a carry-in C; c3 is assumed to be zero.
 512         //
 513         // On exit, we write the low 128 bits of the sum A + C + X Y to
 514         // [RDI], and update the carry registers with the carry out.  The
 515         // registers XMM0--XMM3, XMM5--XMM7, and XMM15 are clobbered; the
 516         // general-purpose registers are preserved.
 517   endprologue
 518
 519         movdqu  xmm5, [rbx]
 520         carryadd
 521
 522         mulacc  xmm5, 0,    xmm10, xmm11, xmm12, xmm13, xmm14, xmm15
 523         propout xmm6, lo,                xmm12, xmm13
 524
 525         mulacc  xmm5, 1,    xmm10, xmm11, xmm13, xmm14, xmm15, xmm12, t
 526         propout xmm7, lo,                xmm13, xmm14
 527
 528         mulacc  xmm5, 2,    xmm10, xmm11, xmm14, xmm15, xmm12, xmm13, t
 529         propout xmm6, hi,                xmm14, xmm15
 530
 531         mulacc  xmm5, 3,    xmm10, xmm11, xmm15, xmm12, xmm13, xmm14, t
 532         propout xmm7, hi,                xmm15, xmm12
 533
 534         punpckldq xmm6, xmm7
 535         movdqu  [rdi], xmm6
 536
 537         ret
 538
 539 ENDFUNC
 540
 541 INTFUNC(mmul4)
 542         // On entry, RDI points to the destination buffer; RAX and RBX point
 543         // to the packed operands U and N; and XMM8/XMM9 and XMM10/XMM11 hold
 544         // the expanded operands V and M.  The stack pointer must be 8 modulo 16
 545         // (as usual for AMD64 ABIs).
 546         //
 547         // On exit, we store Y = U V M mod B in XMM10/XMM11, and write the
 548         // low 128 bits of the sum U V + N Y to [RDI], leaving the remaining
 549         // carry in XMM12, XMM13, and XMM14.  The registers XMM0--XMM7, and
 550         // XMM15 are clobbered; the general-purpose registers are preserved.
 551         movdqu  xmm4, [rax]
 552 #if ABI_WIN
 553         stalloc 48 + 8                  // space for the carries
 554 #endif
 555   endprologue
 556
 557         // Calculate W = U V, and leave it in XMM7.  Stash the carry pieces
 558         // for later.
 559         mulcore xmm4, 0,   xmm8,  xmm9,  xmm12, xmm13, xmm14, xmm15
 560         propout xmm7, lo,                xmm12, xmm13
 561         jmp     5f
 562
 563 ENDFUNC
 564
 565 INTFUNC(mmla4)
 566         // On entry, RDI points to the destination buffer, which also
 567         // contains an addend A to accumulate; RAX and RBX point to the
 568         // packed operands U and N; and XMM8/XMM9 and XMM10/XMM11 hold the
 569         // expanded operands V and M.  The stack pointer must be 8 modulo 16
 570         // (as usual for AMD64 ABIs).
 571         //
 572         // On exit, we store Y = (A + U V) M mod B in XMM10/XMM11, and write
 573         // the low 128 bits of the sum A + U V + N Y to [RDI], leaving the
 574         // remaining carry in XMM12, XMM13, and XMM14.  The registers
 575         // XMM0--XMM7, and XMM15 are clobbered; the general-purpose registers
 576         // are preserved.
 577         movdqu  xmm4, [rax]
 578 #if ABI_WIN
 579         stalloc 48 + 8                  // space for the carries
 580 #  define STKTMP(i) [rsp + i]
 581 #endif
 582 #if ABI_SYSV
 583 #  define STKTMP(i) [rsp + i - 48 - 8]  // use red zone
 584 #endif
 585   endprologue
 586
 587         movd    xmm12, [rdi +  0]
 588         movd    xmm13, [rdi +  4]
 589         movd    xmm14, [rdi +  8]
 590         movd    xmm15, [rdi + 12]
 591
 592         // Calculate W = U V, and leave it in XMM7.  Stash the carry pieces
 593         // for later.
 594         mulacc  xmm4, 0,   xmm8,  xmm9,  xmm12, xmm13, xmm14, xmm15
 595         propout xmm7, lo,                xmm12, xmm13
 596
 597 5:      mulacc  xmm4, 1,   xmm8,  xmm9,  xmm13, xmm14, xmm15, xmm12, t
 598         propout xmm6, lo,                xmm13, xmm14
 599
 600         mulacc  xmm4, 2,   xmm8,  xmm9,  xmm14, xmm15, xmm12, xmm13, t
 601         propout xmm7, hi,                xmm14, xmm15
 602
 603         mulacc  xmm4, 3,   xmm8,  xmm9,  xmm15, xmm12, xmm13, xmm14, t
 604         propout xmm6, hi,                xmm15, xmm12
 605
 606         // Prepare W, and stash carries for later.
 607         punpckldq xmm7, xmm6
 608         movdqa  STKTMP( 0), xmm12
 609         movdqa  STKTMP(16), xmm13
 610         movdqa  STKTMP(32), xmm14
 611
 612         // Calculate Y = W M.  We just about have enough spare registers to
 613         // make this work.
 614         mulcore xmm7, 0,   xmm10, xmm11, xmm3,  xmm4,  xmm5,  xmm6
 615
 616         // Start expanding W back into the main carry registers...
 617         pxor    xmm15, xmm15
 618         movdqa  xmm12, xmm7
 619         movdqa  xmm14, xmm7
 620
 621         mulcore xmm7, 1,   xmm10, xmm11, xmm0,  xmm1,  xmm2
 622         accum                            xmm4,  xmm5,  xmm6
 623
 624         punpckldq xmm12, xmm15          // (w_0, 0, w_1, 0)
 625         punpckhdq xmm14, xmm15          // (w_2, 0, w_3, 0)
 626
 627         mulcore xmm7, 2,   xmm10, xmm11, xmm0,  xmm1
 628         accum                            xmm5,  xmm6
 629
 630         pxor    xmm2, xmm2
 631         movdqa  xmm13, xmm12
 632         movdqa  xmm15, xmm14
 633
 634         mulcore xmm7, 3,   xmm10, xmm11, xmm0
 635         accum                            xmm6
 636
 637         punpckldq xmm12, xmm2           // (w_0, 0, 0, 0)
 638         punpckldq xmm14, xmm2           // (w_2, 0, 0, 0)
 639         punpckhdq xmm13, xmm2           // (w_1, 0, 0, 0)
 640         punpckhdq xmm15, xmm2           // (w_3, 0, 0, 0)
 641
 642         // That's lots of pieces.  Now we have to assemble the answer.
 643         squash  xmm3, xmm4, xmm5, xmm6,  xmm0, xmm1,  xmm10
 644
 645         // Expand it.
 646         movdqu  xmm5, [rbx]
 647         expand  xmm2, xmm10, xmm11
 648
 649         // Finish the calculation by adding the Montgomery product.
 650         mulacc  xmm5, 0    xmm10, xmm11, xmm12, xmm13, xmm14, xmm15
 651         propout xmm6, lo,                xmm12, xmm13
 652
 653         mulacc  xmm5, 1    xmm10, xmm11, xmm13, xmm14, xmm15, xmm12, t
 654         propout xmm7, lo,                xmm13, xmm14
 655
 656         mulacc  xmm5, 2    xmm10, xmm11, xmm14, xmm15, xmm12, xmm13, t
 657         propout xmm6, hi,                xmm14, xmm15
 658
 659         mulacc  xmm5, 3    xmm10, xmm11, xmm15, xmm12, xmm13, xmm14, t
 660         propout xmm7, hi,                xmm15, xmm12
 661
 662         punpckldq xmm6, xmm7
 663
 664         // Add add on the carry we calculated earlier.
 665         paddq   xmm12, STKTMP( 0)
 666         paddq   xmm13, STKTMP(16)
 667         paddq   xmm14, STKTMP(32)
 668
 669         // And, with that, we're done.
 670         movdqu  [rdi], xmm6
 671 #if ABI_WIN
 672         stfree  56
 673 #endif
 674         ret
 675
 676 #undef STKTMP
 677
 678 ENDFUNC
 679
 680 INTFUNC(mont4)
 681         // On entry, RDI points to the destination buffer holding a packed
 682         // value W; RBX points to a packed operand N; and XMM8/XMM9 hold an
 683         // expanded operand M.
 684         //
 685         // On exit, we store Y = W M mod B in XMM10/XMM11, and write the low
 686         // 128 bits of the sum W + N Y to [RDI], leaving the remaining carry
 687         // in XMM12, XMM13, and XMM14.  The registers XMM0--XMM3, XMM5--XMM7,
 688         // and XMM15 are clobbered; the general-purpose registers are
 689         // preserved.
 690   endprologue
 691
 692         movdqu  xmm7, [rdi]
 693
 694         // Calculate Y = W M.  Avoid the standard carry registers, because
 695         // we're setting something else up there.
 696         mulcore xmm7, 0,   xmm8,  xmm9,  xmm3,  xmm4,  xmm5,  xmm6
 697
 698         // Start expanding W back into the main carry registers...
 699         pxor    xmm15, xmm15
 700         movdqa  xmm12, xmm7
 701         movdqa  xmm14, xmm7
 702
 703         mulcore xmm7, 1,   xmm8,  xmm9,  xmm0,  xmm1,  xmm2
 704         accum                            xmm4,  xmm5,  xmm6
 705
 706         punpckldq xmm12, xmm15          // (w_0, 0, w_1, 0)
 707         punpckhdq xmm14, xmm15          // (w_2, 0, w_3, 0)
 708
 709         mulcore xmm7, 2,   xmm8,  xmm9,  xmm0,  xmm1
 710         accum                            xmm5,  xmm6
 711
 712         pxor    xmm2, xmm2
 713         movdqa  xmm13, xmm12
 714         movdqa  xmm15, xmm14
 715
 716         mulcore xmm7, 3,   xmm8,  xmm9,  xmm0
 717         accum                            xmm6
 718
 719         punpckldq xmm12, xmm2           // (w_0, 0, 0, 0)
 720         punpckldq xmm14, xmm2           // (w_2, 0, 0, 0)
 721         punpckhdq xmm13, xmm2           // (w_1, 0, 0, 0)
 722         punpckhdq xmm15, xmm2           // (w_3, 0, 0, 0)
 723
 724         // That's lots of pieces.  Now we have to assemble the answer.
 725         squash  xmm3, xmm4, xmm5, xmm6,  xmm0, xmm1,  xmm10
 726
 727         // Expand it.
 728         movdqu  xmm5, [rbx]
 729         expand  xmm2, xmm10, xmm11
 730
 731         // Finish the calculation by adding the Montgomery product.
 732         mulacc  xmm5, 0    xmm10, xmm11, xmm12, xmm13, xmm14, xmm15
 733         propout xmm6, lo,                xmm12, xmm13
 734
 735         mulacc  xmm5, 1    xmm10, xmm11, xmm13, xmm14, xmm15, xmm12, t
 736         propout xmm7, lo,                xmm13, xmm14
 737
 738         mulacc  xmm5, 2    xmm10, xmm11, xmm14, xmm15, xmm12, xmm13, t
 739         propout xmm6, hi,                xmm14, xmm15
 740
 741         mulacc  xmm5, 3    xmm10, xmm11, xmm15, xmm12, xmm13, xmm14, t
 742         propout xmm7, hi,                xmm15, xmm12
 743
 744         punpckldq xmm6, xmm7
 745
 746         // And, with that, we're done.
 747         movdqu  [rdi], xmm6
 748         ret
 749
 750 ENDFUNC
 751
 752 ///--------------------------------------------------------------------------
 753 /// Bulk multipliers.
 754
 755 FUNC(mpx_umul4_amd64_sse2)
 756         // void mpx_umul4_amd64_sse2(mpw *dv, const mpw *av, const mpw *avl,
 757         //                         const mpw *bv, const mpw *bvl);
 758
 759         // Establish the arguments and do initial setup.
 760         //
 761         //                      sysv    win
 762         // inner loop dv        rdi     rdi*
 763         // inner loop av        rbx*    rbx*
 764         // outer loop dv        r10     rcx
 765         // outer loop bv        rcx     r9
 766         // av base              rsi     rdx
 767         // av limit             rdx     r8
 768         // bv limit             r8      r10
 769
 770 #if ABI_SYSV
 771 #  define DV r10
 772 #  define AV rsi
 773 #  define AVL rdx
 774 #  define BV rcx
 775 #  define BVL r8
 776
 777         pushreg rbx
 778   endprologue
 779
 780         mov     DV, rdi
 781
 782 #endif
 783
 784 #if ABI_WIN
 785 #  define DV rcx
 786 #  define AV rdx
 787 #  define AVL r8
 788 #  define BV r9
 789 #  define BVL r10
 790
 791         pushreg rbx
 792         pushreg rdi
 793         stalloc 160 + 8
 794
 795         savexmm xmm6,    0
 796         savexmm xmm7,   16
 797         savexmm xmm8,   32
 798         savexmm xmm9,   48
 799         savexmm xmm10,  64
 800         savexmm xmm11,  80
 801         savexmm xmm12,  96
 802         savexmm xmm13, 112
 803         savexmm xmm14, 128
 804         savexmm xmm15, 144
 805
 806   endprologue
 807
 808         mov     rdi, DV
 809         mov     BVL, [rsp + 224]
 810
 811 #endif
 812
 813         // Prepare for the first iteration.
 814         pxor    xmm0, xmm0
 815         movdqu  xmm10, [BV]             // bv[0]
 816         mov     rbx, AV
 817         add     DV, 16
 818         add     BV, 16
 819         expand  xmm0, xmm10, xmm11
 820         call    mul4zc
 821         add     rbx, 16
 822         add     rdi, 16
 823         cmp     rbx, AVL                // all done?
 824         jae     8f
 825
 826         .p2align 4
 827         // Continue with the first iteration.
 828 0:      call    mul4
 829         add     rbx, 16
 830         add     rdi, 16
 831         cmp     rbx, AVL                // all done?
 832         jb      0b
 833
 834         // Write out the leftover carry.  There can be no tail here.
 835 8:      call    carryprop
 836         cmp     BV, BVL                 // more passes to do?
 837         jae     9f
 838
 839         .p2align 4
 840         // Set up for the next pass.
 841 1:      movdqu  xmm10, [BV]             // bv[i]
 842         mov     rdi, DV                 // -> dv[i]
 843         pxor    xmm0, xmm0
 844         expand  xmm0, xmm10, xmm11
 845         mov     rbx, AV                 // -> av[0]
 846         add     DV, 16
 847         add     BV, 16
 848         call    mla4zc
 849         add     rbx, 16
 850         add     rdi, 16
 851         cmp     rbx, AVL                // done yet?
 852         jae     8f
 853
 854         .p2align 4
 855         // Continue...
 856 0:      call    mla4
 857         add     rbx, 16
 858         add     rdi, 16
 859         cmp     rbx, AVL
 860         jb      0b
 861
 862         // Finish off this pass.  There was no tail on the previous pass, and
 863         // there can be none on this pass.
 864 8:      call    carryprop
 865         cmp     BV, BVL
 866         jb      1b
 867
 868         // All over.
 869 9:
 870
 871 #if ABI_SYSV
 872         popreg  rbx
 873 #endif
 874
 875 #if ABI_WIN
 876
 877         rstrxmm xmm6,    0
 878         rstrxmm xmm7,   16
 879         rstrxmm xmm8,   32
 880         rstrxmm xmm9,   48
 881         rstrxmm xmm10,  64
 882         rstrxmm xmm11,  80
 883         rstrxmm xmm12,  96
 884         rstrxmm xmm13, 112
 885         rstrxmm xmm14, 128
 886         rstrxmm xmm15, 144
 887
 888         stfree  160 + 8
 889         popreg  rdi
 890         popreg  rbx
 891
 892 #endif
 893
 894         ret
 895
 896 #undef DV
 897 #undef AV
 898 #undef AVL
 899 #undef BV
 900 #undef BVL
 901
 902 ENDFUNC
 903
 904 FUNC(mpxmont_mul4_amd64_sse2)
 905         // void mpxmont_mul4_amd64_sse2(mpw *dv, const mpw *av, const mpw *bv,
 906         //                           const mpw *nv, size_t n, const mpw *mi);
 907
 908         // Establish the arguments and do initial setup.
 909         //
 910         //                      sysv    win
 911         // inner loop dv        rdi     rdi*
 912         // inner loop av        rax     rax
 913         // inner loop nv        rbx*    rbx*
 914         // mi                   r9      r10
 915         // outer loop dv        r10     rcx
 916         // outer loop bv        rdx     r8
 917         // av base              rsi     rdx
 918         // av limit             r11     r11
 919         // bv limit             r8      r12*
 920         // nv base              rcx     r9
 921         // n                    r8      r12*
 922
 923 #if ABI_SYSV
 924 #  define DV r10
 925 #  define AV rsi
 926 #  define AVL r11
 927 #  define BV rdx
 928 #  define BVL r8
 929 #  define NV rcx
 930 #  define N r8
 931 #  define MI r9
 932
 933         pushreg rbx
 934   endprologue
 935
 936         mov     DV, rdi
 937
 938 #endif
 939
 940 #if ABI_WIN
 941 #  define DV rcx
 942 #  define AV rdx
 943 #  define AVL r11
 944 #  define BV r8
 945 #  define BVL r12
 946 #  define NV r9
 947 #  define N r12
 948 #  define MI r10
 949
 950         pushreg rbx
 951         pushreg rdi
 952         pushreg r12
 953         stalloc 160
 954
 955         savexmm xmm6,    0
 956         savexmm xmm7,   16
 957         savexmm xmm8,   32
 958         savexmm xmm9,   48
 959         savexmm xmm10,  64
 960         savexmm xmm11,  80
 961         savexmm xmm12,  96
 962         savexmm xmm13, 112
 963         savexmm xmm14, 128
 964         savexmm xmm15, 144
 965
 966   endprologue
 967
 968         mov     rdi, DV
 969         mov     N, [rsp + 224]
 970         mov     MI, [rsp + 232]
 971
 972 #endif
 973
 974         // Establish the expanded operands.
 975         pxor    xmm0, xmm0
 976         movdqu  xmm8, [BV]              // bv[0]
 977         movdqu  xmm10, [MI]             // mi
 978         expand  xmm0, xmm8, xmm9, xmm10, xmm11
 979
 980         // Set up the outer loop state and prepare for the first iteration.
 981         mov     rax, AV                 // -> U = av[0]
 982         mov     rbx, NV                 // -> X = nv[0]
 983         lea     AVL, [AV + 4*N]         // -> av[n/4] = av limit
 984         lea     BVL, [BV + 4*N]         // -> bv[n/4] = bv limit
 985         add     BV, 16
 986         add     DV, 16
 987         call    mmul4
 988         add     rdi, 16
 989         add     rax, 16
 990         add     rbx, 16
 991         cmp     rax, AVL                // done already?
 992         jae     8f
 993
 994         .p2align 4
 995         // Complete the first inner loop.
 996 0:      call    dmul4
 997         add     rdi, 16
 998         add     rax, 16
 999         add     rbx, 16
1000         cmp     rax, AVL                // done yet?
1001         jb      0b
1002
1003         // Still have carries left to propagate.
1004         call    carryprop
1005         movd    [rdi + 16], xmm12
1006
1007         .p2align 4
1008         // Embark on the next iteration.  (There must be one.  If n = 1, then
1009         // we would have bailed above, to label 8.  Similarly, the subsequent
1010         // iterations can fall into the inner loop immediately.)
1011 1:      pxor    xmm0, xmm0
1012         movdqu  xmm8, [BV]              // bv[i]
1013         movdqu  xmm10, [MI]             // mi
1014         mov     rdi, DV                 // -> Z = dv[i]
1015         mov     rax, AV                 // -> U = av[0]
1016         mov     rbx, NV                 // -> X = nv[0]
1017         expand  xmm0, xmm8, xmm9, xmm10, xmm11
1018         add     BV, 16
1019         add     DV, 16
1020         call    mmla4
1021         add     rdi, 16
1022         add     rax, 16
1023         add     rbx, 16
1024
1025         .p2align 4
1026         // Complete the next inner loop.
1027 0:      call    dmla4
1028         add     rdi, 16
1029         add     rax, 16
1030         add     rbx, 16
1031         cmp     rax, AVL
1032         jb      0b
1033
1034         // Still have carries left to propagate, and they overlap the
1035         // previous iteration's final tail, so read that in and add it.
1036         movd    xmm0, [rdi]
1037         paddq   xmm12, xmm0
1038         call    carryprop
1039         movd    [rdi + 16], xmm12
1040
1041         // Back again, maybe.
1042         cmp     BV, BVL
1043         jb      1b
1044
1045         // All done.
1046 9:
1047
1048 #if ABI_SYSV
1049         popreg  rbx
1050 #endif
1051
1052 #if ABI_WIN
1053
1054         rstrxmm xmm6,    0
1055         rstrxmm xmm7,   16
1056         rstrxmm xmm8,   32
1057         rstrxmm xmm9,   48
1058         rstrxmm xmm10,  64
1059         rstrxmm xmm11,  80
1060         rstrxmm xmm12,  96
1061         rstrxmm xmm13, 112
1062         rstrxmm xmm14, 128
1063         rstrxmm xmm15, 144
1064
1065         stfree  160
1066         popreg  r12
1067         popreg  rdi
1068         popreg  rbx
1069
1070 #endif
1071
1072         ret
1073
1074         // First iteration was short.  Write out the carries and we're done.
1075         // (This could be folded into the main loop structure, but that would
1076         // penalize small numbers more.)
1077 8:      call    carryprop
1078         movd    [rdi + 16], xmm12
1079 #if ABI_SYSV
1080         popreg  rbx
1081         ret
1082 #endif
1083 #if ABI_WIN
1084         jmp     9b
1085 #endif
1086
1087 #undef DV
1088 #undef AV
1089 #undef AVL
1090 #undef BV
1091 #undef BVL
1092 #undef NV
1093 #undef N
1094 #undef MI
1095
1096 ENDFUNC
1097
1098 FUNC(mpxmont_redc4_amd64_sse2)
1099         // void mpxmont_redc4_amd64_sse2(mpw *dv, mpw *dvl, const mpw *nv,
1100         //                             size_t n, const mpw *mi);
1101
1102         // Establish the arguments and do initial setup.
1103         //
1104         //                      sysv    win
1105         // inner loop dv        rdi     rdi*
1106         // dv limit             rax     rax
1107         // blocks-of-4 dv limit rsi     rdx
1108         // inner loop nv        rbx*    rbx*
1109         // mi                   r8      r10
1110         // outer loop dv        r10     rcx
1111         // outer loop dv limit  r11     r11
1112         // nv base              rdx     r8
1113         // nv limit             r9      r12*
1114         // n                    rcx     r9
1115         // c                    rcx     r9
1116
1117 #if ABI_SYSV
1118
1119 #  define DVL rax
1120 #  define DVL4 rsi
1121 #  define MI r8
1122 #  define DV r10
1123 #  define DVLO r11
1124 #  define NV rdx
1125 #  define NVL r9
1126 #  define N rcx
1127 #  define C ecx
1128
1129         pushreg rbx
1130   endprologue
1131
1132         mov     DV, rdi
1133
1134 #endif
1135
1136 #if ABI_WIN
1137
1138 #  define DVL rax
1139 #  define DVL4 rdx
1140 #  define MI r10
1141 #  define DV rcx
1142 #  define DVLO r11
1143 #  define NV r8
1144 #  define NVL r12
1145 #  define N r9
1146 #  define C r9d
1147
1148         pushreg rbx
1149         pushreg rdi
1150         pushreg r12
1151         stalloc 160
1152
1153         savexmm xmm6,    0
1154         savexmm xmm7,   16
1155         savexmm xmm8,   32
1156         savexmm xmm9,   48
1157         savexmm xmm10,  64
1158         savexmm xmm11,  80
1159         savexmm xmm12,  96
1160         savexmm xmm13, 112
1161         savexmm xmm14, 128
1162         savexmm xmm15, 144
1163
1164   endprologue
1165
1166         mov     rdi, DV
1167         mov     MI, [rsp + 224]
1168
1169 #endif
1170
1171         // Establish the expanded operands and the blocks-of-4 dv limit.
1172         pxor    xmm0, xmm0
1173         mov     DVL, DVL4               // -> dv[n] = dv limit
1174         sub     DVL4, DV                // length of dv in bytes
1175         movdqu  xmm8, [MI]              // mi
1176         and     DVL4, ~15               // mask off the tail end
1177         expand  xmm0, xmm8, xmm9
1178         add     DVL4, DV                // find limit
1179
1180         // Set up the outer loop state and prepare for the first iteration.
1181         mov     rbx, NV                 // -> X = nv[0]
1182         lea     DVLO, [DV + 4*N]        // -> dv[n/4] = outer dv limit
1183         lea     NVL, [NV + 4*N]         // -> nv[n/4] = nv limit
1184         add     DV, 16
1185         call    mont4
1186         add     rbx, 16
1187         add     rdi, 16
1188         cmp     rbx, NVL                // done already?
1189         jae     8f
1190
1191         .p2align 4
1192         // Complete the first inner loop.
1193 5:      call    mla4
1194         add     rbx, 16
1195         add     rdi, 16
1196         cmp     rbx, NVL                // done yet?
1197         jb      5b
1198
1199         // Still have carries left to propagate.
1200 8:      carryadd
1201         psllq   xmm15, 16
1202         pslldq  xmm15, 8
1203         paddq   xmm14, xmm15
1204         call    carryprop
1205         movd    C, xmm12
1206         add     rdi, 16
1207         cmp     rdi, DVL4
1208         jae     7f
1209
1210         .p2align 4
1211         // Continue carry propagation until the end of the buffer.
1212 0:      add     [rdi], C
1213         mov     C, 0                    // preserves flags
1214         adcd    [rdi + 4], 0
1215         adcd    [rdi + 8], 0
1216         adcd    [rdi + 12], 0
1217         adc     C, 0
1218         add     rdi, 16
1219         cmp     rdi, DVL4
1220         jb      0b
1221
1222         // Deal with the tail end.
1223 7:      add     [rdi], C
1224         mov     C, 0                    // preserves flags
1225         add     rdi, 4
1226         adc     C, 0
1227         cmp     rdi, DVL
1228         jb      7b
1229
1230         // All done for this iteration.  Start the next.  (This must have at
1231         // least one follow-on iteration, or we'd not have started this outer
1232         // loop.)
1233 8:      mov     rdi, DV                 // -> Z = dv[i]
1234         mov     rbx, NV                 // -> X = nv[0]
1235         cmp     rdi, DVLO               // all done yet?
1236         jae     9f
1237         add     DV, 16
1238         call    mont4
1239         add     rdi, 16
1240         add     rbx, 16
1241         jmp     5b
1242
1243         // All over.
1244 9:
1245
1246 #if ABI_SYSV
1247         popreg  rbx
1248 #endif
1249
1250 #if ABI_WIN
1251
1252         rstrxmm xmm6,    0
1253         rstrxmm xmm7,   16
1254         rstrxmm xmm8,   32
1255         rstrxmm xmm9,   48
1256         rstrxmm xmm10,  64
1257         rstrxmm xmm11,  80
1258         rstrxmm xmm12,  96
1259         rstrxmm xmm13, 112
1260         rstrxmm xmm14, 128
1261         rstrxmm xmm15, 144
1262
1263         stfree  160
1264         popreg  r12
1265         popreg  rdi
1266         popreg  rbx
1267
1268 #endif
1269
1270         ret
1271
1272 #undef DVL
1273 #undef DVL4
1274 #undef MI
1275 #undef DV
1276 #undef DVLO
1277 #undef NV
1278 #undef NVL
1279 #undef N
1280 #undef C
1281
1282 ENDFUNC
1283
1284 ///--------------------------------------------------------------------------
1285 /// Testing and performance measurement.
1286
1287 #ifdef TEST_MUL4
1288
1289 #if ABI_SYSV
1290 #  define ARG0 rdi
1291 #  define ARG1 rsi
1292 #  define ARG2 rdx
1293 #  define ARG3 rcx
1294 #  define ARG4 r8
1295 #  define ARG5 r9
1296 #  define ARG6 STKARG(0)
1297 #  define ARG7 STKARG(1)
1298 #  define ARG8 STKARG(2)
1299 #  define STKARG_OFFSET 16
1300 #endif
1301 #if ABI_WIN
1302 #  define ARG0 rcx
1303 #  define ARG1 rdx
1304 #  define ARG2 r8
1305 #  define ARG3 r9
1306 #  define ARG4 STKARG(0)
1307 #  define ARG5 STKARG(1)
1308 #  define ARG6 STKARG(2)
1309 #  define ARG7 STKARG(3)
1310 #  define ARG8 STKARG(4)
1311 #  define STKARG_OFFSET 40
1312 #endif
1313 #define STKARG(i) [rsp + STKARG_OFFSET + 8*(i)]
1314
1315 //                sysv                          win
1316 //                dmul  smul  mmul  mont        dmul  smul  mmul  mont
1317 // A    rax
1318 // D    rdx
1319 // z    rdi       rdi   rdi   rdi    rdi        rcx   rcx   rcx    rcx
1320 // c    rcx       rsi   rsi   rsi    rsi        rdx   rdx   rdx    rdx
1321 // y    r10       --    --    rdx    rdx        --    --    r8     r8
1322 // u    r11       rdx   --    rcx    --         r8    --    r9     --
1323 // x    rbx       rcx   rdx   r8     rcx        r9    r8    stk0   r9
1324 // vv   xmm8/9    r8    --    r9     r8         stk0  --    stk1   stk0
1325 // yy   xmm10/11  r9    rcx   stk0   --         stk1  r9    stk2   --
1326 // n    r8        stk0  r8    stk1   r9         stk2  stk0  stk3   stk1
1327 // cyv  r9        stk1  r9    stk2   stk0       stk3  stk1  stk4   stk2
1328
1329 .macro  cysetup v, n
1330         rdtsc
1331         shl     rdx, 32
1332         or      rax, rdx
1333         mov     [\v + 8*\n - 8], rax
1334 .endm
1335
1336 .macro  cystore v, n
1337         rdtsc
1338         shl     rdx, 32
1339         or      rax, rdx
1340         sub     rax, [\v + 8*\n - 8]
1341         mov     [\v + 8*\n - 8], rax
1342         dec     \n
1343 .endm
1344
1345 .macro  testprologue mode
1346         pushreg rbx
1347 #if ABI_SYSV
1348   endprologue
1349   .ifeqs "\mode", "dmul"
1350         mov     rbx, rcx
1351         movdqu  xmm8, [r8]
1352         movdqu  xmm10, [r9]
1353         mov     r8d, STKARG(0)
1354         mov     r9, STKARG(1)
1355         mov     r11, rdx
1356         mov     rcx, rsi
1357   .endif
1358   .ifeqs "\mode", "smul"
1359         mov     rbx, rdx
1360         movdqu  xmm10, [rcx]
1361         mov     rcx, rsi
1362   .endif
1363   .ifeqs "\mode", "mmul"
1364         mov     rax, STKARG(0)
1365         mov     rbx, r8
1366         movdqu  xmm8, [r9]
1367         movdqu  xmm10, [rax]
1368         mov     r8, STKARG(1)
1369         mov     r9, STKARG(2)
1370         mov     r10, rdx
1371         mov     r11, rcx
1372         mov     rcx, rsi
1373   .endif
1374   .ifeqs "\mode", "mont"
1375         mov     rbx, rcx
1376         movdqu  xmm8, [r8]
1377         mov     r8, r9
1378         mov     r9, STKARG(0)
1379         mov     r10, rdx
1380         mov     rcx, rsi
1381   .endif
1382 #endif
1383 #if ABI_WIN
1384         pushreg rdi
1385         stalloc 168
1386         savexmm xmm6,    0
1387         savexmm xmm7,   16
1388         savexmm xmm8,   32
1389         savexmm xmm9,   48
1390         savexmm xmm10,  64
1391         savexmm xmm11,  80
1392         savexmm xmm12,  96
1393         savexmm xmm13, 112
1394         savexmm xmm14, 128
1395         savexmm xmm15, 144
1396   endprologue
1397   .ifeqs "\mode", "dmul"
1398         mov     r10, STKARG(0)
1399         mov     r11, STKARG(1)
1400         mov     rdi, rcx
1401         mov     rcx, rdx
1402         mov     rbx, r9
1403         movdqu  xmm8, [r10]
1404         movdqu  xmm10, [r11]
1405         mov     r8, STKARG(2)
1406         mov     r9, STKARG(3)
1407         mov     r11, r8
1408   .endif
1409   .ifeqs "\mode", "smul"
1410         mov     rdi, rcx
1411         mov     rcx, rdx
1412         mov     rbx, r8
1413         movdqu  xmm10, [r9]
1414         mov     r8, STKARG(0)
1415         mov     r9, STKARG(1)
1416   .endif
1417   .ifeqs "\mode", "mmul"
1418         mov     r10, STKARG(1)
1419         mov     r11, STKARG(2)
1420         mov     rdi, rcx
1421         mov     rcx, rdx
1422         mov     rbx, STKARG(0)
1423         movdqu  xmm8, [r10]
1424         movdqu  xmm10, [r11]
1425         mov     r8, STKARG(3)
1426         mov     r9, STKARG(4)
1427         mov     r10, r8
1428         mov     r11, r9
1429   .endif
1430   .ifeqs "\mode", "mont"
1431         mov     r10, STKARG(0)
1432         mov     rdi, rcx
1433         mov     rcx, rdx
1434         mov     rbx, r9
1435         movdqu  xmm8, [r10]
1436         mov     r8, STKARG(1)
1437         mov     r9, STKARG(2)
1438         mov     r10, r8
1439   .endif
1440 #endif
1441
1442         pxor    xmm0, xmm0
1443   .ifeqs "\mode", "dmul"
1444         expand  xmm0, xmm8, xmm9, xmm10, xmm11
1445   .endif
1446   .ifeqs "\mode", "smul"
1447         expand  xmm0, xmm10, xmm11
1448   .endif
1449   .ifeqs "\mode", "mmul"
1450         expand  xmm0, xmm8, xmm9, xmm10, xmm11
1451   .endif
1452   .ifeqs "\mode", "mont"
1453         expand  xmm0, xmm8, xmm9
1454   .endif
1455 .endm
1456
1457 .macro  testepilogue
1458 #if ABI_WIN
1459         rstrxmm xmm6,    0
1460         rstrxmm xmm7,   16
1461         rstrxmm xmm8,   32
1462         rstrxmm xmm9,   48
1463         rstrxmm xmm10,  64
1464         rstrxmm xmm11,  80
1465         rstrxmm xmm12,  96
1466         rstrxmm xmm13, 112
1467         rstrxmm xmm14, 128
1468         rstrxmm xmm15, 144
1469         stfree  168
1470         popreg  rdi
1471 #endif
1472         popreg  rbx
1473         ret
1474 .endm
1475
1476 .macro  testldcarry
1477         movdqu  xmm12, [rcx +  0]       // (c'_0, c''_0)
1478         movdqu  xmm13, [rcx + 16]       // (c'_1, c''_1)
1479         movdqu  xmm14, [rcx + 32]       // (c'_2, c''_2)
1480 .endm
1481
1482 .macro  testtop u=nil
1483         .p2align 4
1484 0:
1485         cysetup r9, r8
1486   .ifnes "\u", "nil"
1487         mov     rax, \u
1488   .endif
1489 .endm
1490
1491 .macro  testtail
1492         cystore r9, r8
1493         jnz     0b
1494 .endm
1495
1496 .macro  testcarryout
1497         movdqu  [rcx +  0], xmm12
1498         movdqu  [rcx + 16], xmm13
1499         movdqu  [rcx + 32], xmm14
1500 .endm
1501
1502 FUNC(test_dmul4)
1503         testprologue dmul
1504         testldcarry
1505         testtop r11
1506         call    dmul4
1507         testtail
1508         testcarryout
1509         testepilogue
1510 ENDFUNC
1511
1512 FUNC(test_dmla4)
1513         testprologue dmul
1514         testldcarry
1515         testtop r11
1516         call    dmla4
1517         testtail
1518         testcarryout
1519         testepilogue
1520 ENDFUNC
1521
1522 FUNC(test_mul4)
1523         testprologue smul
1524         testldcarry
1525         testtop nil
1526         call    mul4
1527         testtail
1528         testcarryout
1529         testepilogue
1530 ENDFUNC
1531
1532 FUNC(test_mla4)
1533         testprologue smul
1534         testldcarry
1535         testtop nil
1536         call    mla4
1537         testtail
1538         testcarryout
1539         testepilogue
1540 ENDFUNC
1541
1542 FUNC(test_mmul4)
1543         testprologue mmul
1544         testtop r11
1545         call    mmul4
1546         testtail
1547         movdqu  [r10 +  0], xmm10
1548         movdqu  [r10 + 16], xmm11
1549         testcarryout
1550         testepilogue
1551 ENDFUNC
1552
1553 FUNC(test_mmla4)
1554         testprologue mmul
1555         testtop r11
1556         call    mmla4
1557         testtail
1558         movdqu  [r10 +  0], xmm10
1559         movdqu  [r10 + 16], xmm11
1560         testcarryout
1561         testepilogue
1562 ENDFUNC
1563
1564 FUNC(test_mont4)
1565         testprologue mont
1566         testtop
1567         call    mont4
1568         testtail
1569         movdqu  [r10 +  0], xmm10
1570         movdqu  [r10 + 16], xmm11
1571         testcarryout
1572         testepilogue
1573 ENDFUNC
1574
1575 #endif
1576
1577 ///----- That's all, folks --------------------------------------------------