chiark - git - mdw - catacomb/blob - math/mpx-mul4-arm64-simd.S

   1 /// -*- mode: asm; asm-comment-char: ?/ -*-
   2 ///
   3 /// Large SIMD-based multiplications
   4 ///
   5 /// (c) 2019 Straylight/Edgeware
   6 ///
   7
   8 ///----- Licensing notice ---------------------------------------------------
   9 ///
  10 /// This file is part of Catacomb.
  11 ///
  12 /// Catacomb is free software: you can redistribute it and/or modify it
  13 /// under the terms of the GNU Library General Public License as published
  14 /// by the Free Software Foundation; either version 2 of the License, or
  15 /// (at your option) any later version.
  16 ///
  17 /// Catacomb is distributed in the hope that it will be useful, but
  18 /// WITHOUT ANY WARRANTY; without even the implied warranty of
  19 /// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  20 /// Library General Public License for more details.
  21 ///
  22 /// You should have received a copy of the GNU Library General Public
  23 /// License along with Catacomb.  If not, write to the Free Software
  24 /// Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
  25 /// USA.
  26
  27 ///--------------------------------------------------------------------------
  28 /// Preliminaries.
  29
  30 #include "config.h"
  31 #include "asm-common.h"
  32
  33         .text
  34
  35 ///--------------------------------------------------------------------------
  36 /// Theory.
  37 ///
  38 /// We define a number of primitive fixed-size multipliers from which we can
  39 /// construct more general variable-length multipliers.
  40 ///
  41 /// The basic trick is the same throughout.  In an operand-scanning
  42 /// multiplication, the inner multiplication loop multiplies a multiple-
  43 /// precision operand by a single precision factor, and adds the result,
  44 /// appropriately shifted, to the result.  A `finely integrated operand
  45 /// scanning' implementation of Montgomery multiplication also adds the
  46 /// product of a single-precision `Montgomery factor' and the modulus,
  47 /// calculated in the same pass.  The more common `coarsely integrated
  48 /// operand scanning' alternates main multiplication and Montgomery passes,
  49 /// which requires additional carry propagation.
  50 ///
  51 /// Throughout both plain-multiplication and Montgomery stages, then, one of
  52 /// the factors remains constant throughout the operation, so we can afford
  53 /// to take a little time to preprocess it.  The transformation we perform is
  54 /// as follows.  Let b = 2^16, and B = b^2 = 2^32.  Suppose we're given a
  55 /// 128-bit factor v = v_0 + v_1 B + v_2 B^2 + v_3 B^3.  Split each v_i into
  56 /// two sixteen-bit pieces, so v_i = v'_i + v''_i b.  These eight 16-bit
  57 /// pieces are placed into 32-bit cells, and arranged as two 128-bit SIMD
  58 /// operands, as follows.
  59 ///
  60 ///     Offset     0       4        8      12
  61 ///        0    v'_0   v''_0     v'_1   v''_1
  62 ///       16    v'_2   v''_2     v'_3   v''_3
  63 ///
  64 /// The `umull' and `umlal' instructions can multiply a vector of two 32-bit
  65 /// values by a 32-bit scalar, giving two 64-bit results; thus, it will act
  66 /// on (say) v'_0 and v''_0 in a single instruction, to produce two 48-bit
  67 /// results in 64-bit fields.  The sixteen bits of headroom allows us to add
  68 /// many products together before we must deal with carrying; it also allows
  69 /// for some calculations to be performed on the above expanded form.
  70 ///
  71 /// We maintain three `carry' registers, v28--v30, accumulating intermediate
  72 /// results; we name them `c0', `c1', and `c2'.  Each carry register holds
  73 /// two 64-bit halves: the register c0, for example, holds c'_0 (low half)
  74 /// and c''_0 (high half), and represents the value c_0 = c'_0 + c''_0 b; the
  75 /// carry registers collectively represent the value c_0 + c_1 B + c_2 B^2.
  76 /// The `umull' or `umlal' instruction acting on a scalar operand and an
  77 /// operand in the expanded form above produces a result which can be added
  78 /// directly to the appropriate carry register.
  79 ///
  80 /// An unusual feature of this code, as compared to the other `mul4'
  81 /// implementations, is that it makes extensive use of the ARM64
  82 /// general-purpose registers for carry resolution and output construction.
  83 /// As a result, an additional general-purpose register (typically x15) is
  84 /// used as an additional carry, with the carry value in bits 16--63.
  85 ///
  86 /// Multiplication is performed in product-scanning order, since ARM
  87 /// processors commonly implement result forwarding for consecutive multiply-
  88 /// and-accumulate instructions specifying the same destination.
  89 /// Experimentally, this runs faster than operand-scanning in an attempt to
  90 /// hide instruction latencies.
  91 ///
  92 /// On 64-bit ARM, we have a vast supply number of registers: the expanded
  93 /// operands are kept in registers.  The packed operands are read from memory
  94 /// into working registers v0 and v1.  The following conventional argument
  95 /// names and locations are used throughout.
  96 ///
  97 /// Arg Format      Location    Notes
  98 ///
  99 /// U   packed      [x1]
 100 /// X   packed      [x2]        In Montgomery multiplication, X = N
 101 /// V   expanded    v2/v3
 102 /// Y   expanded    v4/v5       In Montgomery multiplication, Y = (A + U V) M
 103 /// M   expanded    v6/v7       -N^{-1} (mod B^4)
 104 /// N                           Modulus, for Montgomery multiplication
 105 /// A   packed      [x0]        Destination/accumulator
 106 /// C   carry       v28--v30
 107 /// 0               v31         128-bit zero
 108 ///
 109 /// The calculation is some variant of
 110 ///
 111 ///     A' + C' B^4 <- U V + X Y + A + C
 112 ///
 113 /// The low-level functions fit into a fairly traditional (finely-integrated)
 114 /// operand scanning loop over operand pairs (U, X) (indexed by j) and (V, Y)
 115 /// (indexed by i).
 116 ///
 117 /// The variants are as follows.
 118 ///
 119 /// Function    Variant                 Use             i       j
 120 ///
 121 /// mmul4       A = C = 0               Montgomery      0       0
 122 /// dmul4       A = 0                   Montgomery      0       +
 123 /// mmla4       C = 0                   Montgomery      +       0
 124 /// dmla4       exactly as shown        Montgomery      +       +
 125 ///
 126 /// mul4zc      U = V = A = C = 0       Plain           0       0
 127 /// mul4        U = V = A = 0           Plain           0       +
 128 /// mla4zc      U = V = C = 0           Plain           +       0
 129 /// mla4        U = V = 0               Plain           +       +
 130 ///
 131 /// The `mmul4' and `mmla4' functions are also responsible for calculating
 132 /// the Montgomery reduction factor Y = (A + U V) M used by the rest of the
 133 /// inner loop.
 134
 135 ///--------------------------------------------------------------------------
 136 /// Macro definitions.
 137
 138 .macro  mulacc  z, u, v, x=nil, y=nil
 139         // Set Z = Z + U V + X Y, using the low halves of V and Y.  Y may be
 140         // `nil' to omit the second operand.  Z, V, and Y should be 128-bit
 141         // `vN' registers; and U and X should be 32-bit `vN.s[I]' scalars;
 142         // the multiplications produce two 64-bit elementwise products, which
 143         // are added elementwise to Z.
 144
 145         umlal   \z\().2d, \v\().2s, \u
 146     .ifnes "\y", "nil"
 147         umlal   \z\().2d, \y\().2s, \x
 148     .endif
 149 .endm
 150
 151 .macro  mulacc2 z, u, v, x=nil, y=nil
 152         // Set Z = Z + U V + X Y, using the high halves of V and Y; see
 153         // `mulacc'.
 154
 155         umlal2  \z\().2d, \v\().4s, \u
 156     .ifnes "\y", "nil"
 157         umlal2  \z\().2d, \y\().4s, \x
 158     .endif
 159 .endm
 160
 161 .macro  mulinit z, zinitp, u, v=nil, x=nil, y=nil
 162         // If ZINITP then set Z = Z + U V + X Y, as for `mulacc'; otherwise,
 163         // set Z = U V + X Y.  Operand requirements and detailed operation
 164         // are as for `mulacc'.
 165
 166   .ifeqs "\zinitp", "t"
 167         mulacc  \z, \u, \v, \x, \y
 168   .else
 169         umull   \z\().2d, \v\().2s, \u
 170     .ifnes "\y", "nil"
 171         umlal   \z\().2d, \y\().2s, \x
 172     .endif
 173   .endif
 174 .endm
 175
 176 .macro  mulini2 z, zinitp, u, v=nil, x=nil, y=nil
 177         // As `mulinit', but with the high halves of V and Y.
 178
 179   .ifeqs "\zinitp", "t"
 180         mulacc2 \z, \u, \v, \x, \y
 181   .else
 182         umull2  \z\().2d, \v\().4s, \u
 183     .ifnes "\y", "nil"
 184         umlal2  \z\().2d, \y\().4s, \x
 185     .endif
 186   .endif
 187 .endm
 188
 189 // `mulI': accumulate the B^I and b B^i terms of the polynomial product sum U
 190 // V + X Y, given that U = u_0 + B u_1 + B^2 u_2 + B^3 u_3 (and similarly for
 191 // x), and V = v'_0 + b v''_0 + B (v'_1 + b v''_1) + B^2 (v'_2 + b v''_2) +
 192 // B^3 (v'_3 + b v''_3) (and similarly for Y).  The 64-bit coefficients are
 193 // added into the low and high halves of the 128-bit register Z (if ZINIT is
 194 // `nil' then simply set Z, as if it were initially zero).
 195 .macro  mul0    z, zinitp, u, v0, v1, x=nil, y0=nil, y1=nil
 196         mulinit \z, \zinitp, \u\().s[0], \v0, \x\().s[0], \y0
 197 .endm
 198 .macro  mul1    z, zinitp, u, v0, v1, x=nil, y0=nil, y1=nil
 199         mulini2 \z, \zinitp, \u\().s[0], \v0, \x\().s[0], \y0
 200         mulacc  \z,          \u\().s[1], \v0, \x\().s[1], \y0
 201 .endm
 202 .macro  mul2    z, zinitp, u, v0, v1, x=nil, y0=nil, y1=nil
 203         mulinit \z, \zinitp, \u\().s[0], \v1, \x\().s[0], \y1
 204         mulacc2 \z,          \u\().s[1], \v0, \x\().s[1], \y0
 205         mulacc  \z,          \u\().s[2], \v0, \x\().s[2], \y0
 206 .endm
 207 .macro  mul3    z, zinitp, u, v0, v1, x=nil, y0=nil, y1=nil
 208         mulini2 \z, \zinitp, \u\().s[0], \v1, \x\().s[0], \y1
 209         mulacc  \z,          \u\().s[1], \v1, \x\().s[1], \y1
 210         mulacc2 \z,          \u\().s[2], \v0, \x\().s[2], \y0
 211         mulacc  \z,          \u\().s[3], \v0, \x\().s[3], \y0
 212 .endm
 213 .macro  mul4    z, zinitp, u, v0, v1, x=nil, y0=nil, y1=nil
 214         mulini2 \z, \zinitp, \u\().s[1], \v1, \x\().s[1], \y1
 215         mulacc  \z,          \u\().s[2], \v1, \x\().s[2], \y1
 216         mulacc2 \z,          \u\().s[3], \v0, \x\().s[3], \y0
 217 .endm
 218 .macro  mul5    z, zinitp, u, v0, v1, x=nil, y0=nil, y1=nil
 219         mulini2 \z, \zinitp, \u\().s[2], \v1, \x\().s[2], \y1
 220         mulacc  \z,          \u\().s[3], \v1, \x\().s[3], \y1
 221 .endm
 222 .macro  mul6    z, zinitp, u, v0, v1, x=nil, y0=nil, y1=nil
 223         mulini2 \z, \zinitp, \u\().s[3], \v1, \x\().s[3], \y1
 224 .endm
 225
 226 // Steps in the process of propagating carry bits upwards from ZLO (a 128-bit
 227 // `vN' register).  Here, T0, T1, and CG are 64-bit `xN' general-purpose
 228 // registers clobbered in the process.  Set the low 32 bits of the 64-bit
 229 // `xN' general-purpose register ZOUT to the completed coefficient z_1,
 230 // leaving a carry in CG.
 231 //
 232 // In detail, what happens is as follows.  Suppose initially that ZLO =
 233 // (z'_i; z''_i) and ZHI = (z'_{i+1}; z''_{i+1}).  Let t = z'_i + b z''_i;
 234 // observe that floor(t/b) = floor(z'_i/b) + z''_i.  Let z_i = t mod B, and
 235 // add floor(t/B) = floor((floor(z'_i/b) + z''_i)/b) onto z'_{i+1}.  This has
 236 // a circuit depth of 3; I don't know how to do better.
 237 //
 238 // Output words are left in the low half of a 64-bit register, with rubbish
 239 // in the high half.  Two such results can be combined using the `bfi'
 240 // instruction.
 241 .macro  carry0  zlo, cg=x15, t0=x16, t1=x17
 242         // Capture the values of carry-register ZLO and CG (if not `nil') in
 243         // general-purpose registers T0 and T1, suitable for use in `carry1'.
 244         mov     \t0, \zlo\().d[0]
 245         mov     \t1, \zlo\().d[1]
 246   .ifnes "\cg", "nil"
 247         add     \t0, \t0, \cg, lsr #16
 248   .endif
 249 .endm
 250 .macro  carry1  zout, cg=x15, t0=x16, t1=x17
 251         // Collect a 32-bit output word in the low 32 bits of ZOUT (leaving
 252         // rubbish in the high 32 bits), and update CG suitably to continue
 253         // processing with the next carry register.
 254   .ifnes "\zout", "nil"
 255         add     \zout, \t0, \t1, lsl #16
 256   .endif
 257   .ifnes "\cg", "nil"
 258         add     \cg, \t1, \t0, lsr #16
 259   .endif
 260 .endm
 261
 262 .macro  expand  vlo, vhi, vz=v31
 263         // Expand the packed 128-bit operand in VLO to an expanded operand in
 264         // VLO and VHI, assuming that VZ is all-bits-zero.  All three are
 265         // `vN' 128-bit SIMD registers.
 266         zip2    \vhi\().8h, \vlo\().8h, \vz\().8h
 267         zip1    \vlo\().8h, \vlo\().8h, \vz\().8h
 268 .endm
 269
 270 .macro  sprdacc a0, a1, a2, a3=nil
 271         // Spread the packed 128-bit operand in A0 into carry-format values
 272         // in A0, A1, A2, A3.  If A3 is `nil', then spread the same value
 273         // into A0, A1, A2 only, clobbering x16.
 274   .ifeqs "\a3", "nil"
 275         mov     w16, \a0\().s[3]
 276   .endif
 277         trn2    \a2\().2d, \a0\().2d, v31.2d
 278         trn2    \a1\().2s, \a0\().2s, v31.2s
 279   .ifeqs "\a3", "nil"
 280         lsl     x16, x16, #16
 281   .endif
 282         trn1    \a0\().2s, \a0\().2s, v31.2s
 283   .ifeqs "\a3", "nil"
 284         mov     \a2\().d[1], x16
 285   .else
 286         trn2    \a3\().2s, \a2\().2s, v31.2s
 287   .endif
 288         mov     \a2\().s[1], wzr
 289 .endm
 290
 291 .macro  crryacc a0, a1, a2, a3, c0, c1, c2
 292         // Add the carry-format values A0, A1, A2 into the existing carries
 293         // C0, C1, C2 (leaving A3 where it is).
 294         add     \c0\().2d, \c0\().2d, \a0\().2d
 295         add     \c1\().2d, \c1\().2d, \a1\().2d
 296         add     \c2\().2d, \c2\().2d, \a2\().2d
 297 .endm
 298
 299 ///--------------------------------------------------------------------------
 300 /// Primitive multipliers and related utilities.
 301
 302 INTFUNC(carryprop)
 303         // On entry, x0 points to a destination, and v28--v30 and x15 hold
 304         // incoming carries c0--c2 and cg.  On exit, the low 128 bits of the
 305         // carry value are stored at [x0]; the remaining 16 bits of carry are
 306         // left in x10; x0 is advanced by 16; and x11--x17 are clobbered.
 307   endprologue
 308
 309         carry0  v28
 310         carry1  x11
 311         carry0  v29
 312         carry1  x13
 313         carry0  v30
 314         carry1  x12
 315         bfi     x11, x13, #32, #32
 316         lsr     x14, x15, #16
 317         lsr     x10, x15, #48
 318         bfi     x12, x14, #32, #32
 319         stp     x11, x12, [x0], #16
 320         ret
 321 ENDFUNC
 322
 323 INTFUNC(dmul4)
 324         // On entry, x0 points to the destination; x1 and x2 point to packed
 325         // operands U and X; v2/v3 and v4/v5 hold expanded operands V and Y;
 326         // v28--v30 and x15 hold incoming carries c0--c2 and cg; and v31 is
 327         // zero.  On exit, the destination and carries are updated; x0, x1,
 328         // x2 are each advanced by 16; v2--v5 and v8--v15 are preserved; and
 329         // x11--x14, x16, x17 and the other SIMD registers are clobbered.
 330   endprologue
 331
 332         // Start by loading the operand words from memory.
 333         ldr     q0, [x1], #16
 334         ldr     q1, [x2], #16
 335
 336         // Do the multiplication.
 337         mul0    v28, t,      v0,  v2, v3,    v1,  v4, v5
 338         mul1    v29, t,      v0,  v2, v3,    v1,  v4, v5
 339          carry0 v28
 340         mul2    v30, t,      v0,  v2, v3,    v1,  v4, v5
 341          carry1 x11
 342          carry0 v29
 343         mul3    v27, nil,    v0,  v2, v3,    v1,  v4, v5
 344          carry1 x13
 345          carry0 v30
 346         mul4    v28, nil,    v0,  v2, v3,    v1,  v4, v5
 347          carry1 x12
 348          carry0 v27
 349         mul5    v29, nil,    v0,  v2, v3,    v1,  v4, v5
 350          carry1 x14
 351         mul6    v30, nil,    v0,  v2, v3,    v1,  v4, v5
 352
 353         // Finish up and store the result.
 354         bfi     x11, x13, #32, #32
 355         bfi     x12, x14, #32, #32
 356         stp     x11, x12, [x0], #16
 357
 358         // All done.
 359         ret
 360 ENDFUNC
 361
 362 INTFUNC(dmla4)
 363         // On entry, x0 points to the destination/accumulator A; x1 and x2
 364         // point to packed operands U and X; v2/v3 and v4/v5 hold expanded
 365         // operands V and Y; v28--v30 and x15 hold incoming carries c0--c2
 366         // and cg; and v31 is zero.  On exit, the accumulator and carries are
 367         // updated; x0, x1, x2 are each advanced by 16; v2--v5 and v8--v15
 368         // are preserved; and x11--x14, x16, x17 and the other SIMD registers
 369         // are clobbered.
 370   endprologue
 371
 372         // Start by loading the operand words from memory.
 373         ldr     q24, [x0]
 374         ldr     q0, [x1], #16
 375         ldr     q1, [x2], #16
 376         sprdacc v24, v25, v26, v27
 377         crryacc v24, v25, v26, v27, v28, v29, v30
 378
 379         // Do the multiplication.
 380         mul0    v28, t,      v0,  v2, v3,    v1,  v4, v5
 381         mul1    v29, t,      v0,  v2, v3,    v1,  v4, v5
 382          carry0 v28
 383         mul2    v30, t,      v0,  v2, v3,    v1,  v4, v5
 384          carry1 x11
 385          carry0 v29
 386         mul3    v27, t,      v0,  v2, v3,    v1,  v4, v5
 387          carry1 x13
 388          carry0 v30
 389         mul4    v28, nil,    v0,  v2, v3,    v1,  v4, v5
 390          carry1 x12
 391          carry0 v27
 392         mul5    v29, nil,    v0,  v2, v3,    v1,  v4, v5
 393          carry1 x14
 394         mul6    v30, nil,    v0,  v2, v3,    v1,  v4, v5
 395
 396         // Finish up and store the result.
 397         bfi     x11, x13, #32, #32
 398         bfi     x12, x14, #32, #32
 399         stp     x11, x12, [x0], #16
 400
 401         // All done.
 402         ret
 403 ENDFUNC
 404
 405 INTFUNC(mul4)
 406         // On entry, x0 points to the destination; x2 points to a packed
 407         // operand X; v4/v5 holds an expanded operand Y; v13--v15 and x15
 408         // hold incoming carries c0--c2 and cg; and v31 is zero.  On exit,
 409         // the destination and carries are updated; x0 and x2 are each
 410         // advanced by 16; v4 and v5 and v8--v15 are preserved; and x11--x14,
 411         // x16, x17 and the other SIMD registers are clobbered.
 412   endprologue
 413
 414         // Start by loading the operand words from memory.
 415         ldr     q1, [x2], #16
 416
 417         // Do the multiplication.
 418         mul0    v28, t,      v1,  v4, v5
 419         mul1    v29, t,      v1,  v4, v5
 420          carry0 v28
 421         mul2    v30, t,      v1,  v4, v5
 422          carry1 x11
 423          carry0 v29
 424         mul3    v27, nil,    v1,  v4, v5
 425          carry1 x13
 426          carry0 v30
 427         mul4    v28, nil,    v1,  v4, v5
 428          carry1 x12
 429          carry0 v27
 430         mul5    v29, nil,    v1,  v4, v5
 431          carry1 x14
 432         mul6    v30, nil,    v1,  v4, v5
 433
 434         // Finish up and store the result.
 435         bfi     x11, x13, #32, #32
 436         bfi     x12, x14, #32, #32
 437         stp     x11, x12, [x0], #16
 438
 439         // All done.
 440         ret
 441 ENDFUNC
 442
 443 INTFUNC(mul4zc)
 444         // On entry, x0 points to the destination; x2 points to a packed
 445         // operand X; v4/v5 holds an expanded operand Y; and v31 is zero.  On
 446         // exit, the destination is updated; v28--v30 and x15 hold outgoing
 447         // carries c0--c2 and cg; x0 and x2 are each advanced by 16; v4 and
 448         // v5 and v8--v15 are preserved; and x11--x14, x16, x17 and the other
 449         // SIMD registers are clobbered.
 450   endprologue
 451
 452         // Start by loading the operand words from memory.
 453         ldr     q1, [x2], #16
 454
 455         // Do the multiplication.
 456         mul0    v28, nil,    v1,  v4, v5
 457         mul1    v29, nil,    v1,  v4, v5
 458          carry0 v28, nil
 459         mul2    v30, nil,    v1,  v4, v5
 460          carry1 x11
 461          carry0 v29
 462         mul3    v27, nil,    v1,  v4, v5
 463          carry1 x13
 464          carry0 v30
 465         mul4    v28, nil,    v1,  v4, v5
 466          carry1 x12
 467          carry0 v27
 468         mul5    v29, nil,    v1,  v4, v5
 469          carry1 x14
 470         mul6    v30, nil,    v1,  v4, v5
 471
 472         // Finish up and store the result.
 473         bfi     x11, x13, #32, #32
 474         bfi     x12, x14, #32, #32
 475         stp     x11, x12, [x0], #16
 476
 477         // All done.
 478         ret
 479 ENDFUNC
 480
 481 INTFUNC(mla4)
 482         // On entry, x0 points to the destination/accumulator A; x2 points to
 483         // a packed operand X; v4/v5 holds an expanded operand Y; v13--v15
 484         // and x15 hold incoming carries c0--c2 and cg; and v31 is zero.  On
 485         // exit, the accumulator and carries are updated; x0 and x2 are each
 486         // advanced by 16; v4 and v5 and v8--v15 are preserved; and x11--x14,
 487         // x16, x17 and the other SIMD registers are clobbered.
 488   endprologue
 489
 490         // Start by loading the operand words from memory.
 491         ldr     q24, [x0]
 492         ldr     q1, [x2], #16
 493         sprdacc v24, v25, v26, v27
 494         crryacc v24, v25, v26, v27, v28, v29, v30
 495
 496         // Do the multiplication.
 497         mul0    v28, t,      v1,  v4, v5
 498         mul1    v29, t,      v1,  v4, v5
 499          carry0 v28
 500         mul2    v30, t,      v1,  v4, v5
 501          carry1 x11
 502          carry0 v29
 503         mul3    v27, t,      v1,  v4, v5
 504          carry1 x13
 505          carry0 v30
 506         mul4    v28, nil,    v1,  v4, v5
 507          carry1 x12
 508          carry0 v27
 509         mul5    v29, nil,    v1,  v4, v5
 510          carry1 x14
 511         mul6    v30, nil,    v1,  v4, v5
 512
 513         // Finish up and store the result.
 514         bfi     x11, x13, #32, #32
 515         bfi     x12, x14, #32, #32
 516         stp     x11, x12, [x0], #16
 517
 518         // All done.
 519         ret
 520 ENDFUNC
 521
 522 INTFUNC(mla4zc)
 523         // On entry, x0 points to the destination/accumulator A; x2 points to
 524         // a packed operand X; v4/v5 holds an expanded operand Y; and v31 is
 525         // zero.  On exit, the accumulator is updated; v28--v30 and x15 hold
 526         // outgoing carries c0--c2 and cg; x0 and x2 are each advanced by 16;
 527         // v4, v5, and v8--v15 are preserved; and x11--x14, x16, x17 and the
 528         // other SIMD registers are clobbered.
 529   endprologue
 530
 531         // Start by loading the operand words from memory.
 532         ldr     q28, [x0]
 533         ldr     q1, [x2], #16
 534         sprdacc v28, v29, v30, v27
 535
 536         // Do the multiplication.
 537         mul0    v28, t,      v1,  v4, v5
 538         mul1    v29, t,      v1,  v4, v5
 539          carry0 v28, nil
 540         mul2    v30, t,      v1,  v4, v5
 541          carry1 x11
 542          carry0 v29
 543         mul3    v27, t,      v1,  v4, v5
 544          carry1 x13
 545          carry0 v30
 546         mul4    v28, nil,    v1,  v4, v5
 547          carry1 x12
 548          carry0 v27
 549         mul5    v29, nil,    v1,  v4, v5
 550          carry1 x14
 551         mul6    v30, nil,    v1,  v4, v5
 552
 553         // Finish up and store the result.
 554         bfi     x11, x13, #32, #32
 555         bfi     x12, x14, #32, #32
 556         stp     x11, x12, [x0], #16
 557
 558         // All done.
 559         ret
 560 ENDFUNC
 561
 562 INTFUNC(mmul4)
 563         // On entry, x0 points to the destination; x1 points to a packed
 564         // operand U; x2 points to a packed operand X (the modulus); v2/v3
 565         // holds an expanded operand V; and v6/v7 holds an expanded operand M
 566         // (the Montgomery factor -N^{-1} (mod B)).  On exit, the destination
 567         // is updated (to zero); v4/v5 hold an expanded factor Y = U V M (mod
 568         // B); v28--v30 and x15 hold outgoing carries c0--c2 and cg; x0, x1,
 569         // and x2 are each advanced by 16; v2, v3, and v8--v15 are preserved;
 570         // and x11--x14, x16, x17 and the other SIMD registers are clobbered.
 571   endprologue
 572
 573         // Start by loading the operand words from memory.
 574         ldr     q0, [x1], #16
 575         ldr     q1, [x2], #16
 576
 577         // Calculate the low half of W = A + U V, being careful to leave the
 578         // carries in place.
 579         mul0    v28, nil,  v0,  v2, v3
 580         mul1    v29, nil,  v0,  v2, v3
 581          carry0 v28, nil
 582         mul2    v30, nil,  v0,  v2, v3
 583          carry1 x11
 584          carry0 v29
 585         mul3    v27, nil,  v0,  v2, v3
 586         b       mmla4_common
 587 ENDFUNC
 588
 589 INTFUNC(mmla4)
 590         // On entry, x0 points to the destination/accumulator A; x1 points to
 591         // a packed operand U; x2 points to a packed operand X (the modulus);
 592         // v2/v3 holds an expanded operand V; and v6/v7 holds an expanded
 593         // operand M (the Montgomery factor -N^{-1} (mod B)).  On exit, the
 594         // accumulator is updated (to zero); v4/v5 hold an expanded factor Y
 595         // = (A + U V) M (mod B); v28--v30 and x15 hold outgoing carries
 596         // c0--c2 and cg; x0, x1, and x2 are each advanced by 16; v2, v3, v6,
 597         // v7, and v8--v15 are preserved; and x11--x14, x16, x17 and the
 598         // other SIMD registers are clobbered.
 599   endprologue
 600
 601         // Start by loading the operand words from memory.
 602         ldr     q28, [x0]
 603         ldr     q0, [x1], #16
 604         ldr     q1, [x2], #16
 605         sprdacc v28, v29, v30, v27
 606
 607         // Calculate the low half of W = A + U V, being careful to leave the
 608         // carries in place.
 609         mul0    v28, t,    v0,  v2, v3
 610         mul1    v29, t,    v0,  v2, v3
 611          carry0 v28, nil
 612         mul2    v30, t,    v0,  v2, v3
 613          carry1 x11
 614          carry0 v29
 615         mul3    v27, t,    v0,  v2, v3
 616 mmla4_common:
 617          carry1 x13
 618          carry0 v30
 619          carry1 x12
 620          carry0 v27
 621          carry1 x14, nil
 622
 623         // Piece the result together and ship it back.
 624         bfi     x11, x13, #32, #32
 625         bfi     x12, x14, #32, #32
 626         mov     v16.d[0], x11
 627         mov     v16.d[1], x12
 628
 629         // Calculate the low half of the Montgomery factor Y = W M.
 630         mul0    v18, nil,    v16,  v6, v7
 631         mul1    v19, nil,    v16,  v6, v7
 632          carry0 v18, nil
 633         mul2    v20, nil,    v16,  v6, v7
 634          carry1 x11
 635          carry0 v19
 636         mul3    v21, nil,    v16,  v6, v7
 637          carry1 x13
 638          carry0 v20
 639          carry1 x12
 640          carry0 v21
 641          carry1 x14, nil
 642
 643         // Piece the result together, ship it back, and expand.
 644         bfi     x11, x13, #32, #32
 645         bfi     x12, x14, #32, #32
 646         mov     v4.d[0], x11
 647         mov     v4.d[1], x12
 648         expand  v4, v5
 649
 650         // Build up the product X Y in the carry slots.
 651         mul0    v28, t,                      v1,  v4, v5
 652         mul1    v29, t,                      v1,  v4, v5
 653          carry0 v28, nil
 654         mul2    v30, t,                      v1,  v4, v5
 655          carry1 nil
 656          carry0 v29
 657         mul3    v27, t,                      v1,  v4, v5
 658          carry1 nil
 659          carry0 v30
 660
 661         // And complete the calculation.
 662         mul4    v28, nil,    v0,  v2, v3,    v1,  v4, v5
 663          carry1 nil
 664          carry0 v27
 665         mul5    v29, nil,    v0,  v2, v3,    v1,  v4, v5
 666          carry1 nil
 667         mul6    v30, nil,    v0,  v2, v3,    v1,  v4, v5
 668
 669         // Finish up and store the result.
 670         stp     xzr, xzr, [x0], #16
 671
 672         // All done.
 673         ret
 674 ENDFUNC
 675
 676 INTFUNC(mont4)
 677         // On entry, x0 points to the destination/accumulator A; x2 points to
 678         // a packed operand X (the modulus); and v6/v7 holds an expanded
 679         // operand M (the Montgomery factor -N^{-1} (mod B)).  On exit, the
 680         // accumulator is updated (to zero); v4/v5 hold an expanded factor Y
 681         // = A M (mod B); v28--v30 and x15 hold outgoing carries c0--c2 and
 682         // cg; x0 and x2 are each advanced by 16; v6, v7, and v8--v15 are
 683         // preserved; and x11--x14, x16, x17 and the other SIMD registers are
 684         // clobbered.
 685   endprologue
 686
 687         // Start by loading the operand words from memory.
 688         ldr     q28, [x0]
 689         ldr     q1, [x2], #16
 690
 691         // Calculate Y = A M (mod B).
 692         mul0    v18, nil,    v28,  v6, v7
 693         mul1    v19, nil,    v28,  v6, v7
 694          carry0 v18, nil
 695         mul2    v20, nil,    v28,  v6, v7
 696          carry1 x11
 697          carry0 v19
 698         mul3    v21, nil,    v28,  v6, v7
 699          carry1 x13
 700          carry0 v20
 701           sprdacc v28, v29, v30, v27
 702          carry1 x12
 703          carry0 v21
 704          carry1 x14, nil
 705
 706         // Piece the result together, ship it back, and expand.
 707         bfi     x11, x13, #32, #32
 708         bfi     x12, x14, #32, #32
 709         mov     v4.d[0], x11
 710         mov     v4.d[1], x12
 711         expand  v4, v5
 712
 713         // Calculate the actual result.  Well, the carries, at least.
 714         mul0    v28, t,      v1,  v4, v5
 715         mul1    v29, t,      v1,  v4, v5
 716          carry0 v28, nil
 717         mul2    v30, t,      v1,  v4, v5
 718          carry1 nil
 719          carry0 v29
 720         mul3    v27, t,      v1,  v4, v5
 721          carry1 nil
 722          carry0 v30
 723
 724         // And complete the calculation.
 725         mul4    v28, nil,    v1,  v4, v5
 726          carry1 nil
 727          carry0 v27
 728         mul5    v29, nil,    v1,  v4, v5
 729          carry1 nil
 730         mul6    v30, nil,    v1,  v4, v5
 731
 732         // Finish up and store the result.
 733         stp     xzr, xzr, [x0], #16
 734
 735         // All done.
 736         ret
 737 ENDFUNC
 738
 739 ///--------------------------------------------------------------------------
 740 /// Bulk multipliers.
 741
 742 FUNC(mpx_umul4_arm64_simd)
 743         // void mpx_umul4_arm64_simd(mpw *dv, const mpw *av, const mpw *avl,
 744         //                           const mpw *bv, const mpw *bvl);
 745
 746         // Establish the arguments and do initial setup.
 747         //
 748         // inner loop dv        x0
 749         // inner loop av        x2
 750         // outer loop dv        x5
 751         // outer loop bv        x3
 752         // av base              x1
 753         // inner n              x6
 754         // n base               x7
 755         // outer n              x4
 756         pushreg x29, x30
 757         setfp
 758   endprologue
 759
 760         // Prepare for the first iteration.
 761         ldr     q4, [x3], #16           // Y = bv[0]
 762         movi    v31.4s, #0
 763         sub     x7, x2, x1              // = inner loop count base
 764         // x0                           // = dv for inner loop
 765         // x1                           // = av base
 766         // x3                           // = bv for outer loop
 767         sub     x4, x4, x3              // = outer loop count (decremented)
 768         sub     x6, x7, #16             // = inner loop count (decremented)
 769         mov     x2, x1                  // = av for inner loop
 770         add     x5, x0, #16             // = dv for outer loop
 771         expand  v4, v5                  // expand Y
 772         bl      mul4zc
 773         cbz     x6, 8f                  // all done?
 774
 775         // Continue with the first iteration.
 776 0:      sub     x6, x6, #16
 777         bl      mul4
 778         cbnz    x6, 0b
 779
 780         // Write out the leftover carry.  There can be no tail here.
 781 8:      bl      carryprop
 782         cbz     x4, 9f
 783
 784         // Set up for the next pass.
 785 1:      ldr     q4, [x3], #16           // Y = bv[i]
 786         mov     x0, x5                  // -> dv[i]
 787         mov     x2, x1                  // -> av[0]
 788         add     x5, x5, #16
 789         sub     x6, x7, #16             // = inner loop count (decremented)
 790         sub     x4, x4, #16             // outer loop count
 791         expand  v4, v5                  // expand Y
 792         bl      mla4zc
 793         cbz     x6, 8f
 794
 795         // Continue...
 796 0:      sub     x6, x6, #16
 797         bl      mla4
 798         cbnz    x6, 0b
 799
 800         // Finish off this pass.  There was no tail on the previous pass, and
 801         // there can be done on this pass.
 802 8:      bl      carryprop
 803         cbnz    x4, 1b
 804
 805         // All over.
 806 9:      popreg  x29, x30
 807         ret
 808 ENDFUNC
 809
 810 FUNC(mpxmont_mul4_arm64_simd)
 811         // void mpxmont_mul4_arm64_simd(mpw *dv,
 812         //                              const mpw *av, const mpw *bv,
 813         //                              const mpw *nv, size_t n,
 814         //                              const mpw *mi);
 815
 816         // Establish the arguments and do initial setup.
 817         //
 818         // inner loop dv        x0
 819         // inner loop av        x1
 820         // inner loop nv        x2
 821         // nv base              x3
 822         // base n               x4
 823         // mi                   (x5)
 824         // outer loop dv        x5
 825         // outer loop bv        x6
 826         // av base              x7
 827         // inner n              x8
 828         // outer n              x9
 829         // c                    x10
 830         pushreg x29, x30
 831         setfp
 832   endprologue
 833
 834         // Set up the outer loop state and prepare for the first iteration.
 835         ldr     q2, [x2]                // = V = bv[0]
 836         ldr     q6, [x5]                // = M
 837         movi    v31.4s, #0
 838         // x0                           // -> dv for inner loop
 839         // x1                           // -> av for inner loop
 840         // x3                           // -> nv base
 841         // x4                           // = n base
 842         add     x5, x0, #16             // -> dv
 843         add     x6, x2, #16             // -> bv
 844         mov     x2, x3                  // -> nv[0]
 845         mov     x7, x1                  // -> av base
 846         sub     x8, x4, #4              // = inner n (decremented)
 847         sub     x9, x4, #4              // = outer n (decremented)
 848         expand  v2, v3                  // expand V
 849         expand  v6, v7                  // expand M
 850         bl      mmul4
 851         cbz     x8, 8f                  // done already?
 852
 853         // Complete the first inner loop.
 854 0:      sub     x8, x8, #4
 855         bl      dmul4
 856         cbnz    x8, 0b                  // done yet?
 857
 858         // Still have carries left to propagate.  Rather than store the tail
 859         // end in memory, keep it in x10 for later.
 860         bl      carryprop
 861
 862         // Embark on the next iteration.  (There must be one.  If n = 1 then
 863         // we would have bailed above, to label 8.  Similarly, the subsequent
 864         // iterations can fall into the inner loop immediately.)
 865 1:      ldr     q2, [x6], #16           // = Y = bv[i]
 866         mov     x0, x5                  // -> dv[i]
 867         mov     x1, x7                  // -> av[0]
 868         mov     x2, x3                  // -> nv[0]
 869         add     x5, x5, #16
 870         sub     x8, x4, #4
 871         sub     x9, x9, #4
 872         expand  v2, v3
 873         bl      mmla4
 874
 875         // Complete the next inner loop.
 876 0:      sub     x8, x8, #4
 877         bl      dmla4
 878         cbnz    x8, 0b
 879
 880         // Still have carries left to propagate, and they overlap the
 881         // previous iteration's final tail, so read that and add it.
 882         add     x15, x15, x10, lsl #16
 883         bl      carryprop
 884
 885         // Back again, maybe.
 886         cbnz    x9, 1b
 887
 888         // All done, almost.
 889         str     w10, [x0], #4
 890         popreg  x29, x30
 891         ret
 892
 893         // First iteration was short.  Write out the carries and we're done.
 894         // (This could be folded into the main loop structure, but that would
 895         // penalize small numbers more.)
 896 8:      bl      carryprop
 897         str     w10, [x0], #4
 898         popreg  x29, x30
 899         ret
 900 ENDFUNC
 901
 902 FUNC(mpxmont_redc4_arm64_simd)
 903         // void mpxmont_redc4_arm64_simd(mpw *dv, mpw *dvl, const mpw *nv,
 904         //                               size_t n, const mpw *mi);
 905
 906         // Establish the arguments and do initial setup.
 907         //
 908         // inner loop dv        x0
 909         // inner loop nv        x2
 910         // blocks-of-4 count    x6
 911         // tail count           x7
 912         // mi                   (x4)
 913         // outer loop dv        x4
 914         // outer loop count     x8
 915         // nv base              x5
 916         // inner loop count     x1
 917         // n                    x3
 918         // c                    x10
 919         // t0, t1               x11, x12
 920
 921         pushreg x29, x30
 922         setfp
 923   endprologue
 924
 925         // Set up the outer loop state and prepare for the first iteration.
 926         ldr     q6, [x4]                // = M
 927         movi    v31.4s, #0
 928         // x0                           // -> dv for inner loop
 929          sub    x6, x1, x0              // total dv bytes
 930         sub     x1, x3, #4              // inner loop counter
 931         // x2                           // -> nv for inner loop
 932         // x3                           // = n
 933         add     x4, x0, #16             // -> dv for outer loop
 934         mov     x5, x2                  // -> nv base
 935          sub    x6, x6, x3, lsl #2      // dv carry range bytes
 936         sub     x8, x3, #4              // outer loop counter
 937          sub    x6, x6, #16             // dv steam-powered carry bytes
 938         expand  v6, v7                  // expand M
 939         and     x7, x6, #15             // dv tail length in bytes
 940         bic     x6, x6, #15             // dv blocks-of-four length in bytes
 941
 942         bl      mont4
 943         cbz     x1, 8f                  // done already?
 944
 945 5:      sub     x1, x1, #4
 946         bl      mla4
 947         cbnz    x1, 5b                  // done yet?
 948
 949         // Still have carries left to propagate.  Adding the accumulator
 950         // block into the carries is a little different this time, because
 951         // all four accumulator limbs have to be squished into the three
 952         // carry registers for `carryprop' to do its thing.
 953 8:      ldr     q24, [x0]
 954         sprdacc v24, v25, v26
 955         add     v28.2d, v28.2d, v24.2d
 956         add     v29.2d, v29.2d, v25.2d
 957         add     v30.2d, v30.2d, v26.2d
 958         bl      carryprop
 959         cbz     x6, 7f
 960
 961         // Propagate the first group of carries.
 962         ldp     x16, x17, [x0]
 963         sub     x1, x6, #16
 964         adds    x16, x16, x10
 965         adcs    x17, x17, xzr
 966         stp     x16, x17, [x0], #16
 967         cbz     x1, 6f
 968
 969         // Continue carry propagation until the end of the buffer.
 970 0:      ldp     x16, x17, [x0]
 971         sub     x1, x1, #16
 972         adcs    x16, x16, xzr
 973         adcs    x17, x17, xzr
 974         stp     x16, x17, [x0], #16
 975         cbnz    x1, 0b
 976
 977         // Deal with the tail end.  Note that the actual destination length
 978         // won't be an exacty number of blocks of four, so it's safe to just
 979         // drop through here.
 980 6:      adc     w10, wzr, wzr
 981 7:      ldr     w16, [x0]
 982         sub     x1, x7, #4
 983         adds    w16, w16, w10
 984         str     w16, [x0], #4
 985         cbz     x1, 8f
 986 0:      ldr     w16, [x0]
 987         sub     x1, x1, #4
 988         adcs    w16, w16, wzr
 989         str     w16, [x0], #4
 990         cbnz    x1, 0b
 991
 992         // All done for this iteration.  Start the next.
 993 8:      cbz     x8, 9f
 994         mov     x0, x4
 995         add     x4, x4, #16
 996         sub     x1, x3, #4
 997         mov     x2, x5
 998         sub     x8, x8, #4
 999         sub     x6, x6, #16
1000         bl      mont4
1001         b       5b
1002
1003         // All over.
1004 9:      popreg  x29, x30
1005         ret
1006 ENDFUNC
1007
1008 ///--------------------------------------------------------------------------
1009 /// Testing and performance measurement.
1010
1011 #ifdef TEST_MUL4
1012
1013 //                dmul  smul  mmul  mont
1014 // z    x0        x0    x0    x0     x0
1015 // c    x3        x1    x1    x1     x1
1016 // y    x4        --    --    x2     x2
1017 // u    x1        x2    --    x3     --
1018 // x    x2        x3    x2    x4     x3
1019 // vv   v2/v3     x4    --    x5     --
1020 // yy   v4/v5     x5    x3    x6     --
1021 // mm   v6/v7     --    --    --     x4
1022 // n    x5        x6    x4    x7     x5
1023 // cyv  x6        x7    x5    stk0   x6
1024
1025 #define STKARG(i) sp, #16 + i
1026
1027 .macro  testprologue mode
1028         pushreg x29, x30
1029         setfp
1030   endprologue
1031         movi    v31.4s, #0
1032
1033   .ifeqs "\mode", "dmul"
1034         ldr     q2, [x4]
1035         zip2    v3.8h, v2.8h, v31.8h    // (v'_2, v''_2; v'_3, v''_3)
1036         zip1    v2.8h, v2.8h, v31.8h    // (v'_0, v''_0; v'_1, v''_1)
1037
1038         ldr     q4, [x5]
1039         zip2    v5.8h, v4.8h, v31.8h    // (y'_2, y''_2; y'_3, y''_3)
1040         zip1    v4.8h, v4.8h, v31.8h    // (y'_0, y''_0; y'_1, y''_1)
1041
1042         mov     x16, x1
1043         mov     x1, x2                  // -> u
1044         mov     x2, x3                  // -> x
1045         mov     x3, x16                 // -> c
1046
1047         mov     x5, x6                  // = n
1048         mov     x6, x7                  // -> cyv
1049   .endif
1050
1051   .ifeqs "\mode", "smul"
1052         ldr     q4, [x3]
1053         zip2    v5.8h, v4.8h, v31.8h    // (y'_2, y''_2; y'_3, y''_3)
1054         zip1    v4.8h, v4.8h, v31.8h    // (y'_0, y''_0; y'_1, y''_1)
1055
1056         // x2                           // -> x
1057         mov     x3, x1                  // -> c
1058         mov     x6, x5                  // -> cyv
1059         mov     x5, x4                  // = n
1060   .endif
1061
1062   .ifeqs "\mode", "mmul"
1063         ldr     q2, [x5]
1064         zip2    v3.8h, v2.8h, v31.8h    // (v'_2, v''_2; v'_3, v''_3)
1065         zip1    v2.8h, v2.8h, v31.8h    // (v'_0, v''_0; v'_1, v''_1)
1066
1067         ldr     q6, [x6]
1068         zip2    v7.8h, v6.8h, v31.8h    // (y'_2, y''_2; y'_3, y''_3)
1069         zip1    v6.8h, v6.8h, v31.8h    // (y'_0, y''_0; y'_1, y''_1)
1070
1071         mov     x16, x1
1072         mov     x1, x3                  // -> u
1073         mov     x3, x16                 // -> c
1074
1075         mov     x16, x2
1076         mov     x2, x4                  // -> x
1077         mov     x4, x16                 // -> y
1078
1079         mov     x5, x7                  // = n
1080         ldr     x6, [STKARG(0)]         // -> cyv
1081   .endif
1082
1083   .ifeqs "\mode", "mont"
1084         ldr     q6, [x4]
1085         zip2    v7.8h, v6.8h, v31.8h    // (m'_2, m''_2; m'_3, m''_3)
1086         zip1    v6.8h, v6.8h, v31.8h    // (m'_0, m''_0; m'_1, m''_1)
1087
1088         mov     x4, x2                  // -> y
1089         mov     x2, x3                  // -> x
1090         mov     x3, x1                  // -> c
1091
1092         // x5                           // = n
1093         // x6                           // -> cyv
1094   .endif
1095 .endm
1096
1097 .macro  testldcarry
1098         ld1     {v28.2d-v30.2d}, [x3]
1099         mov     x15, #0
1100 .endm
1101
1102 .macro  testtop
1103 0:      sub     x5, x5, #1
1104 .endm
1105
1106 .macro  testtail
1107         cbnz    x5, 0b
1108 .endm
1109
1110 .macro  testcarryout
1111         // More complicated than usual because we must mix the general-
1112         // purpose carry back in.
1113         lsr     x15, x15, #16
1114         mov     v0.d[0], x15
1115         mov     v0.d[1], xzr
1116         add     v28.2d, v28.2d, v0.2d
1117         st1     {v28.2d-v30.2d}, [x3]
1118 .endm
1119
1120 .macro  testepilogue
1121         popreg  x29, x30
1122         ret
1123 .endm
1124
1125 FUNC(test_dmul4)
1126         testprologue dmul
1127         testldcarry
1128         testtop
1129         bl      dmul4
1130         testtail
1131         testcarryout
1132         testepilogue
1133 ENDFUNC
1134
1135 FUNC(test_dmla4)
1136         testprologue dmul
1137         testldcarry
1138         testtop
1139         bl      dmla4
1140         testtail
1141         testcarryout
1142         testepilogue
1143 ENDFUNC
1144
1145 FUNC(test_mul4)
1146         testprologue smul
1147         testldcarry
1148         testtop
1149         bl      mul4
1150         testtail
1151         testcarryout
1152         testepilogue
1153 ENDFUNC
1154
1155 FUNC(test_mul4zc)
1156         testprologue smul
1157         testldcarry
1158         testtop
1159         bl      mul4zc
1160         testtail
1161         testcarryout
1162         testepilogue
1163 ENDFUNC
1164
1165 FUNC(test_mla4)
1166         testprologue smul
1167         testldcarry
1168         testtop
1169         bl      mla4
1170         testtail
1171         testcarryout
1172         testepilogue
1173 ENDFUNC
1174
1175 FUNC(test_mla4zc)
1176         testprologue smul
1177         testldcarry
1178         testtop
1179         bl      mla4zc
1180         testtail
1181         testcarryout
1182         testepilogue
1183 ENDFUNC
1184
1185 FUNC(test_mmul4)
1186         testprologue mmul
1187         testtop
1188         bl      mmul4
1189         testtail
1190         stp     q4, q5, [x4]
1191         testcarryout
1192         testepilogue
1193 ENDFUNC
1194
1195 FUNC(test_mmla4)
1196         testprologue mmul
1197         testtop
1198         bl      mmla4
1199         testtail
1200         stp     q4, q5, [x4]
1201         testcarryout
1202         testepilogue
1203 ENDFUNC
1204
1205 FUNC(test_mont4)
1206         testprologue mont
1207         testtop
1208         bl      mont4
1209         testtail
1210         stp     q4, q5, [x4]
1211         testcarryout
1212         testepilogue
1213 ENDFUNC
1214
1215 #endif
1216
1217 ///----- That's all, folks --------------------------------------------------