chiark - git - mdw - catacomb/blob - symm/gcm-x86ish-pclmul.S

   1 /// -*- mode: asm; asm-comment-char: ?/ -*-
   2 ///
   3 /// GCM acceleration for x86 processors
   4 ///
   5 /// (c) 2018 Straylight/Edgeware
   6 ///
   7
   8 ///----- Licensing notice ---------------------------------------------------
   9 ///
  10 /// This file is part of Catacomb.
  11 ///
  12 /// Catacomb is free software: you can redistribute it and/or modify it
  13 /// under the terms of the GNU Library General Public License as published
  14 /// by the Free Software Foundation; either version 2 of the License, or
  15 /// (at your option) any later version.
  16 ///
  17 /// Catacomb is distributed in the hope that it will be useful, but
  18 /// WITHOUT ANY WARRANTY; without even the implied warranty of
  19 /// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  20 /// Library General Public License for more details.
  21 ///
  22 /// You should have received a copy of the GNU Library General Public
  23 /// License along with Catacomb.  If not, write to the Free Software
  24 /// Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
  25 /// USA.
  26
  27 ///--------------------------------------------------------------------------
  28 /// Preliminaries.
  29
  30 #include "config.h"
  31 #include "asm-common.h"
  32
  33         .arch   .pclmul
  34
  35         .text
  36
  37 ///--------------------------------------------------------------------------
  38 /// Common register allocation.
  39
  40 #if CPUFAM_X86
  41 #  define A eax
  42 #  define K edx
  43 #elif CPUFAM_AMD64 && ABI_SYSV
  44 #  define A rdi
  45 #  define K rsi
  46 #elif CPUFAM_AMD64 && ABI_WIN
  47 #  define A rcx
  48 #  define K rdx
  49 #endif
  50
  51 ///--------------------------------------------------------------------------
  52 /// Multiplication macros.
  53
  54         // The good news is that we have a fancy instruction to do the
  55         // multiplications.  The bad news is that it's not particularly well-
  56         // suited to the job.
  57         //
  58         // For one thing, it only does a 64-bit multiplication, so in general
  59         // we'll need to synthesize the full-width multiply by hand.  For
  60         // another thing, it doesn't help with the reduction, so we have to
  61         // do that by hand too.  And, finally, GCM has crazy bit ordering,
  62         // and the instruction does nothing useful for that at all.
  63         //
  64         // Focusing on that last problem first: the bits aren't in monotonic
  65         // significance order unless we permute them.  If we reverse the byte
  66         // order, then we'll have the bits in monotonic order, but backwards,
  67         // so the degree-0 coefficient will be in the most-significant bit.
  68         //
  69         // This is less of a difficulty than it seems at first, because
  70         // algebra.  Suppose we are given u = SUM_{0<=i<n} u_i t^i and v =
  71         // SUM_{0<=j<n} v_j t^j; then
  72         //
  73         //      u v = SUM_{0<=i,j<n} u_i v_j t^{i+j}
  74         //
  75         // Suppose instead that we're given ũ = SUM_{0<=i<n} u_{n-i-1} t^i
  76         // and ṽ = SUM_{0<=j<n} v_{n-j-1} t^j, so the bits are backwards.
  77         // Then
  78         //
  79         //      ũ ṽ = SUM_{0<=i,j<n} u_{n-i-1} v_{n-j-1} t^{i+j}
  80         //          = SUM_{0<=i,j<n} u_i v_j t^{2n-2-(i+j)}
  81         //
  82         // which is almost the bit-reversal of u v, only it's shifted right
  83         // by one place.  Oh, well: we'll have to shift it back later.
  84         //
  85         // That was important to think about, but there's not a great deal to
  86         // do about it yet other than to convert what we've got from the
  87         // blockcipher's byte-ordering convention to our big-endian
  88         // convention.  Since this depends on the blockcipher convention,
  89         // we'll leave the caller to cope with this: the macros here will
  90         // assume that the operands are in `register' format, which is the
  91         // byte-reversal of the external representation, padded at the
  92         // most-significant end except for 96-bit blocks, which are
  93         // zero-padded at the least-significant end (see `mul96' for the
  94         // details).  In the commentary, pieces of polynomial are numbered
  95         // according to the degree of the coefficients, so the unit
  96         // coefficient of some polynomial a is in a_0.
  97         //
  98         // The commentary for `mul128' is the most detailed.  The other
  99         // macros assume that you've already read and understood that.
 100
 101 .macro  mul128
 102         // Enter with u and v in xmm0 and xmm1 respectively; leave with z =
 103         // u v in xmm0.  Clobbers xmm1--xmm4.
 104
 105         // First for the double-precision multiplication.  It's tempting to
 106         // use Karatsuba's identity here, but I suspect that loses more in
 107         // the shifting, bit-twiddling, and dependency chains that it gains
 108         // in saving a multiplication which otherwise pipelines well.
 109         // xmm0 =                       // (u_1; u_0)
 110         // xmm1 =                       // (v_1; v_0)
 111         movdqa  xmm2, xmm1              // (v_1; v_0) again
 112         movdqa  xmm3, xmm0              // (u_1; u_0) again
 113         movdqa  xmm4, xmm0              // (u_1; u_0) yet again
 114         pclmulhqlqdq xmm2, xmm0         // u_1 v_0
 115         pclmullqlqdq xmm0, xmm1         // u_1 v_1
 116         pclmulhqlqdq xmm3, xmm1         // u_0 v_1
 117         pclmulhqhqdq xmm4, xmm1         // u_0 v_0
 118
 119         // Arrange the pieces to form a double-precision polynomial.
 120         pxor    xmm2, xmm3              // (m_1; m_0) = u_1 v_0 + u_0 v_1
 121         movdqa  xmm1, xmm2              // (m_1; m_0) again
 122         pslldq  xmm2, 8                 // (0; m_1)
 123         psrldq  xmm1, 8                 // (m_0; 0)
 124         pxor    xmm0, xmm2              // x_1 = u_1 v_1 + m_1
 125         pxor    xmm1, xmm4              // x_0 = u_0 v_0 + t^64 m_0
 126
 127         // Two problems remain.  The first is that this product is shifted
 128         // left (from GCM's backwards perspective) by one place, which is
 129         // annoying.  Let's take care of that now.  Once this is done, we'll
 130         // be properly in GCM's backwards bit-ordering, so xmm1 will hold the
 131         // low half of the product and xmm0 the high half.  (The following
 132         // diagrams show bit 0 consistently on the right.)
 133         //
 134         //                               xmm1
 135         //    ,-------------.-------------.-------------.-------------.
 136         //    | 0  x_0-x_30 |  x_31-x_62  |  x_63-x_94  |  x_95-x_126 |
 137         //    `-------------^-------------^-------------^-------------'
 138         //
 139         //                               xmm0
 140         //    ,-------------.-------------.-------------.-------------.
 141         //    | x_127-x_158 | x_159-x_190 | x_191-x_222 | x_223-x_254 |
 142         //    `-------------^-------------^-------------^-------------'
 143         //
 144         // We start by shifting each 32-bit lane right (from GCM's point of
 145         // view -- physically, left) by one place, which gives us this:
 146         //
 147         //                           low (xmm3)
 148         //    ,-------------.-------------.-------------.-------------.
 149         //    | x_0-x_30  0 | x_32-x_62 0 | x_64-x_94 0 | x_96-x_126 0|
 150         //    `-------------^-------------^-------------^-------------'
 151         //
 152         //                           high (xmm2)
 153         //    ,-------------.-------------.-------------.-------------.
 154         //    |x_128-x_158 0|x_160-x_190 0|x_192-x_222 0|x_224-x_254 0|
 155         //    `-------------^-------------^-------------^-------------'
 156         //
 157         // but we've lost a bunch of bits.  We separately shift each lane
 158         // left by 31 places to give us the bits we lost.
 159         //
 160         //                           low (xmm1)
 161         //    ,-------------.-------------.-------------.-------------.
 162         //    |    0...0    | 0...0  x_31 | 0...0  x_63 | 0...0  x_95 |
 163         //    `-------------^-------------^-------------^-------------'
 164         //
 165         //                           high (xmm0)
 166         //    ,-------------.-------------.-------------.-------------.
 167         //    | 0...0 x_127 | 0...0 x_159 | 0...0 x_191 | 0...0 x_223 |
 168         //    `-------------^-------------^-------------^-------------'
 169         //
 170         // Which is close, but we don't get a cigar yet.  To get the missing
 171         // bits into position, we shift each of these right by a lane, but,
 172         // alas, the x_127 falls off, so, separately, we shift the high
 173         // register left by three lanes, so that everything is lined up
 174         // properly when we OR them all together:
 175         //
 176         //                           low (xmm1)
 177         //    ,-------------.-------------.-------------.-------------.
 178         //    ? 0...0  x_31 | 0...0  x_63 | 0...0  x_95 |    0...0    |
 179         //    `-------------^-------------^-------------^-------------'
 180         //
 181         //                           wrap (xmm4)
 182         //    ,-------------.-------------.-------------.-------------.
 183         //    |    0...0    |    0...0    |    0...0    | 0...0 x_127 |
 184         //    `-------------^-------------^-------------^-------------'
 185         //
 186         //                           high (xmm0)
 187         //    ,-------------.-------------.-------------.-------------.
 188         //    | 0...0 x_159 | 0...0 x_191 | 0...0 x_223 |    0...0    |
 189         //    `-------------^-------------^-------------^-------------'
 190         //
 191         // The `low' and `wrap' registers (xmm1, xmm3, xmm4) then collect the
 192         // low 128 coefficients, while the `high' registers (xmm0, xmm2)
 193         // collect the high 127 registers, leaving a zero bit at the most
 194         // significant end as we expect.
 195
 196         // xmm0 =                       // (x_7, x_6; x_5, x_4)
 197         // xmm1 =                       // (x_3, x_2; x_1, x_0)
 198         movdqa  xmm3, xmm1              // (x_3, x_2; x_1, x_0) again
 199         movdqa  xmm2, xmm0              // (x_7, x_6; x_5, x_4) again
 200         psrld   xmm1, 31                // shifted left; just the carries
 201         psrld   xmm0, 31
 202         pslld   xmm3, 1                 // shifted right, but dropped carries
 203         pslld   xmm2, 1
 204         movdqa  xmm4, xmm0              // another copy for the carry around
 205         pslldq  xmm1, 4                 // move carries over
 206         pslldq  xmm0, 4
 207         psrldq  xmm4, 12                // the big carry wraps around
 208         por     xmm1, xmm3
 209         por     xmm0, xmm2              // (y_7, y_6; y_5, y_4)
 210         por     xmm1, xmm4              // (y_3, y_2; y_1, y_0)
 211
 212         // And the other problem is that the result needs to be reduced
 213         // modulo p(t) = t^128 + t^7 + t^2 + t + 1.  Let R = t^128 = t^7 +
 214         // t^2 + t + 1 in our field.  So far, we've calculated z_0 and z_1
 215         // such that z_0 + z_1 R = u v using the identity R = t^128: now we
 216         // must collapse the two halves of z together using the other
 217         // identity R = t^7 + t^2 + t + 1.
 218         //
 219         // We do this by working on each 32-bit word of the high half of z
 220         // separately, so consider y_i, for some 4 <= i < 8.  Certainly, y_i
 221         // t^{32i} = y_i R t^{32(i-4)} = (t^7 + t^2 + t + 1) y_i t^{32(i-4)},
 222         // but we can't use that directly without breaking up the 32-bit word
 223         // structure.  Instead, we start by considering just y_i t^7
 224         // t^{32(i-4)}, which again looks tricky.  Now, split y_i = a_i +
 225         // t^25 b_i, with deg a_i < 25; then
 226         //
 227         //      y_i t^7 t^{32(i-4)} = a_i t^7 t^{32(i-4)} + b_i t^{32(i-3)}
 228         //
 229         // We can similarly decompose y_i t^2 and y_i t into a pair of 32-bit
 230         // contributions to the t^{32(i-4)} and t^{32(i-3)} words, but the
 231         // splits are different.  This is lovely, with one small snag: when
 232         // we do this to y_7, we end up with a contribution back into the
 233         // t^128 coefficient word.  But notice that only the low seven bits
 234         // of this word are affected, so there's no knock-on contribution
 235         // into the t^32 word.  Therefore, if we handle the high bits of each
 236         // word together, and then the low bits, everything will be fine.
 237
 238         // First, shift the high bits down.
 239         movdqa  xmm2, xmm0              // (y_7, y_6; y_5, y_4) again
 240         movdqa  xmm3, xmm0              // (y_7, y_6; y_5, y_4) yet again
 241         movdqa  xmm4, xmm0              // (y_7, y_6; y_5, y_4) again again
 242         pslld   xmm2, 31                // the b_i for t
 243         pslld   xmm3, 30                // the b_i for t^2
 244         pslld   xmm4, 25                // the b_i for t^7
 245         pxor    xmm2, xmm3              // add them all together
 246         pxor    xmm2, xmm4
 247         movdqa  xmm3, xmm2              // and a copy for later
 248         psrldq  xmm2, 4                 // contribution into low half
 249         pslldq  xmm3, 12                // and high half
 250         pxor    xmm1, xmm2
 251         pxor    xmm0, xmm3
 252
 253         // And then shift the low bits up.
 254         movdqa  xmm2, xmm0
 255         movdqa  xmm3, xmm0
 256         pxor    xmm1, xmm0              // mix in the unit contribution
 257         psrld   xmm0, 1
 258         psrld   xmm2, 2
 259         psrld   xmm3, 7
 260         pxor    xmm1, xmm2              // low half, unit, and t^2 contribs
 261         pxor    xmm0, xmm3              // t and t^7 contribs
 262         pxor    xmm0, xmm1              // mix them together and we're done
 263 .endm
 264
 265 .macro  mul64
 266         // Enter with u and v in the low halves of xmm0 and xmm1
 267         // respectively; leave with z = u v in xmm0.  Clobbers xmm1--xmm4.
 268
 269         // The multiplication is thankfully easy.
 270         pclmullqlqdq xmm0, xmm1         // u v
 271
 272         // Shift the product up by one place.  After this, we're in GCM
 273         // bizarro-world.
 274         movdqa  xmm1, xmm0              // u v again
 275         psrld   xmm0, 31                // shifted left; just the carries
 276         pslld   xmm1, 1                 // shifted right, but dropped carries
 277         pslldq  xmm0, 4                 // move carries over
 278         por     xmm1, xmm0              // (y_3, y_2; y_1, y_0)
 279
 280         // Now we must reduce.  This is essentially the same as the 128-bit
 281         // case above, but mostly simpler because everything is smaller.  The
 282         // polynomial this time is p(t) = t^64 + t^4 + t^3 + t + 1.
 283
 284         // First, we must detach the top (`low'!) half of the result.
 285         movdqa  xmm0, xmm1              // (y_3, y_2; y_1, y_0) again
 286         psrldq  xmm1, 8                 // (y_1, y_0; 0, 0)
 287
 288         // Next, shift the high bits down.
 289         movdqa  xmm2, xmm0              // (y_3, y_2; ?, ?) again
 290         movdqa  xmm3, xmm0              // (y_3, y_2; ?, ?) yet again
 291         movdqa  xmm4, xmm0              // (y_3, y_2; ?, ?) again again
 292         pslld   xmm2, 31                // b_i for t
 293         pslld   xmm3, 29                // b_i for t^3
 294         pslld   xmm4, 28                // b_i for t^4
 295         pxor    xmm2, xmm3              // add them all together
 296         pxor    xmm2, xmm4
 297         movdqa  xmm3, xmm2              // and a copy for later
 298         movq    xmm2, xmm2              // zap high half
 299         pslldq  xmm3, 4                 // contribution into high half
 300         psrldq  xmm2, 4                 // and low half
 301         pxor    xmm0, xmm3
 302         pxor    xmm1, xmm2
 303
 304         // And then shift the low bits up.
 305         movdqa  xmm2, xmm0
 306         movdqa  xmm3, xmm0
 307         pxor    xmm1, xmm0              // mix in the unit contribution
 308         psrld   xmm0, 1
 309         psrld   xmm2, 3
 310         psrld   xmm3, 4
 311         pxor    xmm1, xmm2              // low half, unit, and t^3 contribs
 312         pxor    xmm0, xmm3              // t and t^4 contribs
 313         pxor    xmm0, xmm1              // mix them together and we're done
 314 .endm
 315
 316 .macro  mul96
 317         // Enter with u and v in the /high/ three words of xmm0 and xmm1
 318         // respectively (and zero in the low word); leave with z = u v in the
 319         // high three words of xmm0, and /junk/ in the low word.  Clobbers
 320         // xmm1--xmm4.
 321
 322         // This is an inconvenient size.  There's nothing for it but to do
 323         // four multiplications, as if for the 128-bit case.  It's possible
 324         // that there's cruft in the top 32 bits of the input registers, so
 325         // shift both of them up by four bytes before we start.  This will
 326         // mean that the high 64 bits of the result (from GCM's viewpoint)
 327         // will be zero.
 328         // xmm0 =                       // (0, u_2; u_1, u_0)
 329         // xmm1 =                       // (0, v_2; v_1, v_0)
 330         movdqa  xmm2, xmm1              // (0, v_2; v_1, v_0) again
 331         movdqa  xmm3, xmm0              // (0, u_2; u_1, u_0) again
 332         movdqa  xmm4, xmm0              // (0, u_2; u_1, u_0) yet again
 333         pclmulhqlqdq xmm2, xmm0         // u_2 (v_1 t^32 + v_0) = e_0
 334         pclmullqlqdq xmm0, xmm1         // u_2 v_2 = d = (0; d)
 335         pclmulhqlqdq xmm3, xmm1         // v_2 (u_1 t^32 + u_0) = e_1
 336         pclmulhqhqdq xmm4, xmm1         // u_0 v_0 + (u_1 v_0 + u_0 v_1) t^32
 337                                         //   + u_1 v_1 t^64 = f
 338
 339         // Extract the high and low halves of the 192-bit result.  We don't
 340         // need be too picky about the unused high words of the result
 341         // registers.  The answer we want is d t^128 + e t^64 + f, where e =
 342         // e_0 + e_1.
 343         //
 344         // The place values for the two halves are (t^160, t^128; t^96, ?)
 345         // and (?, t^64; t^32, 1).
 346         psrldq  xmm0, 8                 // (d; 0) = d t^128
 347         pxor    xmm2, xmm3              // e = (e_0 + e_1)
 348         movdqa  xmm1, xmm4              // f again
 349         pxor    xmm0, xmm2              // d t^128 + e t^64
 350         psrldq  xmm2, 12                // e[31..0] t^64
 351         psrldq  xmm1, 4                 // f[95..0]
 352         pslldq  xmm4, 8                 // f[127..96]
 353         pxor    xmm1, xmm2              // low 96 bits of result
 354         pxor    xmm0, xmm4              // high 96 bits of result
 355
 356         // Next, shift everything one bit to the left to compensate for GCM's
 357         // strange ordering.  This will be easier if we shift up the high
 358         // half by a word before we start.  After this we're in GCM bizarro-
 359         // world.
 360         movdqa  xmm3, xmm1              // low half again
 361         pslldq  xmm0, 4                 // shift high half
 362         psrld   xmm1, 31                // shift low half down: just carries
 363         movdqa  xmm2, xmm0              // copy high half
 364         pslld   xmm3, 1                 // shift low half down: drop carries
 365         psrld   xmm0, 31                // shift high half up: just carries
 366         pslld   xmm2, 1                 // shift high half down: drop carries
 367         movdqa  xmm4, xmm0              // copy high carries for carry-around
 368         pslldq  xmm0, 4                 // shift carries down
 369         pslldq  xmm1, 4
 370         psrldq  xmm4, 12                // the big carry wraps around
 371         por     xmm1, xmm3
 372         por     xmm0, xmm2
 373         por     xmm1, xmm4
 374
 375         // Finally, the reduction.  This is essentially the same as the
 376         // 128-bit case, except that the polynomial is p(t) = t^96 + t^10 +
 377         // t^9 + t^6 + 1.  The degrees are larger but not enough to cause
 378         // trouble for the general approach.
 379
 380         // First, shift the high bits down.
 381         movdqa  xmm2, xmm0              // copies of the high part
 382         movdqa  xmm3, xmm0
 383         movdqa  xmm4, xmm0
 384         pslld   xmm2, 26                // b_i for t^6
 385         pslld   xmm3, 23                // b_i for t^9
 386         pslld   xmm4, 22                // b_i for t^10
 387         pxor    xmm2, xmm3              // add them all together
 388         pslldq  xmm1, 4                 // shift low part up to match
 389         pxor    xmm2, xmm4
 390         movdqa  xmm3, xmm2              // and a copy for later
 391         pslldq  xmm2, 8                 // contribution to high half
 392         psrldq  xmm3, 4                 // contribution to low half
 393         pxor    xmm1, xmm3
 394         pxor    xmm0, xmm2
 395
 396         // And then shift the low bits up.
 397         movdqa  xmm2, xmm0              // copies of the high part
 398         movdqa  xmm3, xmm0
 399         pxor    xmm1, xmm0              // mix in the unit contribution
 400         psrld   xmm0, 6
 401         psrld   xmm2, 9
 402         psrld   xmm3, 10
 403         pxor    xmm1, xmm2              // low half, unit, and t^9 contribs
 404         pxor    xmm0, xmm3              // t^6 and t^10 contribs
 405         pxor    xmm0, xmm1              // mix them together and we're done
 406 .endm
 407
 408 .macro  mul192
 409         // Enter with u and v in xmm0/xmm1 and xmm2/xmm3 respectively; leave
 410         // with z = u v in xmm0/xmm1 -- the top halves of the high registers
 411         // are unimportant.  Clobbers xmm2--xmm7.
 412
 413         // Start multiplying and accumulating pieces of product.
 414         // xmm0 =                       // (u_2; u_1)
 415         // xmm1 =                       // (u_0; ?)
 416         // xmm2 =                       // (v_2; v_1)
 417         // xmm3 =                       // (v_0; ?)
 418         movdqa  xmm4, xmm0              // (u_2; u_1) again
 419         movdqa  xmm5, xmm0              // (u_2; u_1) yet again
 420         movdqa  xmm6, xmm0              // (u_2; u_1) again again
 421         movdqa  xmm7, xmm1              // (u_0; ?) again
 422         punpcklqdq xmm1, xmm3           // (u_0; v_0)
 423         pclmulhqhqdq xmm4, xmm2         // u_1 v_1
 424         pclmullqlqdq xmm3, xmm0         // u_2 v_0
 425         pclmullqhqdq xmm5, xmm2         // u_2 v_1
 426         pclmulhqlqdq xmm6, xmm2         // u_1 v_2
 427         pxor    xmm4, xmm3              // u_2 v_0 + u_1 v_1
 428         pclmullqlqdq xmm7, xmm2         // u_0 v_2
 429         pxor    xmm5, xmm6              // b = u_2 v_1 + u_1 v_2
 430         movdqa  xmm6, xmm0              // (u_2; u_1) like a bad penny
 431         pxor    xmm4, xmm7              // c = u_0 v_2 + u_1 v_1 + u_2 v_0
 432         pclmullqlqdq xmm0, xmm2         // a = u_2 v_2
 433         pclmulhqhqdq xmm6, xmm1         // u_1 v_0
 434         pclmulhqlqdq xmm2, xmm1         // u_0 v_1
 435         pclmullqhqdq xmm1, xmm1         // e = u_0 v_0
 436         pxor    xmm2, xmm6              // d = u_1 v_0 + u_0 v_1
 437
 438         // Next, the piecing together of the product.
 439         // xmm0 =                       // (a_1; a_0) = a = u_2 v_2
 440         // xmm5 =                       // (b_1; b_0) = b = u_1 v_2 + u_2 v_1
 441         // xmm4 =                       // (c_1; c_0) = c = u_0 v_2 +
 442                                         //      u_1 v_1 + u_2 v_0
 443         // xmm2 =                       // (d_1; d_0) = d = u_0 v_1 + u_1 v_0
 444         // xmm1 =                       // (e_1; e_0) = e = u_0 v_0
 445         // xmm3, xmm6, xmm7 spare
 446         movdqa  xmm3, xmm2              // (d_1; d_0) again
 447         movdqa  xmm6, xmm5              // (b_1; b_0) again
 448         pslldq  xmm2, 8                 // (0; d_1)
 449         psrldq  xmm5, 8                 // (b_0; 0)
 450         psrldq  xmm3, 8                 // (d_0; 0)
 451         pslldq  xmm6, 8                 // (0; b_1)
 452         pxor    xmm5, xmm2              // (b_0; d_1)
 453         pxor    xmm0, xmm6              // x_2 = (a_1; a_0 + b_1)
 454         pxor    xmm3, xmm1              // x_0 = (e_1 + d_0; e_0)
 455         pxor    xmm4, xmm5              // x_1 = (b_0 + c_1; c_0 + d_1)
 456
 457         // Now, shift it right (from GCM's point of view) by one bit, and try
 458         // to leave the result in less random registers.  After this, we'll
 459         // be in GCM bizarro-world.
 460         // xmm1, xmm2, xmm5, xmm6, xmm7 spare
 461         movdqa  xmm5, xmm0              // copy x_2
 462         movdqa  xmm1, xmm4              // copy x_1
 463         movdqa  xmm2, xmm3              // copy x_0
 464         psrld   xmm0, 31                // x_2 carries
 465         psrld   xmm4, 31                // x_1 carries
 466         psrld   xmm3, 31                // x_0 carries
 467         pslld   xmm5, 1                 // x_2 shifted
 468         pslld   xmm1, 1                 // x_1 shifted
 469         pslld   xmm2, 1                 // x_0 shifted
 470         movdqa  xmm6, xmm0              // x_2 carry copy
 471         movdqa  xmm7, xmm4              // x_1 carry copy
 472         pslldq  xmm0, 4                 // x_2 carry shifted
 473         pslldq  xmm4, 4                 // x_1 carry shifted
 474         pslldq  xmm3, 4                 // x_0 carry shifted
 475         psrldq  xmm6, 12                // x_2 carry out
 476         psrldq  xmm7, 12                // x_1 carry out
 477         por     xmm0, xmm5              // (y_5; y_4)
 478         por     xmm1, xmm4
 479         por     xmm2, xmm3
 480         por     xmm1, xmm6              // (y_3; y_2)
 481         por     xmm2, xmm7              // (y_1; y_0)
 482
 483         // Next, the reduction.  Our polynomial this time is p(x) = t^192 +
 484         // t^7 + t^2 + t + 1.  Yes, the magic numbers are the same as the
 485         // 128-bit case.  I don't know why.
 486
 487         // First, shift the high bits down.
 488         // xmm0 =                       // (y_5; y_4)
 489         // xmm1 =                       // (y_3; y_2)
 490         // xmm2 =                       // (y_1; y_0)
 491         // xmm3--xmm7 spare
 492         movdqa  xmm3, xmm0              // (y_5; y_4) copy
 493         movdqa  xmm4, xmm0              // (y_5; y_4) copy
 494         movdqa  xmm5, xmm0              // (y_5; y_4) copy
 495         pslld   xmm3, 31                // (y_5; y_4) b_i for t
 496         pslld   xmm4, 30                // (y_5; y_4) b_i for t^2
 497         pslld   xmm5, 25                // (y_5; y_4) b_i for t^7
 498          movq   xmm6, xmm1              // (y_3; 0) copy
 499         pxor    xmm3, xmm4
 500          movq   xmm7, xmm1              // (y_3; 0) copy
 501         pxor    xmm3, xmm5
 502          movq   xmm5, xmm1              // (y_3; 0) copy
 503         movdqa  xmm4, xmm3              // (y_5; y_4) b_i combined
 504          pslld  xmm6, 31                // (y_3; 0) b_i for t
 505          pslld  xmm7, 30                // (y_3; 0) b_i for t^2
 506          pslld  xmm5, 25                // (y_3; 0) b_i for t^7
 507         psrldq  xmm3, 12                // (y_5; y_4) low contrib
 508         pslldq  xmm4, 4                 // (y_5; y_4) high contrib
 509          pxor   xmm6, xmm7
 510         pxor    xmm2, xmm3
 511          pxor   xmm6, xmm5
 512         pxor    xmm1, xmm4
 513          pslldq xmm6, 4
 514          pxor   xmm2, xmm6
 515
 516         // And finally shift the low bits up.  Unfortunately, we also have to
 517         // split the low bits out.
 518         // xmm0 =                       // (y'_5; y'_4)
 519         // xmm1 =                       // (y'_3; y'_2)
 520         // xmm2 =                       // (y'_1; y'_0)
 521          movdqa xmm5, xmm1              // copies of (y'_3; y'_2)
 522          movdqa xmm6, xmm1
 523          movdqa xmm7, xmm1
 524           psrldq xmm1, 8                // bring down (y'_2; ?)
 525         movdqa  xmm3, xmm0              // copies of (y'_5; y'_4)
 526         movdqa  xmm4, xmm0
 527           punpcklqdq  xmm1, xmm2        // (y'_2; y'_1)
 528           psrldq xmm2, 8                // (y'_0; ?)
 529          pxor   xmm2, xmm5              // low half and unit contrib
 530         pxor    xmm1, xmm0
 531          psrld  xmm5, 1
 532         psrld   xmm0, 1
 533          psrld  xmm6, 2
 534         psrld   xmm3, 2
 535          psrld  xmm7, 7
 536         psrld   xmm4, 7
 537          pxor   xmm2, xmm6              // low half, unit, t^2 contribs
 538         pxor    xmm1, xmm3
 539          pxor   xmm5, xmm7              // t and t^7 contribs
 540         pxor    xmm0, xmm4
 541          pxor   xmm5, xmm2              // mix everything together
 542         pxor    xmm0, xmm1
 543          movq   xmm1, xmm5              // shunt (z_0; ?) into proper place
 544 .endm
 545
 546 .macro  mul256
 547         // Enter with u and v in xmm0/xmm1 and xmm2/xmm3 respectively; leave
 548         // with z = u v in xmm0/xmm1.  Clobbers xmm2--xmm7.  On 32-bit x86,
 549         // requires 16 bytes aligned space at SP; on amd64, also clobbers
 550         // xmm8.
 551
 552         // Now it's starting to look worthwhile to do Karatsuba.  Suppose
 553         // u = u_0 + u_1 B and v = v_0 + v_1 B.  Then
 554         //
 555         //      u v = (u_0 v_0) + (u_0 v_1 + u_1 v_0) B + (u_1 v_1) B^2
 556         //
 557         // Name these coefficients of B^i be a, b, and c, respectively, and
 558         // let r = u_0 + u_1 and s = v_0 + v_1.  Then observe that
 559         //
 560         //      q = r s = (u_0 + u_1) (v_0 + v_1)
 561         //        = (u_0 v_0) + (u1 v_1) + (u_0 v_1 + u_1 v_0)
 562         //        = a + d + c
 563         //
 564         // The first two terms we've already calculated; the last is the
 565         // remaining one we want.  We'll set B = t^128.  We know how to do
 566         // 128-bit multiplications already, and Karatsuba is too annoying
 567         // there, so there'll be 12 multiplications altogether, rather than
 568         // the 16 we'd have if we did this the naïve way.
 569         //
 570         // On x86, there aren't quite enough registers, so spill one for a
 571         // bit.  On AMD64, we can keep on going, so it's all good.
 572
 573         // xmm0 =                       // u_1 = (u_11; u_10)
 574         // xmm1 =                       // u_0 = (u_01; u_00)
 575         // xmm2 =                       // v_1 = (v_11; v_10)
 576         // xmm3 =                       // v_0 = (v_01; v_00)
 577         movdqa  xmm4, xmm0              // u_1 again
 578 #if CPUFAM_X86
 579         movdqa  [esp + 0], xmm3
 580 #elif CPUFAM_AMD64
 581         movdqa  xmm8, xmm3
 582 #  define V0 xmm8
 583 #endif
 584         pxor    xmm4, xmm1              // u_* = (u_01 + u_11; u_00 + u_10)
 585         pxor    xmm3, xmm2              // v_* = (v_01 + v_11; v_00 + v_10)
 586
 587         // Start by building the cross product, q = u_* v_*.
 588         movdqa  xmm7, xmm4              // more copies of u_*
 589         movdqa  xmm5, xmm4
 590         movdqa  xmm6, xmm4
 591         pclmullqhqdq xmm4, xmm3         // u_*1 v_*0
 592         pclmulhqlqdq xmm7, xmm3         // u_*0 v_*1
 593         pclmullqlqdq xmm5, xmm3         // u_*1 v_*1
 594         pclmulhqhqdq xmm6, xmm3         // u_*0 v_*0
 595         pxor    xmm4, xmm7              // u_*1 v_*0 + u_*0 v_*1
 596         movdqa  xmm7, xmm4
 597         pslldq  xmm4, 8
 598         psrldq  xmm7, 8
 599         pxor    xmm5, xmm4              // q_1
 600         pxor    xmm6, xmm7              // q_0
 601
 602         // Next, work on the high half, a = u_1 v_1.
 603         movdqa  xmm3, xmm0              // more copies of u_1
 604         movdqa  xmm4, xmm0
 605         movdqa  xmm7, xmm0
 606         pclmullqhqdq xmm0, xmm2         // u_11 v_10
 607         pclmulhqlqdq xmm3, xmm2         // u_10 v_11
 608         pclmullqlqdq xmm4, xmm2         // u_11 v_11
 609         pclmulhqhqdq xmm7, xmm2         // u_10 v_10
 610 #if CPUFAM_X86
 611         movdqa  xmm2, [esp + 0]
 612 #  define V0 xmm2
 613 #endif
 614         pxor    xmm0, xmm3              // u_10 v_11 + u_11 v_10
 615         movdqa  xmm3, xmm0
 616         pslldq  xmm0, 8
 617         psrldq  xmm3, 8
 618         pxor    xmm4, xmm0              // x_1 = a_1
 619         pxor    xmm7, xmm3              // a_0
 620
 621         // Mix that into the product now forming in xmm4--xmm7.
 622         pxor    xmm5, xmm4              // a_1 + q_1
 623         pxor    xmm6, xmm7              // a_0 + q_0
 624         pxor    xmm5, xmm7              // a_0 + (a_1 + q_1)
 625
 626         // Finally, the low half, c = u_0 v_0.
 627         movdqa  xmm0, xmm1              // more copies of u_0
 628         movdqa  xmm3, xmm1
 629         movdqa  xmm7, xmm1
 630         pclmullqhqdq xmm1, V0           // u_01 v_00
 631         pclmulhqlqdq xmm0, V0           // u_00 v_01
 632         pclmullqlqdq xmm3, V0           // u_01 v_01
 633         pclmulhqhqdq xmm7, V0           // u_00 v_00
 634         pxor    xmm0, xmm1              // u_10 v_11 + u_11 v_10
 635         movdqa  xmm1, xmm0
 636         pslldq  xmm0, 8
 637         psrldq  xmm1, 8
 638         pxor    xmm3, xmm0              // c_1
 639         pxor    xmm7, xmm1              // x_0 = c_0
 640
 641         // And mix that in to complete the product.
 642         pxor    xmm6, xmm3              // (a_0 + q_0) + c_1
 643         pxor    xmm5, xmm3       // x_2 = a_0 + (a_1 + c_1 + q_1) = a_0 + b_1
 644         pxor    xmm6, xmm7       // x_1 = (a_0 + c_0 + q_0) + c_1 = b_0 + c_1
 645
 646 #undef V0
 647
 648         // Now we need to shift that whole lot one bit to the left.  This
 649         // will also give us an opportunity to put the product back in
 650         // xmm0--xmm3.  This is a slightly merry dance because it's nearly
 651         // pipelined but we don't have enough registers.
 652         //
 653         // After this, we'll be in GCM bizarro-world.
 654         movdqa  xmm0, xmm4              // x_3 again
 655         psrld   xmm4, 31                // x_3 carries
 656         pslld   xmm0, 1                 // x_3 shifted left
 657         movdqa  xmm3, xmm4              // x_3 copy carries
 658          movdqa xmm1, xmm5              // x_2 again
 659         pslldq  xmm4, 4                 // x_3 carries shifted up
 660          psrld  xmm5, 31                // x_2 carries
 661         psrldq  xmm3, 12                // x_3 big carry out
 662          pslld  xmm1, 1                 // x_2 shifted left
 663         por     xmm0, xmm4              // x_3 mixed together
 664          movdqa xmm4, xmm5              // x_2 copy carries
 665           movdqa xmm2, xmm6             // x_1 again
 666          pslldq xmm5, 4                 // x_2 carries shifted up
 667           psrld xmm6, 31                // x_1 carries
 668          psrldq xmm4, 12                // x_2 big carry out
 669           pslld xmm2, 1                 // x_1 shifted
 670          por    xmm1, xmm5              // x_2 mixed together
 671           movdqa xmm5, xmm6             // x_1 copy carries
 672          por    xmm1, xmm3              // x_2 with carry from x_3
 673            movdqa xmm3, xmm7            // x_0 again
 674           pslldq xmm6, 4                // x_1 carries shifted up
 675            psrld xmm7, 31               // x_2 carries
 676           psrldq xmm5, 12               // x_1 big carry out
 677            pslld xmm3, 1                // x_0 shifted
 678           por   xmm2, xmm6              // x_1 mixed together
 679            pslldq xmm7, 4               // x_0 carries shifted up
 680           por   xmm2, xmm4              // x_1 with carry from x_2
 681            por  xmm3, xmm7              // x_0 mixed together
 682            por  xmm3, xmm5              // x_0 with carry from x_1
 683
 684         // Now we must reduce.  This is essentially the same as the 128-bit
 685         // case above, but more complicated because everything is bigger.
 686         // The polynomial this time is p(t) = t^256 + t^10 + t^5 + t^2 + 1.
 687
 688         // First, shift the high bits down.
 689         movdqa  xmm4, xmm0              // y_3 again
 690         movdqa  xmm5, xmm0              // y_3 yet again
 691         movdqa  xmm6, xmm0              // y_3 again again
 692         pslld   xmm4, 30                // y_3: b_i for t^2
 693         pslld   xmm5, 27                // y_3: b_i for t^5
 694         pslld   xmm6, 22                // y_3: b_i for t^10
 695          movdqa xmm7, xmm1              // y_2 again
 696         pxor    xmm4, xmm5
 697          movdqa xmm5, xmm1              // y_2 again
 698         pxor    xmm4, xmm6
 699          movdqa xmm6, xmm1              // y_2 again
 700          pslld  xmm7, 30                // y_2: b_i for t^2
 701          pslld  xmm5, 27                // y_2: b_i for t^5
 702          pslld  xmm6, 22                // y_2: b_i for t^10
 703          pxor   xmm7, xmm5
 704         movdqa  xmm5, xmm4
 705          pxor   xmm7, xmm6
 706         psrldq  xmm4, 4
 707          movdqa xmm6, xmm7
 708         pslldq  xmm5, 12
 709          psrldq xmm7, 4
 710         pxor    xmm2, xmm4
 711          pslldq xmm6, 12
 712          pxor   xmm3, xmm7
 713         pxor    xmm1, xmm5
 714          pxor   xmm2, xmm6
 715
 716         // And then shift the low bits up.
 717         movdqa  xmm4, xmm0              // y_3 again
 718          movdqa xmm5, xmm1              // y_2 again
 719         movdqa  xmm6, xmm0              // y_3 yet again
 720          movdqa xmm7, xmm1              // y_2 yet again
 721         pxor    xmm2, xmm0              // y_1 and unit contrib from y_3
 722          pxor   xmm3, xmm1              // y_0 and unit contrib from y_2
 723         psrld   xmm0, 2
 724          psrld  xmm1, 2
 725         psrld   xmm4, 5
 726          psrld  xmm5, 5
 727         psrld   xmm6, 10
 728          psrld  xmm7, 10
 729         pxor    xmm0, xmm2              // y_1, with y_3 units and t^2
 730          pxor   xmm1, xmm3              // y_0, with y_2 units and t^2
 731         pxor    xmm4, xmm6              // y_3 t^5 and t^10 contribs
 732          pxor   xmm5, xmm7              // y_2 t^5 and t^10 contribs
 733         pxor    xmm0, xmm4              // high half of reduced result
 734         pxor    xmm1, xmm5              // low half; all done
 735 .endm
 736
 737 ///--------------------------------------------------------------------------
 738 /// Main code.
 739
 740 // There are a number of representations of field elements in this code and
 741 // it can be confusing.
 742 //
 743 //   * The `external format' consists of a sequence of contiguous bytes in
 744 //     memory called a `block'.  The GCM spec explains how to interpret this
 745 //     block as an element of a finite field.  As discussed extensively, this
 746 //     representation is very annoying for a number of reasons.  On the other
 747 //     hand, this code never actually deals with it directly.
 748 //
 749 //   * The `register format' consists of one or more XMM registers, depending
 750 //     on the block size.  The bytes in these registers are in reverse order
 751 //     -- so the least-significant byte of the lowest-numbered register holds
 752 //     the /last/ byte in the block.  If the block size is not a multiple of
 753 //     16 bytes, then there must be padding.  96-bit blocks are weird: the
 754 //     padding is inserted at the /least/ significant end, so the register
 755 //     holds (0, x_0; x_1, x_2); otherwise, the padding goes at the most
 756 //     significant end.
 757 //
 758 //   * The `words' format consists of a sequence of bytes, as in the
 759 //     `external format', but, according to the blockcipher in use, the bytes
 760 //     within each 32-bit word may be reversed (`big-endian') or not
 761 //     (`little-endian').  Accordingly, there are separate entry points for
 762 //     each variant, identified with `b' or `l'.
 763
 764 #define SSEFUNC(f)                                                      \
 765         FUNC(f##_avx); vzeroupper; endprologue; ENDFUNC;                \
 766         FUNC(f)
 767
 768 SSEFUNC(gcm_mulk_128b_x86ish_pclmul)
 769         // On entry, A points to a 128-bit field element in big-endian words
 770         // format; K points to a field-element in register format.  On exit,
 771         // A is updated with the product A K.
 772
 773 #if CPUFAM_X86
 774         mov     A, [esp + 4]
 775         mov     K, [esp + 8]
 776 #endif
 777   endprologue
 778         movdqu  xmm0, [A]
 779         movdqu  xmm1, [K]
 780         pshufd  xmm0, xmm0, SHUF(3, 2, 1, 0)
 781         mul128
 782         pshufd  xmm0, xmm0, SHUF(3, 2, 1, 0)
 783         movdqu  [A], xmm0
 784         ret
 785 ENDFUNC
 786
 787 SSEFUNC(gcm_mulk_128l_x86ish_pclmul)
 788         // On entry, A points to a 128-bit field element in little-endian
 789         // words format; K points to a field-element in register format.  On
 790         // exit, A is updated with the product A K.
 791
 792 #if CPUFAM_X86
 793         mov     A, [esp + 4]
 794         mov     K, [esp + 8]
 795         ldgot   ecx
 796 #endif
 797   endprologue
 798         movdqa  xmm7, [INTADDR(swaptab_128l, ecx)]
 799         movdqu  xmm0, [A]
 800         movdqu  xmm1, [K]
 801         pshufb  xmm0, xmm7
 802         mul128
 803         pshufb  xmm0, xmm7
 804         movdqu  [A], xmm0
 805         ret
 806 ENDFUNC
 807
 808 SSEFUNC(gcm_mulk_64b_x86ish_pclmul)
 809         // On entry, A points to a 64-bit field element in big-endian words
 810         // format; K points to a field-element in register format.  On exit,
 811         // A is updated with the product A K.
 812
 813 #if CPUFAM_X86
 814         mov     A, [esp + 4]
 815         mov     K, [esp + 8]
 816 #endif
 817   endprologue
 818         movq    xmm0, [A]
 819         movq    xmm1, [K]
 820         pshufd  xmm0, xmm0, SHUF(1, 0, 3, 3)
 821         mul64
 822         pshufd  xmm0, xmm0, SHUF(1, 0, 3, 3)
 823         movq    [A], xmm0
 824         ret
 825 ENDFUNC
 826
 827 SSEFUNC(gcm_mulk_64l_x86ish_pclmul)
 828         // On entry, A points to a 64-bit field element in little-endian
 829         // words format; K points to a field-element in register format.  On
 830         // exit, A is updated with the product A K.
 831
 832 #if CPUFAM_X86
 833         mov     A, [esp + 4]
 834         mov     K, [esp + 8]
 835         ldgot   ecx
 836 #endif
 837   endprologue
 838         movdqa  xmm7, [INTADDR(swaptab_64l, ecx)]
 839         movq    xmm0, [A]
 840         movq    xmm1, [K]
 841         pshufb  xmm0, xmm7
 842         mul64
 843         pshufb  xmm0, xmm7
 844         movq    [A], xmm0
 845         ret
 846 ENDFUNC
 847
 848 SSEFUNC(gcm_mulk_96b_x86ish_pclmul)
 849         // On entry, A points to a 96-bit field element in big-endian words
 850         // format; K points to a field-element in register format (i.e., 16
 851         // bytes, with the first four bytes zero).  On exit, A is updated
 852         // with the product A K.
 853
 854 #if CPUFAM_X86
 855         mov     A, [esp + 4]
 856         mov     K, [esp + 8]
 857 #endif
 858   endprologue
 859         movq    xmm0, [A + 0]
 860         movd    xmm2, [A + 8]
 861         movdqu  xmm1, [K]
 862         punpcklqdq xmm0, xmm2
 863         pshufd  xmm0, xmm0, SHUF(3, 2, 1, 0)
 864         mul96
 865         pshufd  xmm1, xmm0, SHUF(3, 2, 1, 0)
 866         psrldq  xmm0, 4
 867         movq    [A + 0], xmm1
 868         movd    [A + 8], xmm0
 869         ret
 870 ENDFUNC
 871
 872 SSEFUNC(gcm_mulk_96l_x86ish_pclmul)
 873         // On entry, A points to a 96-bit field element in little-endian
 874         // words format; K points to a field-element in register format
 875         // (i.e., 16 bytes, with the first four bytes zero).  On exit, A is
 876         // updated with the product A K.
 877
 878 #if CPUFAM_X86
 879         mov     A, [esp + 4]
 880         mov     K, [esp + 8]
 881         ldgot   ecx
 882 #endif
 883   endprologue
 884         movdqa  xmm7, [INTADDR(swaptab_128l, ecx)]
 885         movq    xmm0, [A + 0]
 886         movd    xmm2, [A + 8]
 887         movdqu  xmm1, [K]
 888         punpcklqdq xmm0, xmm2
 889         pshufb  xmm0, xmm7
 890         mul96
 891         pshufb  xmm0, xmm7
 892         movq    [A + 0], xmm0
 893         psrldq  xmm0, 8
 894         movd    [A + 8], xmm0
 895         ret
 896 ENDFUNC
 897
 898 SSEFUNC(gcm_mulk_192b_x86ish_pclmul)
 899         // On entry, A points to a 192-bit field element in big-endian words
 900         // format; K points to a field-element in register format.  On exit,
 901         // A is updated with the product A K.
 902
 903 #if CPUFAM_X86
 904         mov     A, [esp + 4]
 905         mov     K, [esp + 8]
 906 #endif
 907 #if CPUFAM_AMD64 && ABI_WIN
 908         stalloc 2*16 + 8
 909         savexmm xmm6, 0
 910         savexmm xmm7, 16
 911 #endif
 912   endprologue
 913         movdqu  xmm0, [A + 8]
 914         movq    xmm1, [A + 0]
 915         movdqu  xmm2, [K + 0]
 916         movq    xmm3, [K + 16]
 917         pshufd  xmm0, xmm0, SHUF(3, 2, 1, 0)
 918         pshufd  xmm1, xmm1, SHUF(1, 0, 3, 3)
 919         mul192
 920         pshufd  xmm0, xmm0, SHUF(3, 2, 1, 0)
 921         pshufd  xmm1, xmm1, SHUF(1, 0, 3, 3)
 922         movdqu  [A + 8], xmm0
 923         movq    [A + 0], xmm1
 924 #if CPUFAM_AMD64 && ABI_WIN
 925         rstrxmm xmm6, 0
 926         rstrxmm xmm7, 16
 927         stfree  2*16 + 8
 928 #endif
 929         ret
 930 ENDFUNC
 931
 932 SSEFUNC(gcm_mulk_192l_x86ish_pclmul)
 933         // On entry, A points to a 192-bit field element in little-endian
 934         // words format; K points to a field-element in register format.  On
 935         // exit, A is updated with the product A K.
 936
 937 #if CPUFAM_X86
 938         mov     A, [esp + 4]
 939         mov     K, [esp + 8]
 940         ldgot   ecx
 941 #endif
 942 #if CPUFAM_AMD64 && ABI_WIN
 943         stalloc 2*16 + 8
 944         savexmm xmm6, 0
 945         savexmm xmm7, 16
 946 #endif
 947   endprologue
 948         movdqu  xmm0, [A + 8]
 949         movq    xmm1, [A + 0]
 950         movdqu  xmm2, [K + 0]
 951         movq    xmm3, [K + 16]
 952         pshufb  xmm0, [INTADDR(swaptab_128l, ecx)]
 953         pshufb  xmm1, [INTADDR(swaptab_64l, ecx)]
 954         mul192
 955         pshufb  xmm0, [INTADDR(swaptab_128l, ecx)]
 956         pshufb  xmm1, [INTADDR(swaptab_64l, ecx)]
 957         movdqu  [A + 8], xmm0
 958         movq    [A + 0], xmm1
 959 #if CPUFAM_AMD64 && ABI_WIN
 960         rstrxmm xmm6, 0
 961         rstrxmm xmm7, 16
 962         stfree  2*16 + 8
 963 #endif
 964         ret
 965 ENDFUNC
 966
 967 SSEFUNC(gcm_mulk_256b_x86ish_pclmul)
 968         // On entry, A points to a 256-bit field element in big-endian words
 969         // format; K points to a field-element in register format.  On exit,
 970         // A is updated with the product A K.
 971
 972 #if CPUFAM_X86
 973         pushreg ebp
 974         setfp
 975         mov     A, [esp + 8]
 976         mov     K, [esp + 12]
 977         and     esp, ~15
 978         sub     esp, 16
 979 #endif
 980 #if CPUFAM_AMD64 && ABI_WIN
 981         stalloc 3*16 + 8
 982         savexmm xmm6, 0
 983         savexmm xmm7, 16
 984         savexmm xmm8, 32
 985 #endif
 986   endprologue
 987         movdqu  xmm0, [A + 16]
 988         movdqu  xmm1, [A + 0]
 989         movdqu  xmm2, [K + 0]
 990         movdqu  xmm3, [K + 16]
 991         pshufd  xmm0, xmm0, SHUF(3, 2, 1, 0)
 992         pshufd  xmm1, xmm1, SHUF(3, 2, 1, 0)
 993         mul256
 994         pshufd  xmm0, xmm0, SHUF(3, 2, 1, 0)
 995         pshufd  xmm1, xmm1, SHUF(3, 2, 1, 0)
 996         movdqu  [A + 16], xmm0
 997         movdqu  [A + 0], xmm1
 998 #if CPUFAM_X86
 999         dropfp
1000         popreg  ebp
1001 #endif
1002 #if CPUFAM_AMD64 && ABI_WIN
1003         rstrxmm xmm6, 0
1004         rstrxmm xmm7, 16
1005         rstrxmm xmm8, 32
1006         stfree  3*16 + 8
1007 #endif
1008         ret
1009 ENDFUNC
1010
1011 SSEFUNC(gcm_mulk_256l_x86ish_pclmul)
1012         // On entry, A points to a 256-bit field element in little-endian
1013         // words format; K points to a field-element in register format.  On
1014         // exit, A is updated with the product A K.
1015
1016 #if CPUFAM_X86
1017         pushreg ebp
1018         setfp
1019         mov     A, [esp + 8]
1020         mov     K, [esp + 12]
1021         and     esp, ~15
1022         ldgot   ecx
1023         sub     esp, 16
1024 #endif
1025 #if CPUFAM_AMD64 && ABI_WIN
1026         stalloc 3*16 + 8
1027         savexmm xmm6, 0
1028         savexmm xmm7, 16
1029         savexmm xmm8, 32
1030 #endif
1031   endprologue
1032         movdqa  xmm7, [INTADDR(swaptab_128l, ecx)]
1033         movdqu  xmm0, [A + 16]
1034         movdqu  xmm1, [A + 0]
1035         movdqu  xmm2, [K + 0]
1036         movdqu  xmm3, [K + 16]
1037         pshufb  xmm0, xmm7
1038         pshufb  xmm1, xmm7
1039         mul256
1040         movdqa  xmm7, [INTADDR(swaptab_128l, ecx)]
1041         pshufb  xmm0, xmm7
1042         pshufb  xmm1, xmm7
1043         movdqu  [A + 16], xmm0
1044         movdqu  [A + 0], xmm1
1045 #if CPUFAM_X86
1046         dropfp
1047         popreg  ebp
1048 #endif
1049 #if CPUFAM_AMD64 && ABI_WIN
1050         rstrxmm xmm6, 0
1051         rstrxmm xmm7, 16
1052         rstrxmm xmm8, 32
1053         stfree  3*16 + 8
1054 #endif
1055         ret
1056 ENDFUNC
1057
1058         RODATA
1059
1060         .balign 16
1061 swaptab_128l:
1062         // Table for byte-swapping little-endian words-format blocks larger
1063         // than 64 bits.
1064         .byte    15,  14,  13,  12,   11,  10,   9,   8
1065         .byte     7,   6,   5,   4,    3,   2,   1,   0
1066
1067         .balign 16
1068 swaptab_64l:
1069         // Table for byte-swapping 64-bit little-endian words-format blocks.
1070         .byte     7,   6,   5,   4,    3,   2,   1,   0
1071         .byte   255, 255, 255, 255,  255, 255, 255, 255
1072
1073 ///----- That's all, folks --------------------------------------------------