chiark - git - mdw - catacomb/blob - symm/gcm-x86ish-pclmul.S

   1 /// -*- mode: asm; asm-comment-char: ?/ -*-
   2 ///
   3 /// GCM acceleration for x86 processors
   4 ///
   5 /// (c) 2018 Straylight/Edgeware
   6 ///
   7
   8 ///----- Licensing notice ---------------------------------------------------
   9 ///
  10 /// This file is part of Catacomb.
  11 ///
  12 /// Catacomb is free software: you can redistribute it and/or modify it
  13 /// under the terms of the GNU Library General Public License as published
  14 /// by the Free Software Foundation; either version 2 of the License, or
  15 /// (at your option) any later version.
  16 ///
  17 /// Catacomb is distributed in the hope that it will be useful, but
  18 /// WITHOUT ANY WARRANTY; without even the implied warranty of
  19 /// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  20 /// Library General Public License for more details.
  21 ///
  22 /// You should have received a copy of the GNU Library General Public
  23 /// License along with Catacomb.  If not, write to the Free Software
  24 /// Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
  25 /// USA.
  26
  27 ///--------------------------------------------------------------------------
  28 /// Preliminaries.
  29
  30 #include "config.h"
  31 #include "asm-common.h"
  32
  33         .arch   .pclmul
  34
  35         .text
  36
  37 ///--------------------------------------------------------------------------
  38 /// Common register allocation.
  39
  40 #if CPUFAM_X86
  41 #  define A eax
  42 #  define K edx
  43 #elif CPUFAM_AMD64 && ABI_SYSV
  44 #  define A rdi
  45 #  define K rsi
  46 #elif CPUFAM_AMD64 && ABI_WIN
  47 #  define A rcx
  48 #  define K rdx
  49 #endif
  50
  51 ///--------------------------------------------------------------------------
  52 /// Multiplication macros.
  53
  54         // The good news is that we have a fancy instruction to do the
  55         // multiplications.  The bad news is that it's not particularly well-
  56         // suited to the job.
  57         //
  58         // For one thing, it only does a 64-bit multiplication, so in general
  59         // we'll need to synthesize the full-width multiply by hand.  For
  60         // another thing, it doesn't help with the reduction, so we have to
  61         // do that by hand too.  And, finally, GCM has crazy bit ordering,
  62         // and the instruction does nothing useful for that at all.
  63         //
  64         // Focusing on that last problem first: the bits aren't in monotonic
  65         // significance order unless we permute them.  If we reverse the byte
  66         // order, then we'll have the bits in monotonic order, but backwards,
  67         // so the degree-0 coefficient will be in the most-significant bit.
  68         //
  69         // This is less of a difficulty than it seems at first, because
  70         // algebra.  Suppose we are given u = SUM_{0<=i<n} u_i t^i and v =
  71         // SUM_{0<=j<n} v_j t^j; then
  72         //
  73         //      u v = SUM_{0<=i,j<n} u_i v_j t^{i+j}
  74         //
  75         // Suppose instead that we're given ũ = SUM_{0<=i<n} u_{n-i-1} t^i
  76         // and ṽ = SUM_{0<=j<n} v_{n-j-1} t^j, so the bits are backwards.
  77         // Then
  78         //
  79         //      ũ ṽ = SUM_{0<=i,j<n} u_{n-i-1} v_{n-j-1} t^{i+j}
  80         //          = SUM_{0<=i,j<n} u_i v_j t^{2n-2-(i+j)}
  81         //
  82         // which is almost the bit-reversal of u v, only it's shifted right
  83         // by one place.  Putting this another way, what we have is actually
  84         // the bit reversal of the product u v t.  We could get the correct
  85         // answer (modulo p(t)) if we'd sneakily divided one of the operands
  86         // by t before we started.  Conveniently, v is actually the secret
  87         // value k set up by the GCM `mktable' function, so we can arrange to
  88         // actually store k/t (mod p(t)) and then the product will come out
  89         // correct (modulo p(t)) and we won't have anything more to worry
  90         // about here.
  91         //
  92         // That was important to think about, but there's not a great deal to
  93         // do about it yet other than to convert what we've got from the
  94         // blockcipher's byte-ordering convention to our big-endian
  95         // convention.  Since this depends on the blockcipher convention,
  96         // we'll leave the caller to cope with this: the macros here will
  97         // assume that the operands are in `register' format, which is the
  98         // byte-reversal of the external representation, padded at the
  99         // most-significant end except for 96-bit blocks, which are
 100         // zero-padded at the least-significant end (see `mul96' for the
 101         // details).  In the commentary, pieces of polynomial are numbered
 102         // according to the degree of the coefficients, so the unit
 103         // coefficient of some polynomial a is in a_0.
 104         //
 105         // The commentary for `mul128' is the most detailed.  The other
 106         // macros assume that you've already read and understood that.
 107
 108 .macro  mul128
 109         // Enter with u and v in xmm0 and xmm1 respectively; leave with z =
 110         // u v in xmm0.  Clobbers xmm1--xmm4.
 111
 112         // First for the double-precision multiplication.  It's tempting to
 113         // use Karatsuba's identity here, but I suspect that loses more in
 114         // the shifting, bit-twiddling, and dependency chains that it gains
 115         // in saving a multiplication which otherwise pipelines well.
 116         // xmm0 =                       // (u_1; u_0)
 117         // xmm1 =                       // (v_1; v_0)
 118         movdqa  xmm2, xmm1              // (v_1; v_0) again
 119         movdqa  xmm3, xmm0              // (u_1; u_0) again
 120         movdqa  xmm4, xmm0              // (u_1; u_0) yet again
 121         pclmulhqlqdq xmm2, xmm0         // u_1 v_0
 122         pclmullqlqdq xmm0, xmm1         // u_1 v_1
 123         pclmulhqlqdq xmm3, xmm1         // u_0 v_1
 124         pclmulhqhqdq xmm4, xmm1         // u_0 v_0
 125
 126         // Arrange the pieces to form a double-precision polynomial.
 127         pxor    xmm2, xmm3              // (m_1; m_0) = u_1 v_0 + u_0 v_1
 128         movdqa  xmm1, xmm2              // (m_1; m_0) again
 129         pslldq  xmm2, 8                 // (0; m_1)
 130         psrldq  xmm1, 8                 // (m_0; 0)
 131         pxor    xmm0, xmm2              // z_1 = u_1 v_1 + m_1
 132         pxor    xmm1, xmm4              // z_0 = u_0 v_0 + t^64 m_0
 133
 134         // The remaining problem is that the result needs to be reduced
 135         // modulo p(t) = t^128 + t^7 + t^2 + t + 1.  Let R = t^128 = t^7 +
 136         // t^2 + t + 1 in our field.  So far, we've calculated z_0 and z_1
 137         // such that z_0 + z_1 R = u v using the identity R = t^128: now we
 138         // must collapse the two halves of z together using the other
 139         // identity R = t^7 + t^2 + t + 1.
 140         //
 141         // We do this by working on each 32-bit word of the high half of z
 142         // separately, so consider x_i, for some 4 <= i < 8.  Certainly, x_i
 143         // t^{32i} = x_i R t^{32(i-4)} = (t^7 + t^2 + t + 1) x_i t^{32(i-4)},
 144         // but we can't use that directly without breaking up the 32-bit word
 145         // structure.  Instead, we start by considering just x_i t^7
 146         // t^{32(i-4)}, which again looks tricky.  Now, split x_i = a_i +
 147         // t^25 b_i, with deg a_i < 25; then
 148         //
 149         //      x_i t^7 t^{32(i-4)} = a_i t^7 t^{32(i-4)} + b_i t^{32(i-3)}
 150         //
 151         // We can similarly decompose x_i t^2 and x_i t into a pair of 32-bit
 152         // contributions to the t^{32(i-4)} and t^{32(i-3)} words, but the
 153         // splits are different.  This is lovely, with one small snag: when
 154         // we do this to x_7, we end up with a contribution back into the
 155         // t^128 coefficient word.  But notice that only the low seven bits
 156         // of this word are affected, so there's no knock-on contribution
 157         // into the t^32 word.  Therefore, if we handle the high bits of each
 158         // word together, and then the low bits, everything will be fine.
 159
 160         // First, shift the high bits down.
 161         movdqa  xmm2, xmm0              // (x_7, x_6; x_5, x_4) again
 162         movdqa  xmm3, xmm0              // (x_7, x_6; x_5, x_4) yet again
 163         movdqa  xmm4, xmm0              // (x_7, x_6; x_5, x_4) again again
 164         pslld   xmm2, 31                // the b_i for t
 165         pslld   xmm3, 30                // the b_i for t^2
 166         pslld   xmm4, 25                // the b_i for t^7
 167         pxor    xmm2, xmm3              // add them all together
 168         pxor    xmm2, xmm4
 169         movdqa  xmm3, xmm2              // and a copy for later
 170         psrldq  xmm2, 4                 // contribution into low half
 171         pslldq  xmm3, 12                // and high half
 172         pxor    xmm1, xmm2
 173         pxor    xmm0, xmm3
 174
 175         // And then shift the low bits up.
 176         movdqa  xmm2, xmm0
 177         movdqa  xmm3, xmm0
 178         pxor    xmm1, xmm0              // mix in the unit contribution
 179         psrld   xmm0, 1
 180         psrld   xmm2, 2
 181         psrld   xmm3, 7
 182         pxor    xmm1, xmm2              // low half, unit, and t^2 contribs
 183         pxor    xmm0, xmm3              // t and t^7 contribs
 184         pxor    xmm0, xmm1              // mix them together and we're done
 185 .endm
 186
 187 .macro  mul64
 188         // Enter with u and v in the low halves of xmm0 and xmm1
 189         // respectively; leave with z = u v in xmm0.  Clobbers xmm1--xmm4.
 190
 191         // The multiplication is thankfully easy.
 192         pclmullqlqdq xmm1, xmm0         // u v
 193
 194         // Now we must reduce.  This is essentially the same as the 128-bit
 195         // case above, but mostly simpler because everything is smaller.  The
 196         // polynomial this time is p(t) = t^64 + t^4 + t^3 + t + 1.
 197
 198         // First, we must detach the top (`low'!) half of the result.
 199         movdqa  xmm0, xmm1              // (x_3, x_2; x_1, x_0) again
 200         psrldq  xmm1, 8                 // (x_1, x_0; 0, 0)
 201
 202         // Next, shift the high bits down.
 203         movdqa  xmm2, xmm0              // (x_3, x_2; ?, ?) again
 204         movdqa  xmm3, xmm0              // (x_3, x_2; ?, ?) yet again
 205         movdqa  xmm4, xmm0              // (x_3, x_2; ?, ?) again again
 206         pslld   xmm2, 31                // b_i for t
 207         pslld   xmm3, 29                // b_i for t^3
 208         pslld   xmm4, 28                // b_i for t^4
 209         pxor    xmm2, xmm3              // add them all together
 210         pxor    xmm2, xmm4
 211         movdqa  xmm3, xmm2              // and a copy for later
 212         movq    xmm2, xmm2              // zap high half
 213         pslldq  xmm3, 4                 // contribution into high half
 214         psrldq  xmm2, 4                 // and low half
 215         pxor    xmm0, xmm3
 216         pxor    xmm1, xmm2
 217
 218         // And then shift the low bits up.
 219         movdqa  xmm2, xmm0
 220         movdqa  xmm3, xmm0
 221         pxor    xmm1, xmm0              // mix in the unit contribution
 222         psrld   xmm0, 1
 223         psrld   xmm2, 3
 224         psrld   xmm3, 4
 225         pxor    xmm1, xmm2              // low half, unit, and t^3 contribs
 226         pxor    xmm0, xmm3              // t and t^4 contribs
 227         pxor    xmm0, xmm1              // mix them together and we're done
 228 .endm
 229
 230 .macro  mul96
 231         // Enter with u and v in the /high/ three words of xmm0 and xmm1
 232         // respectively (and zero in the low word); leave with z = u v in the
 233         // high three words of xmm0, and /junk/ in the low word.  Clobbers
 234         // xmm1--xmm4.
 235
 236         // This is an inconvenient size.  There's nothing for it but to do
 237         // four multiplications, as if for the 128-bit case.  It's possible
 238         // that there's cruft in the top 32 bits of the input registers, so
 239         // shift both of them up by four bytes before we start.  This will
 240         // mean that the high 64 bits of the result (from GCM's viewpoint)
 241         // will be zero.
 242         // xmm0 =                       // (0, u_2; u_1, u_0)
 243         // xmm1 =                       // (0, v_2; v_1, v_0)
 244         movdqa  xmm2, xmm1              // (0, v_2; v_1, v_0) again
 245         movdqa  xmm3, xmm0              // (0, u_2; u_1, u_0) again
 246         movdqa  xmm4, xmm0              // (0, u_2; u_1, u_0) yet again
 247         pclmulhqlqdq xmm2, xmm0         // u_2 (v_1 t^32 + v_0) = e_0
 248         pclmullqlqdq xmm0, xmm1         // u_2 v_2 = d = (0; d)
 249         pclmulhqlqdq xmm3, xmm1         // v_2 (u_1 t^32 + u_0) = e_1
 250         pclmulhqhqdq xmm4, xmm1         // u_0 v_0 + (u_1 v_0 + u_0 v_1) t^32
 251                                         //   + u_1 v_1 t^64 = f
 252
 253         // Extract the high and low halves of the 192-bit result.  We don't
 254         // need be too picky about the unused high words of the result
 255         // registers.  The answer we want is d t^128 + e t^64 + f, where e =
 256         // e_0 + e_1.
 257         //
 258         // The place values for the two halves are (t^160, t^128; t^96, ?)
 259         // and (?, t^64; t^32, 1).  But we also want to shift the high part
 260         // left by a word, for symmetry's sake.
 261         psrldq  xmm0, 8                 // (d; 0) = d t^128
 262         pxor    xmm2, xmm3              // e = (e_0 + e_1)
 263         movdqa  xmm1, xmm4              // f again
 264         pxor    xmm0, xmm2              // d t^128 + e t^64
 265         psrldq  xmm2, 12                // e[31..0] t^64
 266         psrldq  xmm1, 4                 // f[95..0]
 267         pslldq  xmm4, 12                // f[127..96], shifted
 268         pslldq  xmm0, 4                 // shift high 96 bits
 269         pxor    xmm1, xmm2              // low 96 bits of result
 270         pxor    xmm0, xmm4              // high 96 bits of result
 271
 272         // Finally, the reduction.  This is essentially the same as the
 273         // 128-bit case, except that the polynomial is p(t) = t^96 + t^10 +
 274         // t^9 + t^6 + 1.  The degrees are larger but not enough to cause
 275         // trouble for the general approach.
 276
 277         // First, shift the high bits down.
 278         movdqa  xmm2, xmm0              // copies of the high part
 279         movdqa  xmm3, xmm0
 280         movdqa  xmm4, xmm0
 281         pslld   xmm2, 26                // b_i for t^6
 282         pslld   xmm3, 23                // b_i for t^9
 283         pslld   xmm4, 22                // b_i for t^10
 284         pxor    xmm2, xmm3              // add them all together
 285         pslldq  xmm1, 4                 // shift low part up to match
 286         pxor    xmm2, xmm4
 287         movdqa  xmm3, xmm2              // and a copy for later
 288         pslldq  xmm2, 8                 // contribution to high half
 289         psrldq  xmm3, 4                 // contribution to low half
 290         pxor    xmm1, xmm3
 291         pxor    xmm0, xmm2
 292
 293         // And then shift the low bits up.
 294         movdqa  xmm2, xmm0              // copies of the high part
 295         movdqa  xmm3, xmm0
 296         pxor    xmm1, xmm0              // mix in the unit contribution
 297         psrld   xmm0, 6
 298         psrld   xmm2, 9
 299         psrld   xmm3, 10
 300         pxor    xmm1, xmm2              // low half, unit, and t^9 contribs
 301         pxor    xmm0, xmm3              // t^6 and t^10 contribs
 302         pxor    xmm0, xmm1              // mix them together and we're done
 303 .endm
 304
 305 .macro  mul192
 306         // Enter with u and v in xmm0/xmm1 and xmm2/xmm3 respectively; leave
 307         // with z = u v in xmm0/xmm1 -- the top halves of the high registers
 308         // are unimportant.  Clobbers xmm2--xmm7.
 309
 310         // Start multiplying and accumulating pieces of product.
 311         // xmm0 =                       // (u_2; u_1)
 312         // xmm1 =                       // (u_0; ?)
 313         // xmm2 =                       // (v_2; v_1)
 314         // xmm3 =                       // (v_0; ?)
 315         movdqa  xmm4, xmm0              // (u_2; u_1) again
 316         movdqa  xmm5, xmm0              // (u_2; u_1) yet again
 317         movdqa  xmm6, xmm0              // (u_2; u_1) again again
 318         movdqa  xmm7, xmm3              // (v_0; ?) again
 319         punpcklqdq xmm3, xmm1           // (v_0; u_0)
 320         pclmulhqhqdq xmm4, xmm2         // u_1 v_1
 321         pclmullqlqdq xmm1, xmm2         // u_0 v_2
 322         pclmullqhqdq xmm5, xmm2         // u_2 v_1
 323         pclmulhqlqdq xmm6, xmm2         // u_1 v_2
 324         pxor    xmm1, xmm4              // u_0 v_2 + u_1 v_1
 325         pclmullqlqdq xmm7, xmm0         // u_2 v_0
 326         pxor    xmm5, xmm6              // b = u_2 v_1 + u_1 v_2
 327         movdqa  xmm6, xmm0              // (u_2; u_1) like a bad penny
 328         pxor    xmm1, xmm7              // c = u_0 v_2 + u_1 v_1 + u_2 v_0
 329         pclmullqlqdq xmm0, xmm2         // a = u_2 v_2
 330         pclmulhqlqdq xmm6, xmm3         // u_1 v_0
 331         pclmulhqhqdq xmm2, xmm3         // u_0 v_1
 332         pclmullqhqdq xmm3, xmm3         // e = u_0 v_0
 333         pxor    xmm6, xmm2              // d = u_1 v_0 + u_0 v_1
 334
 335         // Next, the piecing together of the product.  There's significant
 336         // work here to leave the completed pieces in sensible registers.
 337         // xmm0 =                       // (a_1; a_0) = a = u_2 v_2
 338         // xmm5 =                       // (b_1; b_0) = b = u_1 v_2 + u_2 v_1
 339         // xmm1 =                       // (c_1; c_0) = c = u_0 v_2 +
 340                                         //      u_1 v_1 + u_2 v_0
 341         // xmm6 =                       // (d_1; d_0) = d = u_0 v_1 + u_1 v_0
 342         // xmm3 =                       // (e_1; e_0) = e = u_0 v_0
 343         // xmm2, xmm4, xmm7 spare
 344         movdqa  xmm2, xmm6              // (d_1; d_0) again
 345         movdqa  xmm4, xmm5              // (b_1; b_0) again
 346         pslldq  xmm6, 8                 // (0; d_1)
 347         psrldq  xmm5, 8                 // (b_0; 0)
 348         psrldq  xmm2, 8                 // (d_0; 0)
 349         pslldq  xmm4, 8                 // (0; b_1)
 350         pxor    xmm5, xmm6              // (b_0; d_1)
 351         pxor    xmm0, xmm4              // (x_5; x_4) = (a_1; a_0 + b_1)
 352         pxor    xmm2, xmm3              // (x_1; x_0) = (e_1 + d_0; e_0)
 353         pxor    xmm1, xmm5             // (x_3; x_2) = (b_0 + c_1; c_0 + d_1)
 354
 355         // Next, the reduction.  Our polynomial this time is p(x) = t^192 +
 356         // t^7 + t^2 + t + 1.  Yes, the magic numbers are the same as the
 357         // 128-bit case.  I don't know why.
 358
 359         // First, shift the high bits down.
 360         // xmm0 =                       // (x_5; x_4)
 361         // xmm1 =                       // (x_3; x_2)
 362         // xmm2 =                       // (x_1; x_0)
 363         // xmm3--xmm7 spare
 364         movdqa  xmm3, xmm0              // (x_5; x_4) copy
 365         movdqa  xmm4, xmm0              // (x_5; x_4) copy
 366         movdqa  xmm5, xmm0              // (x_5; x_4) copy
 367         pslld   xmm3, 31                // (x_5; x_4) b_i for t
 368         pslld   xmm4, 30                // (x_5; x_4) b_i for t^2
 369         pslld   xmm5, 25                // (x_5; x_4) b_i for t^7
 370          movq   xmm6, xmm1              // (x_3; 0) copy
 371         pxor    xmm3, xmm4
 372          movq   xmm7, xmm1              // (x_3; 0) copy
 373         pxor    xmm3, xmm5
 374          movq   xmm5, xmm1              // (x_3; 0) copy
 375         movdqa  xmm4, xmm3              // (x_5; x_4) b_i combined
 376          pslld  xmm6, 31                // (x_3; 0) b_i for t
 377          pslld  xmm7, 30                // (x_3; 0) b_i for t^2
 378          pslld  xmm5, 25                // (x_3; 0) b_i for t^7
 379         psrldq  xmm3, 12                // (x_5; x_4) low contrib
 380         pslldq  xmm4, 4                 // (x_5; x_4) high contrib
 381          pxor   xmm6, xmm7
 382         pxor    xmm2, xmm3
 383          pxor   xmm6, xmm5
 384         pxor    xmm1, xmm4
 385          pslldq xmm6, 4
 386          pxor   xmm2, xmm6
 387
 388         // And finally shift the low bits up.  Unfortunately, we also have to
 389         // split the low bits out.
 390         // xmm0 =                       // (x'_5; x'_4)
 391         // xmm1 =                       // (x'_3; x'_2)
 392         // xmm2 =                       // (x'_1; x'_0)
 393          movdqa xmm5, xmm1              // copies of (x'_3; x'_2)
 394          movdqa xmm6, xmm1
 395          movdqa xmm7, xmm1
 396           psrldq xmm1, 8                // bring down (x'_2; ?)
 397         movdqa  xmm3, xmm0              // copies of (x'_5; x'_4)
 398         movdqa  xmm4, xmm0
 399           punpcklqdq  xmm1, xmm2        // (x'_2; x'_1)
 400           psrldq xmm2, 8                // (x'_0; ?)
 401          pxor   xmm2, xmm5              // low half and unit contrib
 402         pxor    xmm1, xmm0
 403          psrld  xmm5, 1
 404         psrld   xmm0, 1
 405          psrld  xmm6, 2
 406         psrld   xmm3, 2
 407          psrld  xmm7, 7
 408         psrld   xmm4, 7
 409          pxor   xmm2, xmm6              // low half, unit, t^2 contribs
 410         pxor    xmm1, xmm3
 411          pxor   xmm5, xmm7              // t and t^7 contribs
 412         pxor    xmm0, xmm4
 413          pxor   xmm5, xmm2              // mix everything together
 414         pxor    xmm0, xmm1
 415          movq   xmm1, xmm5              // shunt (z_0; ?) into proper place
 416 .endm
 417
 418 .macro  mul256
 419         // Enter with u and v in xmm0/xmm1 and xmm2/xmm3 respectively; leave
 420         // with z = u v in xmm0/xmm1.  Clobbers xmm2--xmm7.  On 32-bit x86,
 421         // requires 16 bytes aligned space at SP; on amd64, also clobbers
 422         // xmm8.
 423
 424         // Now it's starting to look worthwhile to do Karatsuba.  Suppose
 425         // u = u_0 + u_1 B and v = v_0 + v_1 B.  Then
 426         //
 427         //      u v = (u_0 v_0) + (u_0 v_1 + u_1 v_0) B + (u_1 v_1) B^2
 428         //
 429         // Name these coefficients of B^i be a, b, and c, respectively, and
 430         // let r = u_0 + u_1 and s = v_0 + v_1.  Then observe that
 431         //
 432         //      q = r s = (u_0 + u_1) (v_0 + v_1)
 433         //        = (u_0 v_0) + (u1 v_1) + (u_0 v_1 + u_1 v_0)
 434         //        = a + c + b
 435         //
 436         // The first two terms we've already calculated; the last is the
 437         // remaining one we want.  We'll set B = t^128.  We know how to do
 438         // 128-bit multiplications already, and Karatsuba is too annoying
 439         // there, so there'll be 12 multiplications altogether, rather than
 440         // the 16 we'd have if we did this the naïve way.
 441         //
 442         // On x86, there aren't quite enough registers, so spill one for a
 443         // bit.  On AMD64, we can keep on going, so it's all good.
 444
 445         // xmm0 =                       // u_1 = (u_11; u_10)
 446         // xmm1 =                       // u_0 = (u_01; u_00)
 447         // xmm2 =                       // v_1 = (v_11; v_10)
 448         // xmm3 =                       // v_0 = (v_01; v_00)
 449         movdqa  xmm4, xmm0              // u_1 again
 450 #if CPUFAM_X86
 451         movdqa  [SP + 0], xmm3
 452 #elif CPUFAM_AMD64
 453         movdqa  xmm8, xmm3
 454 #  define V0 xmm8
 455 #endif
 456         pxor    xmm4, xmm1              // u_* = (u_01 + u_11; u_00 + u_10)
 457         pxor    xmm3, xmm2              // v_* = (v_01 + v_11; v_00 + v_10)
 458
 459         // Start by building the cross product, q = u_* v_*.
 460         movdqa  xmm7, xmm4              // more copies of u_*
 461         movdqa  xmm5, xmm4
 462         movdqa  xmm6, xmm4
 463         pclmullqhqdq xmm4, xmm3         // u_*1 v_*0
 464         pclmulhqlqdq xmm7, xmm3         // u_*0 v_*1
 465         pclmullqlqdq xmm5, xmm3         // u_*1 v_*1
 466         pclmulhqhqdq xmm6, xmm3         // u_*0 v_*0
 467         pxor    xmm4, xmm7              // u_*1 v_*0 + u_*0 v_*1
 468         movdqa  xmm7, xmm4
 469         pslldq  xmm4, 8
 470         psrldq  xmm7, 8
 471         pxor    xmm5, xmm4              // q_1
 472         pxor    xmm6, xmm7              // q_0
 473
 474         // Next, work on the high half, a = u_1 v_1.
 475         movdqa  xmm3, xmm0              // more copies of u_1
 476         movdqa  xmm4, xmm0
 477         movdqa  xmm7, xmm0
 478         pclmullqhqdq xmm0, xmm2         // u_11 v_10
 479         pclmulhqlqdq xmm3, xmm2         // u_10 v_11
 480         pclmullqlqdq xmm4, xmm2         // u_11 v_11
 481         pclmulhqhqdq xmm7, xmm2         // u_10 v_10
 482 #if CPUFAM_X86
 483         movdqa  xmm2, [SP + 0]
 484 #  define V0 xmm2
 485 #endif
 486         pxor    xmm0, xmm3              // u_10 v_11 + u_11 v_10
 487         movdqa  xmm3, xmm0
 488         pslldq  xmm0, 8
 489         psrldq  xmm3, 8
 490         pxor    xmm4, xmm0              // x_3 = a_1
 491         pxor    xmm7, xmm3              // a_0
 492
 493         // Mix that into the product now forming in xmm4--xmm7.
 494         pxor    xmm5, xmm4              // a_1 + q_1
 495         pxor    xmm6, xmm7              // a_0 + q_0
 496         pxor    xmm5, xmm7              // a_0 + (a_1 + q_1)
 497
 498         // Finally, the low half, c = u_0 v_0.
 499         movdqa  xmm0, xmm1              // more copies of u_0
 500         movdqa  xmm3, xmm1
 501         movdqa  xmm7, xmm1
 502         pclmullqhqdq xmm1, V0           // u_01 v_00
 503         pclmulhqlqdq xmm0, V0           // u_00 v_01
 504         pclmullqlqdq xmm3, V0           // u_01 v_01
 505         pclmulhqhqdq xmm7, V0           // u_00 v_00
 506         pxor    xmm0, xmm1              // u_10 v_11 + u_11 v_10
 507         movdqa  xmm1, xmm0
 508         pslldq  xmm0, 8
 509         psrldq  xmm1, 8
 510         pxor    xmm3, xmm0              // c_1
 511         pxor    xmm7, xmm1              // x_0 = c_0
 512
 513         // And mix that in to complete the product.
 514         pxor    xmm6, xmm3              // (a_0 + q_0) + c_1
 515         pxor    xmm5, xmm3       // x_2 = a_0 + (a_1 + c_1 + q_1) = a_0 + b_1
 516         pxor    xmm6, xmm7       // x_1 = (a_0 + c_0 + q_0) + c_1 = b_0 + c_1
 517
 518 #undef V0
 519
 520         // Now we must reduce.  This is essentially the same as the 128-bit
 521         // case above, but more complicated because everything is bigger.
 522         // The polynomial this time is p(t) = t^256 + t^10 + t^5 + t^2 + 1.
 523
 524         // First, shift the high bits down.
 525         movdqa  xmm0, xmm4              // x_3 again
 526         movdqa  xmm1, xmm4              // x_3 yet again
 527         movdqa  xmm2, xmm4              // x_3 again again
 528         pslld   xmm0, 30                // x_3: b_i for t^2
 529         pslld   xmm1, 27                // x_3: b_i for t^5
 530         pslld   xmm2, 22                // x_3: b_i for t^10
 531          movdqa xmm3, xmm5              // x_2 again
 532         pxor    xmm0, xmm1
 533          movdqa xmm1, xmm5              // x_2 again
 534         pxor    xmm0, xmm2              // b_3
 535          movdqa xmm2, xmm5              // x_2 again
 536          pslld  xmm3, 30                // x_2: b_i for t^2
 537          pslld  xmm1, 27                // x_2: b_i for t^5
 538          pslld  xmm2, 22                // x_2: b_i for t^10
 539          pxor   xmm3, xmm1
 540         movdqa  xmm1, xmm0
 541          pxor   xmm3, xmm2              // b_2
 542         psrldq  xmm0, 4
 543          movdqa xmm2, xmm3
 544         pslldq  xmm1, 12
 545          psrldq xmm3, 4
 546         pxor    xmm6, xmm0
 547          pslldq xmm2, 12
 548          pxor   xmm7, xmm3
 549         pxor    xmm5, xmm1
 550          pxor   xmm6, xmm2
 551
 552         // And then shift the low bits up.
 553         movdqa  xmm0, xmm4              // x_3 again
 554          movdqa xmm1, xmm5              // x_2 again
 555         movdqa  xmm2, xmm4              // x_3 yet again
 556          movdqa xmm3, xmm5              // x_2 yet again
 557         pxor    xmm6, xmm4              // x_1 and unit contrib from x_3
 558          pxor   xmm7, xmm5              // x_0 and unit contrib from x_2
 559         psrld   xmm4, 2
 560          psrld  xmm5, 2
 561         psrld   xmm0, 5
 562          psrld  xmm1, 5
 563         psrld   xmm2, 10
 564          psrld  xmm3, 10
 565         pxor    xmm4, xmm6              // x_1, with x_3 units and t^2
 566          pxor   xmm5, xmm7              // x_0, with x_2 units and t^2
 567         pxor    xmm0, xmm2              // x_3 t^5 and t^10 contribs
 568          pxor   xmm1, xmm3              // x_2 t^5 and t^10 contribs
 569         pxor    xmm0, xmm4              // high half of reduced result
 570         pxor    xmm1, xmm5              // low half; all done
 571 .endm
 572
 573 ///--------------------------------------------------------------------------
 574 /// Main code.
 575
 576 // There are a number of representations of field elements in this code and
 577 // it can be confusing.
 578 //
 579 //   * The `external format' consists of a sequence of contiguous bytes in
 580 //     memory called a `block'.  The GCM spec explains how to interpret this
 581 //     block as an element of a finite field.  As discussed extensively, this
 582 //     representation is very annoying for a number of reasons.  On the other
 583 //     hand, this code never actually deals with it directly.
 584 //
 585 //   * The `register format' consists of one or more XMM registers, depending
 586 //     on the block size.  The bytes in these registers are in reverse order
 587 //     -- so the least-significant byte of the lowest-numbered register holds
 588 //     the /last/ byte in the block.  If the block size is not a multiple of
 589 //     16 bytes, then there must be padding.  96-bit blocks are weird: the
 590 //     padding is inserted at the /least/ significant end, so the register
 591 //     holds (0, x_0; x_1, x_2); otherwise, the padding goes at the most
 592 //     significant end.
 593 //
 594 //   * The `words' format consists of a sequence of bytes, as in the
 595 //     `external format', but, according to the blockcipher in use, the bytes
 596 //     within each 32-bit word may be reversed (`big-endian') or not
 597 //     (`little-endian').  Accordingly, there are separate entry points for
 598 //     each variant, identified with `b' or `l'.
 599
 600 #define SSEFUNC(f)                                                      \
 601         FUNC(f##_avx); vzeroupper; endprologue; ENDFUNC;                \
 602         FUNC(f)
 603
 604 SSEFUNC(gcm_mulk_128b_x86ish_pclmul)
 605         // On entry, A points to a 128-bit field element in big-endian words
 606         // format; K points to a field-element in register format.  On exit,
 607         // A is updated with the product A K.
 608
 609 #if CPUFAM_X86
 610         mov     A, [SP + 4]
 611         mov     K, [SP + 8]
 612 #endif
 613   endprologue
 614         movdqu  xmm0, [A]
 615         movdqu  xmm1, [K]
 616         pshufd  xmm0, xmm0, SHUF(3, 2, 1, 0)
 617         mul128
 618         pshufd  xmm0, xmm0, SHUF(3, 2, 1, 0)
 619         movdqu  [A], xmm0
 620         ret
 621 ENDFUNC
 622
 623 SSEFUNC(gcm_mulk_128l_x86ish_pclmul)
 624         // On entry, A points to a 128-bit field element in little-endian
 625         // words format; K points to a field-element in register format.  On
 626         // exit, A is updated with the product A K.
 627
 628 #if CPUFAM_X86
 629         mov     A, [SP + 4]
 630         mov     K, [SP + 8]
 631         ldgot   ecx
 632 #endif
 633   endprologue
 634         movdqa  xmm7, [INTADDR(swaptab_128l, ecx)]
 635         movdqu  xmm0, [A]
 636         movdqu  xmm1, [K]
 637         pshufb  xmm0, xmm7
 638         mul128
 639         pshufb  xmm0, xmm7
 640         movdqu  [A], xmm0
 641         ret
 642 ENDFUNC
 643
 644 SSEFUNC(gcm_mulk_64b_x86ish_pclmul)
 645         // On entry, A points to a 64-bit field element in big-endian words
 646         // format; K points to a field-element in register format.  On exit,
 647         // A is updated with the product A K.
 648
 649 #if CPUFAM_X86
 650         mov     A, [SP + 4]
 651         mov     K, [SP + 8]
 652 #endif
 653   endprologue
 654         movq    xmm0, [A]
 655         movq    xmm1, [K]
 656         pshufd  xmm0, xmm0, SHUF(1, 0, 3, 3)
 657         mul64
 658         pshufd  xmm0, xmm0, SHUF(1, 0, 3, 3)
 659         movq    [A], xmm0
 660         ret
 661 ENDFUNC
 662
 663 SSEFUNC(gcm_mulk_64l_x86ish_pclmul)
 664         // On entry, A points to a 64-bit field element in little-endian
 665         // words format; K points to a field-element in register format.  On
 666         // exit, A is updated with the product A K.
 667
 668 #if CPUFAM_X86
 669         mov     A, [SP + 4]
 670         mov     K, [SP + 8]
 671         ldgot   ecx
 672 #endif
 673   endprologue
 674         movdqa  xmm7, [INTADDR(swaptab_64l, ecx)]
 675         movq    xmm0, [A]
 676         movq    xmm1, [K]
 677         pshufb  xmm0, xmm7
 678         mul64
 679         pshufb  xmm0, xmm7
 680         movq    [A], xmm0
 681         ret
 682 ENDFUNC
 683
 684 SSEFUNC(gcm_mulk_96b_x86ish_pclmul)
 685         // On entry, A points to a 96-bit field element in big-endian words
 686         // format; K points to a field-element in register format (i.e., 16
 687         // bytes, with the first four bytes zero).  On exit, A is updated
 688         // with the product A K.
 689
 690 #if CPUFAM_X86
 691         mov     A, [SP + 4]
 692         mov     K, [SP + 8]
 693 #endif
 694   endprologue
 695         movq    xmm0, [A + 0]
 696         movd    xmm2, [A + 8]
 697         movdqu  xmm1, [K]
 698         punpcklqdq xmm0, xmm2
 699         pshufd  xmm0, xmm0, SHUF(3, 2, 1, 0)
 700         mul96
 701         pshufd  xmm1, xmm0, SHUF(3, 2, 1, 0)
 702         psrldq  xmm0, 4
 703         movq    [A + 0], xmm1
 704         movd    [A + 8], xmm0
 705         ret
 706 ENDFUNC
 707
 708 SSEFUNC(gcm_mulk_96l_x86ish_pclmul)
 709         // On entry, A points to a 96-bit field element in little-endian
 710         // words format; K points to a field-element in register format
 711         // (i.e., 16 bytes, with the first four bytes zero).  On exit, A is
 712         // updated with the product A K.
 713
 714 #if CPUFAM_X86
 715         mov     A, [SP + 4]
 716         mov     K, [SP + 8]
 717         ldgot   ecx
 718 #endif
 719   endprologue
 720         movdqa  xmm7, [INTADDR(swaptab_128l, ecx)]
 721         movq    xmm0, [A + 0]
 722         movd    xmm2, [A + 8]
 723         movdqu  xmm1, [K]
 724         punpcklqdq xmm0, xmm2
 725         pshufb  xmm0, xmm7
 726         mul96
 727         pshufb  xmm0, xmm7
 728         movq    [A + 0], xmm0
 729         psrldq  xmm0, 8
 730         movd    [A + 8], xmm0
 731         ret
 732 ENDFUNC
 733
 734 SSEFUNC(gcm_mulk_192b_x86ish_pclmul)
 735         // On entry, A points to a 192-bit field element in big-endian words
 736         // format; K points to a field-element in register format.  On exit,
 737         // A is updated with the product A K.
 738
 739 #if CPUFAM_X86
 740         mov     A, [SP + 4]
 741         mov     K, [SP + 8]
 742 #endif
 743 #if CPUFAM_AMD64 && ABI_WIN
 744         stalloc 2*16 + 8
 745         savexmm xmm6, 0
 746         savexmm xmm7, 16
 747 #endif
 748   endprologue
 749         movdqu  xmm0, [A + 8]
 750         movq    xmm1, [A + 0]
 751         movdqu  xmm2, [K + 0]
 752         movq    xmm3, [K + 16]
 753         pshufd  xmm0, xmm0, SHUF(3, 2, 1, 0)
 754         pshufd  xmm1, xmm1, SHUF(1, 0, 3, 3)
 755         mul192
 756         pshufd  xmm0, xmm0, SHUF(3, 2, 1, 0)
 757         pshufd  xmm1, xmm1, SHUF(1, 0, 3, 3)
 758         movdqu  [A + 8], xmm0
 759         movq    [A + 0], xmm1
 760 #if CPUFAM_AMD64 && ABI_WIN
 761         rstrxmm xmm6, 0
 762         rstrxmm xmm7, 16
 763         stfree  2*16 + 8
 764 #endif
 765         ret
 766 ENDFUNC
 767
 768 SSEFUNC(gcm_mulk_192l_x86ish_pclmul)
 769         // On entry, A points to a 192-bit field element in little-endian
 770         // words format; K points to a field-element in register format.  On
 771         // exit, A is updated with the product A K.
 772
 773 #if CPUFAM_X86
 774         mov     A, [SP + 4]
 775         mov     K, [SP + 8]
 776         ldgot   ecx
 777 #endif
 778 #if CPUFAM_AMD64 && ABI_WIN
 779         stalloc 2*16 + 8
 780         savexmm xmm6, 0
 781         savexmm xmm7, 16
 782 #endif
 783   endprologue
 784         movdqu  xmm0, [A + 8]
 785         movq    xmm1, [A + 0]
 786         movdqu  xmm2, [K + 0]
 787         movq    xmm3, [K + 16]
 788         pshufb  xmm0, [INTADDR(swaptab_128l, ecx)]
 789         pshufb  xmm1, [INTADDR(swaptab_64l, ecx)]
 790         mul192
 791         pshufb  xmm0, [INTADDR(swaptab_128l, ecx)]
 792         pshufb  xmm1, [INTADDR(swaptab_64l, ecx)]
 793         movdqu  [A + 8], xmm0
 794         movq    [A + 0], xmm1
 795 #if CPUFAM_AMD64 && ABI_WIN
 796         rstrxmm xmm6, 0
 797         rstrxmm xmm7, 16
 798         stfree  2*16 + 8
 799 #endif
 800         ret
 801 ENDFUNC
 802
 803 SSEFUNC(gcm_mulk_256b_x86ish_pclmul)
 804         // On entry, A points to a 256-bit field element in big-endian words
 805         // format; K points to a field-element in register format.  On exit,
 806         // A is updated with the product A K.
 807
 808 #if CPUFAM_X86
 809         pushreg BP
 810         setfp
 811         mov     A, [SP + 8]
 812         mov     K, [SP + 12]
 813         stalloc 16
 814         and     SP, ~15
 815 #endif
 816 #if CPUFAM_AMD64 && ABI_WIN
 817         stalloc 3*16 + 8
 818         savexmm xmm6, 0
 819         savexmm xmm7, 16
 820         savexmm xmm8, 32
 821 #endif
 822   endprologue
 823         movdqu  xmm0, [A + 16]
 824         movdqu  xmm1, [A + 0]
 825         movdqu  xmm2, [K + 0]
 826         movdqu  xmm3, [K + 16]
 827         pshufd  xmm0, xmm0, SHUF(3, 2, 1, 0)
 828         pshufd  xmm1, xmm1, SHUF(3, 2, 1, 0)
 829         mul256
 830         pshufd  xmm0, xmm0, SHUF(3, 2, 1, 0)
 831         pshufd  xmm1, xmm1, SHUF(3, 2, 1, 0)
 832         movdqu  [A + 16], xmm0
 833         movdqu  [A + 0], xmm1
 834 #if CPUFAM_X86
 835         dropfp
 836         popreg  BP
 837 #endif
 838 #if CPUFAM_AMD64 && ABI_WIN
 839         rstrxmm xmm6, 0
 840         rstrxmm xmm7, 16
 841         rstrxmm xmm8, 32
 842         stfree  3*16 + 8
 843 #endif
 844         ret
 845 ENDFUNC
 846
 847 SSEFUNC(gcm_mulk_256l_x86ish_pclmul)
 848         // On entry, A points to a 256-bit field element in little-endian
 849         // words format; K points to a field-element in register format.  On
 850         // exit, A is updated with the product A K.
 851
 852 #if CPUFAM_X86
 853         pushreg BP
 854         setfp
 855         mov     A, [SP + 8]
 856         mov     K, [SP + 12]
 857         stalloc 16
 858         ldgot   ecx
 859         and     SP, ~15
 860 #endif
 861 #if CPUFAM_AMD64 && ABI_WIN
 862         stalloc 3*16 + 8
 863         savexmm xmm6, 0
 864         savexmm xmm7, 16
 865         savexmm xmm8, 32
 866 #endif
 867   endprologue
 868         movdqa  xmm7, [INTADDR(swaptab_128l, ecx)]
 869         movdqu  xmm0, [A + 16]
 870         movdqu  xmm1, [A + 0]
 871         movdqu  xmm2, [K + 0]
 872         movdqu  xmm3, [K + 16]
 873         pshufb  xmm0, xmm7
 874         pshufb  xmm1, xmm7
 875         mul256
 876         movdqa  xmm7, [INTADDR(swaptab_128l, ecx)]
 877         pshufb  xmm0, xmm7
 878         pshufb  xmm1, xmm7
 879         movdqu  [A + 16], xmm0
 880         movdqu  [A + 0], xmm1
 881 #if CPUFAM_X86
 882         dropfp
 883         popreg  BP
 884 #endif
 885 #if CPUFAM_AMD64 && ABI_WIN
 886         rstrxmm xmm6, 0
 887         rstrxmm xmm7, 16
 888         rstrxmm xmm8, 32
 889         stfree  3*16 + 8
 890 #endif
 891         ret
 892 ENDFUNC
 893
 894         RODATA
 895
 896         .balign 16
 897 swaptab_128l:
 898         // Table for byte-swapping little-endian words-format blocks larger
 899         // than 64 bits.
 900         .byte    15,  14,  13,  12,   11,  10,   9,   8
 901         .byte     7,   6,   5,   4,    3,   2,   1,   0
 902
 903         .balign 16
 904 swaptab_64l:
 905         // Table for byte-swapping 64-bit little-endian words-format blocks.
 906         .byte     7,   6,   5,   4,    3,   2,   1,   0
 907         .byte   255, 255, 255, 255,  255, 255, 255, 255
 908
 909 ///----- That's all, folks --------------------------------------------------