chiark - git - mdw - catacomb/blob - symm/gcm-arm64-pmull.S

   1 /// -*- mode: asm; asm-comment-char: ?/ -*-
   2 ///
   3 /// GCM acceleration for ARM64 processors
   4 ///
   5 /// (c) 2019 Straylight/Edgeware
   6 ///
   7
   8 ///----- Licensing notice ---------------------------------------------------
   9 ///
  10 /// This file is part of Catacomb.
  11 ///
  12 /// Catacomb is free software: you can redistribute it and/or modify it
  13 /// under the terms of the GNU Library General Public License as published
  14 /// by the Free Software Foundation; either version 2 of the License, or
  15 /// (at your option) any later version.
  16 ///
  17 /// Catacomb is distributed in the hope that it will be useful, but
  18 /// WITHOUT ANY WARRANTY; without even the implied warranty of
  19 /// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  20 /// Library General Public License for more details.
  21 ///
  22 /// You should have received a copy of the GNU Library General Public
  23 /// License along with Catacomb.  If not, write to the Free Software
  24 /// Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
  25 /// USA.
  26
  27 ///--------------------------------------------------------------------------
  28 /// Preliminaries.
  29
  30 #include "config.h"
  31 #include "asm-common.h"
  32
  33         .arch   armv8-a+crypto
  34
  35         .text
  36
  37 ///--------------------------------------------------------------------------
  38 /// Multiplication macros.
  39
  40         // The good news is that we have a fancy instruction to do the
  41         // multiplications.  The bad news is that it's not particularly well-
  42         // suited to the job.
  43         //
  44         // For one thing, it only does a 64-bit multiplication, so in general
  45         // we'll need to synthesize the full-width multiply by hand.  For
  46         // another thing, it doesn't help with the reduction, so we have to
  47         // do that by hand too.  And, finally, GCM has crazy bit ordering,
  48         // and the instruction does nothing useful for that at all.
  49         //
  50         // Focusing on that last problem first: the bits aren't in monotonic
  51         // significance order unless we permute them.  Fortunately, ARM64 has
  52         // an instruction which will just permute the bits in each byte for
  53         // us, so we don't have to worry about this very much.
  54         //
  55         // Our main weapons, the `pmull' and `pmull2' instructions, work on
  56         // 64-bit operands, in half of a vector register, and produce 128-bit
  57         // results.  But neither of them will multiply the high half of one
  58         // vector by the low half of a second one, so we have a problem,
  59         // which we solve by representing one of the operands redundantly:
  60         // rather than packing the 64-bit pieces together, we duplicate each
  61         // 64-bit piece across both halves of a register.
  62         //
  63         // The commentary for `mul128' is the most detailed.  The other
  64         // macros assume that you've already read and understood that.
  65
  66 .macro  mul128
  67         // Enter with u and v in v0 and v1/v2 respectively, and 0 in v31;
  68         // leave with z = u v in v0.  Clobbers v1--v6.
  69
  70         // First for the double-precision multiplication.  It's tempting to
  71         // use Karatsuba's identity here, but I suspect that loses more in
  72         // the shifting, bit-twiddling, and dependency chains that it gains
  73         // in saving a multiplication which otherwise pipelines well.
  74         // v0 =                         // (u_0; u_1)
  75         // v1/v2 =                      // (v_0; v_1)
  76         pmull2  v3.1q, v0.2d, v1.2d     // u_1 v_0
  77         pmull   v4.1q, v0.1d, v2.1d     // u_0 v_1
  78         pmull2  v5.1q, v0.2d, v2.2d     // (t_1; x_3) = u_1 v_1
  79         pmull   v6.1q, v0.1d, v1.1d     // (x_0; t_0) = u_0 v_0
  80
  81         // Arrange the pieces to form a double-precision polynomial.
  82         eor     v3.16b, v3.16b, v4.16b  // (m_0; m_1) = u_0 v_1 + u_1 v_0
  83         vshr128 v4, v3, 64              // (m_1; 0)
  84         vshl128 v3, v3, 64              // (0; m_0)
  85         eor     v1.16b, v5.16b, v4.16b  // (x_2; x_3)
  86         eor     v0.16b, v6.16b, v3.16b  // (x_0; x_1)
  87
  88         // And now the only remaining difficulty is that the result needs to
  89         // be reduced modulo p(t) = t^128 + t^7 + t^2 + t + 1.  Let R = t^128
  90         // = t^7 + t^2 + t + 1 in our field.  So far, we've calculated z_0
  91         // and z_1 such that z_0 + z_1 R = u v using the identity R = t^128:
  92         // now we must collapse the two halves of y together using the other
  93         // identity R = t^7 + t^2 + t + 1.
  94         //
  95         // We do this by working on y_2 and y_3 separately, so consider y_i
  96         // for i = 2 or 3.  Certainly, y_i t^{64i} = y_i R t^{64(i-2) =
  97         // (t^7 + t^2 + t + 1) y_i t^{64(i-2)}, but we can't use that
  98         // directly without breaking up the 64-bit word structure.  Instead,
  99         // we start by considering just y_i t^7 t^{64(i-2)}, which again
 100         // looks tricky.  Now, split y_i = a_i + t^57 b_i, with deg a_i < 57;
 101         // then
 102         //
 103         //      y_i t^7 t^{64(i-2)} = a_i t^7 t^{64(i-2)} + b_i t^{64(i-1)}
 104         //
 105         // We can similarly decompose y_i t^2 and y_i t into a pair of 64-bit
 106         // contributions to the t^{64(i-2)} and t^{64(i-1)} words, but the
 107         // splits are different.  This is lovely, with one small snag: when
 108         // we do this to y_3, we end up with a contribution back into the
 109         // t^128 coefficient word.  But notice that only the low seven bits
 110         // of this word are affected, so there's no knock-on contribution
 111         // into the t^64 word.  Therefore, if we handle the high bits of each
 112         // word together, and then the low bits, everything will be fine.
 113
 114         // First, shift the high bits down.
 115         ushr    v2.2d, v1.2d, #63       // the b_i for t
 116         ushr    v3.2d, v1.2d, #62       // the b_i for t^2
 117         ushr    v4.2d, v1.2d, #57       // the b_i for t^7
 118         eor     v2.16b, v2.16b, v3.16b  // add them all together
 119         eor     v2.16b, v2.16b, v4.16b
 120         vshr128 v3, v2, 64
 121         vshl128 v4, v2, 64
 122         eor     v1.16b, v1.16b, v3.16b  // contribution into high half
 123         eor     v0.16b, v0.16b, v4.16b  // and low half
 124
 125         // And then shift the low bits up.
 126         shl     v2.2d, v1.2d, #1
 127         shl     v3.2d, v1.2d, #2
 128         shl     v4.2d, v1.2d, #7
 129         eor     v1.16b, v1.16b, v2.16b  // unit and t contribs
 130         eor     v3.16b, v3.16b, v4.16b  // t^2 and t^7 contribs
 131         eor     v0.16b, v0.16b, v1.16b  // mix everything together
 132         eor     v0.16b, v0.16b, v3.16b  // ... and we're done
 133 .endm
 134
 135 .macro  mul64
 136         // Enter with u and v in the low halves of v0 and v1, respectively;
 137         // leave with z = u v in x2.  Clobbers x2--x4.
 138
 139         // The multiplication is thankfully easy.
 140         // v0 =                                 // (u; ?)
 141         // v1 =                                 // (v; ?)
 142         pmull   v0.1q, v0.1d, v1.1d             // u v
 143
 144         // Now we must reduce.  This is essentially the same as the 128-bit
 145         // case above, but mostly simpler because everything is smaller.  The
 146         // polynomial this time is p(t) = t^64 + t^4 + t^3 + t + 1.
 147
 148         // Before we get stuck in, transfer the product to general-purpose
 149         // registers.
 150         mov     x3, v0.d[1]
 151         mov     x2, v0.d[0]
 152
 153         // First, shift the high bits down.
 154         eor     x4, x3, x3, lsr #1      // pre-mix t^3 and t^4
 155         eor     x3, x3, x3, lsr #63     // mix in t contribution
 156         eor     x3, x3, x4, lsr #60     // shift and mix in t^3 and t^4
 157
 158         // And then shift the low bits up.
 159         eor     x3, x3, x3, lsl #1      // mix unit and t; pre-mix t^3, t^4
 160         eor     x2, x2, x3              // fold them in
 161         eor     x2, x2, x3, lsl #3      // and t^3 and t^4
 162 .endm
 163
 164 .macro  mul96
 165         // Enter with u in the least-significant 96 bits of v0, with zero in
 166         // the upper 32 bits, and with the least-significant 64 bits of v in
 167         // both halves of v1, and the upper 32 bits of v in the low 32 bits
 168         // of each half of v2, with zero in the upper 32 bits; and with zero
 169         // in v31.  Yes, that's a bit hairy.  Leave with the product u v in
 170         // the low 96 bits of v0, and /junk/ in the high 32 bits.  Clobbers
 171         // v1--v6.
 172
 173         // This is an inconvenient size.  There's nothing for it but to do
 174         // four multiplications, as if for the 128-bit case.  It's possible
 175         // that there's cruft in the top 32 bits of the input registers, so
 176         // shift both of them up by four bytes before we start.  This will
 177         // mean that the high 64 bits of the result (from GCM's viewpoint)
 178         // will be zero.
 179         // v0 =                         // (u_0 + u_1 t^32; u_2)
 180         // v1 =                         // (v_0 + v_1 t^32; v_0 + v_1 t^32)
 181         // v2 =                         // (v_2; v_2)
 182         pmull2  v5.1q, v0.2d, v1.2d     // u_2 (v_0 + v_1 t^32) t^32 = e_0
 183         pmull   v4.1q, v0.1d, v2.1d     // v_2 (u_0 + u_1 t^32) t^32 = e_1
 184         pmull2  v6.1q, v0.2d, v2.2d     // u_2 v_2 = d = (d; 0)
 185         pmull   v3.1q, v0.1d, v1.1d     // u_0 v_0 + (u_0 v_1 + u_1 v_0) t^32
 186                                         //   + u_1 v_1 t^64 = f
 187
 188         // Extract the high and low halves of the 192-bit result.  The answer
 189         // we want is d t^128 + e t^64 + f, where e = e_0 + e_1.  The low 96
 190         // bits of the answer will end up in v0, with junk in the top 32
 191         // bits; the high 96 bits will end up in v1, which must have zero in
 192         // its top 32 bits.
 193         //
 194         // Here, bot(x) is the low 96 bits of a 192-bit quantity x, arranged
 195         // in the low 96 bits of a SIMD register, with junk in the top 32
 196         // bits; and top(x) is the high 96 bits, also arranged in the low 96
 197         // bits of a register, with /zero/ in the top 32 bits.
 198         eor     v4.16b, v4.16b, v5.16b  // e_0 + e_1 = e
 199         vshl128 v6, v6, 32              // top(d t^128)
 200         vshr128 v5, v4, 32              // top(e t^64)
 201         vshl128 v4, v4, 64              // bot(e t^64)
 202         vshr128 v1, v3, 96              // top(f)
 203         eor     v6.16b, v6.16b, v5.16b  // top(d t^128 + e t^64)
 204         eor     v0.16b, v3.16b, v4.16b  // bot([d t^128] + e t^64 + f)
 205         eor     v1.16b, v1.16b, v6.16b  // top(e t^64 + d t^128 + f)
 206
 207         // Finally, the reduction.  This is essentially the same as the
 208         // 128-bit case, except that the polynomial is p(t) = t^96 + t^10 +
 209         // t^9 + t^6 + 1.  The degrees are larger but not enough to cause
 210         // trouble for the general approach.  Unfortunately, we have to do
 211         // this in 32-bit pieces rather than 64.
 212
 213         // First, shift the high bits down.
 214         ushr    v2.4s, v1.4s, #26       // the b_i for t^6
 215         ushr    v3.4s, v1.4s, #23       // the b_i for t^9
 216         ushr    v4.4s, v1.4s, #22       // the b_i for t^10
 217         eor     v2.16b, v2.16b, v3.16b  // add them all together
 218         eor     v2.16b, v2.16b, v4.16b
 219         vshr128 v3, v2, 64              // contribution for high half
 220         vshl128 v2, v2, 32              // contribution for low half
 221         eor     v1.16b, v1.16b, v3.16b  // apply to high half
 222         eor     v0.16b, v0.16b, v2.16b  // and low half
 223
 224         // And then shift the low bits up.
 225         shl     v2.4s, v1.4s, #6
 226         shl     v3.4s, v1.4s, #9
 227         shl     v4.4s, v1.4s, #10
 228         eor     v1.16b, v1.16b, v2.16b  // unit and t^6 contribs
 229         eor     v3.16b, v3.16b, v4.16b  // t^9 and t^10 contribs
 230         eor     v0.16b, v0.16b, v1.16b  // mix everything together
 231         eor     v0.16b, v0.16b, v3.16b  // ... and we're done
 232 .endm
 233
 234 .macro  mul192
 235         // Enter with u in v0 and the less-significant half of v1, with v
 236         // duplicated across both halves of v2/v3/v4, and with zero in v31.
 237         // Leave with the product u v in v0 and the bottom half of v1.
 238         // Clobbers v16--v25.
 239
 240         // Start multiplying and accumulating pieces of product.
 241         // v0 =                         // (u_0; u_1)
 242         // v1 =                         // (u_2; ?)
 243         // v2 =                         // (v_0; v_0)
 244         // v3 =                         // (v_1; v_1)
 245         // v4 =                         // (v_2; v_2)
 246         pmull   v16.1q, v0.1d, v2.1d    //   a = u_0 v_0
 247
 248         pmull   v19.1q, v0.1d, v3.1d    //       u_0 v_1
 249         pmull2  v21.1q, v0.2d, v2.2d    //       u_1 v_0
 250
 251         pmull   v17.1q, v0.1d, v4.1d    //       u_0 v_2
 252         pmull2  v22.1q, v0.2d, v3.2d    //       u_1 v_1
 253         pmull   v23.1q, v1.1d, v2.1d    //       u_2 v_0
 254          eor    v19.16b, v19.16b, v21.16b // b = u_0 v_1 + u_1 v_0
 255
 256         pmull2  v20.1q, v0.2d, v4.2d    //       u_1 v_2
 257         pmull   v24.1q, v1.1d, v3.1d    //       u_2 v_1
 258          eor    v17.16b, v17.16b, v22.16b //     u_0 v_2 + u_1 v_1
 259
 260         pmull   v18.1q, v1.1d, v4.1d    //   e = u_2 v_2
 261          eor    v17.16b, v17.16b, v23.16b // c = u_0 v_2 + u_1 v_1 + u_2 v_1
 262          eor    v20.16b, v20.16b, v24.16b // d = u_1 v_2 + u_2 v_1
 263
 264         // Piece the product together.
 265         // v16 =                        // (a_0; a_1)
 266         // v19 =                        // (b_0; b_1)
 267         // v17 =                        // (c_0; c_1)
 268         // v20 =                        // (d_0; d_1)
 269         // v18 =                        // (e_0; e_1)
 270         vshl128 v21, v19, 64            // (0; b_0)
 271         ext     v22.16b, v19.16b, v20.16b, #8 // (b_1; d_0)
 272         vshr128 v23, v20, 64            // (d_1; 0)
 273         eor     v16.16b, v16.16b, v21.16b // (x_0; x_1)
 274         eor     v17.16b, v17.16b, v22.16b // (x_2; x_3)
 275         eor     v18.16b, v18.16b, v23.16b // (x_2; x_3)
 276
 277         // Next, the reduction.  Our polynomial this time is p(x) = t^192 +
 278         // t^7 + t^2 + t + 1.  Yes, the magic numbers are the same as the
 279         // 128-bit case.  I don't know why.
 280
 281         // First, shift the high bits down.
 282         // v16 =                        // (y_0; y_1)
 283         // v17 =                        // (y_2; y_3)
 284         // v18 =                        // (y_4; y_5)
 285         mov     v19.d[0], v17.d[1]      // (y_3; ?)
 286
 287         ushr    v23.2d, v18.2d, #63     // hi b_i for t
 288         ushr    d20, d19, #63           // lo b_i for t
 289         ushr    v24.2d, v18.2d, #62     // hi b_i for t^2
 290         ushr    d21, d19, #62           // lo b_i for t^2
 291         ushr    v25.2d, v18.2d, #57     // hi b_i for t^7
 292         ushr    d22, d19, #57           // lo b_i for t^7
 293         eor     v23.16b, v23.16b, v24.16b // mix them all together
 294         eor     v20.8b, v20.8b, v21.8b
 295         eor     v23.16b, v23.16b, v25.16b
 296         eor     v20.8b, v20.8b, v22.8b
 297
 298         // Permute the high pieces while we fold in the b_i.
 299         eor     v17.16b, v17.16b, v23.16b
 300         vshl128 v20, v20, 64
 301         mov     v19.d[0], v18.d[1]      // (y_5; ?)
 302         ext     v18.16b, v17.16b, v18.16b, #8 // (y_3; y_4)
 303         eor     v16.16b, v16.16b, v20.16b
 304
 305         // And finally shift the low bits up.
 306         // v16 =                        // (y'_0; y'_1)
 307         // v17 =                        // (y'_2; ?)
 308         // v18 =                        // (y'_3; y'_4)
 309         // v19 =                        // (y'_5; ?)
 310         shl     v20.2d, v18.2d, #1
 311         shl     d23, d19, #1
 312         shl     v21.2d, v18.2d, #2
 313         shl     d24, d19, #2
 314         shl     v22.2d, v18.2d, #7
 315         shl     d25, d19, #7
 316         eor     v18.16b, v18.16b, v20.16b // unit and t contribs
 317         eor     v19.8b, v19.8b, v23.8b
 318         eor     v21.16b, v21.16b, v22.16b // t^2 and t^7 contribs
 319         eor     v24.8b, v24.8b, v25.8b
 320         eor     v18.16b, v18.16b, v21.16b // all contribs
 321         eor     v19.8b, v19.8b, v24.8b
 322         eor     v0.16b, v16.16b, v18.16b // mix them into the low half
 323         eor     v1.8b, v17.8b, v19.8b
 324 .endm
 325
 326 .macro  mul256
 327         // Enter with u in v0/v1, with v duplicated across both halves of
 328         // v2--v5, and with zero in v31.  Leave with the product u v in
 329         // v0/v1.  Clobbers ???.
 330
 331         // Now it's starting to look worthwhile to do Karatsuba.  Suppose
 332         // u = u_0 + u_1 B and v = v_0 + v_1 B.  Then
 333         //
 334         //      u v = (u_0 v_0) + (u_0 v_1 + u_1 v_0) B + (u_1 v_1) B^2
 335         //
 336         // Name these coefficients of B^i be a, b, and c, respectively, and
 337         // let r = u_0 + u_1 and s = v_0 + v_1.  Then observe that
 338         //
 339         //      q = r s = (u_0 + u_1) (v_0 + v_1)
 340         //        = (u_0 v_0) + (u1 v_1) + (u_0 v_1 + u_1 v_0)
 341         //        = a + d + c
 342         //
 343         // The first two terms we've already calculated; the last is the
 344         // remaining one we want.  We'll set B = t^128.  We know how to do
 345         // 128-bit multiplications already, and Karatsuba is too annoying
 346         // there, so there'll be 12 multiplications altogether, rather than
 347         // the 16 we'd have if we did this the naïve way.
 348         // v0 =                         // u_0 = (u_00; u_01)
 349         // v1 =                         // u_1 = (u_10; u_11)
 350         // v2 =                         // (v_00; v_00)
 351         // v3 =                         // (v_01; v_01)
 352         // v4 =                         // (v_10; v_10)
 353         // v5 =                         // (v_11; v_11)
 354
 355         eor     v28.16b, v0.16b, v1.16b // u_* = (u_00 + u_10; u_01 + u_11)
 356         eor     v29.16b, v2.16b, v4.16b // v_*0 = v_00 + v_10
 357         eor     v30.16b, v3.16b, v5.16b // v_*1 = v_01 + v_11
 358
 359         // Start by building the cross product, q = u_* v_*.
 360         pmull   v24.1q, v28.1d, v30.1d  // u_*0 v_*1
 361         pmull2  v25.1q, v28.2d, v29.2d  // u_*1 v_*0
 362         pmull   v20.1q, v28.1d, v29.1d  // u_*0 v_*0
 363         pmull2  v21.1q, v28.2d, v30.2d  // u_*1 v_*1
 364         eor     v24.16b, v24.16b, v25.16b // u_*0 v_*1 + u_*1 v_*0
 365         vshr128 v25, v24, 64
 366         vshl128 v24, v24, 64
 367         eor     v20.16b, v20.16b, v24.16b // q_0
 368         eor     v21.16b, v21.16b, v25.16b // q_1
 369
 370         // Next, work on the low half, a = u_0 v_0
 371         pmull   v24.1q, v0.1d, v3.1d    // u_00 v_01
 372         pmull2  v25.1q, v0.2d, v2.2d    // u_01 v_00
 373         pmull   v16.1q, v0.1d, v2.1d    // u_00 v_00
 374         pmull2  v17.1q, v0.2d, v3.2d    // u_01 v_01
 375         eor     v24.16b, v24.16b, v25.16b // u_00 v_01 + u_01 v_00
 376         vshr128 v25, v24, 64
 377         vshl128 v24, v24, 64
 378         eor     v16.16b, v16.16b, v24.16b // a_0
 379         eor     v17.16b, v17.16b, v25.16b // a_1
 380
 381         // Mix the pieces we have so far.
 382         eor     v20.16b, v20.16b, v16.16b
 383         eor     v21.16b, v21.16b, v17.16b
 384
 385         // Finally, work on the high half, c = u_1 v_1
 386         pmull   v24.1q, v1.1d, v5.1d    // u_10 v_11
 387         pmull2  v25.1q, v1.2d, v4.2d    // u_11 v_10
 388         pmull   v18.1q, v1.1d, v4.1d    // u_10 v_10
 389         pmull2  v19.1q, v1.2d, v5.2d    // u_11 v_11
 390         eor     v24.16b, v24.16b, v25.16b // u_10 v_11 + u_11 v_10
 391         vshr128 v25, v24, 64
 392         vshl128 v24, v24, 64
 393         eor     v18.16b, v18.16b, v24.16b // c_0
 394         eor     v19.16b, v19.16b, v25.16b // c_1
 395
 396         // Finish mixing the product together.
 397         eor     v20.16b, v20.16b, v18.16b
 398         eor     v21.16b, v21.16b, v19.16b
 399         eor     v17.16b, v17.16b, v20.16b
 400         eor     v18.16b, v18.16b, v21.16b
 401
 402         // Now we must reduce.  This is essentially the same as the 192-bit
 403         // case above, but more complicated because everything is bigger.
 404         // The polynomial this time is p(t) = t^256 + t^10 + t^5 + t^2 + 1.
 405         // v16 =                        // (y_0; y_1)
 406         // v17 =                        // (y_2; y_3)
 407         // v18 =                        // (y_4; y_5)
 408         // v19 =                        // (y_6; y_7)
 409         ushr    v24.2d, v18.2d, #62     // (y_4; y_5) b_i for t^2
 410         ushr    v25.2d, v19.2d, #62     // (y_6; y_7) b_i for t^2
 411         ushr    v26.2d, v18.2d, #59     // (y_4; y_5) b_i for t^5
 412         ushr    v27.2d, v19.2d, #59     // (y_6; y_7) b_i for t^5
 413         ushr    v28.2d, v18.2d, #54     // (y_4; y_5) b_i for t^10
 414         ushr    v29.2d, v19.2d, #54     // (y_6; y_7) b_i for t^10
 415         eor     v24.16b, v24.16b, v26.16b // mix the contributions together
 416         eor     v25.16b, v25.16b, v27.16b
 417         eor     v24.16b, v24.16b, v28.16b
 418         eor     v25.16b, v25.16b, v29.16b
 419         vshr128 v26, v25, 64            // slide contribs into position
 420         ext     v25.16b, v24.16b, v25.16b, #8
 421         vshl128 v24, v24, 64
 422         eor     v18.16b, v18.16b, v26.16b
 423         eor     v17.16b, v17.16b, v25.16b
 424         eor     v16.16b, v16.16b, v24.16b
 425
 426         // And then shift the low bits up.
 427         // v16 =                        // (y'_0; y'_1)
 428         // v17 =                        // (y'_2; y'_3)
 429         // v18 =                        // (y'_4; y'_5)
 430         // v19 =                        // (y'_6; y'_7)
 431         shl     v24.2d, v18.2d, #2      // (y'_4; y_5) a_i for t^2
 432         shl     v25.2d, v19.2d, #2      // (y_6; y_7) a_i for t^2
 433         shl     v26.2d, v18.2d, #5      // (y'_4; y_5) a_i for t^5
 434         shl     v27.2d, v19.2d, #5      // (y_6; y_7) a_i for t^5
 435         shl     v28.2d, v18.2d, #10     // (y'_4; y_5) a_i for t^10
 436         shl     v29.2d, v19.2d, #10     // (y_6; y_7) a_i for t^10
 437         eor     v18.16b, v18.16b, v24.16b // mix the contributions together
 438         eor     v19.16b, v19.16b, v25.16b
 439         eor     v26.16b, v26.16b, v28.16b
 440         eor     v27.16b, v27.16b, v29.16b
 441         eor     v18.16b, v18.16b, v26.16b
 442         eor     v19.16b, v19.16b, v27.16b
 443         eor     v0.16b, v16.16b, v18.16b
 444         eor     v1.16b, v17.16b, v19.16b
 445 .endm
 446
 447 ///--------------------------------------------------------------------------
 448 /// Main code.
 449
 450 // There are a number of representations of field elements in this code and
 451 // it can be confusing.
 452 //
 453 //   * The `external format' consists of a sequence of contiguous bytes in
 454 //     memory called a `block'.  The GCM spec explains how to interpret this
 455 //     block as an element of a finite field.  As discussed extensively, this
 456 //     representation is very annoying for a number of reasons.  On the other
 457 //     hand, this code never actually deals with it directly.
 458 //
 459 //   * The `register format' consists of one or more SIMD registers,
 460 //     depending on the block size.  The bits in each byte are reversed,
 461 //     compared to the external format, which makes the polynomials
 462 //     completely vanilla, unlike all of the other GCM implementations.
 463 //
 464 //   * The `table format' is just like the `register format', only the two
 465 //     halves of 128-bit SIMD register are the same, so we need twice as many
 466 //     registers.
 467 //
 468 //   * The `words' format consists of a sequence of bytes, as in the
 469 //     `external format', but, according to the blockcipher in use, the bytes
 470 //     within each 32-bit word may be reversed (`big-endian') or not
 471 //     (`little-endian').  Accordingly, there are separate entry points for
 472 //     each variant, identified with `b' or `l'.
 473
 474 FUNC(gcm_mulk_128b_arm64_pmull)
 475         // On entry, x0 points to a 128-bit field element A in big-endian
 476         // words format; x1 points to a field-element K in table format.  On
 477         // exit, A is updated with the product A K.
 478
 479         ldr     q0, [x0]
 480         ldp     q1, q2, [x1]
 481         rev32   v0.16b, v0.16b
 482         vzero
 483         rbit    v0.16b, v0.16b
 484         mul128
 485         rbit    v0.16b, v0.16b
 486         rev32   v0.16b, v0.16b
 487         str     q0, [x0]
 488         ret
 489 ENDFUNC
 490
 491 FUNC(gcm_mulk_128l_arm64_pmull)
 492         // On entry, x0 points to a 128-bit field element A in little-endian
 493         // words format; x1 points to a field-element K in table format.  On
 494         // exit, A is updated with the product A K.
 495
 496         ldr     q0, [x0]
 497         ldp     q1, q2, [x1]
 498         vzero
 499         rbit    v0.16b, v0.16b
 500         mul128
 501         rbit    v0.16b, v0.16b
 502         str     q0, [x0]
 503         ret
 504 ENDFUNC
 505
 506 FUNC(gcm_mulk_64b_arm64_pmull)
 507         // On entry, x0 points to a 64-bit field element A in big-endian
 508         // words format; x1 points to a field-element K in table format.  On
 509         // exit, A is updated with the product A K.
 510
 511         ldr     d0, [x0]
 512         ldr     q1, [x1]
 513         rev32   v0.8b, v0.8b
 514         rbit    v0.8b, v0.8b
 515         mul64
 516         rbit    x2, x2
 517         ror     x2, x2, #32
 518         str     x2, [x0]
 519         ret
 520 ENDFUNC
 521
 522 FUNC(gcm_mulk_64l_arm64_pmull)
 523         // On entry, x0 points to a 64-bit field element A in little-endian
 524         // words format; x1 points to a field-element K in table format.  On
 525         // exit, A is updated with the product A K.
 526
 527         ldr     d0, [x0]
 528         ldr     q1, [x1]
 529         rbit    v0.8b, v0.8b
 530         mul64
 531         rbit    x2, x2
 532         rev     x2, x2
 533         str     x2, [x0]
 534         ret
 535 ENDFUNC
 536
 537 FUNC(gcm_mulk_96b_arm64_pmull)
 538         // On entry, x0 points to a 96-bit field element A in big-endian
 539         // words format; x1 points to a field-element K in table format.  On
 540         // exit, A is updated with the product A K.
 541
 542         ldr     w2, [x0, #8]
 543         ldr     d0, [x0, #0]
 544         mov     v0.d[1], x2
 545         ldp     q1, q2, [x1]
 546         rev32   v0.16b, v0.16b
 547         vzero
 548         rbit    v0.16b, v0.16b
 549         mul96
 550         rbit    v0.16b, v0.16b
 551         rev32   v0.16b, v0.16b
 552         mov     w2, v0.s[2]
 553         str     d0, [x0, #0]
 554         str     w2, [x0, #8]
 555         ret
 556 ENDFUNC
 557
 558 FUNC(gcm_mulk_96l_arm64_pmull)
 559         // On entry, x0 points to a 96-bit field element A in little-endian
 560         // words format; x1 points to a field-element K in table format.  On
 561         // exit, A is updated with the product A K.
 562
 563         ldr     d0, [x0, #0]
 564         ldr     w2, [x0, #8]
 565         mov     v0.d[1], x2
 566         ldp     q1, q2, [x1]
 567         rbit    v0.16b, v0.16b
 568         vzero
 569         mul96
 570         rbit    v0.16b, v0.16b
 571         mov     w2, v0.s[2]
 572         str     d0, [x0, #0]
 573         str     w2, [x0, #8]
 574         ret
 575 ENDFUNC
 576
 577 FUNC(gcm_mulk_192b_arm64_pmull)
 578         // On entry, x0 points to a 192-bit field element A in big-endian
 579         // words format; x1 points to a field-element K in table format.  On
 580         // exit, A is updated with the product A K.
 581
 582         ldr     q0, [x0, #0]
 583         ldr     d1, [x0, #16]
 584         ldp     q2, q3, [x1, #0]
 585         ldr     q4, [x1, #32]
 586         rev32   v0.16b, v0.16b
 587         rev32   v1.8b, v1.8b
 588         rbit    v0.16b, v0.16b
 589         rbit    v1.8b, v1.8b
 590         vzero
 591         mul192
 592         rev32   v0.16b, v0.16b
 593         rev32   v1.8b, v1.8b
 594         rbit    v0.16b, v0.16b
 595         rbit    v1.8b, v1.8b
 596         str     q0, [x0, #0]
 597         str     d1, [x0, #16]
 598         ret
 599 ENDFUNC
 600
 601 FUNC(gcm_mulk_192l_arm64_pmull)
 602         // On entry, x0 points to a 192-bit field element A in little-endian
 603         // words format; x1 points to a field-element K in table format.  On
 604         // exit, A is updated with the product A K.
 605
 606         ldr     q0, [x0, #0]
 607         ldr     d1, [x0, #16]
 608         ldp     q2, q3, [x1, #0]
 609         ldr     q4, [x1, #32]
 610         rbit    v0.16b, v0.16b
 611         rbit    v1.8b, v1.8b
 612         vzero
 613         mul192
 614         rbit    v0.16b, v0.16b
 615         rbit    v1.8b, v1.8b
 616         str     q0, [x0, #0]
 617         str     d1, [x0, #16]
 618         ret
 619 ENDFUNC
 620
 621 FUNC(gcm_mulk_256b_arm64_pmull)
 622         // On entry, x0 points to a 256-bit field element A in big-endian
 623         // words format; x1 points to a field-element K in table format.  On
 624         // exit, A is updated with the product A K.
 625
 626         ldp     q0, q1, [x0]
 627         ldp     q2, q3, [x1, #0]
 628         ldp     q4, q5, [x1, #32]
 629         rev32   v0.16b, v0.16b
 630         rev32   v1.16b, v1.16b
 631         rbit    v0.16b, v0.16b
 632         rbit    v1.16b, v1.16b
 633         vzero
 634         mul256
 635         rev32   v0.16b, v0.16b
 636         rev32   v1.16b, v1.16b
 637         rbit    v0.16b, v0.16b
 638         rbit    v1.16b, v1.16b
 639         stp     q0, q1, [x0]
 640         ret
 641 ENDFUNC
 642
 643 FUNC(gcm_mulk_256l_arm64_pmull)
 644         // On entry, x0 points to a 256-bit field element A in little-endian
 645         // words format; x1 points to a field-element K in table format.  On
 646         // exit, A is updated with the product A K.
 647
 648         ldp     q0, q1, [x0]
 649         ldp     q2, q3, [x1, #0]
 650         ldp     q4, q5, [x1, #32]
 651         rbit    v0.16b, v0.16b
 652         rbit    v1.16b, v1.16b
 653         vzero
 654         mul256
 655         rbit    v0.16b, v0.16b
 656         rbit    v1.16b, v1.16b
 657         stp     q0, q1, [x0]
 658         ret
 659 ENDFUNC
 660
 661 ///----- That's all, folks --------------------------------------------------