chiark - git - mdw - catacomb/blob - symm/gcm-arm-crypto.S

   1 /// -*- mode: asm; asm-comment-char: ?/ -*-
   2 ///
   3 /// GCM acceleration for ARM processors
   4 ///
   5 /// (c) 2019 Straylight/Edgeware
   6 ///
   7
   8 ///----- Licensing notice ---------------------------------------------------
   9 ///
  10 /// This file is part of Catacomb.
  11 ///
  12 /// Catacomb is free software: you can redistribute it and/or modify it
  13 /// under the terms of the GNU Library General Public License as published
  14 /// by the Free Software Foundation; either version 2 of the License, or
  15 /// (at your option) any later version.
  16 ///
  17 /// Catacomb is distributed in the hope that it will be useful, but
  18 /// WITHOUT ANY WARRANTY; without even the implied warranty of
  19 /// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  20 /// Library General Public License for more details.
  21 ///
  22 /// You should have received a copy of the GNU Library General Public
  23 /// License along with Catacomb.  If not, write to the Free Software
  24 /// Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
  25 /// USA.
  26
  27 ///--------------------------------------------------------------------------
  28 /// Preliminaries.
  29
  30 #include "config.h"
  31 #include "asm-common.h"
  32
  33         .arch   armv8-a
  34         .fpu    crypto-neon-fp-armv8
  35
  36         .text
  37
  38 ///--------------------------------------------------------------------------
  39 /// Multiplication macros.
  40
  41         // The good news is that we have a fancy instruction to do the
  42         // multiplications.  The bad news is that it's not particularly well-
  43         // suited to the job.
  44         //
  45         // For one thing, it only does a 64-bit multiplication, so in general
  46         // we'll need to synthesize the full-width multiply by hand.  For
  47         // another thing, it doesn't help with the reduction, so we have to
  48         // do that by hand too.  And, finally, GCM has crazy bit ordering,
  49         // and the instruction does nothing useful for that at all.
  50         //
  51         // Focusing on that last problem first: the bits aren't in monotonic
  52         // significance order unless we permute them.  If we reverse the byte
  53         // order, then we'll have the bits in monotonic order, but backwards,
  54         // so the degree-0 coefficient will be in the most-significant bit.
  55         //
  56         // This is less of a difficulty than it seems at first, because
  57         // algebra.  Suppose we are given u = SUM_{0<=i<n} u_i t^i and v =
  58         // SUM_{0<=j<n} v_j t^j; then
  59         //
  60         //      u v = SUM_{0<=i,j<n} u_i v_j t^{i+j}
  61         //
  62         // Suppose instead that we're given ũ = SUM_{0<=i<n} u_{n-i-1} t^i
  63         // and ṽ = SUM_{0<=j<n} v_{n-j-1} t^j, so the bits are backwards.
  64         // Then
  65         //
  66         //      ũ ṽ = SUM_{0<=i,j<n} u_{n-i-1} v_{n-j-1} t^{i+j}
  67         //          = SUM_{0<=i,j<n} u_i v_j t^{2n-2-(i+j)}
  68         //
  69         // which is almost the bit-reversal of u v, only it's shifted right
  70         // by one place.  Oh, well: we'll have to shift it back later.
  71         //
  72         // That was important to think about, but there's not a great deal to
  73         // do about it yet other than to convert what we've got from the
  74         // blockcipher's byte-ordering convention to our big-endian
  75         // convention.  Since this depends on the blockcipher convention,
  76         // we'll leave the caller to cope with this: the macros here will
  77         // assume that the operands are in `register' format, which is the
  78         // same as the external representation, except that the bytes within
  79         // each 64-bit piece are reversed.  In the commentary, pieces of
  80         // polynomial are numbered according to the degree of the
  81         // coefficients, so the unit coefficient of some polynomial a is in
  82         // a_0.
  83         //
  84         // The commentary for `mul128' is the most detailed.  The other
  85         // macros assume that you've already read and understood that.
  86
  87 .macro  mul128
  88         // Enter with u and v in q0 and q1 respectively; leave with z = u v
  89         // in q0.  Clobbers q1--q3, q8, q9.
  90
  91         // First for the double-precision multiplication.  It's tempting to
  92         // use Karatsuba's identity here, but I suspect that loses more in
  93         // the shifting, bit-twiddling, and dependency chains that it gains
  94         // in saving a multiplication which otherwise pipelines well.
  95         // q0 =                         // (u_0; u_1)
  96         // q1 =                         // (v_0; v_1)
  97         vmull.p64 q2, d1, d2            // u_1 v_0
  98         vmull.p64 q3, d0, d3            // u_0 v_1
  99         vmull.p64 q8, d1, d3            // (x_3; t_1) = u_1 v_1
 100         vmull.p64 q9, d0, d2            // (t_0; x_0) = u_0 v_0
 101
 102         // Arrange the pieces to form a double-precision polynomial.
 103         veor    q2, q2, q3              // (m_1; m_0) = u_0 v_1 + u_1 v_0
 104         veor    d17, d17, d4            // x_2 = t_1 + m_1
 105         veor    d18, d18, d5            // x_1 = t_0 + m_0
 106         // q8 =                         // (x_3; x_2)
 107         // q9 =                         // (x_1; x_0)
 108
 109         // Two-and-a-half problems remain.  The first is that this product is
 110         // shifted left by one place, which is annoying.  Let's take care of
 111         // that now.  Once this is done, we'll be properly in GCM's backwards
 112         // bit-ordering.
 113         //
 114         // The half a problem is that the result wants to have its 64-bit
 115         // halves switched.  Here turns out to be the best place to arrange
 116         // for that.
 117         //
 118         //                   q9                             q8
 119         //      ,-------------.-------------.  ,-------------.-------------.
 120         //      | 0  x_0-x_62 | x_63-x_126  |  | x_127-x_190 | x_191-x_254 |
 121         //      `-------------^-------------'  `-------------^-------------'
 122         //            d19           d18              d17           d16
 123         //
 124         // We start by shifting each 32-bit lane right (from GCM's point of
 125         // view -- physically, left) by one place, which gives us this:
 126         //
 127         //                 low (q9)                      high (q8)
 128         //      ,-------------.-------------.  ,-------------.-------------.
 129         //      | x_0-x_62  0 |x_64-x_126 0 |  |x_128-x_190 0|x_192-x_254 0|
 130         //      `-------------^-------------'  `-------------^-------------'
 131         //            d19           d18              d17           d16
 132         //
 133         // but we've lost a bunch of bits.  We separately shift each lane
 134         // left by 31 places to give us the bits we lost.
 135         //
 136         //                 low (q3)                      high (q2)
 137         //      ,-------------.-------------.  ,-------------.-------------.
 138         //      |    0...0    | 0...0  x_63 |  | 0...0 x_127 | 0...0 x_191 |
 139         //      `-------------^-------------'  `-------------^-------------'
 140         //                           d6              d5            d4
 141         //
 142         // Since we can address each of these pieces individually, putting
 143         // them together is relatively straightforward.
 144
 145
 146         vshr.u64 d6, d18, #63           // shifted left; just the carries
 147         vshl.u64 q9, q9, #1             // shifted right, but dropped carries
 148         vshr.u64 q2, q8, #63
 149         vshl.u64 q8, q8, #1
 150         vorr    d0, d19, d6             // y_0
 151         vorr    d1, d18, d5             // y_1
 152         vorr    d2, d17, d4             // y_2
 153         vmov    d3, d16                 // y_3
 154
 155         // And the other one is that the result needs to be reduced modulo
 156         // p(t) = t^128 + t^7 + t^2 + t + 1.  Let R = t^128 = t^7 + t^2 + t +
 157         // 1 in our field.  So far, we've calculated z_0 and z_1 such that
 158         // z_0 + z_1 R = u v using the identity R = t^128: now we must
 159         // collapse the two halves of y together using the other identity R =
 160         // t^7 + t^2 + t + 1.
 161         //
 162         // We do this by working on y_2 and y_3 separately, so consider y_i
 163         // for i = 2 or 3.  Certainly, y_i t^{64i} = y_i R t^{64(i-2) =
 164         // (t^7 + t^2 + t + 1) y_i t^{64(i-2)}, but we can't use that
 165         // directly without breaking up the 64-bit word structure.  Instead,
 166         // we start by considering just y_i t^7 t^{64(i-2)}, which again
 167         // looks tricky.  Now, split y_i = a_i + t^57 b_i, with deg a_i < 57;
 168         // then
 169         //
 170         //      y_i t^7 t^{64(i-2)} = a_i t^7 t^{64(i-2)} + b_i t^{64(i-1)}
 171         //
 172         // We can similarly decompose y_i t^2 and y_i t into a pair of 64-bit
 173         // contributions to the t^{64(i-2)} and t^{64(i-1)} words, but the
 174         // splits are different.  This is lovely, with one small snag: when
 175         // we do this to y_3, we end up with a contribution back into the
 176         // t^128 coefficient word.  But notice that only the low seven bits
 177         // of this word are affected, so there's no knock-on contribution
 178         // into the t^64 word.  Therefore, if we handle the high bits of each
 179         // word together, and then the low bits, everything will be fine.
 180
 181         // First, shift the high bits down.
 182         vshl.u64 q2, q1, #63            // the b_i for t
 183         vshl.u64 q3, q1, #62            // the b_i for t^2
 184         vshl.u64 q8, q1, #57            // the b_i for t^7
 185         veor    q2, q2, q3              // add them all together
 186         veor    q2, q2, q8
 187         veor    d2, d2, d5              // contribution into low half
 188         veor    d1, d1, d4              // and high half
 189
 190         // And then shift the low bits up.
 191         vshr.u64 q2, q1, #1
 192         vshr.u64 q3, q1, #2
 193         vshr.u64 q8, q1, #7
 194         veor    q0, q0, q1              // mix in the unit contribution
 195         veor    q2, q2, q3              // t and t^2 contribs
 196         veor    q0, q0, q8              // low, unit, and t^7 contribs
 197         veor    q0, q0, q2              // mix them together and we're done
 198 .endm
 199
 200 .macro  mul64
 201         // Enter with u and v in the low halves of d0 and d1 respectively;
 202         // leave with z = u v in d0.  Clobbers d1--d5.
 203
 204         // The multiplication is thankfully easy.
 205         vmull.p64 q0, d0, d1            // u v
 206
 207         // Shift the product up by one place, and swap the two halves.  After
 208         // this, we're in GCM bizarro-world.
 209         vshr.u64 d2, d0, #63            // shifted left; just the carries
 210         vshl.u64 d3, d1, #1             // low half right
 211         vshl.u64 d1, d0, #1             // high half shifted right
 212         vorr    d0, d3, d2              // mix in the carries
 213
 214         // Now we must reduce.  This is essentially the same as the 128-bit
 215         // case above, but mostly simpler because everything is smaller.  The
 216         // polynomial this time is p(t) = t^64 + t^4 + t^3 + t + 1.
 217
 218         // First, shift the high bits down.
 219         vshl.u64 d2, d1, #63            // b_i for t
 220         vshl.u64 d3, d1, #61            // b_i for t^3
 221         vshl.u64 d4, d1, #60            // b_i for t^4
 222         veor    d2, d2, d3              // add them all together
 223         veor    d2, d2, d4
 224         veor    d1, d1, d2              // contribution back into high half
 225
 226         // And then shift the low bits up.
 227         vshr.u64 d2, d1, #1
 228         vshr.u64 d3, d1, #3
 229         vshr.u64 d4, d1, #4
 230         veor    d0, d0, d1              // mix in the unit contribution
 231         veor    d2, d2, d3              // t and t^3 contribs
 232         veor    d0, d0, d4              // low, unit, and t^4
 233         veor    d0, d0, d2              // mix them together and we're done
 234 .endm
 235
 236 .macro  mul96
 237         // Enter with u and v in the most-significant three words of q0 and
 238         // q1 respectively, and zero in the low words, and zero in q15; leave
 239         // with z = u v in the high three words of q0, and /junk/ in the low
 240         // word.  Clobbers ???.
 241
 242         // This is an inconvenient size.  There's nothing for it but to do
 243         // four multiplications, as if for the 128-bit case.  It's possible
 244         // that there's cruft in the top 32 bits of the input registers, so
 245         // shift both of them up by four bytes before we start.  This will
 246         // mean that the high 64 bits of the result (from GCM's viewpoint)
 247         // will be zero.
 248         // q0 =                         // (u_0 + u_1 t^32; u_2)
 249         // q1 =                         // (v_0 + v_1 t^32; v_2)
 250         vmull.p64 q8, d1, d2            // u_2 (v_0 + v_1 t^32) = e_0
 251         vmull.p64 q9, d0, d3            // v_2 (u_0 + u_1 t^32) = e_1
 252         vmull.p64 q3, d1, d3            // u_2 v_2 t^64 = d = (0; d)
 253         vmull.p64 q0, d0, d2            // u_0 v_0 + (u_0 v_1 + u_1 v_0) t^32
 254                                         //   + u_1 v_1 t^64 = f
 255
 256         // Extract the high and low halves of the 192-bit result.  The answer
 257         // we want is d t^128 + e t^64 + f, where e = e_0 + e_1.  The low 96
 258         // bits of the answer will end up in q0, and the high 96 bits will
 259         // end up in q1; we'll need both of these to have zero in their
 260         // bottom 32 bits.
 261         //
 262         // Here, bot(x) is the low 96 bits of a 192-bit quantity x, arranged
 263         // in the low 96 bits of a SIMD register, with junk in the top 32
 264         // bits; and top(x) is the high 96 bits, also arranged in the low 96
 265         // bits of a register, with /zero/ in the top 32 bits.
 266         veor    q8, q8, q9              // e_0 + e_1 = e
 267         vshr128 q1, q3, 32              // top(d t^128)
 268         vext.8  d19, d16, d17, #4       // top(e t^64)
 269         vshl.u64 d16, d0, #32           // top(f), sort of
 270         veor    d3, d3, d19             // q1 = top(d t^128 + e t^64)
 271         veor    d0, d0, d17             // q0 = bot([d t^128] + e t^64 + f)
 272         veor    d3, d3, d16             // q1 = top(d t^128 + e t^64 + f)
 273
 274         // Shift the product right by one place (from GCM's point of view),
 275         // but, unusually, don't swap the halves, because we need to work on
 276         // the 32-bit pieces later.  After this, we're in GCM bizarro-world.
 277         // q0 =                         // (?, x_2; x_1, x_0)
 278         // q1 =                         // (0, x_5; x_4, x_3)
 279         vshr.u64 d4, d0, #63            // carry from d0 to d1
 280         vshr.u64 d5, d2, #63            // carry from d2 to d3
 281         vshr.u32 d6, d3, #31            // carry from d3 to d0
 282         vshl.u64 q0, q0, #1             // shift low half
 283         vshl.u64 q1, q1, #1             // shift high half
 284         vorr    d1, d1, d4
 285         vorr    d0, d0, d6
 286         vorr    d3, d3, d5
 287
 288         // Finally, the reduction.  This is essentially the same as the
 289         // 128-bit case, except that the polynomial is p(t) = t^96 + t^10 +
 290         // t^9 + t^6 + 1.  The degrees are larger but not enough to cause
 291         // trouble for the general approach.
 292
 293         // First, shift the high bits down.
 294         vshl.u32 q2, q1, #26            // b_i for t^6
 295         vshl.u32 q3, q1, #23            // b_i for t^9
 296         vshl.u32 q8, q1, #22            // b_i for t^10
 297         veor    q2, q2, q3              // add them all together
 298         veor    q2, q2, q8
 299         vshl128 q3, q2, 64              // contribution into high half
 300         vshr128 q2, q2, 32              // and low half
 301         veor    q1, q1, q3              // mix them in
 302         veor    q0, q0, q2
 303
 304         // And then shift the low bits up.
 305         vshr.u32 q2, q1, #6
 306         vshr.u32 q3, q1, #9
 307         veor    q0, q0, q1              // mix in the unit contribution
 308         vshr.u32 q8, q1, #10
 309         veor    q2, q2, q3              // mix together t^6 and t^9
 310         veor    q0, q0, q8              // mix in t^10
 311         veor    q0, q0, q2              // and the rest
 312
 313         // And finally swap the two halves.
 314         vswp    d0, d1
 315 .endm
 316
 317 .macro  mul192
 318         // Enter with u and v in d0--d2 and d3--d5 respectively; leave
 319         // with z = u v in d0--d2.  Clobbers q8--q15.
 320
 321         // Start multiplying and accumulating pieces of product.
 322         // (d0; d1; d2) =               // (u_0; u_1; u_2)
 323         // (d3; d4; d5) =               // (v_0; v_1; v_2)
 324         vmull.p64 q10, d0, d3           // e = u_0 v_0
 325
 326         vmull.p64 q12, d0, d4           //     u_0 v_1
 327         vmull.p64 q13, d1, d3           //     u_1 v_0
 328
 329         vmull.p64 q9, d0, d5            //     u_0 v_2
 330         vmull.p64 q14, d1, d4           //     u_1 v_1
 331         vmull.p64 q15, d2, d3           //     u_2 v_0
 332          veor   q12, q12, q13           // d = u_0 v_1 + u_1 v_0
 333
 334         vmull.p64 q11, d1, d5           //     u_1 v_2
 335         vmull.p64 q13, d2, d4           //     u_2 v_1
 336          veor   q9, q9, q14             //     u_0 v_2 + u_1 v_1
 337
 338         vmull.p64 q8, d2, d5            // a = u_2 v_2
 339          veor   q9, q9, q15             // c = u_0 v_2 + u_1 v_1 + u_2 v_0
 340          veor   q11, q11, q13           // b = u_1 v_2 + u_2 v_1
 341
 342         // Piece the product together.
 343         veor    d17, d17, d22  //  q8 = // (x_5; x_4)
 344         veor    d18, d18, d23
 345         veor    d19, d19, d24  //  q9 = // (x_3; x_2)
 346         veor    d20, d20, d25  // q10 = // (x_1; x_0)
 347
 348         // Shift the product right by one place (from GCM's point of view).
 349         vshr.u64 q11, q8, #63           // carry from d16/d17 to d17/d18
 350         vshr.u64 q12, q9, #63           // carry from d18/d19 to d19/d20
 351         vshr.u64 d26, d20, #63          // carry from d20 to d21
 352         vshl.u64 q8, q8, #1             // shift everything down
 353         vshl.u64 q9, q9, #1
 354         vshl.u64 q10, q10, #1
 355         vorr    d17, d17, d22           // and mix in the carries
 356         vorr    d18, d18, d23
 357         vorr    d19, d19, d24
 358         vorr    d20, d20, d25
 359         vorr    d21, d21, d26
 360
 361         // Next, the reduction.  Our polynomial this time is p(x) = t^192 +
 362         // t^7 + t^2 + t + 1.  Yes, the magic numbers are the same as the
 363         // 128-bit case.  I don't know why.
 364
 365         // First, shift the high bits down.
 366         // q8 =                         // (y_5; y_4)
 367         // q9 =                         // (y_3; y_2)
 368         // q10 =                        // (y_1; y_0)
 369         vshl.u64 q11, q8, #63           // (y_5; y_4) b_i for t
 370         vshl.u64 d28, d18, #63          // y_3 b_i for t
 371         vshl.u64 q12, q8, #62           // (y_5; y_4) b_i for t^2
 372         vshl.u64 d29, d18, #62          // y_3 b_i for t^2
 373         vshl.u64 q13, q8, #57           // (y_5; y_4) b_i for t^7
 374         vshl.u64 d30, d18, #57          // y_3 b_i for t^7
 375         veor    q11, q11, q12           // mix them all together
 376         veor    d28, d28, d29
 377         veor    q11, q11, q13
 378         veor    d28, d28, d30
 379         veor    q9, q9, q11
 380         veor    d20, d20, d28
 381
 382         // And finally shift the low bits up.  Also, switch the order of the
 383         // pieces for output.
 384         // q8 =                         // (y'_5; y'_4)
 385         // q9 =                         // (y'_3; y'_2)
 386         // q10 =                        // (y'_1; y'_0)
 387         vshr.u64 q11, q8, #1            // (y_5; y_4) a_i for t
 388         vshr.u64 d28, d18, #1           // y'_3 a_i for t
 389         vshr.u64 q12, q8, #2            // (y_5; y_4) a_i for t^2
 390         vshr.u64 d29, d18, #2           // y'_3 a_i for t^2
 391         vshr.u64 q13, q8, #7            // (y_5; y_4) a_i for t^7
 392         vshr.u64 d30, d18, #7           // y'_3 a_i for t^7
 393         veor    q8, q8, q11
 394         veor    d18, d18, d28
 395         veor    q12, q12, q13
 396         veor    d29, d29, d30
 397         veor    q8, q8, q12
 398         veor    d18, d18, d29
 399         veor    d0, d21, d18
 400         veor    d1, d20, d17
 401         veor    d2, d19, d16
 402 .endm
 403
 404 .macro  mul256
 405         // Enter with u and v in q0/q1 and q2/q3 respectively; leave
 406         // with z = u v in q0/q1.  Clobbers q8--q15.
 407
 408         // Now it's starting to look worthwhile to do Karatsuba.  Suppose
 409         // u = u_0 + u_1 B and v = v_0 + v_1 B.  Then
 410         //
 411         //      u v = (u_0 v_0) + (u_0 v_1 + u_1 v_0) B + (u_1 v_1) B^2
 412         //
 413         // Name these coefficients of B^i be a, b, and c, respectively, and
 414         // let r = u_0 + u_1 and s = v_0 + v_1.  Then observe that
 415         //
 416         //      q = r s = (u_0 + u_1) (v_0 + v_1)
 417         //        = (u_0 v_0) + (u1 v_1) + (u_0 v_1 + u_1 v_0)
 418         //        = a + d + c
 419         //
 420         // The first two terms we've already calculated; the last is the
 421         // remaining one we want.  We'll set B = t^128.  We know how to do
 422         // 128-bit multiplications already, and Karatsuba is too annoying
 423         // there, so there'll be 12 multiplications altogether, rather than
 424         // the 16 we'd have if we did this the naïve way.
 425         // q0 =                         // u_0 = (u_00; u_01)
 426         // q1 =                         // u_1 = (u_10; u_11)
 427         // q2 =                         // v_0 = (v_00; v_01)
 428         // q3 =                         // v_1 = (v_10; v_11)
 429
 430         veor    q8, q0, q1              // u_* = (u_00 + u_10; u_01 + u_11)
 431         veor    q9, q2, q3              // v_* = (v_00 + v_10; v_01 + v_11)
 432
 433         // Start by building the cross product, q = u_* v_*.
 434         vmull.p64 q14, d16, d19         // u_*0 v_*1
 435         vmull.p64 q15, d17, d18         // u_*1 v_*0
 436         vmull.p64 q12, d17, d19         // u_*1 v_*1
 437         vmull.p64 q13, d16, d18         // u_*0 v_*0
 438         veor    q14, q14, q15           // u_*0 v_*1 + u_*1 v_*0
 439         veor    d25, d25, d28  // q12 = // q_1
 440         veor    d26, d26, d29  // q13 = // q_0
 441
 442         // Next, work on the low half, a = u_0 v_0.
 443         vmull.p64 q14, d0, d5           // u_00 v_01
 444         vmull.p64 q15, d1, d4           // u_01 v_00
 445         vmull.p64 q10, d1, d5           // u_01 v_01
 446         vmull.p64 q11, d0, d4           // u_00 v_00
 447         veor    q14, q14, q15           // u_00 v_01 + u_01 v_00
 448         veor    d21, d21, d28  // q10 = // a_1
 449         veor    d22, d22, d29  // q11 = // a_0
 450
 451         // Mix the pieces we have so far.
 452         veor    q12, q12, q10
 453         veor    q13, q13, q11
 454
 455         // Finally, the high half, c = u_1 v_1.
 456         vmull.p64 q14, d2, d7           // u_10 v_11
 457         vmull.p64 q15, d3, d6           // u_11 v_10
 458         vmull.p64 q8, d3, d7            // u_11 v_11
 459         vmull.p64 q9, d2, d6            // u_10 v_10
 460         veor    q14, q14, q15           // u_10 v_11 + u_11 v_10
 461         veor    d17, d17, d28  //  q8 = // c_1
 462         veor    d18, d18, d29  //  q9 = // c_0
 463
 464         // Finish mixing the product together.
 465         veor    q12, q12, q8   // q12 = // b_1
 466         veor    q13, q13, q9   // q13 = // b_0
 467         veor    q9, q9, q12
 468         veor    q10, q10, q13
 469
 470         // Shift the product right by one place (from GCM's point of view).
 471         vshr.u64 q0, q8, #63            // carry from d16/d17 to d17/d18
 472         vshr.u64 q1, q9, #63            // carry from d18/d19 to d19/d20
 473         vshr.u64 q2, q10, #63           // carry from d20/d21 to d21/d22
 474         vshr.u64 d6, d22, #63           // carry from d22 to d23
 475         vshl.u64 q8, q8, #1             // shift everyting down
 476         vshl.u64 q9, q9, #1
 477         vshl.u64 q10, q10, #1
 478         vshl.u64 q11, q11, #1
 479         vorr    d17, d17, d0
 480         vorr    d18, d18, d1
 481         vorr    d19, d19, d2
 482         vorr    d20, d20, d3
 483         vorr    d21, d21, d4
 484         vorr    d22, d22, d5
 485         vorr    d23, d23, d6
 486
 487         // Now we must reduce.  This is essentially the same as the 192-bit
 488         // case above, but more complicated because everything is bigger.
 489         // The polynomial this time is p(t) = t^256 + t^10 + t^5 + t^2 + 1.
 490
 491         // First, shift the high bits down.
 492         // q8 =                         // (y_7; y_6)
 493         // q9 =                         // (y_5; y_4)
 494         // q10 =                        // (y_3; y_2)
 495         // q11 =                        // (y_1; y_0)
 496         vshl.u64 q0, q8, #62            // (y_7; y_6) b_i for t^2
 497         vshl.u64 q12, q9, #62           // (y_5; y_4) b_i for t^2
 498         vshl.u64 q1, q8, #59            // (y_7; y_6) b_i for t^5
 499         vshl.u64 q13, q9, #59           // (y_5; y_4) b_i for t^5
 500         vshl.u64 q2, q8, #54            // (y_7; y_6) b_i for t^10
 501         vshl.u64 q14, q9, #54           // (y_5; y_4) b_i for t^10
 502         veor    q0, q0, q1              // mix the contributions together
 503         veor    q12, q12, q13
 504         veor    q0, q0, q2
 505         veor    q12, q12, q14
 506         veor    d19, d19, d0            // and combine into the lower pieces
 507         veor    d20, d20, d1
 508         veor    d21, d21, d24
 509         veor    d22, d22, d25
 510
 511         // And then shift the low bits up.  Also, switch the order of the
 512         // pieces for output.
 513         // q8 =                         // (y'_7; y'_6)
 514         // q9 =                         // (y'_5; y'_4)
 515         // q10 =                        // (y'_3; y'_2)
 516         // q11 =                        // (y'_1; y'_0)
 517         vshr.u64 q0, q8, #2             // (y_7; y_6) a_i for t^2
 518         vshr.u64 q12, q9, #2            // (y_5; y'_4) a_i for t^2
 519         vshr.u64 q1, q8, #5             // (y_7; y_6) a_i for t^5
 520         vshr.u64 q13, q9, #5            // (y_5; y_4) a_i for t^5
 521         vshr.u64 q2, q8, #10            // (y_7; y_6) a_i for t^10
 522         vshr.u64 q14, q9, #10           // (y_5; y_4) a_i for t^10
 523
 524         veor    q8, q8, q0              // mix the contributions together
 525         veor    q1, q1, q2
 526         veor    q9, q9, q12
 527         veor    q13, q13, q14
 528         veor    q8, q8, q1
 529         veor    q9, q9, q13
 530         veor    d3, d20, d16            // and output
 531         veor    d2, d21, d17
 532         veor    d1, d22, d18
 533         veor    d0, d23, d19
 534 .endm
 535
 536 ///--------------------------------------------------------------------------
 537 /// Main code.
 538
 539 // There are a number of representations of field elements in this code and
 540 // it can be confusing.
 541 //
 542 //   * The `external format' consists of a sequence of contiguous bytes in
 543 //     memory called a `block'.  The GCM spec explains how to interpret this
 544 //     block as an element of a finite field.  As discussed extensively, this
 545 //     representation is very annoying for a number of reasons.  On the other
 546 //     hand, this code never actually deals with it directly.
 547 //
 548 //   * The `register format' consists of one or more NEON registers,
 549 //     depending on the block size.  The bytes in each 64-bit lane of these
 550 //     registers are in reverse order, compared to the external format.
 551 //
 552 //   * The `words' format consists of a sequence of bytes, as in the
 553 //     `external format', but, according to the blockcipher in use, the bytes
 554 //     within each 32-bit word may be reversed (`big-endian') or not
 555 //     (`little-endian').  Accordingly, there are separate entry points for
 556 //     each variant, identified with `b' or `l'.
 557
 558 FUNC(gcm_mulk_128b_arm_crypto)
 559         // On entry, r0 points to a 128-bit field element A in big-endian
 560         // words format; r1 points to a field-element K in register format.
 561         // On exit, A is updated with the product A K.
 562
 563         vld1.8  {q0}, [r0]
 564         vld1.8  {q1}, [r1]
 565         vrev64.32 q0, q0
 566         mul128
 567         vrev64.32 q0, q0
 568         vst1.8  {q0}, [r0]
 569         bx      r14
 570 ENDFUNC
 571
 572 FUNC(gcm_mulk_128l_arm_crypto)
 573         // On entry, r0 points to a 128-bit field element A in little-endian
 574         // words format; r1 points to a field-element K in register format.
 575         // On exit, A is updated with the product A K.
 576
 577         vld1.8  {q0}, [r0]
 578         vld1.8  {q1}, [r1]
 579         vrev64.8 q0, q0
 580         mul128
 581         vrev64.8 q0, q0
 582         vst1.8  {q0}, [r0]
 583         bx      r14
 584 ENDFUNC
 585
 586 FUNC(gcm_mulk_64b_arm_crypto)
 587         // On entry, r0 points to a 64-bit field element A in big-endian
 588         // words format; r1 points to a field-element K in register format.
 589         // On exit, A is updated with the product A K.
 590
 591         vld1.8  {d0}, [r0]
 592         vld1.8  {d1}, [r1]
 593         vrev64.32 d0, d0
 594         mul64
 595         vrev64.32 d0, d0
 596         vst1.8  {d0}, [r0]
 597         bx      r14
 598 ENDFUNC
 599
 600 FUNC(gcm_mulk_64l_arm_crypto)
 601         // On entry, r0 points to a 64-bit field element A in little-endian
 602         // words format; r1 points to a field-element K in register format.
 603         // On exit, A is updated with the product A K.
 604
 605         vld1.8  {d0}, [r0]
 606         vld1.8  {d1}, [r1]
 607         vrev64.8 d0, d0
 608         vzero
 609         mul64
 610         vrev64.8 d0, d0
 611         vst1.8  {d0}, [r0]
 612         bx      r14
 613 ENDFUNC
 614
 615 FUNC(gcm_mulk_96b_arm_crypto)
 616         // On entry, r0 points to a 96-bit field element A in big-endian
 617         // words format; r1 points to a field-element K in register format.
 618         // On exit, A is updated with the product A K.
 619
 620         ldr     r3, [r0, #8]
 621         mov     r12, #0
 622         vld1.8  {d0}, [r0]
 623         vld1.8  {q1}, [r1]
 624         vrev64.32 d0, d0
 625         vmov    d1, r12, r3
 626         vzero
 627         mul96
 628         vrev64.32 d0, d0
 629         vmov    r3, d1[1]
 630         vst1.8  {d0}, [r0]
 631         str     r3, [r0, #8]
 632         bx      r14
 633 ENDFUNC
 634
 635 FUNC(gcm_mulk_96l_arm_crypto)
 636         // On entry, r0 points to a 128-bit field element A in little-endian
 637         // words format; r1 points to a field-element K in register format.
 638         // On exit, A is updated with the product A K.
 639
 640         ldr     r3, [r0, #8]
 641         mov     r12, #0
 642         vld1.8  {d0}, [r0]
 643         vld1.8  {q1}, [r1]
 644         vmov    d1, r3, r12
 645         vrev64.8 q0, q0
 646         mul96
 647         vrev64.8 q0, q0
 648         vmov    r3, d1[0]
 649         vst1.8  {d0}, [r0]
 650         str     r3, [r0, #8]
 651         bx      r14
 652 ENDFUNC
 653
 654 FUNC(gcm_mulk_192b_arm_crypto)
 655         // On entry, r0 points to a 192-bit field element A in big-endian
 656         // words format; r1 points to a field-element K in register format.
 657         // On exit, A is updated with the product A K.
 658
 659         vld1.8  {d0-d2}, [r0]
 660         vld1.8  {d3-d5}, [r1]
 661         vrev64.32 q0, q0
 662         vrev64.32 d2, d2
 663         mul192
 664         vrev64.32 q0, q0
 665         vrev64.32 d2, d2
 666         vst1.8  {d0-d2}, [r0]
 667         bx      r14
 668 ENDFUNC
 669
 670 FUNC(gcm_mulk_192l_arm_crypto)
 671         // On entry, r0 points to a 192-bit field element A in little-endian
 672         // words format; r1 points to a field-element K in register format.
 673         // On exit, A is updated with the product A K.
 674
 675         vld1.8  {d0-d2}, [r0]
 676         vld1.8  {d3-d5}, [r1]
 677         vrev64.8 q0, q0
 678         vrev64.8 d2, d2
 679         mul192
 680         vrev64.8 q0, q0
 681         vrev64.8 d2, d2
 682         vst1.8  {d0-d2}, [r0]
 683         bx      r14
 684 ENDFUNC
 685
 686 FUNC(gcm_mulk_256b_arm_crypto)
 687         // On entry, r0 points to a 256-bit field element A in big-endian
 688         // words format; r1 points to a field-element K in register format.
 689         // On exit, A is updated with the product A K.
 690
 691         vld1.8  {q0, q1}, [r0]
 692         vld1.8  {q2, q3}, [r1]
 693         vrev64.32 q0, q0
 694         vrev64.32 q1, q1
 695         mul256
 696         vrev64.32 q0, q0
 697         vrev64.32 q1, q1
 698         vst1.8  {q0, q1}, [r0]
 699         bx      r14
 700 ENDFUNC
 701
 702 FUNC(gcm_mulk_256l_arm_crypto)
 703         // On entry, r0 points to a 256-bit field element A in little-endian
 704         // words format; r1 points to a field-element K in register format.
 705         // On exit, A is updated with the product A K.
 706
 707         vld1.8  {q0, q1}, [r0]
 708         vld1.8  {q2, q3}, [r1]
 709         vrev64.8 q0, q0
 710         vrev64.8 q1, q1
 711         mul256
 712         vrev64.8 q0, q0
 713         vrev64.8 q1, q1
 714         vst1.8  {q0, q1}, [r0]
 715         bx      r14
 716 ENDFUNC
 717
 718 ///----- That's all, folks --------------------------------------------------