chiark - git - mdw - catacomb/blob - symm/gcm-arm-crypto.S

   1 /// -*- mode: asm; asm-comment-char: ?/ -*-
   2 ///
   3 /// GCM acceleration for ARM processors
   4 ///
   5 /// (c) 2019 Straylight/Edgeware
   6 ///
   7
   8 ///----- Licensing notice ---------------------------------------------------
   9 ///
  10 /// This file is part of Catacomb.
  11 ///
  12 /// Catacomb is free software: you can redistribute it and/or modify it
  13 /// under the terms of the GNU Library General Public License as published
  14 /// by the Free Software Foundation; either version 2 of the License, or
  15 /// (at your option) any later version.
  16 ///
  17 /// Catacomb is distributed in the hope that it will be useful, but
  18 /// WITHOUT ANY WARRANTY; without even the implied warranty of
  19 /// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  20 /// Library General Public License for more details.
  21 ///
  22 /// You should have received a copy of the GNU Library General Public
  23 /// License along with Catacomb.  If not, write to the Free Software
  24 /// Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
  25 /// USA.
  26
  27 ///--------------------------------------------------------------------------
  28 /// Preliminaries.
  29
  30 #include "config.h"
  31 #include "asm-common.h"
  32
  33         .arch   armv8-a
  34         .fpu    crypto-neon-fp-armv8
  35
  36         .text
  37
  38 ///--------------------------------------------------------------------------
  39 /// Multiplication macros.
  40
  41         // The good news is that we have a fancy instruction to do the
  42         // multiplications.  The bad news is that it's not particularly well-
  43         // suited to the job.
  44         //
  45         // For one thing, it only does a 64-bit multiplication, so in general
  46         // we'll need to synthesize the full-width multiply by hand.  For
  47         // another thing, it doesn't help with the reduction, so we have to
  48         // do that by hand too.  And, finally, GCM has crazy bit ordering,
  49         // and the instruction does nothing useful for that at all.
  50         //
  51         // Focusing on that last problem first: the bits aren't in monotonic
  52         // significance order unless we permute them.  If we reverse the byte
  53         // order, then we'll have the bits in monotonic order, but backwards,
  54         // so the degree-0 coefficient will be in the most-significant bit.
  55         //
  56         // This is less of a difficulty than it seems at first, because
  57         // algebra.  Suppose we are given u = SUM_{0<=i<n} u_i t^i and v =
  58         // SUM_{0<=j<n} v_j t^j; then
  59         //
  60         //      u v = SUM_{0<=i,j<n} u_i v_j t^{i+j}
  61         //
  62         // Suppose instead that we're given ũ = SUM_{0<=i<n} u_{n-i-1} t^i
  63         // and ṽ = SUM_{0<=j<n} v_{n-j-1} t^j, so the bits are backwards.
  64         // Then
  65         //
  66         //      ũ ṽ = SUM_{0<=i,j<n} u_{n-i-1} v_{n-j-1} t^{i+j}
  67         //          = SUM_{0<=i,j<n} u_i v_j t^{2n-2-(i+j)}
  68         //
  69         // which is almost the bit-reversal of u v, only it's shifted right
  70         // by one place.  Putting this another way, what we have is actually
  71         // the bit reversal of the product u v t.  We could get the correct
  72         // answer (modulo p(t)) if we'd sneakily divided one of the operands
  73         // by t before we started.  Conveniently, v is actually the secret
  74         // value k set up by the GCM `mktable' function, so we can arrange to
  75         // actually store k/t (mod p(t)) and then the product will come out
  76         // correct (modulo p(t)) and we won't have anything more to worry
  77         // about here.
  78         //
  79         // That was important to think about, but there's not a great deal to
  80         // do about it yet other than to convert what we've got from the
  81         // blockcipher's byte-ordering convention to our big-endian
  82         // convention.  Since this depends on the blockcipher convention,
  83         // we'll leave the caller to cope with this: the macros here will
  84         // assume that the operands are in `register' format, which is the
  85         // same as the external representation, except that the bytes within
  86         // each 64-bit piece are reversed.  In the commentary, pieces of
  87         // polynomial are numbered according to the degree of the
  88         // coefficients, so the unit coefficient of some polynomial a is in
  89         // a_0.
  90         //
  91         // The commentary for `mul128' is the most detailed.  The other
  92         // macros assume that you've already read and understood that.
  93
  94 .macro  mul128
  95         // Enter with u and v in q0 and q1 respectively; leave with z = u v
  96         // in q0.  Clobbers q1--q3, q8, q9.
  97
  98         // First for the double-precision multiplication.  It's tempting to
  99         // use Karatsuba's identity here, but I suspect that loses more in
 100         // the shifting, bit-twiddling, and dependency chains that it gains
 101         // in saving a multiplication which otherwise pipelines well.
 102         // q0 =                         // (u_0; u_1)
 103         // q1 =                         // (v_0; v_1)
 104         vmull.p64 q2, d1, d2            // u_1 v_0
 105         vmull.p64 q3, d0, d3            // u_0 v_1
 106         vmull.p64 q8, d1, d3            // (x_3; t_1) = u_1 v_1
 107         vmull.p64 q9, d0, d2            // (t_0; x_0) = u_0 v_0
 108
 109         // Arrange the pieces to form a double-precision polynomial.
 110         veor    q2, q2, q3              // (m_1; m_0) = u_0 v_1 + u_1 v_0
 111         veor    d17, d17, d4            // x_2 = t_1 + m_1
 112         veor    d18, d18, d5            // x_1 = t_0 + m_0
 113         // q8 =                         // (x_3; x_2)
 114         // q9 =                         // (x_1; x_0)
 115
 116         // One-and-a-half problems remain.
 117         //
 118         // The full-size problem is that the result needs to be reduced
 119         // modulo p(t) = t^128 + t^7 + t^2 + t + 1.  Let R = t^128 = t^7 +
 120         // t^2 + t + 1 in our field.  So far, we've calculated z_0 and z_1
 121         // such that z_0 + z_1 R = u v using the identity R = t^128: now we
 122         // must collapse the two halves of y together using the other
 123         // identity R = t^7 + t^2 + t + 1.
 124         //
 125         // We do this by working on x_2 and x_3 separately, so consider x_i
 126         // for i = 2 or 3.  Certainly, x_i t^{64i} = x_i R t^{64(i-2) =
 127         // (t^7 + t^2 + t + 1) x_i t^{64(i-2)}, but we can't use that
 128         // directly without breaking up the 64-bit word structure.  Instead,
 129         // we start by considering just x_i t^7 t^{64(i-2)}, which again
 130         // looks tricky.  Now, split x_i = a_i + t^57 b_i, with deg a_i < 57;
 131         // then
 132         //
 133         //      x_i t^7 t^{64(i-2)} = a_i t^7 t^{64(i-2)} + b_i t^{64(i-1)}
 134         //
 135         // We can similarly decompose x_i t^2 and x_i t into a pair of 64-bit
 136         // contributions to the t^{64(i-2)} and t^{64(i-1)} words, but the
 137         // splits are different.  This is lovely, with one small snag: when
 138         // we do this to x_3, we end up with a contribution back into the
 139         // t^128 coefficient word.  But notice that only the low seven bits
 140         // of this word are affected, so there's no knock-on contribution
 141         // into the t^64 word.  Therefore, if we handle the high bits of each
 142         // word together, and then the low bits, everything will be fine.
 143
 144         // First, shift the high bits down.
 145         vshl.u64 q2, q8, #63            // the b_i for t
 146         vshl.u64 q3, q8, #62            // the b_i for t^2
 147         vshl.u64 q0, q8, #57            // the b_i for t^7
 148         veor    q2, q2, q3              // add them all together
 149         veor    q2, q2, q0
 150         veor    d18, d18, d5            // contribution into low half
 151         veor    d17, d17, d4            // and high half
 152
 153         // And then shift the low bits up.
 154         vshr.u64 q2, q8, #1
 155         vshr.u64 q3, q8, #2
 156         vshr.u64 q1, q8, #7
 157         veor    q8, q8, q9              // mix in the unit contribution
 158         veor    q2, q2, q3              // t and t^2 contribs
 159         veor    q1, q1, q8              // low, unit, and t^7 contribs
 160         veor    d1, d2, d4              // mix them together and swap halves
 161         veor    d0, d3, d5
 162 .endm
 163
 164 .macro  mul64
 165         // Enter with u and v in the low halves of d0 and d1 respectively;
 166         // leave with z = u v in d0.  Clobbers d1--d5.
 167
 168         // The multiplication is thankfully easy.
 169         vmull.p64 q0, d0, d1            // u v
 170
 171         // Now we must reduce.  This is essentially the same as the 128-bit
 172         // case above, but mostly simpler because everything is smaller.  The
 173         // polynomial this time is p(t) = t^64 + t^4 + t^3 + t + 1.
 174
 175         // First, shift the high bits down.
 176         vshl.u64 d2, d0, #63            // b_i for t
 177         vshl.u64 d3, d0, #61            // b_i for t^3
 178         vshl.u64 d4, d0, #60            // b_i for t^4
 179         veor    d2, d2, d3              // add them all together
 180         veor    d2, d2, d4
 181         veor    d0, d0, d2              // contribution back into high half
 182
 183         // And then shift the low bits up.
 184         vshr.u64 d2, d0, #1
 185         vshr.u64 d3, d0, #3
 186         vshr.u64 d4, d0, #4
 187         veor    d0, d0, d1              // mix in the unit contribution
 188         veor    d2, d2, d3              // t and t^3 contribs
 189         veor    d0, d0, d4              // low, unit, and t^4
 190         veor    d0, d0, d2              // mix them together and we're done
 191 .endm
 192
 193 .macro  mul96
 194         // Enter with u and v in the most-significant three words of q0 and
 195         // q1 respectively, and zero in the low words, and zero in q15; leave
 196         // with z = u v in the high three words of q0, and /junk/ in the low
 197         // word.  Clobbers q1--q3, q8, q9.
 198
 199         // This is an inconvenient size.  There's nothing for it but to do
 200         // four multiplications, as if for the 128-bit case.
 201         // q0 =                         // (u_0 + u_1 t^32; u_2)
 202         // q1 =                         // (v_0 + v_1 t^32; v_2)
 203         vmull.p64 q8, d1, d2            // u_2 (v_0 + v_1 t^32) = e_0
 204         vmull.p64 q9, d0, d3            // v_2 (u_0 + u_1 t^32) = e_1
 205         vmull.p64 q3, d1, d3            // u_2 v_2 t^64 = d = (0; d)
 206         vmull.p64 q0, d0, d2            // u_0 v_0 + (u_0 v_1 + u_1 v_0) t^32
 207                                         //   + u_1 v_1 t^64 = f
 208
 209         // Extract the high and low halves of the 192-bit result.  The answer
 210         // we want is d t^128 + e t^64 + f, where e = e_0 + e_1.  The low 96
 211         // bits of the answer will end up in q0, and the high 96 bits will
 212         // end up in q1; we'll need both of these to have zero in their
 213         // bottom 32 bits.
 214         //
 215         // Here, bot(x) is the low 96 bits of a 192-bit quantity x, arranged
 216         // in the low 96 bits of a SIMD register, with junk in the top 32
 217         // bits; and top(x) is the high 96 bits, also arranged in the low 96
 218         // bits of a register, with /zero/ in the top 32 bits.
 219         veor    q8, q8, q9              // e_0 + e_1 = e
 220         vshr128 q1, q3, 32              // top(d t^128)
 221         vext.8  d19, d16, d17, #4       // top(e t^64)
 222         vshl.u64 d16, d0, #32           // top(f), sort of
 223         veor    d3, d3, d19             // q1 = top(d t^128 + e t^64)
 224         veor    d0, d0, d17             // q0 = bot([d t^128] + e t^64 + f)
 225         veor    d3, d3, d16             // q1 = top(d t^128 + e t^64 + f)
 226
 227         // Finally, the reduction.  This is essentially the same as the
 228         // 128-bit case, except that the polynomial is p(t) = t^96 + t^10 +
 229         // t^9 + t^6 + 1.  The degrees are larger but not enough to cause
 230         // trouble for the general approach.
 231
 232         // First, shift the high bits down.
 233         vshl.u32 q2, q1, #26            // b_i for t^6
 234         vshl.u32 q3, q1, #23            // b_i for t^9
 235         vshl.u32 q8, q1, #22            // b_i for t^10
 236         veor    q2, q2, q3              // add them all together
 237         veor    q2, q2, q8
 238         vshl128 q3, q2, 64              // contribution into high half
 239         vshr128 q2, q2, 32              // and low half
 240         veor    q1, q1, q3              // mix them in
 241         veor    q0, q0, q2
 242
 243         // And then shift the low bits up.
 244         vshr.u32 q2, q1, #6
 245         vshr.u32 q3, q1, #9
 246         veor    q0, q0, q1              // mix in the unit contribution
 247         vshr.u32 q8, q1, #10
 248         veor    q2, q2, q3              // mix together t^6 and t^9
 249         veor    q0, q0, q8              // mix in t^10
 250         veor    q0, q0, q2              // and the rest
 251
 252         // And finally swap the two halves.
 253         vswp    d0, d1
 254 .endm
 255
 256 .macro  mul192
 257         // Enter with u and v in d0--d2 and d3--d5 respectively; leave
 258         // with z = u v in d0--d2.  Clobbers q8--q15.
 259
 260         // Start multiplying and accumulating pieces of product.
 261         // (d0; d1; d2) =               // (u_0; u_1; u_2)
 262         // (d3; d4; d5) =               // (v_0; v_1; v_2)
 263         vmull.p64 q10, d0, d3           // e = u_0 v_0
 264
 265         vmull.p64 q12, d0, d4           //     u_0 v_1
 266         vmull.p64 q13, d1, d3           //     u_1 v_0
 267
 268         vmull.p64 q9, d0, d5            //     u_0 v_2
 269         vmull.p64 q14, d1, d4           //     u_1 v_1
 270         vmull.p64 q15, d2, d3           //     u_2 v_0
 271          veor   q12, q12, q13           // d = u_0 v_1 + u_1 v_0
 272
 273         vmull.p64 q11, d1, d5           //     u_1 v_2
 274         vmull.p64 q13, d2, d4           //     u_2 v_1
 275          veor   q9, q9, q14             //     u_0 v_2 + u_1 v_1
 276
 277         vmull.p64 q8, d2, d5            // a = u_2 v_2
 278          veor   q9, q9, q15             // c = u_0 v_2 + u_1 v_1 + u_2 v_0
 279          veor   q11, q11, q13           // b = u_1 v_2 + u_2 v_1
 280
 281         // Piece the product together.
 282         veor    d17, d17, d22  //  q8 = // (x_5; x_4)
 283         veor    d18, d18, d23
 284         veor    d19, d19, d24  //  q9 = // (x_3; x_2)
 285         veor    d20, d20, d25  // q10 = // (x_1; x_0)
 286
 287         // Next, the reduction.  Our polynomial this time is p(x) = t^192 +
 288         // t^7 + t^2 + t + 1.  Yes, the magic numbers are the same as the
 289         // 128-bit case.  I don't know why.
 290
 291         // First, shift the high bits down.
 292         // q8 =                         // (y_5; y_4)
 293         // q9 =                         // (y_3; y_2)
 294         // q10 =                        // (y_1; y_0)
 295         vshl.u64 q11, q8, #63           // (y_5; y_4) b_i for t
 296         vshl.u64 d28, d18, #63          // y_3 b_i for t
 297         vshl.u64 q12, q8, #62           // (y_5; y_4) b_i for t^2
 298         vshl.u64 d29, d18, #62          // y_3 b_i for t^2
 299         vshl.u64 q13, q8, #57           // (y_5; y_4) b_i for t^7
 300         vshl.u64 d30, d18, #57          // y_3 b_i for t^7
 301         veor    q11, q11, q12           // mix them all together
 302         veor    d28, d28, d29
 303         veor    q11, q11, q13
 304         veor    d28, d28, d30
 305         veor    q9, q9, q11
 306         veor    d20, d20, d28
 307
 308         // And finally shift the low bits up.  Also, switch the order of the
 309         // pieces for output.
 310         // q8 =                         // (y'_5; y'_4)
 311         // q9 =                         // (y'_3; y'_2)
 312         // q10 =                        // (y'_1; y'_0)
 313         vshr.u64 q11, q8, #1            // (y_5; y_4) a_i for t
 314         vshr.u64 d28, d18, #1           // y'_3 a_i for t
 315         vshr.u64 q12, q8, #2            // (y_5; y_4) a_i for t^2
 316         vshr.u64 d29, d18, #2           // y'_3 a_i for t^2
 317         vshr.u64 q13, q8, #7            // (y_5; y_4) a_i for t^7
 318         vshr.u64 d30, d18, #7           // y'_3 a_i for t^7
 319         veor    q8, q8, q11
 320         veor    d18, d18, d28
 321         veor    q12, q12, q13
 322         veor    d29, d29, d30
 323         veor    q8, q8, q12
 324         veor    d18, d18, d29
 325         veor    d0, d21, d18
 326         veor    d1, d20, d17
 327         veor    d2, d19, d16
 328 .endm
 329
 330 .macro  mul256
 331         // Enter with u and v in q0/q1 and q2/q3 respectively; leave
 332         // with z = u v in q0/q1.  Clobbers q8--q15.
 333
 334         // Now it's starting to look worthwhile to do Karatsuba.  Suppose
 335         // u = u_0 + u_1 B and v = v_0 + v_1 B.  Then
 336         //
 337         //      u v = (u_0 v_0) + (u_0 v_1 + u_1 v_0) B + (u_1 v_1) B^2
 338         //
 339         // Name these coefficients of B^i be a, b, and c, respectively, and
 340         // let r = u_0 + u_1 and s = v_0 + v_1.  Then observe that
 341         //
 342         //      q = r s = (u_0 + u_1) (v_0 + v_1)
 343         //        = (u_0 v_0) + (u1 v_1) + (u_0 v_1 + u_1 v_0)
 344         //        = a + c + b
 345         //
 346         // The first two terms we've already calculated; the last is the
 347         // remaining one we want.  We'll set B = t^128.  We know how to do
 348         // 128-bit multiplications already, and Karatsuba is too annoying
 349         // there, so there'll be 12 multiplications altogether, rather than
 350         // the 16 we'd have if we did this the naïve way.
 351         // q0 =                         // u_0 = (u_00; u_01)
 352         // q1 =                         // u_1 = (u_10; u_11)
 353         // q2 =                         // v_0 = (v_00; v_01)
 354         // q3 =                         // v_1 = (v_10; v_11)
 355
 356         veor    q8, q0, q1              // u_* = (u_00 + u_10; u_01 + u_11)
 357         veor    q9, q2, q3              // v_* = (v_00 + v_10; v_01 + v_11)
 358
 359         // Start by building the cross product, q = u_* v_*.
 360         vmull.p64 q14, d16, d19         // u_*0 v_*1
 361         vmull.p64 q15, d17, d18         // u_*1 v_*0
 362         vmull.p64 q12, d17, d19         // u_*1 v_*1
 363         vmull.p64 q13, d16, d18         // u_*0 v_*0
 364         veor    q14, q14, q15           // u_*0 v_*1 + u_*1 v_*0
 365         veor    d25, d25, d28  // q12 = // q_1
 366         veor    d26, d26, d29  // q13 = // q_0
 367
 368         // Next, work on the low half, a = u_0 v_0.
 369         vmull.p64 q14, d0, d5           // u_00 v_01
 370         vmull.p64 q15, d1, d4           // u_01 v_00
 371         vmull.p64 q10, d1, d5           // u_01 v_01
 372         vmull.p64 q11, d0, d4           // u_00 v_00
 373         veor    q14, q14, q15           // u_00 v_01 + u_01 v_00
 374         veor    d21, d21, d28  // q10 = // a_1
 375         veor    d22, d22, d29  // q11 = // a_0
 376
 377         // Mix the pieces we have so far.
 378         veor    q12, q12, q10
 379         veor    q13, q13, q11
 380
 381         // Finally, the high half, c = u_1 v_1.
 382         vmull.p64 q14, d2, d7           // u_10 v_11
 383         vmull.p64 q15, d3, d6           // u_11 v_10
 384         vmull.p64 q8, d3, d7            // u_11 v_11
 385         vmull.p64 q9, d2, d6            // u_10 v_10
 386         veor    q14, q14, q15           // u_10 v_11 + u_11 v_10
 387         veor    d17, d17, d28  //  q8 = // c_1
 388         veor    d18, d18, d29  //  q9 = // c_0
 389
 390         // Finish mixing the product together.
 391         veor    q12, q12, q8   // q12 = // b_1
 392         veor    q13, q13, q9   // q13 = // b_0
 393         veor    q9, q9, q12
 394         veor    q10, q10, q13
 395
 396         // Now we must reduce.  This is essentially the same as the 192-bit
 397         // case above, but more complicated because everything is bigger.
 398         // The polynomial this time is p(t) = t^256 + t^10 + t^5 + t^2 + 1.
 399
 400         // First, shift the high bits down.
 401         // q8 =                         // (y_7; y_6)
 402         // q9 =                         // (y_5; y_4)
 403         // q10 =                        // (y_3; y_2)
 404         // q11 =                        // (y_1; y_0)
 405         vshl.u64 q0, q8, #62            // (y_7; y_6) b_i for t^2
 406         vshl.u64 q12, q9, #62           // (y_5; y_4) b_i for t^2
 407         vshl.u64 q1, q8, #59            // (y_7; y_6) b_i for t^5
 408         vshl.u64 q13, q9, #59           // (y_5; y_4) b_i for t^5
 409         vshl.u64 q2, q8, #54            // (y_7; y_6) b_i for t^10
 410         vshl.u64 q14, q9, #54           // (y_5; y_4) b_i for t^10
 411         veor    q0, q0, q1              // mix the contributions together
 412         veor    q12, q12, q13
 413         veor    q0, q0, q2
 414         veor    q12, q12, q14
 415         veor    d19, d19, d0            // and combine into the lower pieces
 416         veor    d20, d20, d1
 417         veor    d21, d21, d24
 418         veor    d22, d22, d25
 419
 420         // And then shift the low bits up.  Also, switch the order of the
 421         // pieces for output.
 422         // q8 =                         // (y'_7; y'_6)
 423         // q9 =                         // (y'_5; y'_4)
 424         // q10 =                        // (y'_3; y'_2)
 425         // q11 =                        // (y'_1; y'_0)
 426         vshr.u64 q0, q8, #2             // (y_7; y_6) a_i for t^2
 427         vshr.u64 q12, q9, #2            // (y_5; y'_4) a_i for t^2
 428         vshr.u64 q1, q8, #5             // (y_7; y_6) a_i for t^5
 429         vshr.u64 q13, q9, #5            // (y_5; y_4) a_i for t^5
 430         vshr.u64 q2, q8, #10            // (y_7; y_6) a_i for t^10
 431         vshr.u64 q14, q9, #10           // (y_5; y_4) a_i for t^10
 432
 433         veor    q8, q8, q0              // mix the contributions together
 434         veor    q1, q1, q2
 435         veor    q9, q9, q12
 436         veor    q13, q13, q14
 437         veor    q8, q8, q1
 438         veor    q9, q9, q13
 439         veor    d3, d20, d16            // and output
 440         veor    d2, d21, d17
 441         veor    d1, d22, d18
 442         veor    d0, d23, d19
 443 .endm
 444
 445 ///--------------------------------------------------------------------------
 446 /// Main code.
 447
 448 // There are a number of representations of field elements in this code and
 449 // it can be confusing.
 450 //
 451 //   * The `external format' consists of a sequence of contiguous bytes in
 452 //     memory called a `block'.  The GCM spec explains how to interpret this
 453 //     block as an element of a finite field.  As discussed extensively, this
 454 //     representation is very annoying for a number of reasons.  On the other
 455 //     hand, this code never actually deals with it directly.
 456 //
 457 //   * The `register format' consists of one or more NEON registers,
 458 //     depending on the block size.  The bytes in each 64-bit lane of these
 459 //     registers are in reverse order, compared to the external format.
 460 //
 461 //   * The `words' format consists of a sequence of bytes, as in the
 462 //     `external format', but, according to the blockcipher in use, the bytes
 463 //     within each 32-bit word may be reversed (`big-endian') or not
 464 //     (`little-endian').  Accordingly, there are separate entry points for
 465 //     each variant, identified with `b' or `l'.
 466
 467 FUNC(gcm_mulk_128b_arm_crypto)
 468         // On entry, r0 points to a 128-bit field element A in big-endian
 469         // words format; r1 points to a field-element K in register format.
 470         // On exit, A is updated with the product A K.
 471
 472         vld1.8  {q0}, [r0]
 473         vld1.8  {q1}, [r1]
 474         vrev64.32 q0, q0
 475         mul128
 476         vrev64.32 q0, q0
 477         vst1.8  {q0}, [r0]
 478         bx      r14
 479 ENDFUNC
 480
 481 FUNC(gcm_mulk_128l_arm_crypto)
 482         // On entry, r0 points to a 128-bit field element A in little-endian
 483         // words format; r1 points to a field-element K in register format.
 484         // On exit, A is updated with the product A K.
 485
 486         vld1.8  {q0}, [r0]
 487         vld1.8  {q1}, [r1]
 488         vrev64.8 q0, q0
 489         mul128
 490         vrev64.8 q0, q0
 491         vst1.8  {q0}, [r0]
 492         bx      r14
 493 ENDFUNC
 494
 495 FUNC(gcm_mulk_64b_arm_crypto)
 496         // On entry, r0 points to a 64-bit field element A in big-endian
 497         // words format; r1 points to a field-element K in register format.
 498         // On exit, A is updated with the product A K.
 499
 500         vld1.8  {d0}, [r0]
 501         vld1.8  {d1}, [r1]
 502         vrev64.32 d0, d0
 503         mul64
 504         vrev64.32 d0, d0
 505         vst1.8  {d0}, [r0]
 506         bx      r14
 507 ENDFUNC
 508
 509 FUNC(gcm_mulk_64l_arm_crypto)
 510         // On entry, r0 points to a 64-bit field element A in little-endian
 511         // words format; r1 points to a field-element K in register format.
 512         // On exit, A is updated with the product A K.
 513
 514         vld1.8  {d0}, [r0]
 515         vld1.8  {d1}, [r1]
 516         vrev64.8 d0, d0
 517         vzero
 518         mul64
 519         vrev64.8 d0, d0
 520         vst1.8  {d0}, [r0]
 521         bx      r14
 522 ENDFUNC
 523
 524 FUNC(gcm_mulk_96b_arm_crypto)
 525         // On entry, r0 points to a 96-bit field element A in big-endian
 526         // words format; r1 points to a field-element K in register format.
 527         // On exit, A is updated with the product A K.
 528
 529         ldr     r3, [r0, #8]
 530         mov     r12, #0
 531         vld1.8  {d0}, [r0]
 532         vld1.8  {q1}, [r1]
 533         vrev64.32 d0, d0
 534         vmov    d1, r12, r3
 535         vzero
 536         mul96
 537         vrev64.32 d0, d0
 538         vmov    r3, d1[1]
 539         vst1.8  {d0}, [r0]
 540         str     r3, [r0, #8]
 541         bx      r14
 542 ENDFUNC
 543
 544 FUNC(gcm_mulk_96l_arm_crypto)
 545         // On entry, r0 points to a 128-bit field element A in little-endian
 546         // words format; r1 points to a field-element K in register format.
 547         // On exit, A is updated with the product A K.
 548
 549         ldr     r3, [r0, #8]
 550         mov     r12, #0
 551         vld1.8  {d0}, [r0]
 552         vld1.8  {q1}, [r1]
 553         vmov    d1, r3, r12
 554         vrev64.8 q0, q0
 555         mul96
 556         vrev64.8 q0, q0
 557         vmov    r3, d1[0]
 558         vst1.8  {d0}, [r0]
 559         str     r3, [r0, #8]
 560         bx      r14
 561 ENDFUNC
 562
 563 FUNC(gcm_mulk_192b_arm_crypto)
 564         // On entry, r0 points to a 192-bit field element A in big-endian
 565         // words format; r1 points to a field-element K in register format.
 566         // On exit, A is updated with the product A K.
 567
 568         vld1.8  {d0-d2}, [r0]
 569         vld1.8  {d3-d5}, [r1]
 570         vrev64.32 q0, q0
 571         vrev64.32 d2, d2
 572         mul192
 573         vrev64.32 q0, q0
 574         vrev64.32 d2, d2
 575         vst1.8  {d0-d2}, [r0]
 576         bx      r14
 577 ENDFUNC
 578
 579 FUNC(gcm_mulk_192l_arm_crypto)
 580         // On entry, r0 points to a 192-bit field element A in little-endian
 581         // words format; r1 points to a field-element K in register format.
 582         // On exit, A is updated with the product A K.
 583
 584         vld1.8  {d0-d2}, [r0]
 585         vld1.8  {d3-d5}, [r1]
 586         vrev64.8 q0, q0
 587         vrev64.8 d2, d2
 588         mul192
 589         vrev64.8 q0, q0
 590         vrev64.8 d2, d2
 591         vst1.8  {d0-d2}, [r0]
 592         bx      r14
 593 ENDFUNC
 594
 595 FUNC(gcm_mulk_256b_arm_crypto)
 596         // On entry, r0 points to a 256-bit field element A in big-endian
 597         // words format; r1 points to a field-element K in register format.
 598         // On exit, A is updated with the product A K.
 599
 600         vld1.8  {q0, q1}, [r0]
 601         vld1.8  {q2, q3}, [r1]
 602         vrev64.32 q0, q0
 603         vrev64.32 q1, q1
 604         mul256
 605         vrev64.32 q0, q0
 606         vrev64.32 q1, q1
 607         vst1.8  {q0, q1}, [r0]
 608         bx      r14
 609 ENDFUNC
 610
 611 FUNC(gcm_mulk_256l_arm_crypto)
 612         // On entry, r0 points to a 256-bit field element A in little-endian
 613         // words format; r1 points to a field-element K in register format.
 614         // On exit, A is updated with the product A K.
 615
 616         vld1.8  {q0, q1}, [r0]
 617         vld1.8  {q2, q3}, [r1]
 618         vrev64.8 q0, q0
 619         vrev64.8 q1, q1
 620         mul256
 621         vrev64.8 q0, q0
 622         vrev64.8 q1, q1
 623         vst1.8  {q0, q1}, [r0]
 624         bx      r14
 625 ENDFUNC
 626
 627 ///----- That's all, folks --------------------------------------------------