chiark - git - mdw - catacomb/blob - symm/gcm.c

   1 /* -*-c-*-
   2  *
   3  * The GCM authenticated encryption mode
   4  *
   5  * (c) 2017 Straylight/Edgeware
   6  */
   7
   8 /*----- Licensing notice --------------------------------------------------*
   9  *
  10  * This file is part of Catacomb.
  11  *
  12  * Catacomb is free software: you can redistribute it and/or modify it
  13  * under the terms of the GNU Library General Public License as published
  14  * by the Free Software Foundation; either version 2 of the License, or
  15  * (at your option) any later version.
  16  *
  17  * Catacomb is distributed in the hope that it will be useful, but
  18  * WITHOUT ANY WARRANTY; without even the implied warranty of
  19  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  20  * Library General Public License for more details.
  21  *
  22  * You should have received a copy of the GNU Library General Public
  23  * License along with Catacomb.  If not, write to the Free Software
  24  * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
  25  * USA.
  26  */
  27
  28 /*----- Header files ------------------------------------------------------*/
  29
  30 #include "config.h"
  31
  32 #include <stdio.h>
  33
  34 #include <mLib/bits.h>
  35
  36 #include "dispatch.h"
  37 #include "gcm.h"
  38 #include "gcm-def.h"
  39
  40 /*----- Overall strategy --------------------------------------------------*
  41  *
  42  * GCM is pretty awful to implement in software.  (This presentation is going
  43  * to be somewhat different to that in the specification, but I think it
  44  * makes more sense like this.)
  45  *
  46  * We're given a %$w$%-bit blockcipher %$E$% with a key %$K$%.
  47  *
  48  * The main part is arithmetic in the finite field %$k = \gf{2^w}$%, which we
  49  * represent as the quotient ring %$\gf{2}[t]/(p_w(t))$% for some irreducible
  50  * degree-%$w$% polynomial %$p(t)$%, whose precise value isn't very important
  51  * right now.  We choose a secret point %$x = E_K(0^w)$%.
  52  *
  53  * We choose a length size %$z$% as follows: if %$w < 96%$ then %$z = w$%;
  54  * otherwise %$z = w/2$%.  Format a message pair as follows:
  55  *
  56  *      %$F(a, b) = P_w(a) \cat P_w(b) \cat [\ell(a)]_z \cat [\ell(b)]_z$%
  57  *
  58  * where %$P_w(x) = x \cat 0^n$% where $%0 \le n < w$% such that
  59  * %$\ell(x) + n \equiv 0 \pmod{w}$%.
  60  *
  61  * Hash a (block-aligned) message %$u$% as follows.  First, split %$u$% into
  62  * %$w$%-bit blocks %$u_0$%, %$u_1$%, %%\ldots%%, %$u_{n-1}$%.  Interpret
  63  * these as elements of %$k$%.  Then
  64  *
  65  *      %$G_x(u) = u_0 t^n + u_1 t^{n-1} + \cdots + u_{n-1} t$%
  66  *
  67  * converted back to a %$w$%-bit string.
  68  *
  69  * We're ready to go now.  Suppose we're to encrypt a message %$M$% with
  70  * header %$H$% and nonce %$N$%.  If %$\ell(N) + 32 = w$% then let
  71  * %$N' = N$% and let %$i_0 = 1$%; otherwise, let %$U = G_t(F(\epsilon, N))$%
  72  * and split this into %$N' = U[0 \bitsto w - 32]$% and
  73  * %$[i_0]_{32} = U[w - 32 \bitsto w]$%.
  74  *
  75  * Let %$n = \lceil \ell(M)/w \rceil$%.  Compute
  76  *
  77  *      %$y_j = E_K(N' \cat [i_0 + j]_{32})$%
  78  *
  79  * for %$0 \le j \le n$%.  Let
  80  *
  81  *      %$s = (y_1 \cat y_2 \cat \cdots \cat y_n)[0 \bitsto \ell(M)$%
  82  *
  83  * Let %$C = M \xor s$% and let %$T = G_x(F(H, C)) \xor y_0$%.  These are the
  84  * ciphertext and tag respectively.
  85  *
  86  * So why is this awful?
  87  *
  88  * For one thing, the bits are in a completely terrible order.  The bytes are
  89  * arranged in little-endian order, so the unit coefficient is in the first
  90  * byte, and the degree-127 coefficient is in the last byte.  But within each
  91  * byte, the lowest-degree coefficient is in the most significant bit.  It's
  92  * therefore better to think of GCM as using a big-endian byte-ordering
  93  * convention, but with the bits backwards.
  94  *
  95  * But messing about with byte ordering is expensive, so let's not do that in
  96  * the inner loop.  But multiplication in %$k$% is not easy either.  Some
  97  * kind of precomputed table would be nice, but that will leak secrets
  98  * through the cache.
  99  *
 100  * I choose a particularly simple table: given %$x$%, let %$X[i'] = x t^i$%.
 101  * Then $%$x y = \sum_{0\le i<w} y_i X[i']$% which is just a bunch of
 102  * bitmasking.  But the natural order for examining bits of %$y$% is not
 103  * necessarily the obvious one.  We'll have already loaded %$y$% into
 104  * internal form, as 32-bit words.  The good order to process these is left
 105  * to right, from high to low bits.  But now the order of degrees depends on
 106  * the endianness of our conversion of bytes to words.  Oh, well.
 107  *
 108  * If we've adopted a big-endian convention, then we'll see the degrees in
 109  * order, 0, 1, ..., all the way up to %$w - 1$% and everything is fine.  If
 110  * we've adopted a little-endian convention, though, we'll see an ordering
 111  * like this:
 112  *
 113  *      24, 25, ..., 31, 16, 17, ..., 23,  8,  9, ..., 15,  0,  1, ..., 7,
 114  *      56, 57, ..., 63, 48, 49, ..., 55, 40, 41, ..., 47, 32, 33, ..., 39,
 115  *      etc.
 116  *
 117  * which is the ordinary order with 0x18 = 24 XORed into the index.  That is,
 118  * %$i' = i$% if we've adopted a big-endian convention, and
 119  * %$i' = i \xor 24$% if we've adopted a little-endian convention.
 120  */
 121
 122 /*----- Low-level utilities -----------------------------------------------*/
 123
 124 /* --- @mult@, @divt@ --- *
 125  *
 126  * Arguments:   @const gcm_params *p@ = pointer to the parameters
 127  *              @uint32 *z@ = where to write the result
 128  *              @const uint32 *x@ = input field element
 129  *
 130  * Returns:     ---
 131  *
 132  * Use:         Multiply or divide the input field element by %$t$%, and
 133  *              write the product or quotient to @z@.  It's safe for @x@ and
 134  *              @z@ to be equal, but they should not otherwise overlap.  Both
 135  *              input and output are in big-endian form, i.e., with the
 136  *              lowest-degree coefficients in the most significant bits.
 137  */
 138
 139 static void mult(const gcm_params *p, uint32 *z, const uint32 *x)
 140 {
 141   uint32 m, c, t;
 142   unsigned i;
 143
 144   t = x[p->n - 1]; m = -(t&1u); c = m&p->poly;
 145   for (i = 0; i < p->n; i++) { t = x[i]; z[i] = (t >> 1) ^ c; c = t << 31; }
 146 }
 147
 148 #if CPUFAM_X86 || CPUFAM_AMD64 || CPUFAM_ARMEL
 149 static void divt(const gcm_params *p, uint32 *z, const uint32 *x)
 150 {
 151   uint32 m, c, t;
 152   unsigned i;
 153
 154   t = x[0]; m = -((t >> 31)&1u); c = m&1u;
 155   for (i = p->n - 1; i; i--) { t = x[i]; z[i] = (t << 1) | c; c = t >> 31; }
 156   t = x[0]; z[0] = ((t ^ (m&p->poly)) << 1) | c;
 157 }
 158 #endif
 159
 160 /* --- @mul@ --- *
 161  *
 162  * Arguments:   @const gcm_params *p@ = pointer to the parameters
 163  *              @uint32 *z@ = where to write the result
 164  *              @const uint32 *x, *y@ = input field elements
 165  *
 166  * Returns:     ---
 167  *
 168  * Use:         Multiply the input field elements together, and write the
 169  *              product to @z@.  It's safe for the operands to overlap.  Both
 170  *              inputs and the output are in big-endian form, i.e., with the
 171  *              lowest-degree coefficients in the most significant bits.
 172  */
 173
 174 static void mul(const gcm_params *p, uint32 *z,
 175                 const uint32 *x, const uint32 *y)
 176 {
 177   uint32 m, t, u[GCM_NMAX], v[GCM_NMAX];
 178   unsigned i, j, k;
 179
 180   /* We can't do this in-place at all, so use temporary space.  Make a copy
 181    * of @x@ in @u@, where we can clobber it, and build the product in @v@.
 182    */
 183   for (i = 0; i < p->n; i++) { u[i] = x[i]; v[i] = 0; }
 184
 185   /* Repeatedly multiply @x@ (in @u@) by %$t$%, and add together those
 186    * %$x t^i$% selected by the bits of @y@.  This is basically what you get
 187    * by streaming the result of @gcm_mktable@ into @gcm_mulk_...@.
 188    */
 189   for (i = 0; i < p->n; i++) {
 190     t = y[i];
 191     for (j = 0; j < 32; j++) {
 192       m = -((t >> 31)&1u);
 193       for (k = 0; k < p->n; k++) v[k] ^= u[k]&m;
 194       mult(p, u, u); t <<= 1;
 195     }
 196   }
 197
 198   /* Write out the result now that it's ready. */
 199   for (i = 0; i < p->n; i++) z[i] = v[i];
 200 }
 201
 202 /*----- Table-based multiplication ----------------------------------------*/
 203
 204 /* --- @gcm_mktable@ --- *
 205  *
 206  * Arguments:   @const gcm_params *p@ = pointer to the parameters
 207  *              @uint32 *ktab@ = where to write the table; there must be
 208  *                      space for %$32 n$% $%n$%-word entries, i.e.,
 209  *                      %$32 n^2$% 32-bit words in total, where %$n$% is
 210  *                      @p->n@, the block size in words
 211  *              @const uint32 *k@ = input field element
 212  *
 213  * Returns:     ---
 214  *
 215  * Use:         Construct a table for use by @gcm_mulk_...@ below, to
 216  *              multiply (vaguely) efficiently by @k@.
 217  */
 218
 219 static void simple_mktable(const gcm_params *p,
 220                            uint32 *ktab, const uint32 *k)
 221 {
 222   unsigned m = (p->f&GCMF_SWAP ? 0x18 : 0);
 223   unsigned i, j, o = m*p->n;
 224
 225   /* As described above, the table stores entries %$K[i \xor m] = k t^i$%,
 226    * where %$m = 0$% (big-endian cipher) or %$m = 24$% (little-endian).
 227    * The first job is to store %$K[m] = k$%.
 228    *
 229    * We initially build the table with the entries in big-endian order, and
 230    * then swap them if necessary.  This makes the arithmetic functions more
 231    * amenable for use by @gcm_concat@ below.
 232    */
 233   if (!(p->f&GCMF_SWAP)) for (i = 0; i < p->n; i++) ktab[o + i] = k[i];
 234   else for (i = 0; i < p->n; i++) ktab[o + i] = ENDSWAP32(k[i]);
 235
 236   /* Fill in the rest of the table by repeatedly multiplying the previous
 237    * entry by %$t$%.
 238    */
 239   for (i = 1; i < 32*p->n; i++)
 240     { j = (i ^ m)*p->n; mult(p, ktab + j, ktab + o); o = j; }
 241
 242   /* Finally, if the cipher uses a little-endian convention, then swap all of
 243    * the individual words.
 244    */
 245   if (p->f&GCMF_SWAP)
 246     for (i = 0; i < 32*p->n*p->n; i++) ktab[i] = ENDSWAP32(ktab[i]);
 247 }
 248
 249 #if CPUFAM_X86 || CPUFAM_AMD64
 250 static void pclmul_mktable(const gcm_params *p,
 251                            uint32 *ktab, const uint32 *k)
 252 {
 253   unsigned i, n = p->n;
 254   unsigned nz;
 255   uint32 k_over_t[GCM_NMAX], *t;
 256
 257   /* We need to divide the value by t (to compensate for the one-bit shift
 258    * resulting from GCM's backwards bit ordering) and store the value in a
 259    * way which is convenient for the assembler code to read back.  That
 260    * involves reordering the words, and, in the case of 96-bit blocks,
 261    * padding with zeroes to fill out a 128-bit chunk.
 262    */
 263
 264   if (!(p->f&GCMF_SWAP)) divt(p, k_over_t, k);
 265   else {
 266     for (i = 0; i < n; i++) k_over_t[i] = ENDSWAP32(k[i]);
 267     divt(p, k_over_t, k_over_t);
 268   }
 269
 270   if (n == 3) nz = 1;
 271   else nz = 0;
 272   k = k_over_t; t = ktab + n + nz; while (n--) *--t = *k++;
 273   while (nz--) *--t = 0;
 274 }
 275 #endif
 276
 277 #if CPUFAM_ARMEL
 278 static void arm_crypto_mktable(const gcm_params *p,
 279                                uint32 *ktab, const uint32 *k)
 280 {
 281   unsigned i, n = p->n;
 282   uint32 k_over_t[GCM_NMAX], *t;
 283
 284   /* We need to divide the value by t (to compensate for the one-bit shift
 285    * resulting from GCM's backwards bit ordering) and store the value in a
 286    * way which is convenient for the assembler code to read back.  That
 287    * involves swapping the bytes in each 64-bit lane.
 288    */
 289
 290   if (!(p->f&GCMF_SWAP)) divt(p, k_over_t, k);
 291   else {
 292     for (i = 0; i < n; i++) k_over_t[i] = ENDSWAP32(k[i]);
 293     divt(p, k_over_t, k_over_t);
 294   }
 295
 296   t = ktab; k = k_over_t;
 297   while (n >= 2) {
 298     t[1] = k[0]; t[0] = k[1];
 299     t += 2; k += 2; n -= 2;
 300   }
 301   if (n) { t[1] = k[0]; t[0] = 0; }
 302 }
 303 #endif
 304
 305 #if CPUFAM_ARM64
 306 static uint32 rbit32(uint32 x)
 307 {
 308   uint32 z, t;
 309
 310 #if GCC_VERSION_P(4, 3)
 311   /* Two tricks here.  Firstly, two separate steps, rather than a single
 312    * block of assembler, to allow finer-grained instruction scheduling.
 313    * Secondly, use `ENDSWAP32' so that the compiler can cancel it if the
 314    * caller actually wants the bytes reordered.
 315    */
 316   __asm__("rbit %w0, %w1" : "=r"(t) : "r"(x));
 317   z = ENDSWAP32(t);
 318 #else
 319   /* A generic but slightly clever implementation. */
 320 #  define SWIZZLE(x, m, s) ((((x)&(m)) << (s)) | (((x)&~(m)) >> (s)))
 321                                         /* 76543210 */
 322   t = SWIZZLE(x, 0x0f0f0f0f, 4);        /* 32107654 -- swap nibbles */
 323   t = SWIZZLE(t, 0x33333333, 2);        /* 10325476 -- swap bit pairs */
 324   z = SWIZZLE(t, 0x55555555, 1);        /* 01234567 -- swap adjacent bits */
 325 #  undef SWIZZLE
 326 #endif
 327   return (z);
 328 }
 329
 330 static void arm64_pmull_mktable(const gcm_params *p,
 331                                 uint32 *ktab, const uint32 *k)
 332 {
 333   unsigned n = p->n;
 334   uint32 *t;
 335
 336   /* We just need to store the value in a way which is convenient for the
 337    * assembler code to read back.  That involves two transformations:
 338    *
 339    *   * firstly, reversing the order of the bits in each byte; and,
 340    *
 341    *   * secondly, storing two copies of each 64-bit chunk.
 342    *
 343    * Note that, in this case, we /want/ the little-endian byte order of GCM,
 344    * so endianness-swapping happens in the big-endian case.
 345    */
 346
 347   t = ktab;
 348   if (p->f&GCMF_SWAP) {
 349     while (n >= 2) {
 350       t[0] = t[2] = rbit32(k[0]);
 351       t[1] = t[3] = rbit32(k[1]);
 352       t += 4; k += 2; n -= 2;
 353     }
 354     if (n) { t[0] = t[2] = rbit32(k[0]); t[1] = t[3] = 0; }
 355   } else {
 356     while (n >= 2) {
 357       t[0] = t[2] = ENDSWAP32(rbit32(k[0]));
 358       t[1] = t[3] = ENDSWAP32(rbit32(k[1]));
 359       t += 4; k += 2; n -= 2;
 360     }
 361     if (n) { t[0] = t[2] = ENDSWAP32(rbit32(k[0])); t[1] = t[3] = 0; }
 362   }
 363 }
 364 #endif
 365
 366 CPU_DISPATCH(EMPTY, EMPTY, void, gcm_mktable,
 367              (const gcm_params *p, uint32 *ktab, const uint32 *k),
 368              (p, ktab, k),
 369              pick_mktable, simple_mktable)
 370
 371 static gcm_mktable__functype *pick_mktable(void)
 372 {
 373 #if CPUFAM_X86 || CPUFAM_AMD64
 374   DISPATCH_PICK_COND(gcm_mktable, pclmul_mktable,
 375                      cpu_feature_p(CPUFEAT_X86_SSSE3) &&
 376                      cpu_feature_p(CPUFEAT_X86_PCLMUL));
 377 #endif
 378 #if CPUFAM_ARMEL
 379   DISPATCH_PICK_COND(gcm_mktable, arm_crypto_mktable,
 380                      cpu_feature_p(CPUFEAT_ARM_PMULL));
 381 #endif
 382 #if CPUFAM_ARM64
 383   DISPATCH_PICK_COND(gcm_mktable, arm64_pmull_mktable,
 384                      cpu_feature_p(CPUFEAT_ARM_PMULL));
 385 #endif
 386   DISPATCH_PICK_FALLBACK(gcm_mktable, simple_mktable);
 387 }
 388
 389 /* --- @recover_k@ --- *
 390  *
 391  * Arguments:   @const gcm_params *p@ = pointer to the parameters
 392  *              @uint32 *k@ = block-sized vector in which to store %$k$%
 393  *              @const uint32 *ktab@ = the table encoding %$k$%
 394  *
 395  * Returns:     ---
 396  *
 397  * Use:         Recovers %$k$%, the secret from which @ktab@ was by
 398  *              @gcm_mktable@, from the table, and stores it in internal
 399  *              (big-endian) form in @k@.
 400  */
 401
 402 static void simple_recover_k(const gcm_params *p,
 403                              uint32 *k, const uint32 *ktab)
 404 {
 405   unsigned i;
 406
 407   /* If the blockcipher is big-endian, then the key is simply in the first
 408    * table element, in the right format.  If the blockcipher is little-endian
 409    * then it's in element 24, and the bytes need swapping.
 410    */
 411
 412   if (!(p->f&GCMF_SWAP)) for (i = 0; i < p->n; i++) k[i] = ktab[i];
 413   else for (i = 0; i < p->n; i++) k[i] = ENDSWAP32(ktab[24*p->n + i]);
 414 }
 415
 416 #if CPUFAM_X86 || CPUFAM_AMD64
 417 static void pclmul_recover_k(const gcm_params *p,
 418                              uint32 *k, const uint32 *ktab)
 419 {
 420   unsigned n = p->n;
 421   unsigned nz;
 422   const uint32 *t;
 423
 424   /* The representation is already independent of the blockcipher endianness.
 425    * We need to compensate for padding, reorder the words, and multiply by t
 426    * to compensate for the factor of t we divided out earlier.
 427    */
 428
 429   if (n == 3) nz = 1; else nz = 0;
 430   t = ktab + n + nz;
 431   while (n--) *k++ = *--t;
 432   mult(p, k - p->n, k - p->n);
 433 }
 434 #endif
 435
 436 #if CPUFAM_ARMEL
 437 static void arm_crypto_recover_k(const gcm_params *p,
 438                                  uint32 *k, const uint32 *ktab)
 439 {
 440   unsigned n = p->n;
 441   const uint32 *t;
 442
 443   /* The representation is already independent of the blockcipher endianness.
 444    * We only need to reorder the words, and multiply by t to compensate for
 445    * the factor of t we divided out earlier.
 446    */
 447
 448   t = ktab;
 449   while (n >= 2) { k[1] = t[0]; k[0] = t[1]; t += 2; k += 2; n -= 2; }
 450   if (n) { k[0] = t[1]; k++; n--; }
 451   mult(p, k - p->n, k - p->n);
 452 }
 453 #endif
 454
 455 #if CPUFAM_ARM64
 456 static void arm64_pmull_recover_k(const gcm_params *p,
 457                                   uint32 *k, const uint32 *ktab)
 458 {
 459   unsigned n = p->n;
 460   const uint32 *t;
 461
 462   /* The representation is already independent of the blockcipher endianness.
 463    * We need to skip the duplicate pieces, and unscramble the bytes.
 464    */
 465
 466   t = ktab;
 467   while (n >= 2) {
 468     k[0] = ENDSWAP32(rbit32(t[0]));
 469     k[1] = ENDSWAP32(rbit32(t[1]));
 470     t += 4; k += 2; n -= 2;
 471   }
 472   if (n) k[0] = ENDSWAP32(rbit32(t[0]));
 473 }
 474 #endif
 475
 476 CPU_DISPATCH(static, EMPTY, void, recover_k,
 477              (const gcm_params *p, uint32 *k, const uint32 *ktab),
 478              (p, k, ktab),
 479              pick_recover_k, simple_recover_k)
 480
 481 static recover_k__functype *pick_recover_k(void)
 482 {
 483 #if CPUFAM_X86 || CPUFAM_AMD64
 484   DISPATCH_PICK_COND(recover_k, pclmul_recover_k,
 485                      cpu_feature_p(CPUFEAT_X86_SSSE3) &&
 486                      cpu_feature_p(CPUFEAT_X86_PCLMUL));
 487 #endif
 488 #if CPUFAM_ARMEL
 489   DISPATCH_PICK_COND(recover_k, arm_crypto_recover_k,
 490                      cpu_feature_p(CPUFEAT_ARM_PMULL));
 491 #endif
 492 #if CPUFAM_ARM64
 493   DISPATCH_PICK_COND(recover_k, arm64_pmull_recover_k,
 494                      cpu_feature_p(CPUFEAT_ARM_PMULL));
 495 #endif
 496   DISPATCH_PICK_FALLBACK(recover_k, simple_recover_k);
 497 }
 498
 499 /* --- @gcm_mulk_N{b,l}@ --- *
 500  *
 501  * Arguments:   @uint32 *a@ = accumulator to multiply
 502  *              @const uint32 *ktab@ = table constructed by @gcm_mktable@
 503  *
 504  * Returns:     ---
 505  *
 506  * Use:         Multiply @a@ by @k@ (implicitly represented in @ktab@),
 507  *              updating @a@ in-place.  There are separate functions for each
 508  *              supported block size and endianness because this is the
 509  *              function whose performance actually matters.
 510  */
 511
 512 #if CPUFAM_X86 || CPUFAM_AMD64
 513 #  define DECL_MULK_X86ISH(var) extern gcm_mulk_##var##__functype       \
 514   gcm_mulk_##var##_x86ish_pclmul_avx,                                   \
 515   gcm_mulk_##var##_x86ish_pclmul;
 516 #  define PICK_MULK_X86ISH(var) do {                                    \
 517   DISPATCH_PICK_COND(gcm_mulk_##var, gcm_mulk_##var##_x86ish_pclmul_avx, \
 518                      cpu_feature_p(CPUFEAT_X86_AVX) &&                  \
 519                      cpu_feature_p(CPUFEAT_X86_PCLMUL) &&               \
 520                      cpu_feature_p(CPUFEAT_X86_SSSE3));                 \
 521   DISPATCH_PICK_COND(gcm_mulk_##var, gcm_mulk_##var##_x86ish_pclmul,    \
 522                      cpu_feature_p(CPUFEAT_X86_PCLMUL) &&               \
 523                      cpu_feature_p(CPUFEAT_X86_SSSE3));                 \
 524 } while (0)
 525 #else
 526 #  define DECL_MULK_X86ISH(var)
 527 #  define PICK_MULK_X86ISH(var) do ; while (0)
 528 #endif
 529
 530 #if CPUFAM_ARMEL
 531 #  define DECL_MULK_ARM(var)                                            \
 532   extern gcm_mulk_##var##__functype gcm_mulk_##var##_arm_crypto;
 533 #  define PICK_MULK_ARM(var) do {                                       \
 534   DISPATCH_PICK_COND(gcm_mulk_##var, gcm_mulk_##var##_arm_crypto,       \
 535                      cpu_feature_p(CPUFEAT_ARM_PMULL));                 \
 536 } while (0)
 537 #else
 538 #  define DECL_MULK_ARM(var)
 539 #  define PICK_MULK_ARM(var) do ; while (0)
 540 #endif
 541
 542 #if CPUFAM_ARM64
 543 #  define DECL_MULK_ARM64(var)                                          \
 544   extern gcm_mulk_##var##__functype gcm_mulk_##var##_arm64_pmull;
 545 #  define PICK_MULK_ARM64(var) do {                                     \
 546   DISPATCH_PICK_COND(gcm_mulk_##var, gcm_mulk_##var##_arm64_pmull,      \
 547                      cpu_feature_p(CPUFEAT_ARM_PMULL));                 \
 548 } while (0)
 549 #else
 550 #  define DECL_MULK_ARM64(var)
 551 #  define PICK_MULK_ARM64(var) do ; while (0)
 552 #endif
 553
 554 #define DEF_MULK(nbits)                                                 \
 555                                                                         \
 556 CPU_DISPATCH(EMPTY, EMPTY, void, gcm_mulk_##nbits##b,                   \
 557              (uint32 *a, const uint32 *ktab), (a, ktab),                \
 558              pick_mulk_##nbits##b, simple_mulk_##nbits)                 \
 559 CPU_DISPATCH(EMPTY, EMPTY, void, gcm_mulk_##nbits##l,                   \
 560              (uint32 *a, const uint32 *ktab), (a, ktab),                \
 561              pick_mulk_##nbits##l, simple_mulk_##nbits)                 \
 562                                                                         \
 563 static void simple_mulk_##nbits(uint32 *a, const uint32 *ktab)          \
 564 {                                                                       \
 565   uint32 m, t;                                                          \
 566   uint32 z[nbits/32];                                                   \
 567   unsigned i, j, k;                                                     \
 568                                                                         \
 569   for (i = 0; i < nbits/32; i++) z[i] = 0;                              \
 570                                                                         \
 571   for (i = 0; i < nbits/32; i++) {                                      \
 572     t = a[i];                                                           \
 573     for (j = 0; j < 32; j++) {                                          \
 574       m = -((t >> 31)&1u);                                              \
 575       for (k = 0; k < nbits/32; k++) z[k] ^= *ktab++&m;                 \
 576       t <<= 1;                                                          \
 577     }                                                                   \
 578   }                                                                     \
 579                                                                         \
 580   for (i = 0; i < nbits/32; i++) a[i] = z[i];                           \
 581 }                                                                       \
 582                                                                         \
 583 DECL_MULK_X86ISH(nbits##b)                                              \
 584 DECL_MULK_ARM(nbits##b)                                                 \
 585 DECL_MULK_ARM64(nbits##b)                                               \
 586 static gcm_mulk_##nbits##b##__functype *pick_mulk_##nbits##b(void)      \
 587 {                                                                       \
 588   PICK_MULK_X86ISH(nbits##b);                                           \
 589   PICK_MULK_ARM(nbits##b);                                              \
 590   PICK_MULK_ARM64(nbits##b);                                            \
 591   DISPATCH_PICK_FALLBACK(gcm_mulk_##nbits##b, simple_mulk_##nbits);     \
 592 }                                                                       \
 593                                                                         \
 594 DECL_MULK_X86ISH(nbits##l)                                              \
 595 DECL_MULK_ARM(nbits##l)                                                 \
 596 DECL_MULK_ARM64(nbits##l)                                               \
 597 static gcm_mulk_##nbits##l##__functype *pick_mulk_##nbits##l(void)      \
 598 {                                                                       \
 599   PICK_MULK_X86ISH(nbits##l);                                           \
 600   PICK_MULK_ARM(nbits##l);                                              \
 601   PICK_MULK_ARM64(nbits##l);                                            \
 602   DISPATCH_PICK_FALLBACK(gcm_mulk_##nbits##l, simple_mulk_##nbits);     \
 603 }
 604
 605 GCM_WIDTHS(DEF_MULK)
 606
 607 #define GCM_MULK_CASE(nbits)                                            \
 608   case nbits/32:                                                        \
 609     if (_f&GCMF_SWAP) gcm_mulk_##nbits##l(_a, _ktab);                   \
 610     else gcm_mulk_##nbits##b(_a, _ktab);                                \
 611     break;
 612 #define MULK(n, f, a, ktab) do {                                        \
 613   uint32 *_a = (a); const uint32 *_ktab = (ktab);                       \
 614   unsigned _f = (f);                                                    \
 615   switch (n) {                                                          \
 616     GCM_WIDTHS(GCM_MULK_CASE)                                           \
 617     default: abort();                                                   \
 618   }                                                                     \
 619 } while (0)
 620
 621 /*----- Other utilities ---------------------------------------------------*/
 622
 623 /* --- @putlen@ --- *
 624  *
 625  * Arguments:   @octet *p@ = pointer to output buffer
 626  *              @unsigned w@ = size of output buffer
 627  *              @unsigned blksz@ = block size (assumed fairly small)
 628  *              @unsigned long nblocks@ = number of blocks
 629  *              @unsigned nbytes@ = tail size in bytes (assumed small)
 630  *
 631  * Returns:     ---
 632  *
 633  * Use:         Store the overall length in %$\emph{bits}$% (i.e.,
 634  *              @3*(nblocks*blksz + nbytes)@ in big-endian form in the
 635  *              buffer @p@.
 636  */
 637
 638 static void putlen(octet *p, unsigned w, unsigned blksz,
 639                    unsigned long nblocks, unsigned nbytes)
 640 {
 641   unsigned long nblo = nblocks&((1ul << (ULONG_BITS/2)) - 1),
 642     nbhi = nblocks >> ULONG_BITS/2;
 643   unsigned long nlo = nblo*blksz + nbytes, nhi = nbhi*blksz;
 644
 645   /* This is fiddly.  Split @nblocks@, which is the big number, into high and
 646    * low halves, multiply those separately by @blksz@, propagate carries, and
 647    * then multiply by eight.
 648    */
 649   nhi += nlo >> ULONG_BITS/2;
 650   nlo &= (1ul << (ULONG_BITS/2)) - 1;
 651   nlo <<= 3;
 652
 653   /* Now write out the size, feeding bits in from @nhi@ as necessary. */
 654   p += w;
 655   while (w--) {
 656     *--p = U8(nlo);
 657     nlo = (nlo >> 8) | ((nhi&0xff) << (ULONG_BITS/2 - 5));
 658     nhi >>= 8;
 659   }
 660 }
 661
 662 /* --- @mix@ --- *
 663  *
 664  * Arguments:   @const gcm_params *p@ = pointer to the parameters
 665  *              @uint32 *a@ = GHASH accumulator
 666  *              @const octet *q@ = pointer to an input block
 667  *              @const uint32 *ktab@ = multiplication table, built by
 668  *                      @gcm_mktable@
 669  *
 670  * Returns:     ---
 671  *
 672  * Use:         Fold the block @q@ into the GHASH accumulator.  The
 673  *              calculation is %$a' = k (a + q)$%.
 674  */
 675
 676 static void mix(const gcm_params *p, uint32 *a,
 677                 const octet *q, const uint32 *ktab)
 678 {
 679   unsigned i;
 680
 681   if (p->f&GCMF_SWAP)
 682     for (i = 0; i < p->n; i++) { a[i] ^= LOAD32_L(q); q += 4; }
 683   else
 684     for (i = 0; i < p->n; i++) { a[i] ^= LOAD32_B(q); q += 4; }
 685   MULK(p->n, p->f, a, ktab);
 686 }
 687
 688 /* --- @gcm_ghashdone@ --- *
 689  *
 690  * Arguments:   @const gcm_params *p@ = pointer to the parameters
 691  *              @uint32 *a@ = GHASH accumulator
 692  *              @const uint32 *ktab@ = multiplication table, built by
 693  *                      @gcm_mktable@
 694  *              @unsigned long xblocks, yblocks@ = number of whole blocks in
 695  *                      the two inputs
 696  *              @unsigned xbytes, ybytes@ = number of trailing bytes in the
 697  *                      two inputs
 698  *
 699  * Returns:     ---
 700  *
 701  * Use:         Finishes a GHASH operation by appending the appropriately
 702  *              encoded lengths of the two constituent messages.
 703  */
 704
 705 void gcm_ghashdone(const gcm_params *p, uint32 *a, const uint32 *ktab,
 706                    unsigned long xblocks, unsigned xbytes,
 707                    unsigned long yblocks, unsigned ybytes)
 708 {
 709   octet b[4*GCM_NMAX];
 710   unsigned w = p->n < 3 ? 4*p->n : 2*p->n;
 711
 712   /* Construct the encoded lengths.  Note that smaller-block versions of GCM
 713    * encode the lengths in separate blocks.  GCM is only officially defined
 714    * for 64- and 128-bit blocks; I've placed the cutoff somewhat arbitrarily
 715    * at 96 bits.
 716    */
 717   putlen(b,     w, 4*p->n, xblocks, xbytes);
 718   putlen(b + w, w, 4*p->n, yblocks, ybytes);
 719
 720   /* Feed the lengths into the accumulator. */
 721   mix(p, a, b, ktab);
 722   if (p->n < 3) mix(p, a, b + w, ktab);
 723 }
 724
 725 /* --- @gcm_concat@ --- *
 726  *
 727  * Arguments:   @const gcm_params *p@ = pointer to the parameters
 728  *              @uint32 *z@ = GHASH accumulator for suffix, updated
 729  *              @const uint32 *x@ = GHASH accumulator for prefix
 730  *              @const uint32 *ktab@ = multiplication table, built by
 731  *                      @gcm_mktable@
 732  *              @unsigned long n@ = length of suffix in whole blocks
 733  *
 734  * Returns:     ---
 735  *
 736  * Use:         On entry, @x@ and @z@ are the results of hashing two strings
 737  *              %$a$% and %$b$%, each a whole number of blocks long; in
 738  *              particular, %$b$% is @n@ blocks long.  On exit, @z@ is
 739  *              updated to be the hash of %$a \cat b$%.
 740  */
 741
 742 void gcm_concat(const gcm_params *p, uint32 *z, const uint32 *x,
 743                 const uint32 *ktab, unsigned long n)
 744 {
 745   uint32 t[GCM_NMAX], u[GCM_NMAX];
 746   unsigned i, j;
 747
 748   if (!n) {
 749     /* If @n@ is zero, then there's not much to do.  The mathematics
 750      * (explained below) still works, but the code takes a shortcut which
 751      * doesn't handle this case: so set %$z' = z + x k^n = z + x$%.
 752      */
 753
 754     for (j = 0; j < p->n; j++) z[j] ^= x[j];
 755   } else {
 756     /* We have %$x = a_0 t^m + \cdots + a_{m-2} t^2 + a_{m-1} t$% and
 757      * %$z = b_0 t^n + \cdots + b_{n-2} t^2 + b_{n-1} t$%.  What we'd like is
 758      * the hash of %$a \cat b$%, which is %$z + x k^n$%.
 759      *
 760      * The first job, then, is to calculate %$k^n$%, and for this we use a
 761      * simple left-to-right square-and-multiply algorithm.  There's no need
 762      * to keep %$n$% secret here.
 763      */
 764
 765     /* Start by retrieving %$k$% from the table, and convert it to big-endian
 766      * form.
 767      */
 768     recover_k(p, u, ktab);
 769
 770     /* Now calculate %$k^n$%. */
 771     i = ULONG_BITS;
 772 #define BIT (1ul << (ULONG_BITS - 1))
 773     while (!(n&BIT)) { n <<= 1; i--; }
 774     n <<= 1; i--; for (j = 0; j < p->n; j++) t[j] = u[j];
 775     while (i--) { mul(p, t, t, t); if (n&BIT) mul(p, t, t, u); n <<= 1; }
 776 #undef BIT
 777
 778     /* Next, calculate %$x k^n$%.  If we're using a little-endian convention
 779      * then we must convert %$x$%; otherwise we can just use it in place.
 780      */
 781     if (!(p->f&GCMF_SWAP))
 782       mul(p, t, t, x);
 783     else {
 784       for (j = 0; j < p->n; j++) u[j] = ENDSWAP32(x[j]);
 785       mul(p, t, t, u);
 786     }
 787
 788     /* Finally, add %$x k^n$% onto %$z$%, converting back to little-endian if
 789      * necessary.
 790      */
 791     if (!(p->f&GCMF_SWAP)) for (j = 0; j < p->n; j++) z[j] ^= t[j];
 792     else for (j = 0; j < p->n; j++) z[j] ^= ENDSWAP32(t[j]);
 793   }
 794 }
 795
 796 /*----- Test rig ----------------------------------------------------------*/
 797
 798 #ifdef TEST_RIG
 799
 800 #include <mLib/macros.h>
 801 #include <mLib/quis.h>
 802 #include <mLib/testrig.h>
 803
 804 #ifdef ENABLE_ASM_DEBUG
 805 #  include "regdump.h"
 806 #endif
 807
 808 static void report_failure(const char *test, unsigned nbits,
 809                            const char *ref, dstr v[], dstr *d)
 810 {
 811   printf("test %s failed (nbits = %u)", test, nbits);
 812   printf("\n\tx  = "); type_hex.dump(&v[0], stdout);
 813   printf("\n\ty  = "); type_hex.dump(&v[1], stdout);
 814   printf("\n\tz  = "); type_hex.dump(&v[2], stdout);
 815   printf("\n\t%s' = ", ref); type_hex.dump(d, stdout);
 816   putchar('\n');
 817 }
 818
 819 static void mulk(unsigned nbits, unsigned f, uint32 *x, const uint32 *ktab)
 820   { MULK(nbits/32, f, x, ktab); }
 821
 822 static int test_mul(uint32 poly, dstr v[])
 823 {
 824   uint32 x[GCM_NMAX], y[GCM_NMAX], z[GCM_NMAX], ktab[32*GCM_NMAX*GCM_NMAX];
 825   gcm_params p;
 826   dstr d = DSTR_INIT;
 827   unsigned i, nbits;
 828   int ok = 1;
 829   enum { I_x, I_y, I_z };
 830
 831   nbits = 8*v[0].len; p.f = 0; p.n = nbits/32; p.poly = poly;
 832   dstr_ensure(&d, nbits/8); d.len = nbits/8;
 833
 834 #define LOADXY(E) do {                                                  \
 835   for (i = 0; i < nbits/32; i++) {                                      \
 836     x[i] = LOAD32_##E(v[I_x].buf + 4*i);                                \
 837     y[i] = LOAD32_##E(v[I_y].buf + 4*i);                                \
 838   }                                                                     \
 839 } while (0)
 840
 841 #define INITZ(x) do {                                                   \
 842   for (i = 0; i < nbits/32; i++) z[i] = (x)[i];                         \
 843 } while (0)
 844
 845 #define CHECK(E, what, ref) do {                                        \
 846   for (i = 0; i < nbits/32; i++) STORE32_##E(d.buf + 4*i, z[i]);        \
 847   if (MEMCMP(d.buf, !=, v[I_##ref].buf, nbits/8))                       \
 848     { ok = 0; report_failure(what, nbits, #ref, v, &d); }               \
 849 } while (0)
 850
 851 #define TEST_PREP_1(E, x, y, what) do {                                 \
 852   gcm_mktable(&p, ktab, y);                                             \
 853   recover_k(&p, z, ktab); CHECK(B, "mktable/recover_k (" #y ")", y);    \
 854   INITZ(x); mulk(nbits, p.f, z, ktab); CHECK(E, what " (k = " #y ")", z); \
 855 } while (0)
 856
 857 #define TEST_PREP(E, what) do {                                         \
 858   TEST_PREP_1(E, x, y, what);                                           \
 859   TEST_PREP_1(E, y, x, what);                                           \
 860 } while (0)
 861
 862   /* First, test plain multiply. */
 863   LOADXY(B); mul(&p, z, x, y); CHECK(B, "gcm_mul", z);
 864
 865   /* Next, test big-endian prepared key. */
 866   LOADXY(B); TEST_PREP(B, "gcm_kmul_b");
 867
 868   /* Finally, test little-endian prepared key. */
 869   p.f = GCMF_SWAP; LOADXY(L);
 870   TEST_PREP(L, "gcm_kmul_l");
 871
 872 #undef LOADXY
 873 #undef INITZ
 874 #undef CHECK
 875 #undef TEST_PREP_1
 876 #undef TEST_PREP
 877
 878   /* All done. */
 879   return (ok);
 880 }
 881
 882 #define TEST(nbits)                                                     \
 883 static int test_mul_##nbits(dstr v[])                                   \
 884   { return (test_mul(GCM_POLY_##nbits, v)); }
 885 GCM_WIDTHS(TEST)
 886 #undef TEST
 887
 888 static test_chunk defs[] = {
 889 #define TEST(nbits)                                                     \
 890   { "gcm-mul" #nbits, test_mul_##nbits,                                 \
 891     { &type_hex, &type_hex, &type_hex, 0 } },
 892 GCM_WIDTHS(TEST)
 893 #undef TEST
 894   { 0, 0, { 0 } }
 895 };
 896
 897 int main(int argc,  char *argv[])
 898 {
 899   ego(argv[0]);
 900 #ifdef ENABLE_ASM_DEBUG
 901   regdump_init();
 902 #endif
 903   test_run(argc, argv, defs, SRCDIR"/t/gcm");
 904   return (0);
 905 }
 906
 907 #endif
 908
 909 /*----- That's all, folks -------------------------------------------------*/