chiark - git - mdw - catacomb/blob - symm/keccak1600.c

   1 /* -*-c-*-
   2  *
   3  * The Keccak-p[1600, n] permutation
   4  *
   5  * (c) 2017 Straylight/Edgeware
   6  */
   7
   8 /*----- Licensing notice --------------------------------------------------*
   9  *
  10  * This file is part of Catacomb.
  11  *
  12  * Catacomb is free software; you can redistribute it and/or modify
  13  * it under the terms of the GNU Library General Public License as
  14  * published by the Free Software Foundation; either version 2 of the
  15  * License, or (at your option) any later version.
  16  *
  17  * Catacomb is distributed in the hope that it will be useful,
  18  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  19  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  20  * GNU Library General Public License for more details.
  21  *
  22  * You should have received a copy of the GNU Library General Public
  23  * License along with Catacomb; if not, write to the Free
  24  * Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
  25  * MA 02111-1307, USA.
  26  */
  27
  28 /*----- Header files ------------------------------------------------------*/
  29
  30 #include <limits.h>
  31 #include <string.h>
  32
  33 #include <mLib/bits.h>
  34
  35 #include "keccak1600.h"
  36
  37 /* #define KECCAK_DEBUG */
  38
  39 /*----- Miscellaneous utilities -------------------------------------------*/
  40
  41 #define I(x, y) ((x) + 5*(y))           /* Column-major indexing */
  42
  43 /*----- Interlacing or not ------------------------------------------------*/
  44
  45 /* We should prefer the interlaced representation if the target is really
  46  * 32-bit and only providing synthetic 64-bit integers.  Alas, the Windows
  47  * 64-bit ABI specifies that `long' is only 32-bits (i.e., it is IL32/LLP64),
  48  * so detect x86 specifically.
  49  */
  50 #if (ULONG_MAX >> 31) <= 0xffffffff && \
  51   !defined(__amd64__) && !defined(_M_AMD64)
  52 #  define KECCAK_I32
  53 #endif
  54
  55 #ifdef KECCAK_I32
  56 /* A 32-bit target with at best weak support for 64-bit shifts.  Maintain a
  57  * lane as two 32-bit pieces representing the even and odd bits of the lane.
  58  * There are slightly fiddly transformations to apply on the way in and out
  59  * of the main permutation.
  60  */
  61
  62 typedef keccak1600_lane_i32 lane;
  63 #define S si32
  64
  65 static lane interlace(kludge64 x)
  66 {
  67   /* Given a 64-bit string X, return a lane Z containing the even- and
  68    * odd-numbered bits of X.
  69    *
  70    * This becomes more manageable if we look at what happens to the bit
  71    * indices: bit i of X becomes bit ROR_6(i, 1) of Z.  We can effectively
  72    * swap two bits of the indices by swapping the object bits where those
  73    * index bits differ.  Fortunately, this is fairly easy.
  74    *
  75    * We arrange to swap bits between the two halves of X, rather than within
  76    * a half.
  77    */
  78
  79   uint32 x0 = LO64(x), x1 = HI64(x), t;
  80   lane z;
  81                                                             /* 543210 */
  82   t = ((x0 >> 16) ^ x1)&0x0000ffff; x0 ^= t << 16; x1 ^= t; /* 453210 */
  83   t = ((x0 >>  8) ^ x1)&0x00ff00ff; x0 ^= t <<  8; x1 ^= t; /* 354210 */
  84   t = ((x0 >>  4) ^ x1)&0x0f0f0f0f; x0 ^= t <<  4; x1 ^= t; /* 254310 */
  85   t = ((x0 >>  2) ^ x1)&0x33333333; x0 ^= t <<  2; x1 ^= t; /* 154320 */
  86   t = ((x0 >>  1) ^ x1)&0x55555555; x0 ^= t <<  1; x1 ^= t; /* 054321 */
  87   z.even = x0; z.odd = x1; return (z);
  88 }
  89
  90 static kludge64 deinterlace(lane x)
  91 {
  92   /* Given a lane X, return the combined 64-bit value.  This is the inverse
  93    * to `interlace' above, and the principle is the same
  94    */
  95
  96   uint32 x0 = x.even, x1 = x.odd, t;
  97   kludge64 z;
  98                                                             /* 054321 */
  99   t = ((x0 >>  1) ^ x1)&0x55555555; x0 ^= t <<  1; x1 ^= t; /* 154320 */
 100   t = ((x0 >>  2) ^ x1)&0x33333333; x0 ^= t <<  2; x1 ^= t; /* 254310 */
 101   t = ((x0 >>  4) ^ x1)&0x0f0f0f0f; x0 ^= t <<  4; x1 ^= t; /* 354210 */
 102   t = ((x0 >>  8) ^ x1)&0x00ff00ff; x0 ^= t <<  8; x1 ^= t; /* 453210 */
 103   t = ((x0 >> 16) ^ x1)&0x0000ffff; x0 ^= t << 16; x1 ^= t; /* 543210 */
 104   SET64(z, x1, x0); return (z);
 105 }
 106
 107 #define TO_LANE(x) (interlace(x))
 108 #define FROM_LANE(x) (deinterlace(x))
 109
 110 #define PRINTFMT_LANE "%08lx:%08lx"
 111 #define PRINTARGS_LANE(x) (unsigned long)(x).even, (unsigned long)(x).odd
 112
 113 #define BINOP_LANE(z, op, x, y)                                         \
 114   ((z).even = (x).even op (y).even, (z).odd = (x).odd op (y).odd)
 115 #define XOR_LANE(z, x, y) BINOP_LANE(z, ^, x, y)
 116 #define AND_LANE(z, x, y) BINOP_LANE(z, &, x, y)
 117 #define OR_LANE(z, x, y) BINOP_LANE(z, |, x, y)
 118 #define NOT_LANE(z, x) ((z).even = ~(x).even, (z).odd = ~(x).odd)
 119
 120 #define ROTL_LANE(z, x, n) do {                                         \
 121   lane _t = (x);                                                        \
 122   (z).even = (n)%2 ? ROL32(_t.odd,  ((n) + 1)/2)                        \
 123                    : ROL32(_t.even,  (n)/2);                            \
 124   (z).odd  = (n)%2 ? ROL32(_t.even, ((n) - 1)/2)                        \
 125                    : ROL32(_t.odd,   (n)/2);                            \
 126 } while (0)
 127
 128 #define LANE_ZERO {          0,          0 }
 129 #define LANE_CMPL { 0xffffffff, 0xffffffff }
 130
 131 static const lane rcon[24] = {
 132   { 0x00000001, 0x00000000 }, { 0x00000000, 0x00000089 },
 133   { 0x00000000, 0x8000008b }, { 0x00000000, 0x80008080 },
 134   { 0x00000001, 0x0000008b }, { 0x00000001, 0x00008000 },
 135   { 0x00000001, 0x80008088 }, { 0x00000001, 0x80000082 },
 136   { 0x00000000, 0x0000000b }, { 0x00000000, 0x0000000a },
 137   { 0x00000001, 0x00008082 }, { 0x00000000, 0x00008003 },
 138   { 0x00000001, 0x0000808b }, { 0x00000001, 0x8000000b },
 139   { 0x00000001, 0x8000008a }, { 0x00000001, 0x80000081 },
 140   { 0x00000000, 0x80000081 }, { 0x00000000, 0x80000008 },
 141   { 0x00000000, 0x00000083 }, { 0x00000000, 0x80008003 },
 142   { 0x00000001, 0x80008088 }, { 0x00000000, 0x80000088 },
 143   { 0x00000001, 0x00008000 }, { 0x00000000, 0x80008082 }
 144 };
 145
 146 #else
 147 /* A target with good support for 64-bit shifts.  We store lanes as 64-bit
 148  * quantities and deal with them in the obvious, natural way.
 149  */
 150
 151 typedef keccak1600_lane_64 lane;
 152 #define S s64
 153
 154 #define TO_LANE(x) (x)
 155 #define FROM_LANE(x) (x)
 156
 157 #define PRINTFMT_LANE "%08lx%08lx"
 158 #define PRINTARGS_LANE(x) (unsigned long)HI64(x), (unsigned long)LO64(x)
 159
 160 #define XOR_LANE(z, x, y) XOR64((z), (x), (y))
 161 #define AND_LANE(z, x, y) AND64((z), (x), (y))
 162 #define OR_LANE(z, x, y) OR64((z), (x), (y))
 163 #define NOT_LANE(z, x) CPL64((z), (x))
 164 #define ROTL_LANE(z, x, n) ROL64_((z), (x), (n))
 165
 166 #define LANE_ZERO X64(       0,        0)
 167 #define LANE_CMPL X64(ffffffff, ffffffff)
 168
 169 static const lane rcon[24] = {
 170   X64(00000000, 00000001), X64(00000000, 00008082),
 171   X64(80000000, 0000808a), X64(80000000, 80008000),
 172   X64(00000000, 0000808b), X64(00000000, 80000001),
 173   X64(80000000, 80008081), X64(80000000, 00008009),
 174   X64(00000000, 0000008a), X64(00000000, 00000088),
 175   X64(00000000, 80008009), X64(00000000, 8000000a),
 176   X64(00000000, 8000808b), X64(80000000, 0000008b),
 177   X64(80000000, 00008089), X64(80000000, 00008003),
 178   X64(80000000, 00008002), X64(80000000, 00000080),
 179   X64(00000000, 0000800a), X64(80000000, 8000000a),
 180   X64(80000000, 80008081), X64(80000000, 00008080),
 181   X64(00000000, 80000001), X64(80000000, 80008008)
 182 };
 183
 184 #endif
 185
 186 /*----- Complementing or not ----------------------------------------------*/
 187
 188 /* We should use the complemented representation if the target doesn't have a
 189  * fused and-not operation.  There doesn't appear to be a principled way to
 190  * do this, so we'll just have to make do with a big list.  Worse, in my
 191  * brief survey of the architecture reference manuals I have lying about,
 192  * they've split close to 50/50 on this question, so I don't have an
 193  * especially good way to pick a default.  The `no-fused-op' architectures
 194  * seem generally a bit more modern than the `fused-op' architectures, so I
 195  * guess I'll make the complemented representation the default.
 196  *
 197  *              and-not         No and-not
 198  *              -------         ----------
 199  *              ARM (`bic')     x86/amd64
 200  *              Sparc (`andn')  z/Architecture
 201  *              MMIX (`andn')   MIPS
 202  *              IA64 (`andcm')  68k
 203  *              VAX (`bic')     RISC-V
 204  *              PDP-10 (`andc')
 205  */
 206 #if !(defined(__arm__) || defined(__thumb__) || defined(__aarch64__) || \
 207       defined(_M_ARM) || defined(_M_THUMB)) &&                          \
 208     !(defined(__ia64__) || defined(__ia64) || defined(__itanium__) ||   \
 209       defined(_M_IA64)) &&                                              \
 210     !defined(__mmix__) &&                                               \
 211     !(defined(__sparc__) || defined(__sparc)) &&                        \
 212     !defined(__vax__) &&                                                \
 213     !defined(__pdp10__)
 214 #  define KECCAK_COMPL
 215 #endif
 216
 217 #ifdef KECCAK_COMPL
 218 /* A target without fused and/not (`bic', `andc2').  We complement some of
 219  * the lanes in the initial state and undo this on output.  (Absorbing XORs
 220  * input into the state, so this is unaffected.)  See the handling of chi in
 221  * `keccak1600_round' below for the details.
 222  */
 223
 224 #define COMPL_MASK 0x00121106u
 225
 226 #define STATE_INIT(z) do {                                              \
 227   lane cmpl = LANE_CMPL;                                                \
 228   (z)->S[I(1, 0)] = cmpl; (z)->S[I(2, 0)] = cmpl;                       \
 229   (z)->S[I(3, 1)] = cmpl; (z)->S[I(2, 2)] = cmpl;                       \
 230   (z)->S[I(2, 3)] = cmpl; (z)->S[I(0, 4)] = cmpl;                       \
 231 } while (0)
 232
 233 #define STATE_OUT(z) do {                                               \
 234   NOT_LANE((z)->S[I(1, 0)], (z)->S[I(1, 0)]);                           \
 235   NOT_LANE((z)->S[I(2, 0)], (z)->S[I(2, 0)]);                           \
 236   NOT_LANE((z)->S[I(3, 1)], (z)->S[I(3, 1)]);                           \
 237   NOT_LANE((z)->S[I(2, 2)], (z)->S[I(2, 2)]);                           \
 238   NOT_LANE((z)->S[I(2, 3)], (z)->S[I(2, 3)]);                           \
 239   NOT_LANE((z)->S[I(0, 4)], (z)->S[I(0, 4)]);                           \
 240 } while (0)
 241
 242 #else
 243 /* A target with fused and/not (`bic', `andc2').  Everything is simple. */
 244
 245 #define COMPL_MASK 0u
 246
 247 #define STATE_INIT(z) do ; while (0)
 248 #define STATE_OUT(z) do ; while (0)
 249
 250 #endif
 251
 252 /*----- Other magic constants ---------------------------------------------*/
 253
 254 /* The rotation constants.  These are systematically named -- see `THETA_RHO'
 255  * below.
 256  */
 257 #define ROT_0_0  0
 258 #define ROT_1_0  1
 259 #define ROT_2_0 62
 260 #define ROT_3_0 28
 261 #define ROT_4_0 27
 262
 263 #define ROT_0_1 36
 264 #define ROT_1_1 44
 265 #define ROT_2_1  6
 266 #define ROT_3_1 55
 267 #define ROT_4_1 20
 268
 269 #define ROT_0_2  3
 270 #define ROT_1_2 10
 271 #define ROT_2_2 43
 272 #define ROT_3_2 25
 273 #define ROT_4_2 39
 274
 275 #define ROT_0_3 41
 276 #define ROT_1_3 45
 277 #define ROT_2_3 15
 278 #define ROT_3_3 21
 279 #define ROT_4_3  8
 280
 281 #define ROT_0_4 18
 282 #define ROT_1_4  2
 283 #define ROT_2_4 61
 284 #define ROT_3_4 56
 285 #define ROT_4_4 14
 286
 287 /*----- Debugging ---------------------------------------------------------*/
 288
 289 #ifdef KECCAK_DEBUG
 290
 291 #include <stdio.h>
 292
 293 static void dump_state(const char *what, unsigned ir,
 294                        const keccak1600_state *x)
 295 {
 296   unsigned i, j;
 297   keccak1600_state y;
 298   kludge64 a;
 299   int sep;
 300
 301   printf(";; %s [round %u]\n", what, ir);
 302   printf(";; raw state...\n");
 303   for (j = 0; j < 5; j++) {
 304     printf(";;");
 305     for (i = 0, sep = '\t'; i < 5; i++, sep = ' ')
 306       printf("%c" PRINTFMT_LANE, sep, PRINTARGS_LANE(x->S[I(i, j)]));
 307     fputc('\n', stdout);
 308   }
 309   y = *x; STATE_OUT(&y);
 310 #ifdef KECCAK_COMPL
 311   printf(";; uncomplemented state...\n");
 312   for (j = 0; j < 5; j++) {
 313     printf(";;");
 314     for (i = 0, sep = '\t'; i < 5; i++, sep = ' ')
 315       printf("%c" PRINTFMT_LANE, sep, PRINTARGS_LANE(y.S[I(i, j)]));
 316     fputc('\n', stdout);
 317   }
 318 #endif
 319 #ifdef KECCAK_I32
 320   printf(";; deinterlaced state...\n");
 321   for (j = 0; j < 5; j++) {
 322     printf(";;");
 323     for (i = 0, sep = '\t'; i < 5; i++, sep = ' ') {
 324       a = FROM_LANE(y.S[I(i, j)]);
 325       printf("%c%08lx%08lx", sep,
 326              (unsigned long)HI64(a), (unsigned long)LO64(a));
 327     }
 328     fputc('\n', stdout);
 329   }
 330 #endif
 331   fputc('\n', stdout);
 332 }
 333
 334 #endif
 335
 336 /*----- The Keccak-p[1600, n] permutation ---------------------------------*/
 337
 338 static void keccak1600_round(keccak1600_state *z,
 339                              const keccak1600_state *x, unsigned i)
 340 {
 341   /* Perform a round of Keccak-p[1600, n].  Process the state X and write the
 342    * result to Z.
 343    */
 344
 345   lane c[5], d[5], t;
 346
 347   /* Theta, first step: calculate the column parities. */
 348 #define COLPARITY(j) do {                                               \
 349            d[j] =      x->S[I(j, 0)];                                   \
 350   XOR_LANE(d[j], d[j], x->S[I(j, 1)]);                                  \
 351   XOR_LANE(d[j], d[j], x->S[I(j, 2)]);                                  \
 352   XOR_LANE(d[j], d[j], x->S[I(j, 3)]);                                  \
 353   XOR_LANE(d[j], d[j], x->S[I(j, 4)]);                                  \
 354 } while (0)
 355   COLPARITY(0); COLPARITY(1); COLPARITY(2); COLPARITY(3); COLPARITY(4);
 356 #undef COLPARITY
 357
 358   /* Theta, second step: calculate the combined effect. */
 359   ROTL_LANE(c[0], d[1], 1); XOR_LANE(c[0], c[0], d[4]);
 360   ROTL_LANE(c[1], d[2], 1); XOR_LANE(c[1], c[1], d[0]);
 361   ROTL_LANE(c[2], d[3], 1); XOR_LANE(c[2], c[2], d[1]);
 362   ROTL_LANE(c[3], d[4], 1); XOR_LANE(c[3], c[3], d[2]);
 363   ROTL_LANE(c[4], d[0], 1); XOR_LANE(c[4], c[4], d[3]);
 364
 365   /* Now we work plane by plane through the output.  To do this, we must undo
 366    * the pi transposition.  Pi maps (x', y') = (y, 2 x + 3 y), so y = x', and
 367    * x = (y' - 3 y)/2 = 3 (y' - 3 x') = x' + 3 y'.
 368    */
 369 #define THETA_RHO(i0, i1, i2, i3, i4) do {                              \
 370                                                                         \
 371   /* First, theta. */                                                   \
 372   XOR_LANE(d[0], x->S[I(i0, 0)], c[i0]);                                \
 373   XOR_LANE(d[1], x->S[I(i1, 1)], c[i1]);                                \
 374   XOR_LANE(d[2], x->S[I(i2, 2)], c[i2]);                                \
 375   XOR_LANE(d[3], x->S[I(i3, 3)], c[i3]);                                \
 376   XOR_LANE(d[4], x->S[I(i4, 4)], c[i4]);                                \
 377                                                                         \
 378   /* Then rho. */                                                       \
 379   ROTL_LANE(d[0], d[0], ROT_##i0##_0);                                  \
 380   ROTL_LANE(d[1], d[1], ROT_##i1##_1);                                  \
 381   ROTL_LANE(d[2], d[2], ROT_##i2##_2);                                  \
 382   ROTL_LANE(d[3], d[3], ROT_##i3##_3);                                  \
 383   ROTL_LANE(d[4], d[4], ROT_##i4##_4);                                  \
 384 } while (0)
 385
 386   /* The basic chi operation is: z = w ^ (~a&b), but this involves an
 387    * inversion which we can mostly avoid by being clever: observe that
 388    *
 389    *            w ^ (~a&~~b) = w ^ ~(a | ~b) = ~w ^ (a | ~b)
 390    *
 391    * by De Morgan's law.  Furthermore, complementing w or z is basically
 392    * equivalent.  Bertoni, Daemen, Peeters, Van Assche, and Van Keer, `Keccak
 393    * implementation overview', describe a pattern of lane complementation
 394    * which propagates through theta and pi in exactly the right way to be
 395    * restored easily by chi, here, with exactly one inversion per plane.
 396    *
 397    * Here's the pattern.
 398    *
 399    *                    [ * . * * . ]        [ . * * . . ]
 400    *                    [ * . * . . ]        [ . . . * . ]
 401    *                    [ * . * . . ]   ->   [ . . * . . ]
 402    *                    [ . * . * * ]        [ . . * . . ]
 403    *                    [ * . . * . ]        [ * . . . . ]
 404    *
 405    * where a `.' means that the lane is unchanged, and a `*' means that it
 406    * has been complemented.
 407    *
 408    * The macros `CHI_wxy_z' calculate z in terms of w, x, y assuming that the
 409    * inputs w, x, y marked with a `1' are complemented on input, and arrange
 410    * for z to be complemented on output if z is so marked.
 411    *
 412    * The diagrams to the right show the fragment of the complementation
 413    * pattern being handled by the corresponding line of code.  A symbol in
 414    * brackets indicates a deviation from the input pattern forced by explicit
 415    * complementation: there will be exactly one of these for each plane.
 416    */
 417 #ifdef KECCAK_COMPL
 418 #  define CHI_COMPL(z, x) NOT_LANE((z), (x))
 419 #  define CHI_001_1(z, w, x, y)                                         \
 420         (OR_LANE((z), (x), (y)), XOR_LANE((z), (z), (w)))
 421 #  define CHI_010_0(z, w, x, y)                                         \
 422         (AND_LANE((z), (x), (y)), XOR_LANE((z), (z), (w)))
 423 #  define CHI_101_0 CHI_001_1
 424 #  define CHI_110_1 CHI_010_0
 425 #else
 426 #  define CHI(z, w, x, y)                                               \
 427         (NOT_LANE((z), (x)),                                            \
 428          AND_LANE((z), (z), (y)),                                       \
 429          XOR_LANE((z), (z), (w)))
 430 #  define CHI_COMPL(z, x) ((z) = (x))
 431 #  define CHI_001_1 CHI
 432 #  define CHI_010_0 CHI
 433 #  define CHI_101_0 CHI
 434 #  define CHI_110_1 CHI
 435 #endif
 436
 437   /* Let's do the y' = 0 plane first.  Theta and rho are easy with our macro,
 438    * and we've done pi with the coordinate hacking.  That leaves chi next.
 439    * This is hairy because we must worry about complementation.
 440    */
 441   THETA_RHO(0, 1, 2, 3, 4);
 442   CHI_COMPL(t, d[2]);                         /*         [.]               */
 443   CHI_101_0(z->S[I(0, 0)], d[0], d[1], d[2]); /*  *   .   *          ->  . */
 444   CHI_001_1(z->S[I(1, 0)], d[1], t,    d[3]); /*      .  [.]  *      ->  * */
 445   CHI_110_1(z->S[I(2, 0)], d[2], d[3], d[4]); /*          *   *   .  ->  * */
 446   CHI_101_0(z->S[I(3, 0)], d[3], d[4], d[0]); /*  *           *   .  ->  . */
 447   CHI_010_0(z->S[I(4, 0)], d[4], d[0], d[1]); /*  *   .           .  ->  . */
 448
 449   /* We'd better do iota before we forget. */
 450   XOR_LANE(z->S[I(0, 0)], z->S[I(0, 0)], rcon[i]);
 451
 452   /* That was fun.  Maybe y' = 1 will be as good. */
 453   THETA_RHO(3, 4, 0, 1, 2);
 454   CHI_COMPL(t, d[4]);                         /*                 [*]       */
 455   CHI_101_0(z->S[I(0, 1)], d[0], d[1], d[2]); /*  *   .   *          ->  . */
 456   CHI_010_0(z->S[I(1, 1)], d[1], d[2], d[3]); /*      .   *   .      ->  . */
 457   CHI_101_0(z->S[I(2, 1)], d[2], d[3], t);    /*          *   .  [*] ->  . */
 458   CHI_001_1(z->S[I(3, 1)], d[3], d[4], d[0]); /*  *           .   .  ->  * */
 459   CHI_010_0(z->S[I(4, 1)], d[4], d[0], d[1]); /*  *   .           .  ->  . */
 460
 461   /* We're getting the hang of this.  The y' = 2 plane shouldn't be any
 462    * trouble.
 463    */
 464   THETA_RHO(1, 2, 3, 4, 0);
 465   CHI_COMPL(t, d[3]);                         /*             [*]           */
 466   CHI_101_0(z->S[I(0, 2)], d[0], d[1], d[2]); /*  *   .   *          ->  . */
 467   CHI_010_0(z->S[I(1, 2)], d[1], d[2], d[3]); /*      .   *   .      ->  . */
 468   CHI_110_1(z->S[I(2, 2)], d[2], t,    d[4]); /*          *  [*]  .  ->  * */
 469   CHI_101_0(z->S[I(3, 2)], t,    d[4], d[0]); /*  *          [*]  .  ->  . */
 470   CHI_010_0(z->S[I(4, 2)], d[4], d[0], d[1]); /*  *   .           .  ->  . */
 471
 472   /* This isn't as interesting any more.  Let's do y' = 3 before boredom sets
 473    * in.
 474    */
 475   THETA_RHO(4, 0, 1, 2, 3);
 476   CHI_COMPL(t, d[3]);                         /*             [.]           */
 477   CHI_010_0(z->S[I(0, 3)], d[0], d[1], d[2]); /*  .   *   .          ->  . */
 478   CHI_101_0(z->S[I(1, 3)], d[1], d[2], d[3]); /*      *   .   *      ->  . */
 479   CHI_001_1(z->S[I(2, 3)], d[2], t,    d[4]); /*          .  [.]  *  ->  * */
 480   CHI_010_0(z->S[I(3, 3)], t,    d[4], d[0]); /*  .          [.]  *  ->  . */
 481   CHI_101_0(z->S[I(4, 3)], d[4], d[0], d[1]); /*  .   *           *  ->  . */
 482
 483   /* Last plane.  Just y' = 4 to go. */
 484   THETA_RHO(2, 3, 4, 0, 1);
 485   CHI_COMPL(t, d[1]);                         /*     [*]                   */
 486   CHI_110_1(z->S[I(0, 4)], d[0], t,    d[2]); /*  *  [*]  .          ->  * */
 487   CHI_101_0(z->S[I(1, 4)], t,    d[2], d[3]); /*     [*]  .   *      ->  . */
 488   CHI_010_0(z->S[I(2, 4)], d[2], d[3], d[4]); /*          .   *   .  ->  . */
 489   CHI_101_0(z->S[I(3, 4)], d[3], d[4], d[0]); /*  *           *   .  ->  . */
 490   CHI_010_0(z->S[I(4, 4)], d[4], d[0], d[1]); /*  *   .           .  ->  . */
 491
 492   /* And we're done. */
 493 #undef THETA_RHO
 494 #undef CHI_COMPL
 495 #undef CHI_001_1
 496 #undef CHI_010_0
 497 #undef CHI_101_0
 498 #undef CHI_110_1
 499 #undef CHI
 500 }
 501
 502 /* --- @keccak1600_p@ --- *
 503  *
 504  * Arguments:   @keccak1600_state *z@ = where to write the output state
 505  *              @conts keccak1600_state *x@ = input state
 506  *              @unsigned n@ = number of rounds to perform
 507  *
 508  * Returns:     ---
 509  *
 510  * Use:         Implements the %$\Keccak[1600, n]$% permutation at the core
 511  *              of Keccak and the SHA-3 standard.
 512  */
 513
 514 void keccak1600_p(keccak1600_state *z, const keccak1600_state *x, unsigned n)
 515 {
 516   keccak1600_state u, v;
 517   unsigned i = 0;
 518
 519 #ifdef KECCAK_DEBUG
 520   dump_state("init", 0, x);
 521 #endif
 522   keccak1600_round(&u, x, i++); n--;
 523   while (n > 8) {
 524     keccak1600_round(&v, &u, i++);
 525     keccak1600_round(&u, &v, i++);
 526     keccak1600_round(&v, &u, i++);
 527     keccak1600_round(&u, &v, i++);
 528     keccak1600_round(&v, &u, i++);
 529     keccak1600_round(&u, &v, i++);
 530     keccak1600_round(&v, &u, i++);
 531     keccak1600_round(&u, &v, i++);
 532     n -= 8;
 533   }
 534   switch (n) {
 535     case 7: keccak1600_round(&v, &u, i++);
 536             keccak1600_round(&u, &v, i++);
 537     case 5: keccak1600_round(&v, &u, i++);
 538             keccak1600_round(&u, &v, i++);
 539     case 3: keccak1600_round(&v, &u, i++);
 540             keccak1600_round(&u, &v, i++);
 541     case 1: keccak1600_round( z, &u, i++);
 542             break;
 543     case 8: keccak1600_round(&v, &u, i++);
 544             keccak1600_round(&u, &v, i++);
 545     case 6: keccak1600_round(&v, &u, i++);
 546             keccak1600_round(&u, &v, i++);
 547     case 4: keccak1600_round(&v, &u, i++);
 548             keccak1600_round(&u, &v, i++);
 549     case 2: keccak1600_round(&v, &u, i++);
 550             keccak1600_round( z, &v, i++);
 551             break;
 552   }
 553 #ifdef KECCAK_DEBUG
 554   dump_state("final", 0, z);
 555 #endif
 556 }
 557
 558 /* --- @keccack1600_init@ --- *
 559  *
 560  * Arguments:   @keccak1600_state *s@ = a state to initialize
 561  *
 562  * Returns:     ---
 563  *
 564  * Use:         Initialize @s@ to the root state.
 565  */
 566
 567 void keccak1600_init(keccak1600_state *s)
 568   { memset(s->S, 0, sizeof(s->S)); STATE_INIT(s); }
 569
 570 /* --- @keccak1600_mix@ --- *
 571  *
 572  * Arguments:   @keccak1600_state *s@ = a state to update
 573  *              @const kludge64 *p@ = pointer to 64-bit words to mix in
 574  *              @size_t n@ = size of the input, in 64-bit words
 575  *
 576  * Returns:     ---
 577  *
 578  * Use:         Mixes data into a %$\Keccak[r, 1600 - r]$% state.  Note that
 579  *              it's the caller's responsibility to pass in no more than
 580  *              %$r$% bits of data.
 581  */
 582
 583 void keccak1600_mix(keccak1600_state *s, const kludge64 *p, size_t n)
 584 {
 585   unsigned i;
 586   lane a;
 587
 588   for (i = 0; i < n; i++)
 589     { a = TO_LANE(p[i]); XOR_LANE(s->S[i], s->S[i], a); }
 590 }
 591
 592 /* --- @keccak1600_extract@ --- *
 593  *
 594  * Arguments:   @const keccak1600_state *s@ = a state to extract output from
 595  *              @kludge64 *p@ = pointer to 64-bit words to write
 596  *              @size_t n@ = size of the output, in 64-bit words
 597  *
 598  * Returns:     ---
 599  *
 600  * Use:         Reads output from a %$\Keccak[r, 1600 - r]$% state.  Note
 601  *              that it's the caller's responsibility to extract no more than
 602  *              %$r$% bits of data.
 603  */
 604
 605 void keccak1600_extract(const keccak1600_state *s, kludge64 *p, size_t n)
 606 {
 607   uint32 m = COMPL_MASK;
 608   unsigned i;
 609   lane t;
 610
 611   for (i = 0; i < n; i++) {
 612     t = s->S[i]; if (m&1) NOT_LANE(t, t);
 613     *p++ = FROM_LANE(t); m >>= 1;
 614   }
 615 }
 616
 617 /*----- Test rig ----------------------------------------------------------*/
 618
 619 #ifdef TEST_RIG
 620
 621 #include <stdio.h>
 622
 623 #include <mLib/quis.h>
 624 #include <mLib/report.h>
 625 #include <mLib/testrig.h>
 626
 627 static int vrf_p(dstr v[])
 628 {
 629   keccak1600_state u;
 630   kludge64 t[25];
 631   dstr d = DSTR_INIT;
 632   int n;
 633   unsigned i;
 634   int ok = 1;
 635
 636   if (v[0].len != 200) die(1, "bad input size");
 637   if (v[2].len != 200) die(1, "bad output size");
 638   n = *(int *)v[1].buf;
 639   dstr_ensure(&d, 200); d.len = 200;
 640
 641   keccak1600_init(&u);
 642   for (i = 0; i < 25; i++) LOAD64_L_(t[i], v[0].buf + 8*i);
 643   keccak1600_mix(&u, t, 25);
 644   keccak1600_p(&u, &u, n);
 645   keccak1600_extract(&u, t, 25);
 646   for (i = 0; i < 25; i++) STORE64_L_(d.buf + 8*i, t[i]);
 647   if (memcmp(d.buf, v[2].buf, 200) != 0) {
 648     ok = 0;
 649     fprintf(stderr, "failed!");
 650     fprintf(stderr, "\n\t     input = "); type_hex.dump(&v[0], stderr);
 651     fprintf(stderr, "\n\t    rounds = %d", n);
 652     fprintf(stderr, "\n\t  expected = "); type_hex.dump(&v[2], stderr);
 653     fprintf(stderr, "\n\t calclated = "); type_hex.dump(&d, stderr);
 654   }
 655
 656   dstr_destroy(&d);
 657   return (ok);
 658 }
 659
 660 static test_chunk defs[] = {
 661   { "p", vrf_p, { &type_hex, &type_int, &type_hex } },
 662   { 0, 0, { 0 } }
 663 };
 664
 665 int main(int argc, char *argv[])
 666 {
 667   test_run(argc, argv, defs, SRCDIR"/t/keccak1600");
 668   return (0);
 669 }
 670
 671 #endif
 672
 673 /*----- That's all, folks -------------------------------------------------*/