chiark - git - mdw - catacomb/blob - symm/keccak1600.c

   1 /* -*-c-*-
   2  *
   3  * The Keccak-p[1600, n] permutation
   4  *
   5  * (c) 2017 Straylight/Edgeware
   6  */
   7
   8 /*----- Licensing notice --------------------------------------------------*
   9  *
  10  * This file is part of Catacomb.
  11  *
  12  * Catacomb is free software; you can redistribute it and/or modify
  13  * it under the terms of the GNU Library General Public License as
  14  * published by the Free Software Foundation; either version 2 of the
  15  * License, or (at your option) any later version.
  16  *
  17  * Catacomb is distributed in the hope that it will be useful,
  18  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  19  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  20  * GNU Library General Public License for more details.
  21  *
  22  * You should have received a copy of the GNU Library General Public
  23  * License along with Catacomb; if not, write to the Free
  24  * Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
  25  * MA 02111-1307, USA.
  26  */
  27
  28 /*----- Header files ------------------------------------------------------*/
  29
  30 #include <limits.h>
  31 #include <string.h>
  32
  33 #include <mLib/bits.h>
  34
  35 #include "keccak1600.h"
  36
  37 /* #define KECCAK_DEBUG */
  38
  39 /*----- Miscellaneous utilities -------------------------------------------*/
  40
  41 #define I(x, y) ((x) + 5*(y))           /* Column-major indexing */
  42
  43 /*----- Interlacing or not ------------------------------------------------*/
  44
  45 /* We should prefer the interlaced representation if the target is really
  46  * 32-bit and only providing synthetic 64-bit integers.  Alas, the Windows
  47  * 64-bit ABI specifies that `long' is only 32-bits (i.e., it is IL32/LLP64),
  48  * so detect x86 specifically.
  49  */
  50 #if (ULONG_MAX >> 31) <= 0xffffffff && \
  51   !defined(__amd64__) && !defined(_M_AMD64)
  52 #  define KECCAK_I32
  53 #endif
  54
  55 #ifdef KECCAK_I32
  56 /* A 32-bit target with at best weak support for 64-bit shifts.  Maintain a
  57  * lane as two 32-bit pieces representing the even and odd bits of the lane.
  58  * There are slightly fiddly transformations to apply on the way in and out
  59  * of the main permutation.
  60  */
  61
  62 typedef keccak1600_lane_i32 lane;
  63 #define S si32
  64
  65 static lane interlace(kludge64 x)
  66 {
  67   /* Given a 64-bit string X, return a lane Z containing the even- and
  68    * odd-numbered bits of X.
  69    *
  70    * This becomes more manageable if we look at what happens to the bit
  71    * indices: bit i of X becomes bit ROR_6(i, 1) of Z.  We can effectively
  72    * swap two bits of the indices by swapping the object bits where those
  73    * index bits differ.  Fortunately, this is fairly easy.
  74    *
  75    * We arrange to swap bits between the two halves of X, rather than within
  76    * a half.
  77    */
  78
  79   uint32 x0 = LO64(x), x1 = HI64(x), t;
  80   lane z;
  81                                                             /* 543210 */
  82   t = ((x0 >> 16) ^ x1)&0x0000ffff; x0 ^= t << 16; x1 ^= t; /* 453210 */
  83   t = ((x0 >>  8) ^ x1)&0x00ff00ff; x0 ^= t <<  8; x1 ^= t; /* 354210 */
  84   t = ((x0 >>  4) ^ x1)&0x0f0f0f0f; x0 ^= t <<  4; x1 ^= t; /* 254310 */
  85   t = ((x0 >>  2) ^ x1)&0x33333333; x0 ^= t <<  2; x1 ^= t; /* 154320 */
  86   t = ((x0 >>  1) ^ x1)&0x55555555; x0 ^= t <<  1; x1 ^= t; /* 054321 */
  87   z.even = x0; z.odd = x1; return (z);
  88 }
  89
  90 static kludge64 deinterlace(lane x)
  91 {
  92   /* Given a lane X, return the combined 64-bit value.  This is the inverse
  93    * to `interlace' above, and the principle is the same
  94    */
  95
  96   uint32 x0 = x.even, x1 = x.odd, t;
  97   kludge64 z;
  98                                                             /* 054321 */
  99   t = ((x0 >>  1) ^ x1)&0x55555555; x0 ^= t <<  1; x1 ^= t; /* 154320 */
 100   t = ((x0 >>  2) ^ x1)&0x33333333; x0 ^= t <<  2; x1 ^= t; /* 254310 */
 101   t = ((x0 >>  4) ^ x1)&0x0f0f0f0f; x0 ^= t <<  4; x1 ^= t; /* 354210 */
 102   t = ((x0 >>  8) ^ x1)&0x00ff00ff; x0 ^= t <<  8; x1 ^= t; /* 453210 */
 103   t = ((x0 >> 16) ^ x1)&0x0000ffff; x0 ^= t << 16; x1 ^= t; /* 543210 */
 104   SET64(z, x1, x0); return (z);
 105 }
 106
 107 #define TO_LANE(x) (interlace(x))
 108 #define FROM_LANE(x) (deinterlace(x))
 109
 110 #define PRINTFMT_LANE "%08lx:%08lx"
 111 #define PRINTARGS_LANE(x) (unsigned long)(x).even, (unsigned long)(x).odd
 112
 113 #define BINOP_LANE(z, op, x, y)                                         \
 114   ((z).even = (x).even op (y).even, (z).odd = (x).odd op (y).odd)
 115 #define XOR_LANE(z, x, y) BINOP_LANE(z, ^, x, y)
 116 #define AND_LANE(z, x, y) BINOP_LANE(z, &, x, y)
 117 #define OR_LANE(z, x, y) BINOP_LANE(z, |, x, y)
 118 #define NOT_LANE(z, x) ((z).even = ~(x).even, (z).odd = ~(x).odd)
 119
 120 #define ROTL_LANE(z, x, n) do {                                         \
 121   lane _t = (x);                                                        \
 122   (z).even = (n)%2 ? ROL32(_t.odd,  ((n) + 1)/2)                        \
 123                    : ROL32(_t.even,  (n)/2);                            \
 124   (z).odd  = (n)%2 ? ROL32(_t.even, ((n) - 1)/2)                        \
 125                    : ROL32(_t.odd,   (n)/2);                            \
 126 } while (0)
 127
 128 #define LANE_ZERO {          0,          0 }
 129 #define LANE_CMPL { 0xffffffff, 0xffffffff }
 130
 131 static const lane rcon[24] = {
 132   { 0x00000001, 0x00000000 }, { 0x00000000, 0x00000089 },
 133   { 0x00000000, 0x8000008b }, { 0x00000000, 0x80008080 },
 134   { 0x00000001, 0x0000008b }, { 0x00000001, 0x00008000 },
 135   { 0x00000001, 0x80008088 }, { 0x00000001, 0x80000082 },
 136   { 0x00000000, 0x0000000b }, { 0x00000000, 0x0000000a },
 137   { 0x00000001, 0x00008082 }, { 0x00000000, 0x00008003 },
 138   { 0x00000001, 0x0000808b }, { 0x00000001, 0x8000000b },
 139   { 0x00000001, 0x8000008a }, { 0x00000001, 0x80000081 },
 140   { 0x00000000, 0x80000081 }, { 0x00000000, 0x80000008 },
 141   { 0x00000000, 0x00000083 }, { 0x00000000, 0x80008003 },
 142   { 0x00000001, 0x80008088 }, { 0x00000000, 0x80000088 },
 143   { 0x00000001, 0x00008000 }, { 0x00000000, 0x80008082 }
 144 };
 145
 146 #else
 147 /* A target with good support for 64-bit shifts.  We store lanes as 64-bit
 148  * quantities and deal with them in the obvious, natural way.
 149  */
 150
 151 typedef keccak1600_lane_64 lane;
 152 #define S s64
 153
 154 #define TO_LANE(x) (x)
 155 #define FROM_LANE(x) (x)
 156
 157 #define PRINTFMT_LANE "%08lx%08lx"
 158 #define PRINTARGS_LANE(x) (unsigned long)HI64(x), (unsigned long)LO64(x)
 159
 160 #define XOR_LANE(z, x, y) XOR64((z), (x), (y))
 161 #define AND_LANE(z, x, y) AND64((z), (x), (y))
 162 #define OR_LANE(z, x, y) OR64((z), (x), (y))
 163 #define NOT_LANE(z, x) CPL64((z), (x))
 164 #define ROTL_LANE(z, x, n) ROL64_((z), (x), (n))
 165
 166 #define LANE_ZERO X64(       0,        0)
 167 #define LANE_CMPL X64(ffffffff, ffffffff)
 168
 169 static const lane rcon[24] = {
 170   X64(00000000, 00000001), X64(00000000, 00008082),
 171   X64(80000000, 0000808a), X64(80000000, 80008000),
 172   X64(00000000, 0000808b), X64(00000000, 80000001),
 173   X64(80000000, 80008081), X64(80000000, 00008009),
 174   X64(00000000, 0000008a), X64(00000000, 00000088),
 175   X64(00000000, 80008009), X64(00000000, 8000000a),
 176   X64(00000000, 8000808b), X64(80000000, 0000008b),
 177   X64(80000000, 00008089), X64(80000000, 00008003),
 178   X64(80000000, 00008002), X64(80000000, 00000080),
 179   X64(00000000, 0000800a), X64(80000000, 8000000a),
 180   X64(80000000, 80008081), X64(80000000, 00008080),
 181   X64(00000000, 80000001), X64(80000000, 80008008)
 182 };
 183
 184 #endif
 185
 186 /*----- Complementing or not ----------------------------------------------*/
 187
 188 /* We should use the complemented representation if the target doesn't have a
 189  * fused and-not operation.  There doesn't appear to be a principled way to
 190  * do this, so we'll just have to make do with a big list.  Worse, in my
 191  * brief survey of the architecture reference manuals I have lying about,
 192  * they've split close to 50/50 on this question, so I don't have an
 193  * especially good way to pick a default.  The `no-fused-op' architectures
 194  * seem generally a bit more modern than the `fused-op' architectures, so I
 195  * guess I'll make the complemented representation the default.
 196  *
 197  *              and-not         No and-not
 198  *              -------         ----------
 199  *              ARM (`bic')     x86/amd64
 200  *              Sparc (`andn')  z/Architecture
 201  *              MMIX (`andn')   MIPS
 202  *              IA64 (`andcm')  68k
 203  *              VAX (`bic')     RISC-V
 204  *              PDP-10 (`andc')
 205  */
 206 #if !(defined(__arm__) || defined(__thumb__) || defined(__aarch64__) || \
 207       defined(_M_ARM) || defined(_M_THUMB)) &&                          \
 208     !(defined(__ia64__) || defined(__ia64) || defined(__itanium__) ||   \
 209       defined(_M_IA64)) &&                                              \
 210     !defined(__mmix__) &&                                               \
 211     !(defined(__sparc__) || defined(__sparc)) &&                        \
 212     !defined(__vax__) &&                                                \
 213     !defined(__pdp10__)
 214 #  define KECCAK_COMPL
 215 #endif
 216
 217 #ifdef KECCAK_COMPL
 218 /* A target without fused and/not (`bic', `andc2').  We complement some of
 219  * the lanes in the initial state and undo this on output.  (Absorbing XORs
 220  * input into the state, so this is unaffected.)  See the handling of chi in
 221  * `keccak1600_round' below for the details.
 222  */
 223
 224 #define STATE_INIT(z) do {                                              \
 225   lane cmpl = LANE_CMPL;                                                \
 226   (z)->S[I(1, 0)] = cmpl; (z)->S[I(2, 0)] = cmpl;                       \
 227   (z)->S[I(3, 1)] = cmpl; (z)->S[I(2, 2)] = cmpl;                       \
 228   (z)->S[I(2, 3)] = cmpl; (z)->S[I(0, 4)] = cmpl;                       \
 229 } while (0)
 230
 231 #define STATE_OUT(z) do {                                               \
 232   NOT_LANE((z)->S[I(1, 0)], (z)->S[I(1, 0)]);                           \
 233   NOT_LANE((z)->S[I(2, 0)], (z)->S[I(2, 0)]);                           \
 234   NOT_LANE((z)->S[I(3, 1)], (z)->S[I(3, 1)]);                           \
 235   NOT_LANE((z)->S[I(2, 2)], (z)->S[I(2, 2)]);                           \
 236   NOT_LANE((z)->S[I(2, 3)], (z)->S[I(2, 3)]);                           \
 237   NOT_LANE((z)->S[I(0, 4)], (z)->S[I(0, 4)]);                           \
 238 } while (0)
 239
 240 #else
 241 /* A target with fused and/not (`bic', `andc2').  Everything is simple. */
 242
 243 #define STATE_INIT(z) do ; while (0)
 244 #define STATE_OUT(z) do ; while (0)
 245
 246 #endif
 247
 248 /*----- Other magic constants ---------------------------------------------*/
 249
 250 /* The rotation constants.  These are systematically named -- see `THETA_RHO'
 251  * below.
 252  */
 253 #define ROT_0_0  0
 254 #define ROT_1_0  1
 255 #define ROT_2_0 62
 256 #define ROT_3_0 28
 257 #define ROT_4_0 27
 258
 259 #define ROT_0_1 36
 260 #define ROT_1_1 44
 261 #define ROT_2_1  6
 262 #define ROT_3_1 55
 263 #define ROT_4_1 20
 264
 265 #define ROT_0_2  3
 266 #define ROT_1_2 10
 267 #define ROT_2_2 43
 268 #define ROT_3_2 25
 269 #define ROT_4_2 39
 270
 271 #define ROT_0_3 41
 272 #define ROT_1_3 45
 273 #define ROT_2_3 15
 274 #define ROT_3_3 21
 275 #define ROT_4_3  8
 276
 277 #define ROT_0_4 18
 278 #define ROT_1_4  2
 279 #define ROT_2_4 61
 280 #define ROT_3_4 56
 281 #define ROT_4_4 14
 282
 283 /*----- Debugging ---------------------------------------------------------*/
 284
 285 #ifdef KECCAK_DEBUG
 286
 287 #include <stdio.h>
 288
 289 static void dump_state(const char *what, unsigned ir,
 290                        const keccak1600_state *x)
 291 {
 292   unsigned i, j;
 293   keccak1600_state y;
 294   kludge64 a;
 295   int sep;
 296
 297   printf(";; %s [round %u]\n", what, ir);
 298   printf(";; raw state...\n");
 299   for (j = 0; j < 5; j++) {
 300     printf(";;");
 301     for (i = 0, sep = '\t'; i < 5; i++, sep = ' ')
 302       printf("%c" PRINTFMT_LANE, sep, PRINTARGS_LANE(x->S[I(i, j)]));
 303     fputc('\n', stdout);
 304   }
 305   y = *x; STATE_OUT(&y);
 306 #ifdef KECCAK_COMPL
 307   printf(";; uncomplemented state...\n");
 308   for (j = 0; j < 5; j++) {
 309     printf(";;");
 310     for (i = 0, sep = '\t'; i < 5; i++, sep = ' ')
 311       printf("%c" PRINTFMT_LANE, sep, PRINTARGS_LANE(y.S[I(i, j)]));
 312     fputc('\n', stdout);
 313   }
 314 #endif
 315 #ifdef KECCAK_I32
 316   printf(";; deinterlaced state...\n");
 317   for (j = 0; j < 5; j++) {
 318     printf(";;");
 319     for (i = 0, sep = '\t'; i < 5; i++, sep = ' ') {
 320       a = FROM_LANE(y.S[I(i, j)]);
 321       printf("%c%08lx%08lx", sep,
 322              (unsigned long)HI64(a), (unsigned long)LO64(a));
 323     }
 324     fputc('\n', stdout);
 325   }
 326 #endif
 327   fputc('\n', stdout);
 328 }
 329
 330 #endif
 331
 332 /*----- The Keccak-p[1600, n] permutation ---------------------------------*/
 333
 334 static void keccak1600_round(keccak1600_state *z,
 335                              const keccak1600_state *x, unsigned i)
 336 {
 337   /* Perform a round of Keccak-p[1600, n].  Process the state X and write the
 338    * result to Z.
 339    */
 340
 341   lane c[5], d[5], t;
 342
 343   /* Theta, first step: calculate the column parities. */
 344 #define COLPARITY(j) do {                                               \
 345            d[j] =      x->S[I(j, 0)];                                   \
 346   XOR_LANE(d[j], d[j], x->S[I(j, 1)]);                                  \
 347   XOR_LANE(d[j], d[j], x->S[I(j, 2)]);                                  \
 348   XOR_LANE(d[j], d[j], x->S[I(j, 3)]);                                  \
 349   XOR_LANE(d[j], d[j], x->S[I(j, 4)]);                                  \
 350 } while (0)
 351   COLPARITY(0); COLPARITY(1); COLPARITY(2); COLPARITY(3); COLPARITY(4);
 352 #undef COLPARITY
 353
 354   /* Theta, second step: calculate the combined effect. */
 355   ROTL_LANE(c[0], d[1], 1); XOR_LANE(c[0], c[0], d[4]);
 356   ROTL_LANE(c[1], d[2], 1); XOR_LANE(c[1], c[1], d[0]);
 357   ROTL_LANE(c[2], d[3], 1); XOR_LANE(c[2], c[2], d[1]);
 358   ROTL_LANE(c[3], d[4], 1); XOR_LANE(c[3], c[3], d[2]);
 359   ROTL_LANE(c[4], d[0], 1); XOR_LANE(c[4], c[4], d[3]);
 360
 361   /* Now we work plane by plane through the output.  To do this, we must undo
 362    * the pi transposition.  Pi maps (x', y') = (y, 2 x + 3 y), so y = x', and
 363    * x = (y' - 3 y)/2 = 3 (y' - 3 x') = x' + 3 y'.
 364    */
 365 #define THETA_RHO(i0, i1, i2, i3, i4) do {                              \
 366                                                                         \
 367   /* First, theta. */                                                   \
 368   XOR_LANE(d[0], x->S[I(i0, 0)], c[i0]);                                \
 369   XOR_LANE(d[1], x->S[I(i1, 1)], c[i1]);                                \
 370   XOR_LANE(d[2], x->S[I(i2, 2)], c[i2]);                                \
 371   XOR_LANE(d[3], x->S[I(i3, 3)], c[i3]);                                \
 372   XOR_LANE(d[4], x->S[I(i4, 4)], c[i4]);                                \
 373                                                                         \
 374   /* Then rho. */                                                       \
 375   ROTL_LANE(d[0], d[0], ROT_##i0##_0);                                  \
 376   ROTL_LANE(d[1], d[1], ROT_##i1##_1);                                  \
 377   ROTL_LANE(d[2], d[2], ROT_##i2##_2);                                  \
 378   ROTL_LANE(d[3], d[3], ROT_##i3##_3);                                  \
 379   ROTL_LANE(d[4], d[4], ROT_##i4##_4);                                  \
 380 } while (0)
 381
 382   /* The basic chi operation is: z = w ^ (~a&b), but this involves an
 383    * inversion which we can mostly avoid by being clever: observe that
 384    *
 385    *            w ^ (~a&~~b) = w ^ ~(a | ~b) = ~w ^ (a | ~b)
 386    *
 387    * by De Morgan's law.  Furthermore, complementing w or z is basically
 388    * equivalent.  Bertoni, Daemen, Peeters, Van Assche, and Van Keer, `Keccak
 389    * implementation overview', describe a pattern of lane complementation
 390    * which propagates through theta and pi in exactly the right way to be
 391    * restored easily by chi, here, with exactly one inversion per plane.
 392    *
 393    * Here's the pattern.
 394    *
 395    *                    [ * . * * . ]        [ . * * . . ]
 396    *                    [ * . * . . ]        [ . . . * . ]
 397    *                    [ * . * . . ]   ->   [ . . * . . ]
 398    *                    [ . * . * * ]        [ . . * . . ]
 399    *                    [ * . . * . ]        [ * . . . . ]
 400    *
 401    * where a `.' means that the lane is unchanged, and a `*' means that it
 402    * has been complemented.
 403    *
 404    * The macros `CHI_wxy_z' calculate z in terms of w, x, y assuming that the
 405    * inputs w, x, y marked with a `1' are complemented on input, and arrange
 406    * for z to be complemented on output if z is so marked.
 407    *
 408    * The diagrams to the right show the fragment of the complementation
 409    * pattern being handled by the corresponding line of code.  A symbol in
 410    * brackets indicates a deviation from the input pattern forced by explicit
 411    * complementation: there will be exactly one of these for each plane.
 412    */
 413 #ifdef KECCAK_COMPL
 414 #  define CHI_COMPL(z, x) NOT_LANE((z), (x))
 415 #  define CHI_001_1(z, w, x, y)                                         \
 416         (OR_LANE((z), (x), (y)), XOR_LANE((z), (z), (w)))
 417 #  define CHI_010_0(z, w, x, y)                                         \
 418         (AND_LANE((z), (x), (y)), XOR_LANE((z), (z), (w)))
 419 #  define CHI_101_0 CHI_001_1
 420 #  define CHI_110_1 CHI_010_0
 421 #else
 422 #  define CHI(z, w, x, y)                                               \
 423         (NOT_LANE((z), (x)),                                            \
 424          AND_LANE((z), (z), (y)),                                       \
 425          XOR_LANE((z), (z), (w)))
 426 #  define CHI_COMPL(z, x) ((z) = (x))
 427 #  define CHI_001_1 CHI
 428 #  define CHI_010_0 CHI
 429 #  define CHI_101_0 CHI
 430 #  define CHI_110_1 CHI
 431 #endif
 432
 433   /* Let's do the y' = 0 plane first.  Theta and rho are easy with our macro,
 434    * and we've done pi with the coordinate hacking.  That leaves chi next.
 435    * This is hairy because we must worry about complementation.
 436    */
 437   THETA_RHO(0, 1, 2, 3, 4);
 438   CHI_COMPL(t, d[2]);                         /*         [.]               */
 439   CHI_101_0(z->S[I(0, 0)], d[0], d[1], d[2]); /*  *   .   *          ->  . */
 440   CHI_001_1(z->S[I(1, 0)], d[1], t,    d[3]); /*      .  [.]  *      ->  * */
 441   CHI_110_1(z->S[I(2, 0)], d[2], d[3], d[4]); /*          *   *   .  ->  * */
 442   CHI_101_0(z->S[I(3, 0)], d[3], d[4], d[0]); /*  *           *   .  ->  . */
 443   CHI_010_0(z->S[I(4, 0)], d[4], d[0], d[1]); /*  *   .           .  ->  . */
 444
 445   /* We'd better do iota before we forget. */
 446   XOR_LANE(z->S[I(0, 0)], z->S[I(0, 0)], rcon[i]);
 447
 448   /* That was fun.  Maybe y' = 1 will be as good. */
 449   THETA_RHO(3, 4, 0, 1, 2);
 450   CHI_COMPL(t, d[4]);                         /*                 [*]       */
 451   CHI_101_0(z->S[I(0, 1)], d[0], d[1], d[2]); /*  *   .   *          ->  . */
 452   CHI_010_0(z->S[I(1, 1)], d[1], d[2], d[3]); /*      .   *   .      ->  . */
 453   CHI_101_0(z->S[I(2, 1)], d[2], d[3], t);    /*          *   .  [*] ->  . */
 454   CHI_001_1(z->S[I(3, 1)], d[3], d[4], d[0]); /*  *           .   .  ->  * */
 455   CHI_010_0(z->S[I(4, 1)], d[4], d[0], d[1]); /*  *   .           .  ->  . */
 456
 457   /* We're getting the hang of this.  The y' = 2 plane shouldn't be any
 458    * trouble.
 459    */
 460   THETA_RHO(1, 2, 3, 4, 0);
 461   CHI_COMPL(t, d[3]);                         /*             [*]           */
 462   CHI_101_0(z->S[I(0, 2)], d[0], d[1], d[2]); /*  *   .   *          ->  . */
 463   CHI_010_0(z->S[I(1, 2)], d[1], d[2], d[3]); /*      .   *   .      ->  . */
 464   CHI_110_1(z->S[I(2, 2)], d[2], t,    d[4]); /*          *  [*]  .  ->  * */
 465   CHI_101_0(z->S[I(3, 2)], t,    d[4], d[0]); /*  *          [*]  .  ->  . */
 466   CHI_010_0(z->S[I(4, 2)], d[4], d[0], d[1]); /*  *   .           .  ->  . */
 467
 468   /* This isn't as interesting any more.  Let's do y' = 3 before boredom sets
 469    * in.
 470    */
 471   THETA_RHO(4, 0, 1, 2, 3);
 472   CHI_COMPL(t, d[3]);                         /*             [.]           */
 473   CHI_010_0(z->S[I(0, 3)], d[0], d[1], d[2]); /*  .   *   .          ->  . */
 474   CHI_101_0(z->S[I(1, 3)], d[1], d[2], d[3]); /*      *   .   *      ->  . */
 475   CHI_001_1(z->S[I(2, 3)], d[2], t,    d[4]); /*          .  [.]  *  ->  * */
 476   CHI_010_0(z->S[I(3, 3)], t,    d[4], d[0]); /*  .          [.]  *  ->  . */
 477   CHI_101_0(z->S[I(4, 3)], d[4], d[0], d[1]); /*  .   *           *  ->  . */
 478
 479   /* Last plane.  Just y' = 4 to go. */
 480   THETA_RHO(2, 3, 4, 0, 1);
 481   CHI_COMPL(t, d[1]);                         /*     [*]                   */
 482   CHI_110_1(z->S[I(0, 4)], d[0], t,    d[2]); /*  *  [*]  .          ->  * */
 483   CHI_101_0(z->S[I(1, 4)], t,    d[2], d[3]); /*     [*]  .   *      ->  . */
 484   CHI_010_0(z->S[I(2, 4)], d[2], d[3], d[4]); /*          .   *   .  ->  . */
 485   CHI_101_0(z->S[I(3, 4)], d[3], d[4], d[0]); /*  *           *   .  ->  . */
 486   CHI_010_0(z->S[I(4, 4)], d[4], d[0], d[1]); /*  *   .           .  ->  . */
 487
 488   /* And we're done. */
 489 #undef THETA_RHO
 490 #undef CHI_COMPL
 491 #undef CHI_001_1
 492 #undef CHI_010_0
 493 #undef CHI_101_0
 494 #undef CHI_110_1
 495 #undef CHI
 496 }
 497
 498 /* --- @keccak1600_p@ --- *
 499  *
 500  * Arguments:   @keccak1600_state *z@ = where to write the output state
 501  *              @conts keccak1600_state *x@ = input state
 502  *              @unsigned n@ = number of rounds to perform
 503  *
 504  * Returns:     ---
 505  *
 506  * Use:         Implements the %$\Keccak[1600, n]$% permutation at the core
 507  *              of Keccak and the SHA-3 standard.
 508  */
 509
 510 void keccak1600_p(keccak1600_state *z, const keccak1600_state *x, unsigned n)
 511 {
 512   keccak1600_state u, v;
 513   unsigned i = 0;
 514
 515 #ifdef KECCAK_DEBUG
 516   dump_state("init", 0, x);
 517 #endif
 518   keccak1600_round(&u, x, i++); n--;
 519   while (n > 8) {
 520     keccak1600_round(&v, &u, i++);
 521     keccak1600_round(&u, &v, i++);
 522     keccak1600_round(&v, &u, i++);
 523     keccak1600_round(&u, &v, i++);
 524     keccak1600_round(&v, &u, i++);
 525     keccak1600_round(&u, &v, i++);
 526     keccak1600_round(&v, &u, i++);
 527     keccak1600_round(&u, &v, i++);
 528     n -= 8;
 529   }
 530   switch (n) {
 531     case 7: keccak1600_round(&v, &u, i++);
 532             keccak1600_round(&u, &v, i++);
 533     case 5: keccak1600_round(&v, &u, i++);
 534             keccak1600_round(&u, &v, i++);
 535     case 3: keccak1600_round(&v, &u, i++);
 536             keccak1600_round(&u, &v, i++);
 537     case 1: keccak1600_round( z, &u, i++);
 538             break;
 539     case 8: keccak1600_round(&v, &u, i++);
 540             keccak1600_round(&u, &v, i++);
 541     case 6: keccak1600_round(&v, &u, i++);
 542             keccak1600_round(&u, &v, i++);
 543     case 4: keccak1600_round(&v, &u, i++);
 544             keccak1600_round(&u, &v, i++);
 545     case 2: keccak1600_round(&v, &u, i++);
 546             keccak1600_round( z, &v, i++);
 547             break;
 548   }
 549 #ifdef KECCAK_DEBUG
 550   dump_state("final", 0, z);
 551 #endif
 552 }
 553
 554 /* --- @keccack1600_init@ --- *
 555  *
 556  * Arguments:   @keccak1600_state *s@ = a state to initialize
 557  *
 558  * Returns:     ---
 559  *
 560  * Use:         Initialize @s@ to the root state.
 561  */
 562
 563 void keccak1600_init(keccak1600_state *s)
 564   { memset(s->S, 0, sizeof(s->S)); STATE_INIT(s); }
 565
 566 /* --- @keccak1600_mix@ --- *
 567  *
 568  * Arguments:   @keccak1600_state *s@ = a state to update
 569  *              @const kludge64 *p@ = pointer to 64-bit words to mix in
 570  *              @size_t n@ = size of the input, in 64-bit words
 571  *
 572  * Returns:     ---
 573  *
 574  * Use:         Mixes data into a %$\Keccak[r, 1600 - r]$% state.  Note that
 575  *              it's the caller's responsibility to pass in no more than
 576  *              %$r$% bits of data.
 577  */
 578
 579 void keccak1600_mix(keccak1600_state *s, const kludge64 *p, size_t n)
 580 {
 581   unsigned i;
 582   lane a;
 583
 584   for (i = 0; i < n; i++)
 585     { a = TO_LANE(p[i]); XOR_LANE(s->S[i], s->S[i], a); }
 586 }
 587
 588 /* --- @keccak1600_extract@ --- *
 589  *
 590  * Arguments:   @const keccak1600_state *s@ = a state to extract output from
 591  *              @kludge64 *p@ = pointer to 64-bit words to write
 592  *              @size_t n@ = size of the output, in 64-bit words
 593  *
 594  * Returns:     ---
 595  *
 596  * Use:         Reads output from a %$\Keccak[r, 1600 - r]$% state.  Note
 597  *              that it's the caller's responsibility to extract no more than
 598  *              %$r$% bits of data.
 599  */
 600
 601 void keccak1600_extract(const keccak1600_state *s, kludge64 *p, size_t n)
 602 {
 603   unsigned i;
 604   keccak1600_state t;
 605
 606   t = *s; STATE_OUT(&t);
 607   for (i = 0; i < n; i++) p[i] = FROM_LANE(t.S[i]);
 608 }
 609
 610 /*----- Test rig ----------------------------------------------------------*/
 611
 612 #ifdef TEST_RIG
 613
 614 #include <stdio.h>
 615
 616 #include <mLib/quis.h>
 617 #include <mLib/report.h>
 618 #include <mLib/testrig.h>
 619
 620 static int vrf_p(dstr v[])
 621 {
 622   keccak1600_state u;
 623   kludge64 t[25];
 624   dstr d = DSTR_INIT;
 625   int n;
 626   unsigned i;
 627   int ok = 1;
 628
 629   if (v[0].len != 200) die(1, "bad input size");
 630   if (v[2].len != 200) die(1, "bad output size");
 631   n = *(int *)v[1].buf;
 632   dstr_ensure(&d, 200); d.len = 200;
 633
 634   keccak1600_init(&u);
 635   for (i = 0; i < 25; i++) LOAD64_L_(t[i], v[0].buf + 8*i);
 636   keccak1600_mix(&u, t, 25);
 637   keccak1600_p(&u, &u, n);
 638   keccak1600_extract(&u, t, 25);
 639   for (i = 0; i < 25; i++) STORE64_L_(d.buf + 8*i, t[i]);
 640   if (memcmp(d.buf, v[2].buf, 200) != 0) {
 641     ok = 0;
 642     fprintf(stderr, "failed!");
 643     fprintf(stderr, "\n\t     input = "); type_hex.dump(&v[0], stderr);
 644     fprintf(stderr, "\n\t    rounds = %d", n);
 645     fprintf(stderr, "\n\t  expected = "); type_hex.dump(&v[2], stderr);
 646     fprintf(stderr, "\n\t calclated = "); type_hex.dump(&d, stderr);
 647   }
 648
 649   dstr_destroy(&d);
 650   return (ok);
 651 }
 652
 653 static test_chunk defs[] = {
 654   { "p", vrf_p, { &type_hex, &type_int, &type_hex } },
 655   { 0, 0, { 0 } }
 656 };
 657
 658 int main(int argc, char *argv[])
 659 {
 660   test_run(argc, argv, defs, SRCDIR"/t/keccak1600");
 661   return (0);
 662 }
 663
 664 #endif
 665
 666 /*----- That's all, folks -------------------------------------------------*/