[catacomb] / symm / keccak1600.c

/* -*-c-*-
 *
 * The Keccak-p[1600, n] permutation
 *
 * (c) 2017 Straylight/Edgeware
 */

/*----- Licensing notice --------------------------------------------------*
 *
 * This file is part of Catacomb.
 *
 * Catacomb is free software; you can redistribute it and/or modify
 * it under the terms of the GNU Library General Public License as
 * published by the Free Software Foundation; either version 2 of the
 * License, or (at your option) any later version.
 *
 * Catacomb is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU Library General Public License for more details.
 *
 * You should have received a copy of the GNU Library General Public
 * License along with Catacomb; if not, write to the Free
 * Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
 * MA 02111-1307, USA.
 */

/*----- Header files ------------------------------------------------------*/

#include <limits.h>
#include <string.h>

#include <mLib/bits.h>

#include "keccak1600.h"

/* #define KECCAK_DEBUG */

/*----- Miscellaneous utilities -------------------------------------------*/

#define I(x, y) ((x) + 5*(y))		/* Column-major indexing */

/*----- Interlacing or not ------------------------------------------------*/

/* We should prefer the interlaced representation if the target is really
 * 32-bit and only providing synthetic 64-bit integers.  Alas, the Windows
 * 64-bit ABI specifies that `long' is only 32-bits (i.e., it is IL32/LLP64),
 * so detect x86 specifically.
 */
#if (ULONG_MAX >> 31) <= 0xffffffff && \
  !defined(__amd64__) && !defined(_M_AMD64)
#  define KECCAK_I32
#endif

#ifdef KECCAK_I32
/* A 32-bit target with at best weak support for 64-bit shifts.  Maintain a
 * lane as two 32-bit pieces representing the even and odd bits of the lane.
 * There are slightly fiddly transformations to apply on the way in and out
 * of the main permutation.
 */

typedef keccak1600_lane_i32 lane;
#define S si32

static lane interlace(kludge64 x)
{
  /* Given a 64-bit string X, return a lane Z containing the even- and
   * odd-numbered bits of X.
   *
   * This becomes more manageable if we look at what happens to the bit
   * indices: bit i of X becomes bit ROR_6(i, 1) of Z.  We can effectively
   * swap two bits of the indices by swapping the object bits where those
   * index bits differ.  Fortunately, this is fairly easy.
   *
   * We arrange to swap bits between the two halves of X, rather than within
   * a half.
   */

  uint32 x0 = LO64(x), x1 = HI64(x), t;
  lane z;
							    /* 543210 */
  t = ((x0 >> 16) ^ x1)&0x0000ffff; x0 ^= t << 16; x1 ^= t; /* 453210 */
  t = ((x0 >>  8) ^ x1)&0x00ff00ff; x0 ^= t <<  8; x1 ^= t; /* 354210 */
  t = ((x0 >>  4) ^ x1)&0x0f0f0f0f; x0 ^= t <<  4; x1 ^= t; /* 254310 */
  t = ((x0 >>  2) ^ x1)&0x33333333; x0 ^= t <<  2; x1 ^= t; /* 154320 */
  t = ((x0 >>  1) ^ x1)&0x55555555; x0 ^= t <<  1; x1 ^= t; /* 054321 */
  z.even = x0; z.odd = x1; return (z);
}

static kludge64 deinterlace(lane x)
{
  /* Given a lane X, return the combined 64-bit value.  This is the inverse
   * to `interlace' above, and the principle is the same
   */

  uint32 x0 = x.even, x1 = x.odd, t;
  kludge64 z;
							    /* 054321 */
  t = ((x0 >>  1) ^ x1)&0x55555555; x0 ^= t <<  1; x1 ^= t; /* 154320 */
  t = ((x0 >>  2) ^ x1)&0x33333333; x0 ^= t <<  2; x1 ^= t; /* 254310 */
  t = ((x0 >>  4) ^ x1)&0x0f0f0f0f; x0 ^= t <<  4; x1 ^= t; /* 354210 */
  t = ((x0 >>  8) ^ x1)&0x00ff00ff; x0 ^= t <<  8; x1 ^= t; /* 453210 */
  t = ((x0 >> 16) ^ x1)&0x0000ffff; x0 ^= t << 16; x1 ^= t; /* 543210 */
  SET64(z, x1, x0); return (z);
}

#define TO_LANE(x) (interlace(x))
#define FROM_LANE(x) (deinterlace(x))

#define PRINTFMT_LANE "%08lx:%08lx"
#define PRINTARGS_LANE(x) (unsigned long)(x).even, (unsigned long)(x).odd

#define BINOP_LANE(z, op, x, y)						\
  ((z).even = (x).even op (y).even, (z).odd = (x).odd op (y).odd)
#define XOR_LANE(z, x, y) BINOP_LANE(z, ^, x, y)
#define AND_LANE(z, x, y) BINOP_LANE(z, &, x, y)
#define OR_LANE(z, x, y) BINOP_LANE(z, |, x, y)
#define NOT_LANE(z, x) ((z).even = ~(x).even, (z).odd = ~(x).odd)

#define ROTL_LANE(z, x, n) do {						\
  lane _t = (x);							\
  (z).even = (n)%2 ? ROL32(_t.odd,  ((n) + 1)/2)			\
		   : ROL32(_t.even,  (n)/2);				\
  (z).odd  = (n)%2 ? ROL32(_t.even, ((n) - 1)/2)			\
		   : ROL32(_t.odd,   (n)/2);				\
} while (0)

#define LANE_ZERO {	     0,		 0 }
#define LANE_CMPL { 0xffffffff, 0xffffffff }

static const lane rcon[24] = {
  { 0x00000001, 0x00000000 }, { 0x00000000, 0x00000089 },
  { 0x00000000, 0x8000008b }, { 0x00000000, 0x80008080 },
  { 0x00000001, 0x0000008b }, { 0x00000001, 0x00008000 },
  { 0x00000001, 0x80008088 }, { 0x00000001, 0x80000082 },
  { 0x00000000, 0x0000000b }, { 0x00000000, 0x0000000a },
  { 0x00000001, 0x00008082 }, { 0x00000000, 0x00008003 },
  { 0x00000001, 0x0000808b }, { 0x00000001, 0x8000000b },
  { 0x00000001, 0x8000008a }, { 0x00000001, 0x80000081 },
  { 0x00000000, 0x80000081 }, { 0x00000000, 0x80000008 },
  { 0x00000000, 0x00000083 }, { 0x00000000, 0x80008003 },
  { 0x00000001, 0x80008088 }, { 0x00000000, 0x80000088 },
  { 0x00000001, 0x00008000 }, { 0x00000000, 0x80008082 }
};

#else
/* A target with good support for 64-bit shifts.  We store lanes as 64-bit
 * quantities and deal with them in the obvious, natural way.
 */

typedef keccak1600_lane_64 lane;
#define S s64

#define TO_LANE(x) (x)
#define FROM_LANE(x) (x)

#define PRINTFMT_LANE "%08lx%08lx"
#define PRINTARGS_LANE(x) (unsigned long)HI64(x), (unsigned long)LO64(x)

#define XOR_LANE(z, x, y) XOR64((z), (x), (y))
#define AND_LANE(z, x, y) AND64((z), (x), (y))
#define OR_LANE(z, x, y) OR64((z), (x), (y))
#define NOT_LANE(z, x) CPL64((z), (x))
#define ROTL_LANE(z, x, n) ROL64_((z), (x), (n))

#define LANE_ZERO X64(       0,        0)
#define LANE_CMPL X64(ffffffff, ffffffff)

static const lane rcon[24] = {
  X64(00000000, 00000001), X64(00000000, 00008082),
  X64(80000000, 0000808a), X64(80000000, 80008000),
  X64(00000000, 0000808b), X64(00000000, 80000001),
  X64(80000000, 80008081), X64(80000000, 00008009),
  X64(00000000, 0000008a), X64(00000000, 00000088),
  X64(00000000, 80008009), X64(00000000, 8000000a),
  X64(00000000, 8000808b), X64(80000000, 0000008b),
  X64(80000000, 00008089), X64(80000000, 00008003),
  X64(80000000, 00008002), X64(80000000, 00000080),
  X64(00000000, 0000800a), X64(80000000, 8000000a),
  X64(80000000, 80008081), X64(80000000, 00008080),
  X64(00000000, 80000001), X64(80000000, 80008008)
};

#endif

/*----- Complementing or not ----------------------------------------------*/

/* We should use the complemented representation if the target doesn't have a
 * fused and-not operation.  There doesn't appear to be a principled way to
 * do this, so we'll just have to make do with a big list.  Worse, in my
 * brief survey of the architecture reference manuals I have lying about,
 * they've split close to 50/50 on this question, so I don't have an
 * especially good way to pick a default.  The `no-fused-op' architectures
 * seem generally a bit more modern than the `fused-op' architectures, so I
 * guess I'll make the complemented representation the default.
 *
 *		and-not		No and-not
 *		-------		----------
 *		ARM (`bic')	x86/amd64
 *		Sparc (`andn')	z/Architecture
 *		MMIX (`andn')	MIPS
 *		IA64 (`andcm')	68k
 *		VAX (`bic')	RISC-V
 *		PDP-10 (`andc')
 */
#if !(defined(__arm__) || defined(__thumb__) || defined(__aarch64__) || \
      defined(_M_ARM) || defined(_M_THUMB)) &&				\
    !(defined(__ia64__) || defined(__ia64) || defined(__itanium__) ||	\
      defined(_M_IA64)) &&						\
    !defined(__mmix__) &&						\
    !(defined(__sparc__) || defined(__sparc)) &&			\
    !defined(__vax__) &&						\
    !defined(__pdp10__)
#  define KECCAK_COMPL
#endif

#ifdef KECCAK_COMPL
/* A target without fused and/not (`bic', `andc2').  We complement some of
 * the lanes in the initial state and undo this on output.  (Absorbing XORs
 * input into the state, so this is unaffected.)  See the handling of chi in
 * `keccak1600_round' below for the details.
 */

#define COMPL_MASK 0x00121106u

#define STATE_INIT(z) do {						\
  lane cmpl = LANE_CMPL;						\
  (z)->S[I(1, 0)] = cmpl; (z)->S[I(2, 0)] = cmpl;			\
  (z)->S[I(3, 1)] = cmpl; (z)->S[I(2, 2)] = cmpl;			\
  (z)->S[I(2, 3)] = cmpl; (z)->S[I(0, 4)] = cmpl;			\
} while (0)

#define STATE_OUT(z) do {						\
  NOT_LANE((z)->S[I(1, 0)], (z)->S[I(1, 0)]);				\
  NOT_LANE((z)->S[I(2, 0)], (z)->S[I(2, 0)]);				\
  NOT_LANE((z)->S[I(3, 1)], (z)->S[I(3, 1)]);				\
  NOT_LANE((z)->S[I(2, 2)], (z)->S[I(2, 2)]);				\
  NOT_LANE((z)->S[I(2, 3)], (z)->S[I(2, 3)]);				\
  NOT_LANE((z)->S[I(0, 4)], (z)->S[I(0, 4)]);				\
} while (0)

#else
/* A target with fused and/not (`bic', `andc2').  Everything is simple. */

#define COMPL_MASK 0u

#define STATE_INIT(z) do ; while (0)
#define STATE_OUT(z) do ; while (0)

#endif

/*----- Other magic constants ---------------------------------------------*/

/* The rotation constants.  These are systematically named -- see `THETA_RHO'
 * below.
 */
#define ROT_0_0  0
#define ROT_1_0  1
#define ROT_2_0 62
#define ROT_3_0 28
#define ROT_4_0 27

#define ROT_0_1 36
#define ROT_1_1 44
#define ROT_2_1  6
#define ROT_3_1 55
#define ROT_4_1 20

#define ROT_0_2  3
#define ROT_1_2 10
#define ROT_2_2 43
#define ROT_3_2 25
#define ROT_4_2 39

#define ROT_0_3 41
#define ROT_1_3 45
#define ROT_2_3 15
#define ROT_3_3 21
#define ROT_4_3  8

#define ROT_0_4 18
#define ROT_1_4  2
#define ROT_2_4 61
#define ROT_3_4 56
#define ROT_4_4 14

/*----- Debugging ---------------------------------------------------------*/

#ifdef KECCAK_DEBUG

#include <stdio.h>

static void dump_state(const char *what, unsigned ir,
		       const keccak1600_state *x)
{
  unsigned i, j;
  keccak1600_state y;
  kludge64 a;
  int sep;

  printf(";; %s [round %u]\n", what, ir);
  printf(";; raw state...\n");
  for (j = 0; j < 5; j++) {
    printf(";;");
    for (i = 0, sep = '\t'; i < 5; i++, sep = ' ')
      printf("%c" PRINTFMT_LANE, sep, PRINTARGS_LANE(x->S[I(i, j)]));
    fputc('\n', stdout);
  }
  y = *x; STATE_OUT(&y);
#ifdef KECCAK_COMPL
  printf(";; uncomplemented state...\n");
  for (j = 0; j < 5; j++) {
    printf(";;");
    for (i = 0, sep = '\t'; i < 5; i++, sep = ' ')
      printf("%c" PRINTFMT_LANE, sep, PRINTARGS_LANE(y.S[I(i, j)]));
    fputc('\n', stdout);
  }
#endif
#ifdef KECCAK_I32
  printf(";; deinterlaced state...\n");
  for (j = 0; j < 5; j++) {
    printf(";;");
    for (i = 0, sep = '\t'; i < 5; i++, sep = ' ') {
      a = FROM_LANE(y.S[I(i, j)]);
      printf("%c%08lx%08lx", sep,
	     (unsigned long)HI64(a), (unsigned long)LO64(a));
    }
    fputc('\n', stdout);
  }
#endif
  fputc('\n', stdout);
}

#endif

/*----- The Keccak-p[1600, n] permutation ---------------------------------*/

static void keccak1600_round(keccak1600_state *z,
			     const keccak1600_state *x, unsigned i)
{
  /* Perform a round of Keccak-p[1600, n].  Process the state X and write the
   * result to Z.
   */

  lane c[5], d[5], t;

  /* Theta, first step: calculate the column parities. */
#define COLPARITY(j) do {						\
	   d[j] =      x->S[I(j, 0)];					\
  XOR_LANE(d[j], d[j], x->S[I(j, 1)]);					\
  XOR_LANE(d[j], d[j], x->S[I(j, 2)]);					\
  XOR_LANE(d[j], d[j], x->S[I(j, 3)]);					\
  XOR_LANE(d[j], d[j], x->S[I(j, 4)]);					\
} while (0)
  COLPARITY(0); COLPARITY(1); COLPARITY(2); COLPARITY(3); COLPARITY(4);
#undef COLPARITY

  /* Theta, second step: calculate the combined effect. */
  ROTL_LANE(c[0], d[1], 1); XOR_LANE(c[0], c[0], d[4]);
  ROTL_LANE(c[1], d[2], 1); XOR_LANE(c[1], c[1], d[0]);
  ROTL_LANE(c[2], d[3], 1); XOR_LANE(c[2], c[2], d[1]);
  ROTL_LANE(c[3], d[4], 1); XOR_LANE(c[3], c[3], d[2]);
  ROTL_LANE(c[4], d[0], 1); XOR_LANE(c[4], c[4], d[3]);

  /* Now we work plane by plane through the output.  To do this, we must undo
   * the pi transposition.  Pi maps (x', y') = (y, 2 x + 3 y), so y = x', and
   * x = (y' - 3 y)/2 = 3 (y' - 3 x') = x' + 3 y'.
   */
#define THETA_RHO(i0, i1, i2, i3, i4) do {				\
									\
  /* First, theta. */							\
  XOR_LANE(d[0], x->S[I(i0, 0)], c[i0]);				\
  XOR_LANE(d[1], x->S[I(i1, 1)], c[i1]);				\
  XOR_LANE(d[2], x->S[I(i2, 2)], c[i2]);				\
  XOR_LANE(d[3], x->S[I(i3, 3)], c[i3]);				\
  XOR_LANE(d[4], x->S[I(i4, 4)], c[i4]);				\
									\
  /* Then rho. */							\
  ROTL_LANE(d[0], d[0], ROT_##i0##_0);					\
  ROTL_LANE(d[1], d[1], ROT_##i1##_1);					\
  ROTL_LANE(d[2], d[2], ROT_##i2##_2);					\
  ROTL_LANE(d[3], d[3], ROT_##i3##_3);					\
  ROTL_LANE(d[4], d[4], ROT_##i4##_4);					\
} while (0)

  /* The basic chi operation is: z = w ^ (~a&b), but this involves an
   * inversion which we can mostly avoid by being clever: observe that
   *
   *		w ^ (~a&~~b) = w ^ ~(a | ~b) = ~w ^ (a | ~b)
   *
   * by De Morgan's law.  Furthermore, complementing w or z is basically
   * equivalent.  Bertoni, Daemen, Peeters, Van Assche, and Van Keer, `Keccak
   * implementation overview', describe a pattern of lane complementation
   * which propagates through theta and pi in exactly the right way to be
   * restored easily by chi, here, with exactly one inversion per plane.
   *
   * Here's the pattern.
   *
   *			[ * . * * . ]	     [ . * * . . ]
   *			[ * . * . . ]	     [ . . . * . ]
   *			[ * . * . . ]	->   [ . . * . . ]
   *			[ . * . * * ]	     [ . . * . . ]
   *			[ * . . * . ]	     [ * . . . . ]
   *
   * where a `.' means that the lane is unchanged, and a `*' means that it
   * has been complemented.
   *
   * The macros `CHI_wxy_z' calculate z in terms of w, x, y assuming that the
   * inputs w, x, y marked with a `1' are complemented on input, and arrange
   * for z to be complemented on output if z is so marked.
   *
   * The diagrams to the right show the fragment of the complementation
   * pattern being handled by the corresponding line of code.  A symbol in
   * brackets indicates a deviation from the input pattern forced by explicit
   * complementation: there will be exactly one of these for each plane.
   */
#ifdef KECCAK_COMPL
#  define CHI_COMPL(z, x) NOT_LANE((z), (x))
#  define CHI_001_1(z, w, x, y)						\
	(OR_LANE((z), (x), (y)), XOR_LANE((z), (z), (w)))
#  define CHI_010_0(z, w, x, y)						\
	(AND_LANE((z), (x), (y)), XOR_LANE((z), (z), (w)))
#  define CHI_101_0 CHI_001_1
#  define CHI_110_1 CHI_010_0
#else
#  define CHI(z, w, x, y)						\
	(NOT_LANE((z), (x)),						\
	 AND_LANE((z), (z), (y)),					\
	 XOR_LANE((z), (z), (w)))
#  define CHI_COMPL(z, x) ((z) = (x))
#  define CHI_001_1 CHI
#  define CHI_010_0 CHI
#  define CHI_101_0 CHI
#  define CHI_110_1 CHI
#endif

  /* Let's do the y' = 0 plane first.  Theta and rho are easy with our macro,
   * and we've done pi with the coordinate hacking.  That leaves chi next.
   * This is hairy because we must worry about complementation.
   */
  THETA_RHO(0, 1, 2, 3, 4);
  CHI_COMPL(t, d[2]);			      /*	 [.]		   */
  CHI_101_0(z->S[I(0, 0)], d[0], d[1], d[2]); /*  *   .   *          ->  . */
  CHI_001_1(z->S[I(1, 0)], d[1], t,    d[3]); /*      .  [.]  *      ->  * */
  CHI_110_1(z->S[I(2, 0)], d[2], d[3], d[4]); /*          *   *   .  ->  * */
  CHI_101_0(z->S[I(3, 0)], d[3], d[4], d[0]); /*  *           *   .  ->  . */
  CHI_010_0(z->S[I(4, 0)], d[4], d[0], d[1]); /*  *   .           .  ->  . */

  /* We'd better do iota before we forget. */
  XOR_LANE(z->S[I(0, 0)], z->S[I(0, 0)], rcon[i]);

  /* That was fun.  Maybe y' = 1 will be as good. */
  THETA_RHO(3, 4, 0, 1, 2);
  CHI_COMPL(t, d[4]);			      /*		 [*]	   */
  CHI_101_0(z->S[I(0, 1)], d[0], d[1], d[2]); /*  *   .   *          ->  . */
  CHI_010_0(z->S[I(1, 1)], d[1], d[2], d[3]); /*      .   *   .      ->  . */
  CHI_101_0(z->S[I(2, 1)], d[2], d[3], t);    /*          *   .  [*] ->  . */
  CHI_001_1(z->S[I(3, 1)], d[3], d[4], d[0]); /*  *           .   .  ->  * */
  CHI_010_0(z->S[I(4, 1)], d[4], d[0], d[1]); /*  *   .           .  ->  . */

  /* We're getting the hang of this.  The y' = 2 plane shouldn't be any
   * trouble.
   */
  THETA_RHO(1, 2, 3, 4, 0);
  CHI_COMPL(t, d[3]);			      /*	     [*]	   */
  CHI_101_0(z->S[I(0, 2)], d[0], d[1], d[2]); /*  *   .   *          ->  . */
  CHI_010_0(z->S[I(1, 2)], d[1], d[2], d[3]); /*      .   *   .      ->  . */
  CHI_110_1(z->S[I(2, 2)], d[2], t,    d[4]); /*          *  [*]  .  ->  * */
  CHI_101_0(z->S[I(3, 2)], t,    d[4], d[0]); /*  *          [*]  .  ->  . */
  CHI_010_0(z->S[I(4, 2)], d[4], d[0], d[1]); /*  *   .           .  ->  . */

  /* This isn't as interesting any more.  Let's do y' = 3 before boredom sets
   * in.
   */
  THETA_RHO(4, 0, 1, 2, 3);
  CHI_COMPL(t, d[3]);			      /*	     [.]	   */
  CHI_010_0(z->S[I(0, 3)], d[0], d[1], d[2]); /*  .   *   .          ->  . */
  CHI_101_0(z->S[I(1, 3)], d[1], d[2], d[3]); /*      *   .   *      ->  . */
  CHI_001_1(z->S[I(2, 3)], d[2], t,    d[4]); /*          .  [.]  *  ->  * */
  CHI_010_0(z->S[I(3, 3)], t,    d[4], d[0]); /*  .          [.]  *  ->  . */
  CHI_101_0(z->S[I(4, 3)], d[4], d[0], d[1]); /*  .   *           *  ->  . */

  /* Last plane.  Just y' = 4 to go. */
  THETA_RHO(2, 3, 4, 0, 1);
  CHI_COMPL(t, d[1]);			      /*     [*]		   */
  CHI_110_1(z->S[I(0, 4)], d[0], t,    d[2]); /*  *  [*]  .          ->  * */
  CHI_101_0(z->S[I(1, 4)], t,    d[2], d[3]); /*     [*]  .   *      ->  . */
  CHI_010_0(z->S[I(2, 4)], d[2], d[3], d[4]); /*          .   *   .  ->  . */
  CHI_101_0(z->S[I(3, 4)], d[3], d[4], d[0]); /*  *           *   .  ->  . */
  CHI_010_0(z->S[I(4, 4)], d[4], d[0], d[1]); /*  *   .           .  ->  . */

  /* And we're done. */
#undef THETA_RHO
#undef CHI_COMPL
#undef CHI_001_1
#undef CHI_010_0
#undef CHI_101_0
#undef CHI_110_1
#undef CHI
}

/* --- @keccak1600_p@ --- *
 *
 * Arguments:	@keccak1600_state *z@ = where to write the output state
 *		@conts keccak1600_state *x@ = input state
 *		@unsigned n@ = number of rounds to perform
 *
 * Returns:	---
 *
 * Use:		Implements the %$\Keccak[1600, n]$% permutation at the core
 *		of Keccak and the SHA-3 standard.
 */

void keccak1600_p(keccak1600_state *z, const keccak1600_state *x, unsigned n)
{
  keccak1600_state u, v;
  unsigned i = 0;

#ifdef KECCAK_DEBUG
  dump_state("init", 0, x);
#endif
  keccak1600_round(&u, x, i++); n--;
  while (n > 8) {
    keccak1600_round(&v, &u, i++);
    keccak1600_round(&u, &v, i++);
    keccak1600_round(&v, &u, i++);
    keccak1600_round(&u, &v, i++);
    keccak1600_round(&v, &u, i++);
    keccak1600_round(&u, &v, i++);
    keccak1600_round(&v, &u, i++);
    keccak1600_round(&u, &v, i++);
    n -= 8;
  }
  switch (n) {
    case 7: keccak1600_round(&v, &u, i++);
	    keccak1600_round(&u, &v, i++);
    case 5: keccak1600_round(&v, &u, i++);
	    keccak1600_round(&u, &v, i++);
    case 3: keccak1600_round(&v, &u, i++);
	    keccak1600_round(&u, &v, i++);
    case 1: keccak1600_round( z, &u, i++);
	    break;
    case 8: keccak1600_round(&v, &u, i++);
	    keccak1600_round(&u, &v, i++);
    case 6: keccak1600_round(&v, &u, i++);
	    keccak1600_round(&u, &v, i++);
    case 4: keccak1600_round(&v, &u, i++);
	    keccak1600_round(&u, &v, i++);
    case 2: keccak1600_round(&v, &u, i++);
	    keccak1600_round( z, &v, i++);
	    break;
  }
#ifdef KECCAK_DEBUG
  dump_state("final", 0, z);
#endif
}

/* --- @keccack1600_init@ --- *
 *
 * Arguments:	@keccak1600_state *s@ = a state to initialize
 *
 * Returns:	---
 *
 * Use:		Initialize @s@ to the root state.
 */

void keccak1600_init(keccak1600_state *s)
  { memset(s->S, 0, sizeof(s->S)); STATE_INIT(s); }

/* --- @keccak1600_mix@ --- *
 *
 * Arguments:	@keccak1600_state *s@ = a state to update
 *		@const kludge64 *p@ = pointer to 64-bit words to mix in
 *		@size_t n@ = size of the input, in 64-bit words
 *
 * Returns:	---
 *
 * Use:		Mixes data into a %$\Keccak[r, 1600 - r]$% state.  Note that
 *		it's the caller's responsibility to pass in no more than
 *		%$r$% bits of data.
 */

void keccak1600_mix(keccak1600_state *s, const kludge64 *p, size_t n)
{
  unsigned i;
  lane a;

  for (i = 0; i < n; i++)
    { a = TO_LANE(p[i]); XOR_LANE(s->S[i], s->S[i], a); }
}

/* --- @keccak1600_set@ --- *
 *
 * Arguments:	@keccak1600_state *s@ = a state to update
 *		@const kludge64 *p@ = pointer to 64-bit words to mix in
 *		@size_t n@ = size of the input, in 64-bit words
 *
 * Returns:	---
 *
 * Use:		Stores data into a %$\Keccak[r, 1600 - r]$% state.  Note that
 *		it's the caller's responsibility to pass in no more than
 *		%$r$% bits of data.
 *
 *		This is not the operation you wanted for ordinary hashing.
 *		It's provided for the use of higher-level protocols which use
 *		duplexing and other fancy sponge features.
 */

void keccak1600_set(keccak1600_state *s, const kludge64 *p, size_t n)
{
  uint32 m = COMPL_MASK;
  unsigned i;
  lane a;

  for (i = 0; i < n; i++) {
    a = TO_LANE(p[i]); if (m&1) NOT_LANE(a, a);
    s->S[i] = a; m >>= 1;
  }
}

/* --- @keccak1600_extract@ --- *
 *
 * Arguments:	@const keccak1600_state *s@ = a state to extract output from
 *		@kludge64 *p@ = pointer to 64-bit words to write
 *		@size_t n@ = size of the output, in 64-bit words
 *
 * Returns:	---
 *
 * Use:		Reads output from a %$\Keccak[r, 1600 - r]$% state.  Note
 *		that it's the caller's responsibility to extract no more than
 *		%$r$% bits of data.
 */

void keccak1600_extract(const keccak1600_state *s, kludge64 *p, size_t n)
{
  uint32 m = COMPL_MASK;
  unsigned i;
  lane t;

  for (i = 0; i < n; i++) {
    t = s->S[i]; if (m&1) NOT_LANE(t, t);
    *p++ = FROM_LANE(t); m >>= 1;
  }
}

/*----- Test rig ----------------------------------------------------------*/

#ifdef TEST_RIG

#include <stdio.h>

#include <mLib/macros.h>
#include <mLib/quis.h>
#include <mLib/report.h>
#include <mLib/testrig.h>

static int vrf_p(dstr v[])
{
  keccak1600_state u;
  kludge64 t[25];
  dstr d = DSTR_INIT;
  int n;
  unsigned i;
  int ok = 1;

  if (v[0].len != 200) die(1, "bad input size");
  if (v[2].len != 200) die(1, "bad output size");
  n = *(int *)v[1].buf;
  dstr_ensure(&d, 200); d.len = 200;

  keccak1600_init(&u);
  for (i = 0; i < 25; i++) LOAD64_L_(t[i], v[0].buf + 8*i);
  keccak1600_mix(&u, t, 25);
  keccak1600_p(&u, &u, n);
  keccak1600_extract(&u, t, 25);
  for (i = 0; i < 25; i++) STORE64_L_(d.buf + 8*i, t[i]);
  if (MEMCMP(d.buf, !=, v[2].buf, 200)) {
    ok = 0;
    fprintf(stderr, "failed!");
    fprintf(stderr, "\n\t     input = "); type_hex.dump(&v[0], stderr);
    fprintf(stderr, "\n\t    rounds = %d", n);
    fprintf(stderr, "\n\t  expected = "); type_hex.dump(&v[2], stderr);
    fprintf(stderr, "\n\t calclated = "); type_hex.dump(&d, stderr);
  }

  dstr_destroy(&d);
  return (ok);
}

static test_chunk defs[] = {
  { "p", vrf_p, { &type_hex, &type_int, &type_hex } },
  { 0, 0, { 0 } }
};

int main(int argc, char *argv[])
{
  test_run(argc, argv, defs, SRCDIR"/t/keccak1600");
  return (0);
}

#endif

/*----- That's all, folks -------------------------------------------------*/