From: mdw <mdw>
Date: Sun, 12 Oct 2003 14:43:24 +0000 (+0000)
Subject: Universal hashing.
X-Git-Tag: 2.0.4~65
X-Git-Url: http://www.chiark.greenend.org.uk/ucgi/~mdw/git/mLib/commitdiff_plain/8fe3c82b561e58c894d9b8392ef4003158f44dd8

Universal hashing.
---

diff --git a/man/unihash.3 b/man/unihash.3
new file mode 100644
index 0000000..49ef2e0
--- /dev/null
+++ b/man/unihash.3
@@ -0,0 +1,182 @@
+.\" -*-nroff-*-
+.de VS
+.sp 1
+.RS
+.nf
+.ft B
+..
+.de VE
+.ft R
+.fi
+.RE
+.sp 1
+..
+.de hP
+.IP
+.ft B
+\h'-\w'\\$1\ 'u'\\$1\ \c
+.ft P
+..
+.ie t \{\
+.  ds ss \s8\u
+.  ds se \d\s0
+.  ds us \s8\d
+.  ds ue \u\s0
+.  ds *d \(*d
+.  ds >= \(>=
+.\}
+.el \{\
+.  ds ss ^
+.  ds se
+.  ds us _
+.  ds ue
+.  ds *d \fIdelta\fP
+.  ds >= >=
+.\}
+.TH unihash 3 "5 July 2003" "Straylight/Edgeware" "mLib utilities library"
+.SH NAME
+unihash \- simple and efficient universal hashing for hashtables
+.\" @unihash_setkey
+.\" @UNIHASH_INIT
+.\" @unihash_hash
+.\" @UNIHASH
+.\" @unihash
+.SH SYNOPSIS
+.nf
+.B "#include <mLib/unihash.h>"
+
+.BI "void unihash_setkey(unihash_info *" i ", uint32 " k );
+.BI "uint32 UNIHASH_INIT(const unihash_info *" i );
+.BI "void unihash_hash(const unihash_info *" st ", uint32 " a ,
+.BI "                  const void *" p ", size_t " sz );
+.BI "uint32 unihash(const unihash_info *" i ", const void *" p ", size_t " sz );
+.BI "uint32 UNIHASH(const unihash_info *" i ", const void *" p ", size_t " sz );
+.fi
+.SH DESCRIPTION
+The
+.B unihash
+system implements a simple and relatively efficient
+.IR "universal hashing family" .
+Using a such a universal hashing family means that it's provably
+difficult for an adversary to choose input data whose hashes collide,
+thus guaranteeing good average performance even on maliciously chosen
+data.
+.PP
+Unlike, say,
+.BR crc32 (3),
+the
+.B unihash
+function is
+.I keyed
+\- in addition to the data to be hashed, the function takes as input a
+32-bit key.  This key should be chosen at random each time the program
+runs.
+.SS "Preprocessing a key"
+Before use, a key must be
+.I preprocessed
+into a large (16K) table which is used by the main hashing functions.
+The preprocessing is done by
+.BR unihash_setkey :
+pass it a pointer to a
+.B unihash_info
+structure and the 32-bit key you've chosen, and it stores the table in
+the structure.
+.PP
+Objects of type
+.B unihash_info
+don't contain any pointers to other data and are safe to free when
+you've finished with them; or you can just allocate them statically or
+on the stack if that's more convenient.
+.SS "Hashing data"
+The function
+.B unihash_hash
+takes as input:
+.TP
+.BI "const unihash_info *" i
+A pointer to the precomputed tables for a key.
+.TP
+.BI "uint32 " a 
+An accumulator value.  This should be
+.BI UNIHASH_INIT( i )
+for the first chunk of a multi-chunk input, or the result of the
+previous
+.B unihash_hash
+call for subsequent chunks.
+.TP
+.BI "const void *" p
+A pointer to the start of a buffer containing this chunk of data.
+.TP
+.BI "size_t " sz
+The length of the chunk.
+.PP
+The function returns a new accumulator value, which is also the hash of
+the data so far. So, to hash multiple chunks of data, do something like
+.VS
+uint32 a = UNIHASH_INIT(i);
+a = unihash_hash(i, a, p_0, sz_0);
+a = unihash_hash(i, a, p_1, sz_1);
+/* ... */
+a = unihash_hash(i, a, p_n, sz_n);
+.VE
+The macro
+.B UNIHASH
+and function
+.B unihash
+are convenient interfaces to
+.B unihash_hash
+if you only wanted to hash one chunk.
+.SS "Theoretical issues"
+The hash function implemented by
+.B unihash
+is
+.RI ( l \ +\ 1)/2\*(ss32\*(se-almost
+XOR-universal, where
+.I l
+is the length (in bytes) of the longest string you hash.  That means
+that, for any pair of strings
+.I x
+and
+.I y
+and any 32-bit value \*(*d, the probability taken over all choices of the
+key
+.I k
+that
+.IR H\*(usk\*(ue ( x )\  \c
+.BR xor \c
+.RI \  H\*(usk\*(ue ( y )\ =\ \*(*d
+is no greater than
+.RI ( l \ +\ 1)/2\*(ss32\*(se.
+.PP
+This fact is proven in the header file, but it requires more
+sophisticated typesetting than is available here.
+.PP
+The function evaluates a polynomial over GF(2\*(ss32\*(se) whose
+coefficients are the bytes of the message and whose variable is the key.
+Details are given in the header file.
+.PP
+For best results, you should choose the key as a random 32-bit number
+each time your program starts.  Choosing a different key for different
+hashtables isn't necessary.  It's probably a good idea to avoid the keys
+0 and 1.  This raises the collision bound to
+.RI ( l \ +\ 1)/(2\*(ss32\*(se\ \-\ 2)
+(which isn't a significant increase) but eliminates keys for which the
+hash's behaviour is particularly poor.
+.PP
+In tests,
+.B unihash
+actually performed better than
+.BR crc32 ,
+so if you want to just use it as a fast-ish hash with good statistical
+properties, choose some fixed key
+.IR k \ \*(>=\ 2.
+.PP
+We emphasize that the proof of this function's collision behaviour is
+.I not
+dependent on any unproven assumptions (unlike many `proofs' of
+cryptographic security, which actually reduce the security of some
+construction to the security of its components).  It's just a fact.
+.SH SEE ALSO
+.BR crc32 (3),
+.BR mLib (3).
+.SH AUTHOR
+Mark Wooding (mdw@nsict.org).
diff --git a/unihash.c b/unihash.c
new file mode 100644
index 0000000..39e2f06
--- /dev/null
+++ b/unihash.c
@@ -0,0 +1,159 @@
+/* -*-c-*-
+ *
+ * $Id: unihash.c,v 1.1 2003/10/12 14:43:24 mdw Exp $
+ *
+ * Simple and efficient universal hashing for hashtables
+ *
+ * (c) 2003 Straylight/Edgeware
+ */
+
+/*----- Licensing notice --------------------------------------------------* 
+ *
+ * This file is part of the mLib utilities library.
+ *
+ * mLib is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Library General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ * 
+ * mLib is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Library General Public License for more details.
+ * 
+ * You should have received a copy of the GNU Library General Public
+ * License along with mLib; if not, write to the Free
+ * Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+ * MA 02111-1307, USA.
+ */
+
+/*----- Revision history --------------------------------------------------* 
+ *
+ * $Log: unihash.c,v $
+ * Revision 1.1  2003/10/12 14:43:24  mdw
+ * Universal hashing.
+ *
+ */
+
+/*----- Header files ------------------------------------------------------*/
+
+#include <assert.h>
+#include <stdlib.h>
+
+#include "unihash.h"
+
+/*----- Main code ---------------------------------------------------------*/
+
+/* --- @unihash_setkey@ --- *
+ *
+ * Arguments:	@unihash_info *i@ = where to store the precomputed tables
+ *		@uint32 k@ = the key to set, randomly chosen
+ *
+ * Returns:	---
+ *
+ * Use:		Calculates the tables required for efficient hashing.
+ */
+
+static uint32 mul(uint32 x, uint32 y)
+{
+  uint32 z = 0;
+  while (y) {
+    if (y & 1) z ^= x;
+    if (x & (1 << 31))
+      x = U32(x << 1) ^ UNIHASH_POLY;
+    else
+      x = U32(x << 1);
+    y = U32(y >> 1);
+  }
+  return (z);
+}
+
+void unihash_setkey(unihash_info *i, uint32 k)
+{
+  size_t a;
+  size_t b;
+  uint32 x = 1;
+
+  for (a = 0; a < UNIHASH_NBATCH; a++) {
+    x = mul(x, k);
+    for (b = 0; b < 256; b++) {
+      i->s[a][0][b] = mul(x, b <<  0);
+      i->s[a][1][b] = mul(x, b <<  8);
+      i->s[a][2][b] = mul(x, b << 16);
+      i->s[a][3][b] = mul(x, b << 24);
+    }
+  }
+}
+
+/* --- @unihash_hash@ --- *
+ *
+ * Arguments:	@const unihash_info *i@ = pointer to precomputed table
+ *		@uint32 a@ = @i->[0][0][1]@ or value from previous call
+ *		@const void *p@ = pointer to data to hash
+ *		@size_t sz@ = size of the data
+ *
+ * Returns:	---
+ *
+ * Use:		Hashes data.  Call this as many times as needed.  
+ */
+
+uint32 unihash_hash(const unihash_info *i, uint32 a,
+		    const void *p, size_t sz)
+{
+  const octet *pp = p;
+
+  assert(UNIHASH_NBATCH == 4);
+
+#define FULLMULT(u, x)							\
+  (i->s[u][0][U8((x) >>  0)] ^ i->s[u][1][U8((x) >>  8)] ^		\
+   i->s[u][2][U8((x) >> 16)] ^ i->s[u][3][U8((x) >> 24)]);
+
+#define BYTEMULT(u, x) i->s[u][0][x]
+
+  /* --- Do the main bulk in batches of %$n$% bytes --- *
+   *
+   * We have %$a$% and %$m_{n-1}, \ldots, m_1, m_0$%; we want
+   *
+   * %$a' = (a + m_{n-1}) k^n + m_{n-2} k^{n-1} + \cdots + m_1 k^2 + m_0 k$%
+   */
+
+  while (sz >= UNIHASH_NBATCH) {
+    a ^= *pp++;
+    a = FULLMULT(3, a);
+    a ^= BYTEMULT(2, *pp++);
+    a ^= BYTEMULT(1, *pp++);
+    a ^= BYTEMULT(0, *pp++);
+  }
+
+  /* --- The tail end is a smaller batch --- */
+
+  switch (sz) {
+    case  3: a ^= *pp++; a = FULLMULT(2, a); goto batch_2;
+    case  2: a ^= *pp++; a = FULLMULT(1, a); goto batch_1;
+    case  1: a ^= *pp++; a = FULLMULT(0, a); goto batch_0;
+    batch_2: a ^= BYTEMULT(1, *pp++);
+    batch_1: a ^= BYTEMULT(0, *pp++);
+    batch_0: break;
+  }
+
+  return (a);
+}
+
+/* --- @unihash@ --- *
+ *
+ * Arguments:	@const unihash_info *i@ = precomputed tables
+ *		@const void *p@ = pointer to data to hash
+ *		@size_t sz@ = size of the data
+ *
+ * Returns:	The hash value computed.
+ *
+ * Use:		All-in-one hashing function.  No faster than using the
+ * 		separate calls, but more convenient.
+ */
+
+uint32 unihash(const unihash_info *i, const void *p, size_t sz)
+{
+  return (UNIHASH(i, p, sz));
+}
+
+/*----- That's all, folks -------------------------------------------------*/
diff --git a/unihash.h b/unihash.h
new file mode 100644
index 0000000..5fb5741
--- /dev/null
+++ b/unihash.h
@@ -0,0 +1,192 @@
+/* -*-c-*-
+ *
+ * $Id: unihash.h,v 1.1 2003/10/12 14:43:24 mdw Exp $
+ *
+ * Simple and efficient universal hashing for hashtables
+ *
+ * (c) 2003 Straylight/Edgeware
+ */
+
+/*----- Licensing notice --------------------------------------------------* 
+ *
+ * This file is part of the mLib utilities library.
+ *
+ * mLib is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Library General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ * 
+ * mLib is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Library General Public License for more details.
+ * 
+ * You should have received a copy of the GNU Library General Public
+ * License along with mLib; if not, write to the Free
+ * Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+ * MA 02111-1307, USA.
+ */
+
+/*----- Revision history --------------------------------------------------* 
+ *
+ * $Log: unihash.h,v $
+ * Revision 1.1  2003/10/12 14:43:24  mdw
+ * Universal hashing.
+ *
+ */
+
+#ifndef MLIB_UNIHASH_H
+#define MLIB_UNIHASH_H
+
+#ifdef __cplusplus
+  extern "C" {
+#endif
+
+
+/*----- Concept -----------------------------------------------------------*
+ *
+ * Let %$\gf{q}$% be a finite field.  Choose an arbitrary %$k \inr \gf{q}$%.
+ * Let %$M$% be a message.  Injectively pad %$M$% and split it into blocks
+ * $m_{n-1}, m_{n-2}, \ldots, m_2, m_1, m_0$% in %$\gf{q}%.
+ * Then we compute
+ *
+ *   %$H_k(M) = k^{n+1} \sum_{0\le i<n} m_i k^{i+1}.$%
+ *
+ * Note that %$H_0(M) = 0$% for all messages %$M$%.
+ *
+ * If we deal with messages at most %$\ell$% blocks long then %$H_k(\cdot)$%
+ * is %$(\ell + 1)/q$%-almost universal.  Moreover, if %$q = 2^f$% then
+ * %$H_k(\cdot)$% is %$(\ell + 1)/q$%-almost XOR-universal.
+ *
+ * Proof.  Let %$A$% and %$B$% be two messages, represented by
+ * %$a_{n-1}, \ldots, a_0$% and %$b_{m-1}, \ldots, b_0$% respectively; and
+ * choose any %$\delta \in \gf{q}$%.  We must bound the probability that
+ *
+ * %$k^{n+1} + a_{n-1} k^{n} + \cdots + a_1 k^2 + a_0 k - {}$%
+ *   %$k^{m+1} - b_{m-1} k^{m} - \cdots - b_1 k^2 - b_0 k = \delta$%.
+ *
+ * Firstly, we claim that if %$A$% and %$B$% are distinct, there is some
+ * nonzero coefficient of %$k$%.  For if %$n \ne m$% then, without loss of
+ * generality, let %$n > m$%, and hence the coefficient of %$k_n$% is
+ * nonzero.  Alternatively, if %$n = m$% then there must be some
+ * %$i \in \{ 0, \ldots, n - 1 \}$% with %$a_i \ne b_i$%, for otherwise the
+ * messages would be identical; but then the coefficient of %$k^{i+1}$% is
+ * %$a_i - b_i \ne 0$%.
+ *
+ * Hence we have a polynomial equation with degree at most %$\ell + 1$%;
+ * there must be at most %$\ell + 1$% solutions for %$k$%; but we choose
+ * %$k$% at random from a set of %$q$%; so the equation is true with
+ * probability at most %$(\ell + 1)/q$%.
+ *
+ * This function can be used as a simple MAC with provable security against
+ * computationally unbounded adversaries.  Simply XOR the hash with a random
+ * string indexed from a large random pad by some nonce sent with the
+ * message.  The probability of a forgery attempt being successful is then
+ * %$(\ell + 1)/2^t$%, where %$t$% is the tag length and %$n$% is the longest
+ * message permitted.
+ */
+
+/*----- Practicalities ----------------------------------------------------*
+ *
+ * We work in %$\gf{2^32}$%, represented as a field of polynomials modulo
+ * %$\{104c11db7}_x$% (this is the standard CRC-32 polynomial).  Our blocks
+ * are bytes.  We append a big-endian byte length.
+ *
+ * The choice of a 32-bit hash is made for pragmatic reasons: we're never
+ * likely to actually want all 32 bits for a real hashtable anyway.  The
+ * truncation result is needed to keep us afloat with smaller tables.
+ *
+ * We compute hashes using a slightly unrolled version of Horner's rule,
+ * using the recurrence:
+ *
+ *   %$a_{i+b} = (a_i + m_i) k^b + m_{i+1} k^{b-1} + \cdots + m_{i+b-1} k$%
+ *
+ * which involves one full-width multiply and %$b - 1$% one-byte multiplies;
+ * the latter may be efficiently computed using a table lookup.  Start with
+ * %$a_0 = k$%.
+ *
+ * We precompute tables %$S[\cdot][\cdot][\cdot]$%, where
+ *
+ *   %$S[u][v][w] = k^{u+1} x^{8v} w$%
+ *     for %$0 \le u < b$%, %$0 \le v < 4$%, %$0 \le w < 256)$%.
+ *
+ * A one-byte multiply is one lookup; a full-width multiply is four lookups
+ * and three XORs.  The processing required is then %$b + 3$% lookups and
+ * %$b + 3$% XORs per batch, or %$(b + 3)/b$% lookups and XORs per byte, at
+ * the expense of %$4 b$% kilobytes of tables.  This compares relatively
+ * favorably with CRC32.  Indeed, in tests, this implementation with $b = 4$%
+ * is faster than a 32-bit CRC.
+ */
+
+/*----- Header files ------------------------------------------------------*/
+
+#include <stddef.h>
+
+#ifndef MLIB_BITS_H
+#  include "bits.h"
+#endif
+
+/*----- Data structures ---------------------------------------------------*/
+
+#define UNIHASH_NBATCH 4
+#define UNIHASH_POLY 0x04c11db7		/* From CRC32 */
+
+typedef struct unihash_info {
+  uint32 s[UNIHASH_NBATCH][4][256];	/* S-tables as described */
+} unihash_info;
+
+/*----- Functions provided ------------------------------------------------*/
+
+/* --- @unihash_setkey@ --- *
+ *
+ * Arguments:	@unihash_info *i@ = where to store the precomputed tables
+ *		@uint32 k@ = the key to set, randomly chosen
+ *
+ * Returns:	---
+ *
+ * Use:		Calculates the tables required for efficient hashing.
+ */
+
+extern void unihash_setkey(unihash_info */*i*/, uint32 /*k*/);
+
+/* --- @unihash_hash@ --- *
+ *
+ * Arguments:	@const unihash_info *i@ = pointer to precomputed table
+ *		@uint32 a@ = @UNIHASH_INIT(i)@ or value from previous call
+ *		@const void *p@ = pointer to data to hash
+ *		@size_t sz@ = size of the data
+ *
+ * Returns:	---
+ *
+ * Use:		Hashes data.  Call this as many times as needed.  
+ */
+
+#define UNIHASH_INIT(i) ((i)->s[0][0][1]) /* %$k$% */
+
+extern uint32 unihash_hash(const unihash_info */*i*/, uint32 /*a*/,
+			   const void */*p*/, size_t /*sz*/);
+
+/* --- @unihash@ --- *
+ *
+ * Arguments:	@const unihash_info *i@ = precomputed tables
+ *		@const void *p@ = pointer to data to hash
+ *		@size_t sz@ = size of the data
+ *
+ * Returns:	The hash value computed.
+ *
+ * Use:		All-in-one hashing function.  No faster than using the
+ * 		separate calls, but more convenient.
+ */
+
+#define UNIHASH(i, p, sz) (unihash_hash((i), UNIHASH_INIT((i)), (p), (sz)))
+
+extern uint32 unihash(const unihash_info */*i*/,
+		      const void */*p*/, size_t /*sz*/);
+
+/*----- That's all, folks -------------------------------------------------*/
+
+#ifdef __cplusplus
+  }
+#endif
+
+#endif