[mLib] / hash / unihash.h

/* -*-c-*-
 *
 * Simple and efficient universal hashing for hashtables
 *
 * (c) 2003 Straylight/Edgeware
 */

/*----- Licensing notice --------------------------------------------------*
 *
 * This file is part of the mLib utilities library.
 *
 * mLib is free software; you can redistribute it and/or modify
 * it under the terms of the GNU Library General Public License as
 * published by the Free Software Foundation; either version 2 of the
 * License, or (at your option) any later version.
 *
 * mLib is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU Library General Public License for more details.
 *
 * You should have received a copy of the GNU Library General Public
 * License along with mLib; if not, write to the Free
 * Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
 * MA 02111-1307, USA.
 */

#ifndef MLIB_UNIHASH_H
#define MLIB_UNIHASH_H

#ifdef __cplusplus
  extern "C" {
#endif


/*----- Concept -----------------------------------------------------------*
 *
 * Let %$\gf{q}$% be a finite field.  Choose an arbitrary %$k \inr \gf{q}$%.
 * Let %$M$% be a message.  Injectively pad %$M$% and split it into blocks
 * $m_{n-1}, m_{n-2}, \ldots, m_2, m_1, m_0$% in %$\gf{q}%.
 * Then we compute
 *
 *   %$H_k(M) = k^{n+1} + \sum_{0\le i<n} m_i k^{i+1}.$%
 *
 * Note that %$H_0(M) = 0$% for all messages %$M$%.
 *
 * If we deal with messages at most %$\ell$% blocks long then %$H_k(\cdot)$%
 * is %$(\ell + 1)/q$%-almost universal.  Moreover, if %$q = 2^f$% then
 * %$H_k(\cdot)$% is %$(\ell + 1)/q$%-almost XOR-universal.
 *
 * Proof.  Let %$A$% and %$B$% be two messages, represented by
 * %$a_{n-1}, \ldots, a_0$% and %$b_{m-1}, \ldots, b_0$% respectively; and
 * choose any %$\delta \in \gf{q}$%.  We must bound the probability that
 *
 * %$k^{n+1} + a_{n-1} k^{n} + \cdots + a_1 k^2 + a_0 k - {}$%
 *   %$k^{m+1} - b_{m-1} k^{m} - \cdots - b_1 k^2 - b_0 k = \delta$%.
 *
 * Firstly, we claim that if %$A$% and %$B$% are distinct, there is some
 * nonzero coefficient of %$k$%.  For if %$n \ne m$% then, without loss of
 * generality, let %$n > m$%, and hence the coefficient of %$k_n$% is
 * nonzero.  Alternatively, if %$n = m$% then there must be some
 * %$i \in \{ 0, \ldots, n - 1 \}$% with %$a_i \ne b_i$%, for otherwise the
 * messages would be identical; but then the coefficient of %$k^{i+1}$% is
 * %$a_i - b_i \ne 0$%.
 *
 * Hence we have a polynomial equation with degree at most %$\ell + 1$%;
 * there must be at most %$\ell + 1$% solutions for %$k$%; but we choose
 * %$k$% at random from a set of %$q$%; so the equation is true with
 * probability at most %$(\ell + 1)/q$%.
 *
 * This function can be used as a simple MAC with provable security against
 * computationally unbounded adversaries.  Simply XOR the hash with a random
 * string indexed from a large random pad by some nonce sent with the
 * message.  The probability of a forgery attempt being successful is then
 * %$(\ell + 1)/2^t$%, where %$t$% is the tag length and %$\ell$% is the
 * longest message permitted.
 */

/*----- Practicalities ----------------------------------------------------*
 *
 * We work in %$\gf{2^32}$%, represented as a field of polynomials modulo
 * %$\texttt{104c11db7}_x$% (this is the standard CRC-32 polynomial).  Our
 * blocks are bytes.
 *
 * The choice of a 32-bit hash is made for pragmatic reasons: we're never
 * likely to actually want all 32 bits for a real hashtable anyway.  The
 * truncation result is needed to keep us afloat with smaller tables.
 *
 * We compute hashes using a slightly unrolled version of Horner's rule,
 * using the recurrence:
 *
 *   %$a_{i+b} = (a_i + m_i) k^b + m_{i+1} k^{b-1} + \cdots + m_{i+b-1} k$%
 *
 * which involves one full-width multiply and %$b - 1$% one-byte multiplies;
 * the latter may be efficiently computed using a table lookup.  Start with
 * %$a_0 = k$%.
 *
 * We precompute tables %$S[\cdot][\cdot][\cdot]$%, where
 *
 *   %$S[u][v][w] = k^{u+1} x^{8v} w$%
 *     for %$0 \le u < b$%, %$0 \le v < 4$%, %$0 \le w < 256)$%.
 *
 * A one-byte multiply is one lookup; a full-width multiply is four lookups
 * and three XORs.  The processing required is then %$b + 3$% lookups and
 * %$b + 3$% XORs per batch, or %$(b + 3)/b$% lookups and XORs per byte, at
 * the expense of %$4 b$% kilobytes of tables.  This compares relatively
 * favorably with CRC32.  Indeed, in tests, this implementation with $b = 4$%
 * is faster than a 32-bit CRC.
 */

/*----- Header files ------------------------------------------------------*/

#include <stddef.h>

#ifndef MLIB_BITS_H
#  include "bits.h"
#endif

/*----- Data structures ---------------------------------------------------*/

#define UNIHASH_NBATCH 4
#define UNIHASH_POLY 0x04c11db7		/* From CRC32 */

typedef struct unihash_info {
  uint32 s[UNIHASH_NBATCH][4][256];	/* S-tables as described */
} unihash_info;

/*----- A global hash-info table ------------------------------------------*/

extern unihash_info unihash_global;	/* Key this if you like */

/*----- Functions provided ------------------------------------------------*/

/* --- @unihash_setkey@ --- *
 *
 * Arguments:	@unihash_info *i@ = where to store the precomputed tables
 *		@uint32 k@ = the key to set, randomly chosen
 *
 * Returns:	---
 *
 * Use:		Calculates the tables required for efficient hashing.
 */

extern void unihash_setkey(unihash_info */*i*/, uint32 /*k*/);

/* --- @unihash_hash@ --- *
 *
 * Arguments:	@const unihash_info *i@ = pointer to precomputed table
 *		@uint32 a@ = @UNIHASH_INIT(i)@ or value from previous call
 *		@const void *p@ = pointer to data to hash
 *		@size_t sz@ = size of the data
 *
 * Returns:	Hash of data so far.
 *
 * Use:		Hashes data.  Call this as many times as needed.
 */

#define UNIHASH_INIT(i) ((i)->s[0][0][1]) /* %$k$% */

extern uint32 unihash_hash(const unihash_info */*i*/, uint32 /*a*/,
			   const void */*p*/, size_t /*sz*/);

/* --- @unihash@ --- *
 *
 * Arguments:	@const unihash_info *i@ = precomputed tables
 *		@const void *p@ = pointer to data to hash
 *		@size_t sz@ = size of the data
 *
 * Returns:	The hash value computed.
 *
 * Use:		All-in-one hashing function.  No faster than using the
 *		separate calls, but more convenient.
 */

#define UNIHASH(i, p, sz) (unihash_hash((i), UNIHASH_INIT((i)), (p), (sz)))

extern uint32 unihash(const unihash_info */*i*/,
		      const void */*p*/, size_t /*sz*/);

/*----- That's all, folks -------------------------------------------------*/

#ifdef __cplusplus
  }
#endif

#endif
Commit	Line	Data
8fe3c82b	1	/* --c--
8fe3c82b	2	*
	3	* Simple and efficient universal hashing for hashtables
	4	*
	5	* (c) 2003 Straylight/Edgeware
	6	*/
	7
d4efbcd9	8	/----- Licensing notice --------------------------------------------------
8fe3c82b	9	*
	10	* This file is part of the mLib utilities library.
	11	*
	12	* mLib is free software; you can redistribute it and/or modify
	13	* it under the terms of the GNU Library General Public License as
	14	* published by the Free Software Foundation; either version 2 of the
	15	* License, or (at your option) any later version.
d4efbcd9	16	*
8fe3c82b	17	* mLib is distributed in the hope that it will be useful,
	18	* but WITHOUT ANY WARRANTY; without even the implied warranty of
	19	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	20	* GNU Library General Public License for more details.
d4efbcd9	21	*
8fe3c82b	22	* You should have received a copy of the GNU Library General Public
	23	* License along with mLib; if not, write to the Free
	24	* Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
	25	* MA 02111-1307, USA.
	26	*/
	27
8fe3c82b	28	#ifndef MLIB_UNIHASH_H
	29	#define MLIB_UNIHASH_H
	30
	31	#ifdef __cplusplus
	32	extern "C" {
	33	#endif
	34
	35
	36	/----- Concept -----------------------------------------------------------
	37	*
	38	* Let %$\gf{q}$% be a finite field. Choose an arbitrary %$k \inr \gf{q}$%.
	39	* Let %$M$% be a message. Injectively pad %$M$% and split it into blocks
	40	* $m_{n-1}, m_{n-2}, \ldots, m_2, m_1, m_0$% in %$\gf{q}%.
	41	* Then we compute
	42	*
573eadb5	43	* %$H_k(M) = k^{n+1} + \sum_{0\le i<n} m_i k^{i+1}.$%
8fe3c82b	44	*
	45	* Note that %$H_0(M) = 0$% for all messages %$M$%.
	46	*
	47	* If we deal with messages at most %$\ell$% blocks long then %$H_k(\cdot)$%
	48	* is %$(\ell + 1)/q$%-almost universal. Moreover, if %$q = 2^f$% then
	49	* %$H_k(\cdot)$% is %$(\ell + 1)/q$%-almost XOR-universal.
	50	*
	51	* Proof. Let %$A$% and %$B$% be two messages, represented by
	52	* %$a_{n-1}, \ldots, a_0$% and %$b_{m-1}, \ldots, b_0$% respectively; and
	53	* choose any %$\delta \in \gf{q}$%. We must bound the probability that
	54	*
	55	* %$k^{n+1} + a_{n-1} k^{n} + \cdots + a_1 k^2 + a_0 k - {}$%
	56	* %$k^{m+1} - b_{m-1} k^{m} - \cdots - b_1 k^2 - b_0 k = \delta$%.
	57	*
	58	* Firstly, we claim that if %$A$% and %$B$% are distinct, there is some
	59	* nonzero coefficient of %$k$%. For if %$n \ne m$% then, without loss of
	60	* generality, let %$n > m$%, and hence the coefficient of %$k_n$% is
	61	* nonzero. Alternatively, if %$n = m$% then there must be some
	62	* %$i \in \{ 0, \ldots, n - 1 \}$% with %$a_i \ne b_i$%, for otherwise the
	63	* messages would be identical; but then the coefficient of %$k^{i+1}$% is
	64	* %$a_i - b_i \ne 0$%.
	65	*
	66	* Hence we have a polynomial equation with degree at most %$\ell + 1$%;
	67	* there must be at most %$\ell + 1$% solutions for %$k$%; but we choose
	68	* %$k$% at random from a set of %$q$%; so the equation is true with
	69	* probability at most %$(\ell + 1)/q$%.
	70	*
	71	* This function can be used as a simple MAC with provable security against
	72	* computationally unbounded adversaries. Simply XOR the hash with a random
	73	* string indexed from a large random pad by some nonce sent with the
	74	* message. The probability of a forgery attempt being successful is then
573eadb5	75	* %$(\ell + 1)/2^t$%, where %$t$% is the tag length and %$\ell$% is the
573eadb5	76	* longest message permitted.
8fe3c82b	77	*/
	78
	79	/----- Practicalities ----------------------------------------------------
	80	*
	81	* We work in %$\gf{2^32}$%, represented as a field of polynomials modulo
573eadb5	82	* %$\texttt{104c11db7}_x$% (this is the standard CRC-32 polynomial). Our
573eadb5	83	* blocks are bytes.
8fe3c82b	84	*
	85	* The choice of a 32-bit hash is made for pragmatic reasons: we're never
	86	* likely to actually want all 32 bits for a real hashtable anyway. The
	87	* truncation result is needed to keep us afloat with smaller tables.
	88	*
	89	* We compute hashes using a slightly unrolled version of Horner's rule,
	90	* using the recurrence:
	91	*
	92	* %$a_{i+b} = (a_i + m_i) k^b + m_{i+1} k^{b-1} + \cdots + m_{i+b-1} k$%
	93	*
	94	* which involves one full-width multiply and %$b - 1$% one-byte multiplies;
	95	* the latter may be efficiently computed using a table lookup. Start with
	96	* %$a_0 = k$%.
	97	*
	98	* We precompute tables %$S[\cdot][\cdot][\cdot]$%, where
	99	*
	100	* %$S[u][v][w] = k^{u+1} x^{8v} w$%
	101	* for %$0 \le u < b$%, %$0 \le v < 4$%, %$0 \le w < 256)$%.
	102	*
	103	* A one-byte multiply is one lookup; a full-width multiply is four lookups
	104	* and three XORs. The processing required is then %$b + 3$% lookups and
	105	* %$b + 3$% XORs per batch, or %$(b + 3)/b$% lookups and XORs per byte, at
	106	* the expense of %$4 b$% kilobytes of tables. This compares relatively
	107	* favorably with CRC32. Indeed, in tests, this implementation with $b = 4$%
	108	* is faster than a 32-bit CRC.
	109	*/
	110
	111	/----- Header files ------------------------------------------------------/
	112
	113	#include <stddef.h>
	114
	115	#ifndef MLIB_BITS_H
	116	# include "bits.h"
	117	#endif
	118
	119	/----- Data structures ---------------------------------------------------/
	120
	121	#define UNIHASH_NBATCH 4
	122	#define UNIHASH_POLY 0x04c11db7 /* From CRC32 */
	123
	124	typedef struct unihash_info {
	125	uint32 s[UNIHASH_NBATCH][4][256]; /* S-tables as described */
	126	} unihash_info;
	127
6f444bda	128	/----- A global hash-info table ------------------------------------------/
	129
	130	extern unihash_info unihash_global; /* Key this if you like */
	131
8fe3c82b	132	/----- Functions provided ------------------------------------------------/
	133
	134	/* --- @unihash_setkey@ --- *
	135	*
	136	* Arguments: @unihash_info *i@ = where to store the precomputed tables
	137	* @uint32 k@ = the key to set, randomly chosen
	138	*
	139	* Returns: ---
	140	*
	141	* Use: Calculates the tables required for efficient hashing.
	142	*/
	143
	144	extern void unihash_setkey(unihash_info /i/, uint32 /k*/);
	145
	146	/* --- @unihash_hash@ --- *
	147	*
	148	* Arguments: @const unihash_info *i@ = pointer to precomputed table
	149	* @uint32 a@ = @UNIHASH_INIT(i)@ or value from previous call
	150	* @const void *p@ = pointer to data to hash
	151	* @size_t sz@ = size of the data
	152	*
573eadb5	153	* Returns: Hash of data so far.
8fe3c82b	154	*
d4efbcd9	155	* Use: Hashes data. Call this as many times as needed.
8fe3c82b	156	*/
	157
	158	#define UNIHASH_INIT(i) ((i)->s[0][0][1]) /* %$k$% */
	159
	160	extern uint32 unihash_hash(const unihash_info /i/, uint32 /a*/,
	161	const void /p/, size_t /sz*/);
	162
	163	/* --- @unihash@ --- *
	164	*
	165	* Arguments: @const unihash_info *i@ = precomputed tables
	166	* @const void *p@ = pointer to data to hash
	167	* @size_t sz@ = size of the data
	168	*
	169	* Returns: The hash value computed.
	170	*
	171	* Use: All-in-one hashing function. No faster than using the
d4efbcd9	172	* separate calls, but more convenient.
8fe3c82b	173	*/
	174
	175	#define UNIHASH(i, p, sz) (unihash_hash((i), UNIHASH_INIT((i)), (p), (sz)))
	176
	177	extern uint32 unihash(const unihash_info /i*/,
	178	const void /p/, size_t /sz*/);
	179
	180	/----- That's all, folks -------------------------------------------------/
	181
	182	#ifdef __cplusplus
	183	}
	184	#endif
	185
	186	#endif