From: mdw Date: Sun, 12 Oct 2003 14:43:24 +0000 (+0000) Subject: Universal hashing. X-Git-Tag: 2.0.4~65 X-Git-Url: http://www.chiark.greenend.org.uk/ucgi/~mdw/git/mLib/commitdiff_plain/8fe3c82b561e58c894d9b8392ef4003158f44dd8 Universal hashing. --- diff --git a/man/unihash.3 b/man/unihash.3 new file mode 100644 index 0000000..49ef2e0 --- /dev/null +++ b/man/unihash.3 @@ -0,0 +1,182 @@ +.\" -*-nroff-*- +.de VS +.sp 1 +.RS +.nf +.ft B +.. +.de VE +.ft R +.fi +.RE +.sp 1 +.. +.de hP +.IP +.ft B +\h'-\w'\\$1\ 'u'\\$1\ \c +.ft P +.. +.ie t \{\ +. ds ss \s8\u +. ds se \d\s0 +. ds us \s8\d +. ds ue \u\s0 +. ds *d \(*d +. ds >= \(>= +.\} +.el \{\ +. ds ss ^ +. ds se +. ds us _ +. ds ue +. ds *d \fIdelta\fP +. ds >= >= +.\} +.TH unihash 3 "5 July 2003" "Straylight/Edgeware" "mLib utilities library" +.SH NAME +unihash \- simple and efficient universal hashing for hashtables +.\" @unihash_setkey +.\" @UNIHASH_INIT +.\" @unihash_hash +.\" @UNIHASH +.\" @unihash +.SH SYNOPSIS +.nf +.B "#include " + +.BI "void unihash_setkey(unihash_info *" i ", uint32 " k ); +.BI "uint32 UNIHASH_INIT(const unihash_info *" i ); +.BI "void unihash_hash(const unihash_info *" st ", uint32 " a , +.BI " const void *" p ", size_t " sz ); +.BI "uint32 unihash(const unihash_info *" i ", const void *" p ", size_t " sz ); +.BI "uint32 UNIHASH(const unihash_info *" i ", const void *" p ", size_t " sz ); +.fi +.SH DESCRIPTION +The +.B unihash +system implements a simple and relatively efficient +.IR "universal hashing family" . +Using a such a universal hashing family means that it's provably +difficult for an adversary to choose input data whose hashes collide, +thus guaranteeing good average performance even on maliciously chosen +data. +.PP +Unlike, say, +.BR crc32 (3), +the +.B unihash +function is +.I keyed +\- in addition to the data to be hashed, the function takes as input a +32-bit key. This key should be chosen at random each time the program +runs. +.SS "Preprocessing a key" +Before use, a key must be +.I preprocessed +into a large (16K) table which is used by the main hashing functions. +The preprocessing is done by +.BR unihash_setkey : +pass it a pointer to a +.B unihash_info +structure and the 32-bit key you've chosen, and it stores the table in +the structure. +.PP +Objects of type +.B unihash_info +don't contain any pointers to other data and are safe to free when +you've finished with them; or you can just allocate them statically or +on the stack if that's more convenient. +.SS "Hashing data" +The function +.B unihash_hash +takes as input: +.TP +.BI "const unihash_info *" i +A pointer to the precomputed tables for a key. +.TP +.BI "uint32 " a +An accumulator value. This should be +.BI UNIHASH_INIT( i ) +for the first chunk of a multi-chunk input, or the result of the +previous +.B unihash_hash +call for subsequent chunks. +.TP +.BI "const void *" p +A pointer to the start of a buffer containing this chunk of data. +.TP +.BI "size_t " sz +The length of the chunk. +.PP +The function returns a new accumulator value, which is also the hash of +the data so far. So, to hash multiple chunks of data, do something like +.VS +uint32 a = UNIHASH_INIT(i); +a = unihash_hash(i, a, p_0, sz_0); +a = unihash_hash(i, a, p_1, sz_1); +/* ... */ +a = unihash_hash(i, a, p_n, sz_n); +.VE +The macro +.B UNIHASH +and function +.B unihash +are convenient interfaces to +.B unihash_hash +if you only wanted to hash one chunk. +.SS "Theoretical issues" +The hash function implemented by +.B unihash +is +.RI ( l \ +\ 1)/2\*(ss32\*(se-almost +XOR-universal, where +.I l +is the length (in bytes) of the longest string you hash. That means +that, for any pair of strings +.I x +and +.I y +and any 32-bit value \*(*d, the probability taken over all choices of the +key +.I k +that +.IR H\*(usk\*(ue ( x )\ \c +.BR xor \c +.RI \ H\*(usk\*(ue ( y )\ =\ \*(*d +is no greater than +.RI ( l \ +\ 1)/2\*(ss32\*(se. +.PP +This fact is proven in the header file, but it requires more +sophisticated typesetting than is available here. +.PP +The function evaluates a polynomial over GF(2\*(ss32\*(se) whose +coefficients are the bytes of the message and whose variable is the key. +Details are given in the header file. +.PP +For best results, you should choose the key as a random 32-bit number +each time your program starts. Choosing a different key for different +hashtables isn't necessary. It's probably a good idea to avoid the keys +0 and 1. This raises the collision bound to +.RI ( l \ +\ 1)/(2\*(ss32\*(se\ \-\ 2) +(which isn't a significant increase) but eliminates keys for which the +hash's behaviour is particularly poor. +.PP +In tests, +.B unihash +actually performed better than +.BR crc32 , +so if you want to just use it as a fast-ish hash with good statistical +properties, choose some fixed key +.IR k \ \*(>=\ 2. +.PP +We emphasize that the proof of this function's collision behaviour is +.I not +dependent on any unproven assumptions (unlike many `proofs' of +cryptographic security, which actually reduce the security of some +construction to the security of its components). It's just a fact. +.SH SEE ALSO +.BR crc32 (3), +.BR mLib (3). +.SH AUTHOR +Mark Wooding (mdw@nsict.org). diff --git a/unihash.c b/unihash.c new file mode 100644 index 0000000..39e2f06 --- /dev/null +++ b/unihash.c @@ -0,0 +1,159 @@ +/* -*-c-*- + * + * $Id: unihash.c,v 1.1 2003/10/12 14:43:24 mdw Exp $ + * + * Simple and efficient universal hashing for hashtables + * + * (c) 2003 Straylight/Edgeware + */ + +/*----- Licensing notice --------------------------------------------------* + * + * This file is part of the mLib utilities library. + * + * mLib is free software; you can redistribute it and/or modify + * it under the terms of the GNU Library General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * mLib is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Library General Public License for more details. + * + * You should have received a copy of the GNU Library General Public + * License along with mLib; if not, write to the Free + * Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, + * MA 02111-1307, USA. + */ + +/*----- Revision history --------------------------------------------------* + * + * $Log: unihash.c,v $ + * Revision 1.1 2003/10/12 14:43:24 mdw + * Universal hashing. + * + */ + +/*----- Header files ------------------------------------------------------*/ + +#include +#include + +#include "unihash.h" + +/*----- Main code ---------------------------------------------------------*/ + +/* --- @unihash_setkey@ --- * + * + * Arguments: @unihash_info *i@ = where to store the precomputed tables + * @uint32 k@ = the key to set, randomly chosen + * + * Returns: --- + * + * Use: Calculates the tables required for efficient hashing. + */ + +static uint32 mul(uint32 x, uint32 y) +{ + uint32 z = 0; + while (y) { + if (y & 1) z ^= x; + if (x & (1 << 31)) + x = U32(x << 1) ^ UNIHASH_POLY; + else + x = U32(x << 1); + y = U32(y >> 1); + } + return (z); +} + +void unihash_setkey(unihash_info *i, uint32 k) +{ + size_t a; + size_t b; + uint32 x = 1; + + for (a = 0; a < UNIHASH_NBATCH; a++) { + x = mul(x, k); + for (b = 0; b < 256; b++) { + i->s[a][0][b] = mul(x, b << 0); + i->s[a][1][b] = mul(x, b << 8); + i->s[a][2][b] = mul(x, b << 16); + i->s[a][3][b] = mul(x, b << 24); + } + } +} + +/* --- @unihash_hash@ --- * + * + * Arguments: @const unihash_info *i@ = pointer to precomputed table + * @uint32 a@ = @i->[0][0][1]@ or value from previous call + * @const void *p@ = pointer to data to hash + * @size_t sz@ = size of the data + * + * Returns: --- + * + * Use: Hashes data. Call this as many times as needed. + */ + +uint32 unihash_hash(const unihash_info *i, uint32 a, + const void *p, size_t sz) +{ + const octet *pp = p; + + assert(UNIHASH_NBATCH == 4); + +#define FULLMULT(u, x) \ + (i->s[u][0][U8((x) >> 0)] ^ i->s[u][1][U8((x) >> 8)] ^ \ + i->s[u][2][U8((x) >> 16)] ^ i->s[u][3][U8((x) >> 24)]); + +#define BYTEMULT(u, x) i->s[u][0][x] + + /* --- Do the main bulk in batches of %$n$% bytes --- * + * + * We have %$a$% and %$m_{n-1}, \ldots, m_1, m_0$%; we want + * + * %$a' = (a + m_{n-1}) k^n + m_{n-2} k^{n-1} + \cdots + m_1 k^2 + m_0 k$% + */ + + while (sz >= UNIHASH_NBATCH) { + a ^= *pp++; + a = FULLMULT(3, a); + a ^= BYTEMULT(2, *pp++); + a ^= BYTEMULT(1, *pp++); + a ^= BYTEMULT(0, *pp++); + } + + /* --- The tail end is a smaller batch --- */ + + switch (sz) { + case 3: a ^= *pp++; a = FULLMULT(2, a); goto batch_2; + case 2: a ^= *pp++; a = FULLMULT(1, a); goto batch_1; + case 1: a ^= *pp++; a = FULLMULT(0, a); goto batch_0; + batch_2: a ^= BYTEMULT(1, *pp++); + batch_1: a ^= BYTEMULT(0, *pp++); + batch_0: break; + } + + return (a); +} + +/* --- @unihash@ --- * + * + * Arguments: @const unihash_info *i@ = precomputed tables + * @const void *p@ = pointer to data to hash + * @size_t sz@ = size of the data + * + * Returns: The hash value computed. + * + * Use: All-in-one hashing function. No faster than using the + * separate calls, but more convenient. + */ + +uint32 unihash(const unihash_info *i, const void *p, size_t sz) +{ + return (UNIHASH(i, p, sz)); +} + +/*----- That's all, folks -------------------------------------------------*/ diff --git a/unihash.h b/unihash.h new file mode 100644 index 0000000..5fb5741 --- /dev/null +++ b/unihash.h @@ -0,0 +1,192 @@ +/* -*-c-*- + * + * $Id: unihash.h,v 1.1 2003/10/12 14:43:24 mdw Exp $ + * + * Simple and efficient universal hashing for hashtables + * + * (c) 2003 Straylight/Edgeware + */ + +/*----- Licensing notice --------------------------------------------------* + * + * This file is part of the mLib utilities library. + * + * mLib is free software; you can redistribute it and/or modify + * it under the terms of the GNU Library General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * mLib is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Library General Public License for more details. + * + * You should have received a copy of the GNU Library General Public + * License along with mLib; if not, write to the Free + * Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, + * MA 02111-1307, USA. + */ + +/*----- Revision history --------------------------------------------------* + * + * $Log: unihash.h,v $ + * Revision 1.1 2003/10/12 14:43:24 mdw + * Universal hashing. + * + */ + +#ifndef MLIB_UNIHASH_H +#define MLIB_UNIHASH_H + +#ifdef __cplusplus + extern "C" { +#endif + + +/*----- Concept -----------------------------------------------------------* + * + * Let %$\gf{q}$% be a finite field. Choose an arbitrary %$k \inr \gf{q}$%. + * Let %$M$% be a message. Injectively pad %$M$% and split it into blocks + * $m_{n-1}, m_{n-2}, \ldots, m_2, m_1, m_0$% in %$\gf{q}%. + * Then we compute + * + * %$H_k(M) = k^{n+1} \sum_{0\le i m$%, and hence the coefficient of %$k_n$% is + * nonzero. Alternatively, if %$n = m$% then there must be some + * %$i \in \{ 0, \ldots, n - 1 \}$% with %$a_i \ne b_i$%, for otherwise the + * messages would be identical; but then the coefficient of %$k^{i+1}$% is + * %$a_i - b_i \ne 0$%. + * + * Hence we have a polynomial equation with degree at most %$\ell + 1$%; + * there must be at most %$\ell + 1$% solutions for %$k$%; but we choose + * %$k$% at random from a set of %$q$%; so the equation is true with + * probability at most %$(\ell + 1)/q$%. + * + * This function can be used as a simple MAC with provable security against + * computationally unbounded adversaries. Simply XOR the hash with a random + * string indexed from a large random pad by some nonce sent with the + * message. The probability of a forgery attempt being successful is then + * %$(\ell + 1)/2^t$%, where %$t$% is the tag length and %$n$% is the longest + * message permitted. + */ + +/*----- Practicalities ----------------------------------------------------* + * + * We work in %$\gf{2^32}$%, represented as a field of polynomials modulo + * %$\{104c11db7}_x$% (this is the standard CRC-32 polynomial). Our blocks + * are bytes. We append a big-endian byte length. + * + * The choice of a 32-bit hash is made for pragmatic reasons: we're never + * likely to actually want all 32 bits for a real hashtable anyway. The + * truncation result is needed to keep us afloat with smaller tables. + * + * We compute hashes using a slightly unrolled version of Horner's rule, + * using the recurrence: + * + * %$a_{i+b} = (a_i + m_i) k^b + m_{i+1} k^{b-1} + \cdots + m_{i+b-1} k$% + * + * which involves one full-width multiply and %$b - 1$% one-byte multiplies; + * the latter may be efficiently computed using a table lookup. Start with + * %$a_0 = k$%. + * + * We precompute tables %$S[\cdot][\cdot][\cdot]$%, where + * + * %$S[u][v][w] = k^{u+1} x^{8v} w$% + * for %$0 \le u < b$%, %$0 \le v < 4$%, %$0 \le w < 256)$%. + * + * A one-byte multiply is one lookup; a full-width multiply is four lookups + * and three XORs. The processing required is then %$b + 3$% lookups and + * %$b + 3$% XORs per batch, or %$(b + 3)/b$% lookups and XORs per byte, at + * the expense of %$4 b$% kilobytes of tables. This compares relatively + * favorably with CRC32. Indeed, in tests, this implementation with $b = 4$% + * is faster than a 32-bit CRC. + */ + +/*----- Header files ------------------------------------------------------*/ + +#include + +#ifndef MLIB_BITS_H +# include "bits.h" +#endif + +/*----- Data structures ---------------------------------------------------*/ + +#define UNIHASH_NBATCH 4 +#define UNIHASH_POLY 0x04c11db7 /* From CRC32 */ + +typedef struct unihash_info { + uint32 s[UNIHASH_NBATCH][4][256]; /* S-tables as described */ +} unihash_info; + +/*----- Functions provided ------------------------------------------------*/ + +/* --- @unihash_setkey@ --- * + * + * Arguments: @unihash_info *i@ = where to store the precomputed tables + * @uint32 k@ = the key to set, randomly chosen + * + * Returns: --- + * + * Use: Calculates the tables required for efficient hashing. + */ + +extern void unihash_setkey(unihash_info */*i*/, uint32 /*k*/); + +/* --- @unihash_hash@ --- * + * + * Arguments: @const unihash_info *i@ = pointer to precomputed table + * @uint32 a@ = @UNIHASH_INIT(i)@ or value from previous call + * @const void *p@ = pointer to data to hash + * @size_t sz@ = size of the data + * + * Returns: --- + * + * Use: Hashes data. Call this as many times as needed. + */ + +#define UNIHASH_INIT(i) ((i)->s[0][0][1]) /* %$k$% */ + +extern uint32 unihash_hash(const unihash_info */*i*/, uint32 /*a*/, + const void */*p*/, size_t /*sz*/); + +/* --- @unihash@ --- * + * + * Arguments: @const unihash_info *i@ = precomputed tables + * @const void *p@ = pointer to data to hash + * @size_t sz@ = size of the data + * + * Returns: The hash value computed. + * + * Use: All-in-one hashing function. No faster than using the + * separate calls, but more convenient. + */ + +#define UNIHASH(i, p, sz) (unihash_hash((i), UNIHASH_INIT((i)), (p), (sz))) + +extern uint32 unihash(const unihash_info */*i*/, + const void */*p*/, size_t /*sz*/); + +/*----- That's all, folks -------------------------------------------------*/ + +#ifdef __cplusplus + } +#endif + +#endif