chiark / gitweb /
7ea30b10da24c2c2df977d3c17d955286acbdb8b
1 /* -*-c-*-
2  *
3  * Simple and efficient universal hashing for hashtables
4  *
5  * (c) 2003 Straylight/Edgeware
6  */
8 /*----- Licensing notice --------------------------------------------------*
9  *
10  * This file is part of the mLib utilities library.
11  *
12  * mLib is free software; you can redistribute it and/or modify
13  * it under the terms of the GNU Library General Public License as
14  * published by the Free Software Foundation; either version 2 of the
15  * License, or (at your option) any later version.
16  *
17  * mLib is distributed in the hope that it will be useful,
18  * but WITHOUT ANY WARRANTY; without even the implied warranty of
19  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
20  * GNU Library General Public License for more details.
21  *
22  * You should have received a copy of the GNU Library General Public
23  * License along with mLib; if not, write to the Free
24  * Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
25  * MA 02111-1307, USA.
26  */
28 #ifndef MLIB_UNIHASH_H
29 #define MLIB_UNIHASH_H
31 #ifdef __cplusplus
32   extern "C" {
33 #endif
36 /*----- Concept -----------------------------------------------------------*
37  *
38  * Let %$\gf{q}$% be a finite field.  Choose an arbitrary %$k \inr \gf{q}$%.
39  * Let %$M$% be a message.  Injectively pad %$M$% and split it into blocks
40  * $m_{n-1}, m_{n-2}, \ldots, m_2, m_1, m_0$% in %$\gf{q}%. 41 * Then we compute 42 * 43 * %$H_k(M) = k^{n+1} + \sum_{0\le i<n} m_i k^{i+1}.$% 44 * 45 * Note that %$H_0(M) = 0$% for all messages %$M$%. 46 * 47 * If we deal with messages at most %$\ell$% blocks long then %$H_k(\cdot)$% 48 * is %$(\ell + 1)/q$%-almost universal. Moreover, if %$q = 2^f$% then 49 * %$H_k(\cdot)$% is %$(\ell + 1)/q$%-almost XOR-universal. 50 * 51 * Proof. Let %$A$% and %$B$% be two messages, represented by 52 * %$a_{n-1}, \ldots, a_0$% and %$b_{m-1}, \ldots, b_0$% respectively; and 53 * choose any %$\delta \in \gf{q}$%. We must bound the probability that 54 * 55 * %$k^{n+1} + a_{n-1} k^{n} + \cdots + a_1 k^2 + a_0 k - {}$% 56 * %$k^{m+1} - b_{m-1} k^{m} - \cdots - b_1 k^2 - b_0 k = \delta$%. 57 * 58 * Firstly, we claim that if %$A$% and %$B$% are distinct, there is some 59 * nonzero coefficient of %$k$%. For if %$n \ne m$% then, without loss of 60 * generality, let %$n > m$%, and hence the coefficient of %$k_n$% is 61 * nonzero. Alternatively, if %$n = m$% then there must be some 62 * %$i \in \{ 0, \ldots, n - 1 \}$% with %$a_i \ne b_i$%, for otherwise the 63 * messages would be identical; but then the coefficient of %$k^{i+1}$% is 64 * %$a_i - b_i \ne 0$%. 65 * 66 * Hence we have a polynomial equation with degree at most %$\ell + 1$%; 67 * there must be at most %$\ell + 1$% solutions for %$k$%; but we choose 68 * %$k$% at random from a set of %$q$%; so the equation is true with 69 * probability at most %$(\ell + 1)/q$%. 70 * 71 * This function can be used as a simple MAC with provable security against 72 * computationally unbounded adversaries. Simply XOR the hash with a random 73 * string indexed from a large random pad by some nonce sent with the 74 * message. The probability of a forgery attempt being successful is then 75 * %$(\ell + 1)/2^t$%, where %$t$% is the tag length and %$\ell$% is the 76 * longest message permitted. 77 */ 79 /*----- Practicalities ----------------------------------------------------* 80 * 81 * We work in %$\gf{2^32}$%, represented as a field of polynomials modulo 82 * %$\texttt{104c11db7}_x$% (this is the standard CRC-32 polynomial). Our 83 * blocks are bytes. 84 * 85 * The choice of a 32-bit hash is made for pragmatic reasons: we're never 86 * likely to actually want all 32 bits for a real hashtable anyway. The 87 * truncation result is needed to keep us afloat with smaller tables. 88 * 89 * We compute hashes using a slightly unrolled version of Horner's rule, 90 * using the recurrence: 91 * 92 * %$a_{i+b} = (a_i + m_i) k^b + m_{i+1} k^{b-1} + \cdots + m_{i+b-1} k$% 93 * 94 * which involves one full-width multiply and %$b - 1$% one-byte multiplies; 95 * the latter may be efficiently computed using a table lookup. Start with 96 * %$a_0 = k$%. 97 * 98 * We precompute tables %$S[\cdot][\cdot][\cdot]$%, where 99 * 100 * %$S[u][v][w] = k^{u+1} x^{8v} w$% 101 * for %$0 \le u < b$%, %$0 \le v < 4$%, %$0 \le w < 256)$%. 102 * 103 * A one-byte multiply is one lookup; a full-width multiply is four lookups 104 * and three XORs. The processing required is then %$b + 3$% lookups and 105 * %$b + 3$% XORs per batch, or %$(b + 3)/b$% lookups and XORs per byte, at 106 * the expense of %$4 b$% kilobytes of tables. This compares relatively 107 * favorably with CRC32. Indeed, in tests, this implementation with$b = 4$% 108 * is faster than a 32-bit CRC. 109 */ 111 /*----- Header files ------------------------------------------------------*/ 113 #include <stddef.h> 115 #ifndef MLIB_BITS_H 116 # include "bits.h" 117 #endif 119 /*----- Data structures ---------------------------------------------------*/ 121 #define UNIHASH_NBATCH 4 122 #define UNIHASH_POLY 0x04c11db7 /* From CRC32 */ 124 typedef struct unihash_info { 125 uint32 s[UNIHASH_NBATCH]; /* S-tables as described */ 126 } unihash_info; 128 /*----- A global hash-info table ------------------------------------------*/ 130 extern unihash_info unihash_global; /* Key this if you like */ 132 /*----- Functions provided ------------------------------------------------*/ 134 /* --- @unihash_setkey@ --- * 135 * 136 * Arguments: @unihash_info *i@ = where to store the precomputed tables 137 * @uint32 k@ = the key to set, randomly chosen 138 * 139 * Returns: --- 140 * 141 * Use: Calculates the tables required for efficient hashing. 142 */ 144 extern void unihash_setkey(unihash_info */*i*/, uint32 /*k*/); 146 /* --- @unihash_hash@ --- * 147 * 148 * Arguments: @const unihash_info *i@ = pointer to precomputed table 149 * @uint32 a@ = @UNIHASH_INIT(i)@ or value from previous call 150 * @const void *p@ = pointer to data to hash 151 * @size_t sz@ = size of the data 152 * 153 * Returns: Hash of data so far. 154 * 155 * Use: Hashes data. Call this as many times as needed. 156 */ 158 #define UNIHASH_INIT(i) ((i)->s) /* %$k\$% */
160 extern uint32 unihash_hash(const unihash_info */*i*/, uint32 /*a*/,
161                            const void */*p*/, size_t /*sz*/);
163 /* --- @unihash@ --- *
164  *
165  * Arguments:   @const unihash_info *i@ = precomputed tables
166  *              @const void *p@ = pointer to data to hash
167  *              @size_t sz@ = size of the data
168  *
169  * Returns:     The hash value computed.
170  *
171  * Use:         All-in-one hashing function.  No faster than using the
172  *              separate calls, but more convenient.
173  */
175 #define UNIHASH(i, p, sz) (unihash_hash((i), UNIHASH_INIT((i)), (p), (sz)))
177 extern uint32 unihash(const unihash_info */*i*/,
178                       const void */*p*/, size_t /*sz*/);
180 /*----- That's all, folks -------------------------------------------------*/
182 #ifdef __cplusplus
183   }
184 #endif
186 #endif