chiark / gitweb /
Add global unihash table; use universal hashing instead of CRC.
[mLib] / unihash.h
CommitLineData
8fe3c82b 1/* -*-c-*-
2 *
6f444bda 3 * $Id: unihash.h,v 1.3 2003/12/15 20:53:47 mdw Exp $
8fe3c82b 4 *
5 * Simple and efficient universal hashing for hashtables
6 *
7 * (c) 2003 Straylight/Edgeware
8 */
9
10/*----- Licensing notice --------------------------------------------------*
11 *
12 * This file is part of the mLib utilities library.
13 *
14 * mLib is free software; you can redistribute it and/or modify
15 * it under the terms of the GNU Library General Public License as
16 * published by the Free Software Foundation; either version 2 of the
17 * License, or (at your option) any later version.
18 *
19 * mLib is distributed in the hope that it will be useful,
20 * but WITHOUT ANY WARRANTY; without even the implied warranty of
21 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
22 * GNU Library General Public License for more details.
23 *
24 * You should have received a copy of the GNU Library General Public
25 * License along with mLib; if not, write to the Free
26 * Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
27 * MA 02111-1307, USA.
28 */
29
30/*----- Revision history --------------------------------------------------*
31 *
32 * $Log: unihash.h,v $
6f444bda 33 * Revision 1.3 2003/12/15 20:53:47 mdw
34 * Add global unihash table; use universal hashing instead of CRC.
35 *
573eadb5 36 * Revision 1.2 2003/12/14 14:45:30 mdw
37 * Test universal hashing and fix bugs.
38 *
8fe3c82b 39 * Revision 1.1 2003/10/12 14:43:24 mdw
40 * Universal hashing.
41 *
42 */
43
44#ifndef MLIB_UNIHASH_H
45#define MLIB_UNIHASH_H
46
47#ifdef __cplusplus
48 extern "C" {
49#endif
50
51
52/*----- Concept -----------------------------------------------------------*
53 *
54 * Let %$\gf{q}$% be a finite field. Choose an arbitrary %$k \inr \gf{q}$%.
55 * Let %$M$% be a message. Injectively pad %$M$% and split it into blocks
56 * $m_{n-1}, m_{n-2}, \ldots, m_2, m_1, m_0$% in %$\gf{q}%.
57 * Then we compute
58 *
573eadb5 59 * %$H_k(M) = k^{n+1} + \sum_{0\le i<n} m_i k^{i+1}.$%
8fe3c82b 60 *
61 * Note that %$H_0(M) = 0$% for all messages %$M$%.
62 *
63 * If we deal with messages at most %$\ell$% blocks long then %$H_k(\cdot)$%
64 * is %$(\ell + 1)/q$%-almost universal. Moreover, if %$q = 2^f$% then
65 * %$H_k(\cdot)$% is %$(\ell + 1)/q$%-almost XOR-universal.
66 *
67 * Proof. Let %$A$% and %$B$% be two messages, represented by
68 * %$a_{n-1}, \ldots, a_0$% and %$b_{m-1}, \ldots, b_0$% respectively; and
69 * choose any %$\delta \in \gf{q}$%. We must bound the probability that
70 *
71 * %$k^{n+1} + a_{n-1} k^{n} + \cdots + a_1 k^2 + a_0 k - {}$%
72 * %$k^{m+1} - b_{m-1} k^{m} - \cdots - b_1 k^2 - b_0 k = \delta$%.
73 *
74 * Firstly, we claim that if %$A$% and %$B$% are distinct, there is some
75 * nonzero coefficient of %$k$%. For if %$n \ne m$% then, without loss of
76 * generality, let %$n > m$%, and hence the coefficient of %$k_n$% is
77 * nonzero. Alternatively, if %$n = m$% then there must be some
78 * %$i \in \{ 0, \ldots, n - 1 \}$% with %$a_i \ne b_i$%, for otherwise the
79 * messages would be identical; but then the coefficient of %$k^{i+1}$% is
80 * %$a_i - b_i \ne 0$%.
81 *
82 * Hence we have a polynomial equation with degree at most %$\ell + 1$%;
83 * there must be at most %$\ell + 1$% solutions for %$k$%; but we choose
84 * %$k$% at random from a set of %$q$%; so the equation is true with
85 * probability at most %$(\ell + 1)/q$%.
86 *
87 * This function can be used as a simple MAC with provable security against
88 * computationally unbounded adversaries. Simply XOR the hash with a random
89 * string indexed from a large random pad by some nonce sent with the
90 * message. The probability of a forgery attempt being successful is then
573eadb5 91 * %$(\ell + 1)/2^t$%, where %$t$% is the tag length and %$\ell$% is the
92 * longest message permitted.
8fe3c82b 93 */
94
95/*----- Practicalities ----------------------------------------------------*
96 *
97 * We work in %$\gf{2^32}$%, represented as a field of polynomials modulo
573eadb5 98 * %$\texttt{104c11db7}_x$% (this is the standard CRC-32 polynomial). Our
99 * blocks are bytes.
8fe3c82b 100 *
101 * The choice of a 32-bit hash is made for pragmatic reasons: we're never
102 * likely to actually want all 32 bits for a real hashtable anyway. The
103 * truncation result is needed to keep us afloat with smaller tables.
104 *
105 * We compute hashes using a slightly unrolled version of Horner's rule,
106 * using the recurrence:
107 *
108 * %$a_{i+b} = (a_i + m_i) k^b + m_{i+1} k^{b-1} + \cdots + m_{i+b-1} k$%
109 *
110 * which involves one full-width multiply and %$b - 1$% one-byte multiplies;
111 * the latter may be efficiently computed using a table lookup. Start with
112 * %$a_0 = k$%.
113 *
114 * We precompute tables %$S[\cdot][\cdot][\cdot]$%, where
115 *
116 * %$S[u][v][w] = k^{u+1} x^{8v} w$%
117 * for %$0 \le u < b$%, %$0 \le v < 4$%, %$0 \le w < 256)$%.
118 *
119 * A one-byte multiply is one lookup; a full-width multiply is four lookups
120 * and three XORs. The processing required is then %$b + 3$% lookups and
121 * %$b + 3$% XORs per batch, or %$(b + 3)/b$% lookups and XORs per byte, at
122 * the expense of %$4 b$% kilobytes of tables. This compares relatively
123 * favorably with CRC32. Indeed, in tests, this implementation with $b = 4$%
124 * is faster than a 32-bit CRC.
125 */
126
127/*----- Header files ------------------------------------------------------*/
128
129#include <stddef.h>
130
131#ifndef MLIB_BITS_H
132# include "bits.h"
133#endif
134
135/*----- Data structures ---------------------------------------------------*/
136
137#define UNIHASH_NBATCH 4
138#define UNIHASH_POLY 0x04c11db7 /* From CRC32 */
139
140typedef struct unihash_info {
141 uint32 s[UNIHASH_NBATCH][4][256]; /* S-tables as described */
142} unihash_info;
143
6f444bda 144/*----- A global hash-info table ------------------------------------------*/
145
146extern unihash_info unihash_global; /* Key this if you like */
147
8fe3c82b 148/*----- Functions provided ------------------------------------------------*/
149
150/* --- @unihash_setkey@ --- *
151 *
152 * Arguments: @unihash_info *i@ = where to store the precomputed tables
153 * @uint32 k@ = the key to set, randomly chosen
154 *
155 * Returns: ---
156 *
157 * Use: Calculates the tables required for efficient hashing.
158 */
159
160extern void unihash_setkey(unihash_info */*i*/, uint32 /*k*/);
161
162/* --- @unihash_hash@ --- *
163 *
164 * Arguments: @const unihash_info *i@ = pointer to precomputed table
165 * @uint32 a@ = @UNIHASH_INIT(i)@ or value from previous call
166 * @const void *p@ = pointer to data to hash
167 * @size_t sz@ = size of the data
168 *
573eadb5 169 * Returns: Hash of data so far.
8fe3c82b 170 *
171 * Use: Hashes data. Call this as many times as needed.
172 */
173
174#define UNIHASH_INIT(i) ((i)->s[0][0][1]) /* %$k$% */
175
176extern uint32 unihash_hash(const unihash_info */*i*/, uint32 /*a*/,
177 const void */*p*/, size_t /*sz*/);
178
179/* --- @unihash@ --- *
180 *
181 * Arguments: @const unihash_info *i@ = precomputed tables
182 * @const void *p@ = pointer to data to hash
183 * @size_t sz@ = size of the data
184 *
185 * Returns: The hash value computed.
186 *
187 * Use: All-in-one hashing function. No faster than using the
188 * separate calls, but more convenient.
189 */
190
191#define UNIHASH(i, p, sz) (unihash_hash((i), UNIHASH_INIT((i)), (p), (sz)))
192
193extern uint32 unihash(const unihash_info */*i*/,
194 const void */*p*/, size_t /*sz*/);
195
196/*----- That's all, folks -------------------------------------------------*/
197
198#ifdef __cplusplus
199 }
200#endif
201
202#endif