chiark - git - mdw - mLib/blob - unihash.h

   1 /* -*-c-*-
   2  *
   3  * $Id: unihash.h,v 1.3 2003/12/15 20:53:47 mdw Exp $
   4  *
   5  * Simple and efficient universal hashing for hashtables
   6  *
   7  * (c) 2003 Straylight/Edgeware
   8  */
   9
  10 /*----- Licensing notice --------------------------------------------------*
  11  *
  12  * This file is part of the mLib utilities library.
  13  *
  14  * mLib is free software; you can redistribute it and/or modify
  15  * it under the terms of the GNU Library General Public License as
  16  * published by the Free Software Foundation; either version 2 of the
  17  * License, or (at your option) any later version.
  18  *
  19  * mLib is distributed in the hope that it will be useful,
  20  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  21  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  22  * GNU Library General Public License for more details.
  23  *
  24  * You should have received a copy of the GNU Library General Public
  25  * License along with mLib; if not, write to the Free
  26  * Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
  27  * MA 02111-1307, USA.
  28  */
  29
  30 /*----- Revision history --------------------------------------------------*
  31  *
  32  * $Log: unihash.h,v $
  33  * Revision 1.3  2003/12/15 20:53:47  mdw
  34  * Add global unihash table; use universal hashing instead of CRC.
  35  *
  36  * Revision 1.2  2003/12/14 14:45:30  mdw
  37  * Test universal hashing and fix bugs.
  38  *
  39  * Revision 1.1  2003/10/12 14:43:24  mdw
  40  * Universal hashing.
  41  *
  42  */
  43
  44 #ifndef MLIB_UNIHASH_H
  45 #define MLIB_UNIHASH_H
  46
  47 #ifdef __cplusplus
  48   extern "C" {
  49 #endif
  50
  51
  52 /*----- Concept -----------------------------------------------------------*
  53  *
  54  * Let %$\gf{q}$% be a finite field.  Choose an arbitrary %$k \inr \gf{q}$%.
  55  * Let %$M$% be a message.  Injectively pad %$M$% and split it into blocks
  56  * $m_{n-1}, m_{n-2}, \ldots, m_2, m_1, m_0$% in %$\gf{q}%.
  57  * Then we compute
  58  *
  59  *   %$H_k(M) = k^{n+1} + \sum_{0\le i<n} m_i k^{i+1}.$%
  60  *
  61  * Note that %$H_0(M) = 0$% for all messages %$M$%.
  62  *
  63  * If we deal with messages at most %$\ell$% blocks long then %$H_k(\cdot)$%
  64  * is %$(\ell + 1)/q$%-almost universal.  Moreover, if %$q = 2^f$% then
  65  * %$H_k(\cdot)$% is %$(\ell + 1)/q$%-almost XOR-universal.
  66  *
  67  * Proof.  Let %$A$% and %$B$% be two messages, represented by
  68  * %$a_{n-1}, \ldots, a_0$% and %$b_{m-1}, \ldots, b_0$% respectively; and
  69  * choose any %$\delta \in \gf{q}$%.  We must bound the probability that
  70  *
  71  * %$k^{n+1} + a_{n-1} k^{n} + \cdots + a_1 k^2 + a_0 k - {}$%
  72  *   %$k^{m+1} - b_{m-1} k^{m} - \cdots - b_1 k^2 - b_0 k = \delta$%.
  73  *
  74  * Firstly, we claim that if %$A$% and %$B$% are distinct, there is some
  75  * nonzero coefficient of %$k$%.  For if %$n \ne m$% then, without loss of
  76  * generality, let %$n > m$%, and hence the coefficient of %$k_n$% is
  77  * nonzero.  Alternatively, if %$n = m$% then there must be some
  78  * %$i \in \{ 0, \ldots, n - 1 \}$% with %$a_i \ne b_i$%, for otherwise the
  79  * messages would be identical; but then the coefficient of %$k^{i+1}$% is
  80  * %$a_i - b_i \ne 0$%.
  81  *
  82  * Hence we have a polynomial equation with degree at most %$\ell + 1$%;
  83  * there must be at most %$\ell + 1$% solutions for %$k$%; but we choose
  84  * %$k$% at random from a set of %$q$%; so the equation is true with
  85  * probability at most %$(\ell + 1)/q$%.
  86  *
  87  * This function can be used as a simple MAC with provable security against
  88  * computationally unbounded adversaries.  Simply XOR the hash with a random
  89  * string indexed from a large random pad by some nonce sent with the
  90  * message.  The probability of a forgery attempt being successful is then
  91  * %$(\ell + 1)/2^t$%, where %$t$% is the tag length and %$\ell$% is the
  92  * longest message permitted.
  93  */
  94
  95 /*----- Practicalities ----------------------------------------------------*
  96  *
  97  * We work in %$\gf{2^32}$%, represented as a field of polynomials modulo
  98  * %$\texttt{104c11db7}_x$% (this is the standard CRC-32 polynomial).  Our
  99  * blocks are bytes.
 100  *
 101  * The choice of a 32-bit hash is made for pragmatic reasons: we're never
 102  * likely to actually want all 32 bits for a real hashtable anyway.  The
 103  * truncation result is needed to keep us afloat with smaller tables.
 104  *
 105  * We compute hashes using a slightly unrolled version of Horner's rule,
 106  * using the recurrence:
 107  *
 108  *   %$a_{i+b} = (a_i + m_i) k^b + m_{i+1} k^{b-1} + \cdots + m_{i+b-1} k$%
 109  *
 110  * which involves one full-width multiply and %$b - 1$% one-byte multiplies;
 111  * the latter may be efficiently computed using a table lookup.  Start with
 112  * %$a_0 = k$%.
 113  *
 114  * We precompute tables %$S[\cdot][\cdot][\cdot]$%, where
 115  *
 116  *   %$S[u][v][w] = k^{u+1} x^{8v} w$%
 117  *     for %$0 \le u < b$%, %$0 \le v < 4$%, %$0 \le w < 256)$%.
 118  *
 119  * A one-byte multiply is one lookup; a full-width multiply is four lookups
 120  * and three XORs.  The processing required is then %$b + 3$% lookups and
 121  * %$b + 3$% XORs per batch, or %$(b + 3)/b$% lookups and XORs per byte, at
 122  * the expense of %$4 b$% kilobytes of tables.  This compares relatively
 123  * favorably with CRC32.  Indeed, in tests, this implementation with $b = 4$%
 124  * is faster than a 32-bit CRC.
 125  */
 126
 127 /*----- Header files ------------------------------------------------------*/
 128
 129 #include <stddef.h>
 130
 131 #ifndef MLIB_BITS_H
 132 #  include "bits.h"
 133 #endif
 134
 135 /*----- Data structures ---------------------------------------------------*/
 136
 137 #define UNIHASH_NBATCH 4
 138 #define UNIHASH_POLY 0x04c11db7         /* From CRC32 */
 139
 140 typedef struct unihash_info {
 141   uint32 s[UNIHASH_NBATCH][4][256];     /* S-tables as described */
 142 } unihash_info;
 143
 144 /*----- A global hash-info table ------------------------------------------*/
 145
 146 extern unihash_info unihash_global;     /* Key this if you like */
 147
 148 /*----- Functions provided ------------------------------------------------*/
 149
 150 /* --- @unihash_setkey@ --- *
 151  *
 152  * Arguments:   @unihash_info *i@ = where to store the precomputed tables
 153  *              @uint32 k@ = the key to set, randomly chosen
 154  *
 155  * Returns:     ---
 156  *
 157  * Use:         Calculates the tables required for efficient hashing.
 158  */
 159
 160 extern void unihash_setkey(unihash_info */*i*/, uint32 /*k*/);
 161
 162 /* --- @unihash_hash@ --- *
 163  *
 164  * Arguments:   @const unihash_info *i@ = pointer to precomputed table
 165  *              @uint32 a@ = @UNIHASH_INIT(i)@ or value from previous call
 166  *              @const void *p@ = pointer to data to hash
 167  *              @size_t sz@ = size of the data
 168  *
 169  * Returns:     Hash of data so far.
 170  *
 171  * Use:         Hashes data.  Call this as many times as needed.
 172  */
 173
 174 #define UNIHASH_INIT(i) ((i)->s[0][0][1]) /* %$k$% */
 175
 176 extern uint32 unihash_hash(const unihash_info */*i*/, uint32 /*a*/,
 177                            const void */*p*/, size_t /*sz*/);
 178
 179 /* --- @unihash@ --- *
 180  *
 181  * Arguments:   @const unihash_info *i@ = precomputed tables
 182  *              @const void *p@ = pointer to data to hash
 183  *              @size_t sz@ = size of the data
 184  *
 185  * Returns:     The hash value computed.
 186  *
 187  * Use:         All-in-one hashing function.  No faster than using the
 188  *              separate calls, but more convenient.
 189  */
 190
 191 #define UNIHASH(i, p, sz) (unihash_hash((i), UNIHASH_INIT((i)), (p), (sz)))
 192
 193 extern uint32 unihash(const unihash_info */*i*/,
 194                       const void */*p*/, size_t /*sz*/);
 195
 196 /*----- That's all, folks -------------------------------------------------*/
 197
 198 #ifdef __cplusplus
 199   }
 200 #endif
 201
 202 #endif