lib/hashtab.c

   1 /*  $Id: hashtab.c 6135 2003-01-19 01:15:40Z rra $
   2 **
   3 **  Generic hash table implementation.
   4 **
   5 **  Written by Russ Allbery <rra@stanford.edu>
   6 **  This work is hereby placed in the public domain by its author.
   7 **
   8 **  This is a generic hash table implementation with linear probing.  It
   9 **  takes a comparison function and a hashing function and stores void *.
  10 **
  11 **  Included for the use of callers is the hash function LOOKUP2 by Bob
  12 **  Jenkins, taken from <http://burtleburtle.net/bob/hash/>; see that web
  13 **  page for analysis and performance comparisons.  The performance of this
  14 **  hash is slightly worse than the standard sum and modulus hash function
  15 **  seen in many places but it produces fewer collisions.
  16 */
  17
  18 #include "config.h"
  19 #include "clibrary.h"
  20 #include "inn/hashtab.h"
  21 #include "libinn.h"
  22
  23 /* Magic values for empty and deleted hash table slots. */
  24 #define HASH_EMPTY      ((void *) 0)
  25 #define HASH_DELETED    ((void *) 1)
  26
  27 struct hash {
  28     size_t size;                /* Allocated size. */
  29     size_t mask;                /* Used to resolve a hash to an index. */
  30     size_t nelements;           /* Total elements, including deleted. */
  31     size_t ndeleted;            /* Number of deleted elements. */
  32
  33     unsigned long searches;     /* Count of lookups (for debugging). */
  34     unsigned long collisions;   /* Count of collisions (for debugging). */
  35     unsigned long expansions;   /* Count of hash resizes needed. */
  36
  37     hash_func hash;             /* Return hash of a key. */
  38     hash_key_func key;          /* Given an element, returns its key. */
  39     hash_equal_func equal;      /* Whether a key matches an element. */
  40     hash_delete_func delete;    /* Called when a hash element is deleted. */
  41
  42     void **table;               /* The actual elements. */
  43 };
  44
  45
  46 /*
  47 **  Given a target table size, return the nearest power of two that's
  48 **  greater than or equal to that size, with a minimum size of four.  The
  49 **  minimum must be at least four to ensure that there is always at least
  50 **  one empty slot in the table given hash_find_slot's resizing of the table
  51 **  if it as least 75% full.  Otherwise, it would be possible for
  52 **  hash_find_slot to go into an infinite loop.
  53 */
  54 static size_t
  55 hash_size(size_t target)
  56 {
  57     int n;
  58     size_t size;
  59
  60     size = target - 1;
  61     for (n = 0; size > 0; n++)
  62         size >>= 1;
  63     size = 1 << n;
  64     return (size < 4) ? 4 : size;
  65 }
  66
  67
  68 /*
  69 **  Create a new hash table.  The given size is rounded up to the nearest
  70 **  power of two for speed reasons (it greatly simplifies the use of the
  71 **  hash function).
  72 */
  73 struct hash *
  74 hash_create(size_t size, hash_func hash_f, hash_key_func key_f,
  75             hash_equal_func equal_f, hash_delete_func delete_f)
  76 {
  77     struct hash *hash;
  78
  79     hash = xcalloc(1, sizeof(struct hash));
  80     hash->hash = hash_f;
  81     hash->key = key_f;
  82     hash->equal = equal_f;
  83     hash->delete = delete_f;
  84     hash->size = hash_size(size);
  85     hash->mask = hash->size - 1;
  86     hash->table = xcalloc(hash->size, sizeof(void *));
  87     return hash;
  88 }
  89
  90
  91 /*
  92 **  Free a hash and all resources used by it, and call the delete function
  93 **  on every element.
  94 */
  95 void
  96 hash_free(struct hash *hash)
  97 {
  98     size_t i;
  99     void *entry;
 100
 101     for (i = 0; i < hash->size; i++) {
 102         entry = hash->table[i];
 103         if (entry != HASH_EMPTY && entry != HASH_DELETED)
 104             (*hash->delete)(entry);
 105     }
 106     free(hash->table);
 107     free(hash);
 108 }
 109
 110
 111 /*
 112 **  Internal search function used by hash_expand.  This is an optimized
 113 **  version of hash_find_slot that returns a pointer to the first empty
 114 **  slot, not trying to call the equality function on non-empty slots and
 115 **  assuming there are no HASH_DELETED slots.
 116 */
 117 static void **
 118 hash_find_empty(struct hash *hash, const void *key)
 119 {
 120     size_t slot;
 121
 122     slot = (*hash->hash)(key) & hash->mask;
 123     while (1) {
 124         if (hash->table[slot] == HASH_EMPTY)
 125             return &hash->table[slot];
 126
 127         slot++;
 128         if (slot >= hash->size)
 129             slot -= hash->size;
 130     }
 131 }
 132
 133
 134 /*
 135 **  Expand the hash table to be approximately 50% empty based on the number
 136 **  of elements in the hash.  This is done by allocating a new table and
 137 **  then calling hash_find_empty for each element in the previous table,
 138 **  recovering the key by calling hash->key on the element.
 139 */
 140 static void
 141 hash_expand(struct hash *hash)
 142 {
 143     void **old, **slot;
 144     size_t i, size;
 145
 146     old = hash->table;
 147     size = hash->size;
 148     hash->size = hash_size((hash->nelements - hash->ndeleted) * 2);
 149     hash->mask = hash->size - 1;
 150     hash->table = xcalloc(hash->size, sizeof(void *));
 151
 152     hash->nelements = 0;
 153     hash->ndeleted = 0;
 154     for (i = 0; i < size; i++)
 155         if (old[i] != HASH_EMPTY && old[i] != HASH_DELETED) {
 156             slot = hash_find_empty(hash, (*hash->key)(old[i]));
 157             *slot = old[i];
 158             hash->nelements++;
 159         }
 160
 161     hash->expansions++;
 162     free(old);
 163 }
 164
 165
 166 /*
 167 **  Find a slot in the hash for a given key.  This is used both for
 168 **  inserting and deleting elements from the hash, as well as looking up
 169 **  entries.  Returns a pointer to the slot.  If insert is true, return the
 170 **  first empty or deleted slot.  If insert is false, return NULL if the
 171 **  element could not be found.
 172 **
 173 **  This function assumes that there is at least one empty slot in the
 174 **  hash; otherwise, it can loop infinitely.  It attempts to ensure this by
 175 **  always expanding the hash if it is at least 75% full; this will ensure
 176 **  that property for any hash size of 4 or higher.
 177 */
 178 static void **
 179 hash_find_slot(struct hash *hash, const void *key, bool insert)
 180 {
 181     void **deleted_slot = NULL;
 182     void *entry;
 183     size_t slot;
 184
 185     if (insert && hash->nelements * 4 >= hash->size * 3)
 186         hash_expand(hash);
 187
 188     hash->searches++;
 189
 190     slot = (*hash->hash)(key) & hash->mask;
 191     while (1) {
 192         entry = hash->table[slot];
 193         if (entry == HASH_EMPTY) {
 194             if (!insert)
 195                 return NULL;
 196
 197             if (deleted_slot != NULL) {
 198                 *deleted_slot = HASH_EMPTY;
 199                 hash->ndeleted--;
 200                 return deleted_slot;
 201             }
 202             hash->nelements++;
 203             return &hash->table[slot];
 204         } else if (entry == HASH_DELETED) {
 205             if (insert)
 206                 deleted_slot = &hash->table[slot];
 207         } else if ((*hash->equal)(key, entry)) {
 208             return &hash->table[slot];
 209         }
 210
 211         hash->collisions++;
 212         slot++;
 213         if (slot >= hash->size)
 214             slot -= hash->size;
 215     }
 216 }
 217
 218
 219 /*
 220 **  Given a key, return the entry corresponding to that key or NULL if that
 221 **  key isn't present in the hash table.
 222 */
 223 void *
 224 hash_lookup(struct hash *hash, const void *key)
 225 {
 226     void **slot;
 227
 228     slot = hash_find_slot(hash, key, false);
 229     return (slot == NULL) ? NULL : *slot;
 230 }
 231
 232
 233 /*
 234 **  Insert a new key/value pair into the hash, returning true if the
 235 **  insertion was successful and false if there is already a value in the
 236 **  hash with that key.
 237 */
 238 bool
 239 hash_insert(struct hash *hash, const void *key, void *datum)
 240 {
 241     void **slot;
 242
 243     slot = hash_find_slot(hash, key, true);
 244     if (*slot != HASH_EMPTY)
 245         return false;
 246     *slot = datum;
 247     return true;
 248 }
 249
 250
 251 /*
 252 **  Replace an existing hash value with a new data value, calling the delete
 253 **  function for the old data.  Returns true if the replacement was
 254 **  successful or false (without changing the hash) if the key whose value
 255 **  should be replaced was not found in the hash.
 256 */
 257 bool
 258 hash_replace(struct hash *hash, const void *key, void *datum)
 259 {
 260     void **slot;
 261
 262     slot = hash_find_slot(hash, key, false);
 263     if (slot == NULL)
 264         return false;
 265     (*hash->delete)(*slot);
 266     *slot = datum;
 267     return true;
 268 }
 269
 270
 271 /*
 272 **  Delete a key out of the hash.  Returns true if the deletion was
 273 **  successful, false if the key could not be found in the hash.
 274 */
 275 bool
 276 hash_delete(struct hash *hash, const void *key)
 277 {
 278     bool result;
 279
 280     result = hash_replace(hash, key, HASH_DELETED);
 281     if (result)
 282         hash->ndeleted++;
 283     return result;
 284 }
 285
 286
 287 /*
 288 **  For each element in the hash table, call the provided function, passing
 289 **  it the element and the opaque token that's passed to this function.
 290 */
 291 void
 292 hash_traverse(struct hash *hash, hash_traverse_func callback, void *data)
 293 {
 294     size_t i;
 295     void *entry;
 296
 297     for (i = 0; i < hash->size; i++) {
 298         entry = hash->table[i];
 299         if (entry != HASH_EMPTY && entry != HASH_DELETED)
 300             (*callback)(entry, data);
 301     }
 302 }
 303
 304
 305 /*
 306 **  Returns a count of undeleted elements in the hash.
 307 */
 308 unsigned long
 309 hash_count(struct hash *hash)
 310 {
 311     return hash->nelements - hash->ndeleted;
 312 }
 313
 314
 315 /*
 316 **  Accessor functions for the debugging statistics.
 317 */
 318 unsigned long
 319 hash_searches(struct hash *hash)
 320 {
 321     return hash->searches;
 322 }
 323
 324 unsigned long
 325 hash_collisions(struct hash *hash)
 326 {
 327     return hash->collisions;
 328 }
 329
 330 unsigned long
 331 hash_expansions(struct hash *hash)
 332 {
 333     return hash->expansions;
 334 }
 335
 336
 337 /*
 338 **  Mix three 32-bit values reversibly.  This is the internal mixing
 339 **  function for the hash function.
 340 **
 341 **  For every delta with one or two bit set, and the deltas of all three
 342 **  high bits or all three low bits, whether the original value of a,b,c
 343 **  is almost all zero or is uniformly distributed,
 344 **
 345 **   * If mix() is run forward or backward, at least 32 bits in a,b,c
 346 **     have at least 1/4 probability of changing.
 347 **
 348 **   * If mix() is run forward, every bit of c will change between 1/3 and
 349 **     2/3 of the time.  (Well, 22/100 and 78/100 for some 2-bit deltas.)
 350 **
 351 **  mix() takes 36 machine instructions, but only 18 cycles on a superscalar
 352 **  machine (like a Pentium or a Sparc).  No faster mixer seems to work,
 353 **  that's the result of my brute-force search.  There were about 2^68
 354 **  hashes to choose from.  I (Bob Jenkins) only tested about a billion of
 355 **  those.
 356 */
 357 #define MIX(a, b, c)                                    \
 358     {                                                   \
 359         (a) -= (b); (a) -= (c); (a) ^= ((c) >> 13);     \
 360         (b) -= (c); (b) -= (a); (b) ^= ((a) << 8);      \
 361         (c) -= (a); (c) -= (b); (c) ^= ((b) >> 13);     \
 362         (a) -= (b); (a) -= (c); (a) ^= ((c) >> 12);     \
 363         (b) -= (c); (b) -= (a); (b) ^= ((a) << 16);     \
 364         (c) -= (a); (c) -= (b); (c) ^= ((b) >> 5);      \
 365         (a) -= (b); (a) -= (c); (a) ^= ((c) >> 3);      \
 366         (b) -= (c); (b) -= (a); (b) ^= ((a) << 10);     \
 367         (c) -= (a); (c) -= (b); (c) ^= ((b) >> 15);     \
 368     }
 369
 370
 371 /*
 372 **  Hash a variable-length key into a 32-bit value.
 373 **
 374 **  Takes byte sequence to hash and returns a 32-bit value.  A partial
 375 **  result can be passed as the third parameter so that large amounts of
 376 **  data can be hashed by subsequent calls, passing in the result of the
 377 **  previous call each time.  Every bit of the key affects every bit of the
 378 **  return value.  Every 1-bit and 2-bit delta achieves avalanche.  About
 379 **  (36 + 6n) instructions.
 380 **
 381 **  The best hash table sizes are powers of 2.  There is no need to mod with
 382 **  a prime (mod is sooo slow!).  If you need less than 32 bits, use a
 383 **  bitmask.  For example, if you need only 10 bits, do:
 384 **
 385 **      h = h & ((1 << 10) - 1);
 386 **
 387 **  In which case, the hash table should have 2^10 elements.
 388 **
 389 **  Based on code by Bob Jenkins <bob_jenkins@burtleburtle.net>, originally
 390 **  written in 1996.  The original license was:
 391 **
 392 **      By Bob Jenkins, 1996.  bob_jenkins@burtleburtle.net.  You may use
 393 **      this code any way you wish, private, educational, or commercial.
 394 **      It's free.
 395 **
 396 **  See <http://burlteburtle.net/bob/hash/evahash.html> for discussion of
 397 **  this hash function.  Use for hash table lookup, or anything where one
 398 **  collision in 2^32 is acceptable.  Do NOT use for cryptographic purposes.
 399 */
 400 unsigned long
 401 hash_lookup2(const char *key, size_t length, unsigned long partial)
 402 {
 403     uint32_t a, b, c, len;
 404
 405     /* Set up the internal state.  a and b are initialized to a golden
 406        ratio, an arbitrary value intended to avoid mapping all zeroes to all
 407        zeroes. */
 408     len = length;
 409     a = b = 0x9e3779b9;
 410     c = partial;
 411
 412 #define S0(c)   ((uint32_t)(c))
 413 #define S1(c)   ((uint32_t)(c) << 8)
 414 #define S2(c)   ((uint32_t)(c) << 16)
 415 #define S3(c)   ((uint32_t)(c) << 24)
 416
 417     /* Handle most of the key. */
 418     while (len >= 12) {
 419         a += S0(key[0]) + S1(key[1]) + S2(key[2])  + S3(key[3]);
 420         b += S0(key[4]) + S1(key[5]) + S2(key[6])  + S3(key[7]);
 421         c += S0(key[8]) + S1(key[9]) + S2(key[10]) + S3(key[11]);
 422         MIX(a, b, c);
 423         key += 12;
 424         len -= 12;
 425    }
 426
 427     /* Handle the last 11 bytes.  All of the cases fall through. */
 428     c += length;
 429     switch (len) {
 430     case 11: c += S3(key[10]);
 431     case 10: c += S2(key[9]);
 432     case  9: c += S1(key[8]);
 433         /* The first byte of c is reserved for the length. */
 434     case  8: b += S3(key[7]);
 435     case  7: b += S2(key[6]);
 436     case  6: b += S1(key[5]);
 437     case  5: b += S0(key[4]);
 438     case  4: a += S3(key[3]);
 439     case  3: a += S2(key[2]);
 440     case  2: a += S1(key[1]);
 441     case  1: a += S0(key[0]);
 442         /* case 0: nothing left to add. */
 443     }
 444     MIX(a, b, c);
 445     return c;
 446 }
 447
 448
 449 /*
 450 **  A hash function for nul-terminated strings using hash_lookup2, suitable
 451 **  for passing to hash_create.
 452 */
 453 unsigned long
 454 hash_string(const void *key)
 455 {
 456     return hash_lookup2(key, strlen(key), 0);
 457 }