chiark - git - mdw - mLib/blob - hash.3

   1 .\" -*-nroff-*-
   2 .de VS
   3 .sp 1
   4 .RS
   5 .nf
   6 .ft B
   7 ..
   8 .de VE
   9 .ft R
  10 .fi
  11 .RE
  12 .sp 1
  13 ..
  14 .de hP
  15 .IP
  16 .ft B
  17 \h'-\w'\\$1\ 'u'\\$1\ \c
  18 .ft P
  19 ..
  20 .ie t .ds o \(bu
  21 .el .ds o o
  22 .TH hash 3 "2 August 1999" "Straylight/Edgeware" "mLib utilities library"
  23 .SH "NAME"
  24 hash \- low-level hashtable implementation
  25 .\" @hash_create
  26 .\" @hash_destroy
  27 .\" @hash_bin
  28 .\" @hash_extend
  29 .\" @hash_remove
  30 .\" @hash_mkiter
  31 .\" @hash_next
  32 .\"
  33 .\" @HASH_BIN
  34 .\" @HASH_MKITER
  35 .\" @HASH_NEXT
  36 .\"
  37 .SH "SYNOPSIS"
  38 .nf
  39 .B "#include <mLib/hash.h>"
  40
  41 .BI "void hash_create(hash_table *" t ", size_t " n );
  42 .BI "void hash_destroy(hash_table *" t );
  43 .BI "hash_base **hash_bin(hash_table *" t ", uint32 " hash );
  44 .BI "int hash_extend(hash_table *" t );
  45 .BI "void hash_remove(hash_table *" t ", hash_base * " b );
  46 .BI "void hash_mkiter(hash_iter *" i ", hash_table *" t );
  47 .BI "hash_base *hash_next(hash_iter *" i );
  48
  49 .BI "hash_base **HASH_BIN(hash_table *" t ", uint32 " hash );
  50 .BI "void HASH_MKITER(hash_iter *" i ", hash_table *" t );
  51 .BI "void HASH_NEXT(hash_iter *" i ", " b );
  52 .fi
  53 .SH "OVERVIEW"
  54 The
  55 .B hash
  56 functions provide the basis for an extensible hashtable implementation.
  57 The implementation is not complete.  Many decisions have been left to
  58 the user, including:
  59 .hP \*o
  60 how keys should be represented, hashed and compared;
  61 .hP \*o
  62 how objects contained within the table should be allocated; and
  63 .hP \*o
  64 when the hashtable should be extended.
  65 .PP
  66 A complete hashtable implementation will need to take the above
  67 decisions.  If you just want a prepackaged solution, see
  68 .BR sym (3)
  69 which provides one.
  70 .SH "IMPLEMENTATION DETAILS"
  71 Each item in the hashtable is assigned a 32-bit integer
  72 .IR hash :
  73 a number computed somehow from the item's data such that two items which
  74 are considered equal will yield the same hash.  Ideally, different items
  75 will yield different hashes.  It is important for this implementation
  76 that all bits of the hash are similarly good.
  77 .PP
  78 In order to look up an item, the high bits of the hash are masked off
  79 and the remainder used as an index into a vector of
  80 .IR "bin lists" .
  81 Each bin contains a list of items with identical low-order bits of their
  82 hashes.
  83 .PP
  84 A table expansion involves doubling the size of the bin vector.  Each
  85 bin list is then split into two, items being placed into a new bin
  86 depending on the setting of the next lowest hash bit.  By expanding the
  87 hashtable as needed, lookups remain constant-time.
  88 .PP
  89 A low-level hashtable is represented by a
  90 .B hash_table
  91 structure.  It contains two members:
  92 .TP
  93 .B "uint32 mask"
  94 The current bitmask to be applied to hashes.  This is one less than the
  95 current number of bins in the hashtable, and is applied to hash values
  96 in order to decide which bin an item should be in.
  97 .TP
  98 .B "hash_base **v"
  99 The bin vector.  It is an array of pointers to hashtable items.
 100 .PP
 101 A hashtable item consists of a
 102 .B hash_base
 103 structure.  A full hashtable implementation will need to extend this
 104 structure by adding keying information and other data; the
 105 .B hash_base
 106 only contains the bare minimum of information needed to maintain the
 107 hashtable at a low level.  It contains the following members:
 108 .TP
 109 .B "hash_base *next"
 110 Pointer to the next item in the bin list.  The final item has a null
 111 .B next
 112 pointer.  The entry in the bin vector is null if the bin list is empty.
 113 It is up to the high-level implementation to insert items into the list.
 114 .TP
 115 .B "uint32 hash"
 116 The hash for this item.  This must be the full 32-bit hash for the
 117 current item.  It is used during hashtable expansion to determine which
 118 bin an item should be moved to.
 119 .SH "FUNCTIONALITY PROVIDED"
 120 This section describes the functions and macros provided for building
 121 hashtables.  Code examples are given throughout.  They assume the
 122 following definitions:
 123 .VS
 124 /* --- A table of items --- */
 125
 126 typedef struct item_table {
 127   hash_table t;
 128   size_t load;
 129 };
 130
 131 /* --- An item --- */
 132
 133 typedef struct item {
 134   hash_base b;
 135   const char *k;
 136   /* ... */
 137 } item;
 138 .VE
 139 The implementation presented here is simple but relatively bad.  The
 140 source file
 141 .B sym.c
 142 presents a more realistic example, but is rather more complex.
 143 .SS "Initialization and finalization"
 144 An empty hashtable is initialized by calling
 145 .B hash_create
 146 with the address of a
 147 .B hash_table
 148 structure to be filled in and the initial number of hash bins to create.
 149 .PP
 150 For example, an item table might be initialized like this:
 151 .VS
 152 void item_createtab(item_table *t)
 153 {
 154   hash_create(&t->t, ITEM_INITSZ);
 155   t->load = ITEM_INITLOAD;
 156 }
 157 .VE
 158 A hashtable can be destroyed by calling
 159 .B hash_destroy
 160 with the address of the
 161 .B hash_table
 162 structure.  This does not attempt to deallocate the individual items;
 163 that must be done beforehand.
 164 .PP
 165 The usual way to deallocate the individual hashtable items is using the
 166 iteration constructs described below.
 167 .VS
 168 void item_destroytab(item_table *t)
 169 {
 170   hash_iter i;
 171   hash_base *b;
 172   for (hash_mkiter(&i, &t->t); (b = hash_next(&i)) != 0; ) {
 173     item *ii = (item *)b;
 174     free(ii->k);
 175     /* ... */
 176     DESTROY(ii);
 177   }
 178   hash_destroy(&t->t);
 179 }
 180 .VE
 181 .sp -1
 182 .SS "Searching, adding and removing"
 183 Items must be searched for and added by hand.
 184 .PP
 185 The macro
 186 .B HASH_BIN
 187 returns the address of the bin list haed for a particular hashtable and
 188 hash value.  The function
 189 .B hash_bin
 190 works the same way and provides the same result, but since the macro is
 191 very simple its use is preferred.  However, it will evaluate its
 192 arguments multiple times.
 193 .PP
 194 Once the bin list has been found, it's fairly easy to search for an
 195 exact match.  A simple search might look something like this:
 196 .VS
 197 item *lookup(item_table *t, const char *k)
 198 {
 199   uint32 h = hash(k);           /* Hash @k@ somehow */
 200   hash_base **bin = HASH_BIN(&t->t, h);
 201   hash_base *b;
 202   for (b = *bin; b; b = b->next) {
 203     item *i = (item *)b;
 204     if (h == i->b.hash && strcmp(k, i->k) == 0)
 205       return (i);
 206   }
 207   return (0);
 208 }
 209 .VE
 210 Insertion is also relatively trivial given the bin list head.  Insertion
 211 may make the hashtable too large, so it might be necessary to extend
 212 it.  Extension is performed by
 213 .BR hash_extend ,
 214 which is passed only the address of the hashtable.  It returns nonzero
 215 if extension was successful.
 216 .VS
 217 item *add(item_table *t, const char *k, /* ... */)
 218 {
 219   item *i;
 220   uint32 h;
 221   hash_base **bin;
 222
 223   /* --- See if the item is already there --- */
 224
 225   if ((i =  = lookup(t, k)) != 0)
 226     return (i);
 227
 228   /* --- Make a new hashtable item --- */
 229
 230   i = CREATE(item);
 231   i->k = xstrdup(k);
 232   /* ... */
 233
 234   /* --- Link it into the bin list --- */
 235
 236   h = i->b.hash = hash(k);
 237   bin = HASH_BIN(&t->t, h);
 238   i->b.next = *bin;
 239   *bin = &i->b.next;
 240
 241   /* --- Maybe extend the hashtable --- */
 242
 243   if (t->load)
 244     t->load--;
 245   else if (hash_extend(&t->t))
 246     t->load = recalc_load(t);
 247
 248   /* --- Done --- */
 249
 250   return (i);
 251 }
 252 .VE
 253 The
 254 .B sym
 255 implementation is rather more sophisticated in its approach but the idea
 256 is the same.  In particular,
 257 .B sym
 258 provides a single interface for lookup and insertion which prevents the
 259 unnecessary rehashing performed by the above code.
 260 .PP
 261 Removal of items is more straightforward.  The function
 262 .B hash_remove
 263 will unlink a given item from its bin list, after which point it is safe
 264 to remove.
 265 .SS "Iteration"
 266 Iteration allows code to be performed on all the items in a hashtable.
 267 This is done using an
 268 .I iterator
 269 object, represented by a
 270 .B hash_iter
 271 structure.  An iterator is initialized by calling
 272 .BR hash_mkiter .
 273 Each call to
 274 .B hash_next
 275 yields a different item from the hashtable until there are none left, a
 276 condition signified by a null return value.
 277 .PP
 278 The macros
 279 .B HASH_MKITER
 280 and
 281 .B HASH_NEXT
 282 do the same jobs as the above functions.  However,
 283 .B HASH_NEXT
 284 has a slightly bizarre argument passing convention: its second argument
 285 is an
 286 .I lvalue
 287 which is updated to contain the address of the next item.
 288 .PP
 289 The finalization code above contained an example of iteration.
 290 .SH "SEE ALSO"
 291 .BR sym (3),
 292 .BR mLib (3).
 293 .SH "AUTHOR"
 294 Mark Wooding, <mdw@distorted.org.uk>