chiark - git - mdw - mLib/blob - struct/hash.3

   1 .\" -*-nroff-*-
   2 .de VS
   3 .sp 1
   4 .RS
   5 .nf
   6 .ft B
   7 ..
   8 .de VE
   9 .ft R
  10 .fi
  11 .RE
  12 .sp 1
  13 ..
  14 .de hP
  15 .IP
  16 .ft B
  17 \h'-\w'\\$1\ 'u'\\$1\ \c
  18 .ft P
  19 ..
  20 .ie t .ds o \(bu
  21 .el .ds o o
  22 .TH hash 3 "2 August 1999" "Straylight/Edgeware" "mLib utilities library"
  23 .SH "NAME"
  24 hash \- low-level hashtable implementation
  25 .\" @hash_create
  26 .\" @hash_destroy
  27 .\" @hash_bin
  28 .\" @hash_extend
  29 .\" @hash_remove
  30 .\" @hash_mkiter
  31 .\" @hash_next
  32 .\"
  33 .\" @HASH_BIN
  34 .\" @HASH_MKITER
  35 .\" @HASH_NEXT
  36 .\"
  37 .SH "SYNOPSIS"
  38 .nf
  39 .B "#include <mLib/hash.h>"
  40
  41 .ta 2n
  42 .B "typedef struct {"
  43 .B "    uint32 mask;"
  44 .B "    hash_base **v;"
  45 .B "    arena *a;"
  46 .B "} hash_table;"
  47
  48 .B "typedef struct {"
  49 .B "    hash_base *next;"
  50 .B "    uint32 hash;"
  51 .B "} hash_base;"
  52
  53 .B "typedef struct { ...\& } hash_iter;"
  54
  55 .BI "void hash_create(hash_table *" t ", size_t " n );
  56 .BI "void hash_destroy(hash_table *" t );
  57 .BI "hash_base **hash_bin(hash_table *" t ", uint32 " hash );
  58 .BI "int hash_extend(hash_table *" t );
  59 .BI "void hash_remove(hash_table *" t ", hash_base *" b );
  60 .BI "void hash_mkiter(hash_iter *" i ", hash_table *" t );
  61 .BI "hash_base *hash_next(hash_iter *" i );
  62
  63 .BI "hash_base **HASH_BIN(hash_table *" t ", uint32 " hash );
  64 .BI "void HASH_MKITER(hash_iter *" i ", hash_table *" t );
  65 .BI "void HASH_NEXT(hash_iter *" i ", " b );
  66 .fi
  67 .SH "OVERVIEW"
  68 The
  69 .B hash
  70 functions provide the basis for an extensible hashtable implementation.
  71 The implementation is not complete.  Many decisions have been left to
  72 the user, including:
  73 .hP \*o
  74 how keys should be represented, hashed and compared;
  75 .hP \*o
  76 how objects contained within the table should be allocated; and
  77 .hP \*o
  78 when the hashtable should be extended.
  79 .PP
  80 A complete hashtable implementation will need to take the above
  81 decisions.  If you just want a prepackaged solution, see
  82 .BR sym (3)
  83 which provides one.
  84 .SH "IMPLEMENTATION DETAILS"
  85 Each item in the hashtable is assigned a 32-bit integer
  86 .IR hash :
  87 a number computed somehow from the item's data such that two items which
  88 are considered equal will yield the same hash.  Ideally, different items
  89 will yield different hashes.  It is important for this implementation
  90 that all bits of the hash are similarly good.
  91 .PP
  92 In order to look up an item, the high bits of the hash are masked off
  93 and the remainder used as an index into a vector of
  94 .IR "bin lists" .
  95 Each bin contains a list of items with identical low-order bits of their
  96 hashes.
  97 .PP
  98 A table expansion involves doubling the size of the bin vector.  Each
  99 bin list is then split into two, items being placed into a new bin
 100 depending on the setting of the next lowest hash bit.  By expanding the
 101 hashtable as needed, lookups remain constant-time.
 102 .PP
 103 A low-level hashtable is represented by a
 104 .B hash_table
 105 structure.  It contains two members:
 106 .TP
 107 .B "uint32 mask"
 108 The current bitmask to be applied to hashes.  This is one less than the
 109 current number of bins in the hashtable, and is applied to hash values
 110 in order to decide which bin an item should be in.
 111 .TP
 112 .B "hash_base **v"
 113 The bin vector.  It is an array of pointers to hashtable items.
 114 .PP
 115 A hashtable item consists of a
 116 .B hash_base
 117 structure.  A full hashtable implementation will need to extend this
 118 structure by adding keying information and other data; the
 119 .B hash_base
 120 only contains the bare minimum of information needed to maintain the
 121 hashtable at a low level.  It contains the following members:
 122 .TP
 123 .B "hash_base *next"
 124 Pointer to the next item in the bin list.  The final item has a null
 125 .B next
 126 pointer.  The entry in the bin vector is null if the bin list is empty.
 127 It is up to the high-level implementation to insert items into the list.
 128 .TP
 129 .B "uint32 hash"
 130 The hash for this item.  This must be the full 32-bit hash for the
 131 current item.  It is used during hashtable expansion to determine which
 132 bin an item should be moved to.
 133 .SH "FUNCTIONALITY PROVIDED"
 134 This section describes the functions and macros provided for building
 135 hashtables.  Code examples are given throughout.  They assume the
 136 following definitions:
 137 .VS
 138 /* --- A table of items --- */
 139
 140 typedef struct item_table {
 141   hash_table t;
 142   size_t load;
 143 };
 144
 145 /* --- An item --- */
 146
 147 typedef struct item {
 148   hash_base b;
 149   const char *k;
 150   /* ... */
 151 } item;
 152 .VE
 153 The implementation presented here is simple but relatively bad.  The
 154 source file
 155 .B sym.c
 156 presents a more realistic example, but is rather more complex.
 157 .SS "Initialization and finalization"
 158 An empty hashtable is initialized by calling
 159 .B hash_create
 160 with the address of a
 161 .B hash_table
 162 structure to be filled in and the initial number of hash bins to create.
 163 .PP
 164 For example, an item table might be initialized like this:
 165 .VS
 166 void item_createtab(item_table *t)
 167 {
 168   hash_create(&t->t, ITEM_INITSZ);
 169   t->load = ITEM_INITLOAD;
 170 }
 171 .VE
 172 A hashtable can be destroyed by calling
 173 .B hash_destroy
 174 with the address of the
 175 .B hash_table
 176 structure.  This does not attempt to deallocate the individual items;
 177 that must be done beforehand.
 178 .PP
 179 The usual way to deallocate the individual hashtable items is using the
 180 iteration constructs described below.
 181 .VS
 182 void item_destroytab(item_table *t)
 183 {
 184   hash_iter i;
 185   hash_base *b;
 186   for (hash_mkiter(&i, &t->t); (b = hash_next(&i)) != 0; ) {
 187     item *ii = (item *)b;
 188     free(ii->k);
 189     /* ... */
 190     DESTROY(ii);
 191   }
 192   hash_destroy(&t->t);
 193 }
 194 .VE
 195 .sp -1
 196 .SS "Searching, adding and removing"
 197 Items must be searched for and added by hand.
 198 .PP
 199 The macro
 200 .B HASH_BIN
 201 returns the address of the bin list haed for a particular hashtable and
 202 hash value.  The function
 203 .B hash_bin
 204 works the same way and provides the same result, but since the macro is
 205 very simple its use is preferred.  However, it will evaluate its
 206 arguments multiple times.
 207 .PP
 208 Once the bin list has been found, it's fairly easy to search for an
 209 exact match.  A simple search might look something like this:
 210 .VS
 211 item *lookup(item_table *t, const char *k)
 212 {
 213   uint32 h = hash(k);           /* Hash @k@ somehow */
 214   hash_base **bin = HASH_BIN(&t->t, h);
 215   hash_base *b;
 216   for (b = *bin; b; b = b->next) {
 217     item *i = (item *)b;
 218     if (h == i->b.hash && strcmp(k, i->k) == 0)
 219       return (i);
 220   }
 221   return (0);
 222 }
 223 .VE
 224 Insertion is also relatively trivial given the bin list head.  Insertion
 225 may make the hashtable too large, so it might be necessary to extend
 226 it.  Extension is performed by
 227 .BR hash_extend ,
 228 which is passed only the address of the hashtable.  It returns nonzero
 229 if extension was successful.
 230 .VS
 231 item *add(item_table *t, const char *k, /* ... */)
 232 {
 233   item *i;
 234   uint32 h;
 235   hash_base **bin;
 236
 237   /* --- See if the item is already there --- */
 238
 239   if ((i =  = lookup(t, k)) != 0)
 240     return (i);
 241
 242   /* --- Make a new hashtable item --- */
 243
 244   i = CREATE(item);
 245   i->k = xstrdup(k);
 246   /* ... */
 247
 248   /* --- Link it into the bin list --- */
 249
 250   h = i->b.hash = hash(k);
 251   bin = HASH_BIN(&t->t, h);
 252   i->b.next = *bin;
 253   *bin = &i->b.next;
 254
 255   /* --- Maybe extend the hashtable --- */
 256
 257   if (t->load)
 258     t->load--;
 259   else if (hash_extend(&t->t))
 260     t->load = recalc_load(t);
 261
 262   /* --- Done --- */
 263
 264   return (i);
 265 }
 266 .VE
 267 The
 268 .B sym
 269 implementation is rather more sophisticated in its approach but the idea
 270 is the same.  In particular,
 271 .B sym
 272 provides a single interface for lookup and insertion which prevents the
 273 unnecessary rehashing performed by the above code.
 274 .PP
 275 Removal of items is more straightforward.  The function
 276 .B hash_remove
 277 will unlink a given item from its bin list, after which point it is safe
 278 to remove.
 279 .SS "Iteration"
 280 Iteration allows code to be performed on all the items in a hashtable.
 281 This is done using an
 282 .I iterator
 283 object, represented by a
 284 .B hash_iter
 285 structure.  An iterator is initialized by calling
 286 .BR hash_mkiter .
 287 Each call to
 288 .B hash_next
 289 yields a different item from the hashtable until there are none left, a
 290 condition signified by a null return value.
 291 .PP
 292 The macros
 293 .B HASH_MKITER
 294 and
 295 .B HASH_NEXT
 296 do the same jobs as the above functions.  However,
 297 .B HASH_NEXT
 298 has a slightly bizarre argument passing convention: its second argument
 299 is an
 300 .I lvalue
 301 which is updated to contain the address of the next item.
 302 .PP
 303 The finalization code above contained an example of iteration.
 304 .SH "SEE ALSO"
 305 .BR sym (3),
 306 .BR mLib (3).
 307 .SH "AUTHOR"
 308 Mark Wooding, <mdw@distorted.org.uk>