-%%% -*-cweb-*-
+%%% -*- mode: latex; mmm-classes: cweb -*-
\documentclass[baseclass = strayman]{cweb}
+\usepackage{enumerate}
\usepackage[T1]{fontenc}
\usepackage[utf8]{inputenc}
\usepackage[palatino, helvetica, courier, maths = cmr]{mdwfonts}
+\let\saveCwebN=\N
+\usepackage{mdwmath}
+\usepackage{crypto}
\usepackage{tikz}
+ \usetikzlibrary{arrows.meta}
+ \usetikzlibrary{shapes.geometric}
+ \usetikzlibrary{shapes.misc}
+\let\N=\saveCwebN
\def\bitor#1!{\begingroup\let\MOD=\OR#1\endgroup}
\def\email#1{\href{mailto:#1}{\nolinkurl{#1}}}
+\defop\hwt{hwt}
+\defbrk\parens{(}{)}
+\defbrk\index{[}{]}
+
+\def\card{\#}
+\let\NULL=\Lambda
+
+\def\fixme#1{\marginpar{\textbf{FIXME!}}\fbox{\textbf{FIXME:} #1}}
\makeatletter
\def\CwebNumber#1{% % octal, hex, or decimal constant
\def\CwebSectLayout{\normalfont\sffamily\bfseries}
\def\CwebPointer{\mathord{\rightarrow}}
+\tikzset
+ {> = {Computer Modern Rightarrow[length = 5pt, width = 7pt]},
+ algstep/.style = {draw, align = center, inner sep = 6pt},
+ algcond/.style = {algstep, shape = rounded rectangle, aspect = 2}}
+
\title{QP-tries}
\author{Tony Finch \and Mark Wooding}
\tableofcontents
%% hack formatting for various identifiers
-@f uint64_t int
-@f uint32_t int
-@f xor anything
+@s uint64_t int
+@s uint32_t int
+@s xor anything
%%%--------------------------------------------------------------------------
@* Introduction.
jurisdictions allow authors to abandon their copyrights explicitly; others
don't.
-@<Copyright notice@> =
-/* Written by Tony Finch \email{dot@@dotat.at}
- You may do anything with this. It has no warranty.
+@<Copyright notice@>=
+/* Written by Tony Finch \email{dot@@dotat.at} \\
+ You may do anything with this. It has no warranty. \\
\url{http://creativecommons.org/publicdomain/zero/1.0/}
*/
The interface is defined in a header file \texttt{Tbl.h}.
-@(Tbl.h@> =
+@(Tbl.h@>=
// Tbl.h: an abstract API for tables with string keys.
@<Copyright ...@>
@#
implemented in terms of other functions, rather than by each individual table
data structure. These common implementations are given in \texttt{Tbl.c}.
-@(Tbl.c@> =
+@(Tbl.c@>=
// Tbl.c: simpler wrappers for core table functions
@<Copyright ...@>
@#
represented as a null pointer. Table pointers are passed by value; functions
such as |Tset| which update the table return the new pointer.
-@<\texttt{Tbl.h} declarations@> =
+@<\texttt{Tbl.h} declarations@>=
typedef struct Tbl Tbl;
@ The |Tget|\ldots\ functions retrieve the information assoicated with a key.
call |Tgetl| instead, passing the length along. The length does not include
the null terminator, which must still be present.
-@<\texttt{Tbl.h} declarations@> =
+@<\texttt{Tbl.h} declarations@>=
void *Tgetl(Tbl *tbl, const char *key, size_t klen);
void *Tget(Tbl *tbl, const char *key);
|*rval|, respectively, and returns |true|; otherwise, it returns |false|
leaving |*rkey| and |*rval| unchanged.
-@<\texttt{Tbl.h} declarations@> =
+@<\texttt{Tbl.h} declarations@>=
bool Tgetkv(Tbl *tbl, const char *key, size_t klen, const char **rkey, void **rval);
@ The implementations of |Tget| and |Tgetl| are trivial.
-@<Common table utilities@> =
+@<Common table utilities@>=
void *
Tgetl(Tbl *tbl, const char *key, size_t len) {
const char *rkey = NULL;
empty. This is a successful outcome; |errno| is not changed.
\end{itemize}
-@<\texttt{Tbl.h} declarations@> =
+@<\texttt{Tbl.h} declarations@>=
Tbl *Tsetl(Tbl *tbl, const char *key, size_t klen, void *value);
Tbl *Tset(Tbl *tbl, const char *key, void *value);
@ The |Tset| function is implemented in terms of |Tsetl| in the obvious way.
The |Tsetl| function is primitive, implemented by the data structure.
-@<Common table utilities@> =
+@<Common table utilities@>=
Tbl *
Tset(Tbl *tbl, const char *key, void *value) {
return(Tsetl(tbl, key, strlen(key), value));
null-terminated key string; the |Tdell| function is additionally passed the
length of the string.
-@<\texttt{Tbl.h} declarations@> =
+@<\texttt{Tbl.h} declarations@>=
Tbl *Tdell(Tbl *tbl, const char *key, size_t klen);
Tbl *Tdel(Tbl *tbl, const char *key);
@ The |Tdel| and |Tdell| functions are implemented in terms of the primitive
-|Tdelkv|, which additionally stores pointers to the entry's original key and
-value in |*rkey| and |*rvalue|.
+|Tdelkv|, which additionally saves pointers to the entry's original key and
+value in |*rkey| and |*rvalue| so that their storage can be reclaimed.
-@<\texttt{Tbl.h} declarations@> =
+@<\texttt{Tbl.h} declarations@>=
Tbl *Tdelkv(Tbl *tbl, const char *key, size_t klen, const char **rkey, void **rval);
@ The implementations of |Tdel| and |Tdell| are straightforward.
-@<Common table utilities@> =
+@<Common table utilities@>=
Tbl *
Tdell(Tbl *tbl, const char *key, size_t len) {
const char *rkey = NULL;
key string present in the table, or null if no such string exists; it doesn't
report the associated value pointer at all.
-@<\texttt{Tbl.h} declarations@> =
+@<\texttt{Tbl.h} declarations@>=
bool Tnextl(Tbl *tbl, const char **pkey, size_t *pklen, void **pvalue);
bool Tnext(Tbl *tbl, const char **pkey, void **pvalue);
const char *Tnxt(Tbl *tbl, const char *key);
@ The |Tnext| and |Tnxt| functions are implemented in terms of |Tnextl| in
the obvious way.
-@<Common table utilities@> =
+@<Common table utilities@>=
bool
Tnext(Tbl *tbl, const char **pkey, void **pvalue) {
size_t len = *pkey == NULL ? 0 : strlen(*pkey);
@ Finally, we define some diagnostic tools.
-The |Tsize| function reports statistics about a table
+The |Tsize| function reports statistics about a table. Specifically, it
+stores information in the addresses given as its arguments:
+\begin{description}
+\item[|*rtype|] A statically allocated string describing the table
+ implementation.
+\item[|*size|] The total memory allocated by the table. This doesn't include
+ keys or values, because they're allocated by the caller.
+\item[|*rdepth|] The total depth of the search structure.
+ \fixme{why is this an interesting thing to know?}
+\item[|*rbranch|] The number of branch nodes in the search structure.
+\item[|*rleaves|] The number of leaf nodes in the search structure.
+\end{description}
+
+The |Tdump| function prints a diagnostic dump of a table to standard output.
+
+Both of these functions are primitive, implemented separately for each data
+structure.
-@<\texttt{Tbl.h} declarations@> =
+@<\texttt{Tbl.h} declarations@>=
void Tdump(Tbl *tbl);
-void Tsize(Tbl *tbl, const char **rtype,
+void Tsize(Tbl *tbl, const char **rtype, @|
size_t *rsize, size_t *rdepth, size_t *rbranches, size_t *rleaves);
+
+@ \relax
+
+%%%--------------------------------------------------------------------------
+@* QP-Tries.
+
+@*1 Theory lesson.
+
+QP-tries are a data structure for keeping track of a dynamic set of
+\emph{strings}. It'll be best if we don't think of these strings as being
+made up of bytes or characters: those units are too large to be convenient.
+Instead, we'll work with a smaller set~$\Sigma$ of \emph{symbols}, which will
+usually be small fixed-size groups of bits, say four or five. The limiting
+factor will be that the number of symbols $\card\Sigma$ must be at most equal
+to the machine's word size $w$.
+
+There's another constraint, which is that the set of strings that we work
+with must be \emph{prefix-free}. This usually most easily arranged by ending
+each string with a \emph{terminating symbol}. For C~strings, the obvious
+choice is a null character, since C has a preference for null-terminated
+strings anyway.
+
+@ The trie \emph{root} points at a node. The trie structure consists of two
+kinds of nodes called \emph{branch} nodes and \emph{leaf} nodes. Both kinds
+of nodes are the same size. Both kinds of notes have a \emph{tag bit} which
+is clear in leaf nodes and set in branch nodes. (The opposite convention is
+possible, but doesn't work as well in practice.)
+
+In broad outline, searching QP-trie for a key proceeds as follows. We start
+at the root. A branch node has children corresponding to the symbols in
+$\Sigma$: if we encounter one, we move to the child selected by one of the
+symbols of the search key. A leaf node holds a key and its associated value:
+if we encounter one, then we compare the search key against the key in the
+leaf, and return the associated value if keys match, or report failure if
+they don't.
+
+The trie is compressed in two ways.
+\begin{itemize}
+\item Branch nodes which would have fewer than two children are elided.
+ Branch nodes contain an \emph{offset} to indicate which of the search key's
+ symbols to check.
+\item Children which don't have leaves are omitted. A branch node contains a
+ \emph{bitmap}, indexed by symbols in some convenient ordering: the bit
+ corresponding to a symbol is set if the node has a corresponding child. A
+ branch node also holds a pointer to a \emph{twigs} vector, which simply
+ holds the node's remaining children, in order.
+\end{itemize}
+The size of a branch node's twigs vector is equal to the Hamming weight --
+i.e., the number of set bits -- of its bitmap~$B$. Moreover, the index of
+the child corresponding to a symbol $s \in \Sigma$, if one exists, is equal
+to the number of present children corresponding to symbols preceding~$s$ in
+the ordering, which can conveniently be computed by
+\[ \hwt\bigparens{B \bitand \bigparens{(1 \lsl s) - 1}} \mpunct, \]
+where $\hwt(\cdot)$ denotes Hamming weight.
+
+This leads to the first algorithm, for lookup in a QP-trie.
+
+\begin{list}
+ {\textbf{Algorithm L} \emph{(Lookup in a QP-trie.)}}
+ {\leftmargin=0pt \rightmargin=0pt \labelwidth=-\labelsep}
+\item
+Given a QP-trie root pointer $\texttt{ROOT}$ and a key string~$k$ of
+length~$n$, find the value associated with~$k$ in the trie.
+
+Every node~$\texttt{N}$ has a tag bit $\texttt{TAG(N)}$. If
+$\texttt{TAG(N)} = 0$ then $\texttt{N}$ points to a leaf node, which has
+$\texttt{KEY}$ and $\texttt{VALUE}$ fields. If $\texttt{TAG(N)} = 1$ then
+$\texttt{N}$ points to a branch node, which has $\texttt{OFFSET}$,
+$\texttt{BITMAP}$, and $\texttt{TWIGS}$ fields.
+
+\begin{enumerate}[\bfseries L1.]
+\item{} [Empty tree?]
+ If $\texttt{ROOT} = \NULL$, then return failure.
+\item{} [Initialize.]
+ Set $\texttt{N} \gets \texttt{ROOT}$.
+\item{} [Check tag.] \label{en:qp.lookup.chk-tag}
+ If $\texttt{TAG(N)} = 0$ then then $\texttt{N}$ points to a leaf node: go to
+ step~L\ref{en:qp.lookup.cmp-key}. Otherwise, it points to a branch node: go
+ to step~L\ref{en:qp.lookup.chk-len}.
+\item{} [Check length.] \label{en:qp.lookup.chk-len}
+ If $\texttt{OFFSET(N)} \ge n$ then return failure. Otherwise, continue
+ with the next step.
+\item{} [Check symbol.]
+ Let $s \gets k[\texttt{OFFSET(N)}]$. If $\texttt{BITMAP(N)} \bitand (1 \lsl
+ s) = 0$ then return failure. Otherwise, set $\texttt{N}$ to be the address
+ of $\texttt{TWIGS(N)}[\hwt(\texttt{BITMAP(N)} \bitand ((1 \lsl s) - 1))]$
+ and go to step~L\ref{en:qp.lookup.chk-tag}.
+\item{} [Compare key.] \label{en:qp.lookup.cmp-key}
+ If the strings $k$ and $\texttt{KEY(N)}$ are equal then return
+ $\texttt{VALUE(N)}$ successfully; otherwise return failure.
+ \qed
+\end{enumerate}
+\end{list}
+
+\begin{figure}
+ \centering
+ \begin{tikzpicture}
+ [x = 48mm, y = 10mm]
+ \node[algcond] (emptyp) at (0, 8) {L1. Empty tree?};
+ \node[algstep] (init) at (0, 6) {L2. Initialize};
+ \node[algcond] (leafp) at (0, 4) {L3. Check tag};
+ \node[algcond] (len) at (1, 5.5) {L4. Check length};
+ \node[algcond] (sym) at (1, 3) {L5. Check symbol};
+ \node[algcond] (key) at (0, 2) {L6. Compare key};
+ \node (lose) at (2, 4) {\texttt{FAILURE}};
+ \node (win) at (0, 0) {\texttt{SUCCESS}};
+ \path[->]
+ (emptyp)
+ edge node[left] {$\texttt{ROOT} \ne \NULL$} (init)
+ edge [out = 0, in = 140]
+ node[above, sloped] {$\texttt{ROOT} = \NULL$} (lose)
+ (init)
+ edge (leafp)
+ (leafp)
+ edge node[left] {$\texttt{TAG(N)} = 0$} (key)
+ edge node[sloped, above] {$\texttt{TAG(N)} = 1$} (len)
+ (len)
+ edge node[left] {$\texttt{OFFSET(N)} < n$} (sym)
+ edge node[above, sloped] {$\texttt{OFFSET(N)} \ge n$} (lose)
+ (sym)
+ edge node[below, sloped, align = center]
+ {bit $s$ set in \\ $\texttt{BITMAP(N)}$} (leafp)
+ edge node[below, sloped, align = center]
+ {bit $s$ clear in \\ $\texttt{BITMAP(N)}$} (lose)
+ (key)
+ edge node[left] {$k$ matches $\texttt{KEY(N)}$} (win)
+ edge [out = 0, in = -125]
+ node[below] {$k$ doesn't match $\texttt{KEY(N)}$} (lose);
+ \end{tikzpicture}
+ \caption{QP-trie lookup algorithm} \label{fig:qp.lookup}
+\end{figure}
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
@* {qp.h}.
-@(qp.h@> =
+@(qp.h@>=
// qp.h: tables implemented with quadbit popcount patricia tries.
//
// Written by Tony Finch <dot@@dotat.at>
@* {qp.c}.
-@(qp.c@> =
+@(qp.c@>=
// qp.c: tables implemented with quadbit popcount patricia tries.
//
// Written by Tony Finch <dot@@dotat.at>
@* {fn.h}.
-@(fn.h@> =
+@(fn.h@>=
// fn.h: quintet bit popcount patricia tries, new version
//
// This version uses somewhat different terminology than older
@* {fn.c}.
-@(fn.c@> =
+@(fn.c@>=
// fn.h: quintet bit popcount patricia tries, new version
//
// Written by Tony Finch <dot@@dotat.at>
@* {qp-debug.c}.
-@(qp-debug.c@> =
+@(qp-debug.c@>=
// qp-debug.c: qp trie debug support
//
// Written by Tony Finch <dot@@dotat.at>
@* {fn-debug.c}.
-@(fn-debug.c@> =
+@(fn-debug.c@>=
// fn-debug.c: fn trie debug support
//
// Written by Tony Finch <dot@@dotat.at>
@* {toy.c}.
-@(toy.c@> =
+@(toy.c@>=
#include <assert.h>
#include <errno.h>
#include <stdarg.h>
p = argv[i];
switch (*p++) {
case '+':
- v = malloc(sizeof(*v)); if (!v) bail("out of memory");
- v->seq = seq; v->k = p;
- tt = Tset(t, p, v);
+ len = strlen(p) + 1; size = (len + sizeof(*v) + 15)&-16;
+ v = malloc(size); if (!v) bail("out of memory");
+ memcpy((char *)v + size - len, p, len);
+ v->k = (char *)v + size - len; v->seq = seq++;
+ tt = Tset(t, v->k, v);
if (!tt) bail("Tset failed: %s", strerror(errno));
t = tt; break;
case '-':
v = Tget(t, p);
if (!v) puts("(not found)");
else printf("%lu\n", v->seq);
+ assert(strcmp(p, v->k) == 0);
break;
case 'D':
Tdump(t);
return (0);
}
-@
-\end{document} \iftrue % gobble following `\fi'
+@ \end{document} \iftrue % gobble following `\fi'