From: Mark Wooding Date: Fri, 30 Aug 2024 19:25:47 +0000 (+0100) Subject: Progress. X-Git-Url: https://www.chiark.greenend.org.uk/ucgi/~mdw/git/qpweb/commitdiff_plain Progress. --- diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..896e43a --- /dev/null +++ b/.gitignore @@ -0,0 +1,22 @@ +*.aux +*.d +*.idx +*.log +*.out +*.scn +*.toc +*.o +/local.mk +/Tbl.c +/Tbl.h +/fn.c +/fn-debug.c +/fn.h +/fntoy +/qp.c +/qp-debug.c +/qp.h +/qp.pdf +/qp.tex +/qptoy +/toy.c diff --git a/Makefile b/Makefile index 084e6dd..3a5f645 100644 --- a/Makefile +++ b/Makefile @@ -1,3 +1,5 @@ +### -*-makefile-*- + all: clean:: .PHONY: all clean @@ -5,13 +7,6 @@ clean:: TARGETS = CLEAN = -V = 0 -V_AT = $(V_AT.$V) -V_AT.0 = @ - -v-tag = $(call v-tag.$V,$1,$2) -v-tag.0 = @printf " %-8s %s\n" '$1' '$2'; - CTANGLE = ctangle CWEAVE = cweave LATEX = pdflatex @@ -21,6 +16,14 @@ WARNINGS = -pedantic -Wall CFLAGS = -std=gnu99 $(OPTIMIZE) $(WARNINGS) LD = gcc LDFLAGS = +-include local.mk + +V = 0 +V_AT = $(V_AT.$V) +V_AT.0 = @ + +v-tag = $(call v-tag.$V,$1,$2) +v-tag.0 = @printf " %-8s %s\n" '$1' '$2'; SRCEXT = objify = $(addsuffix .o, \ @@ -68,6 +71,7 @@ $$(firstword $$($1_WEBSRCS)): $1.w $$(call v-tag,CTANGLE,$$<)$$(CTANGLE) $$< endef $(foreach w,$(WEBS:.w=), $(eval $(call tangle-web,$w))) +%.c: %.w # don't use implicit rule TARGETS += qptoy fntoy diff --git a/qp.w b/qp.w index 3ba3414..4227526 100644 --- a/qp.w +++ b/qp.w @@ -1,14 +1,30 @@ -%%% -*-cweb-*- +%%% -*- mode: latex; mmm-classes: cweb -*- \documentclass[baseclass = strayman]{cweb} +\usepackage{enumerate} \usepackage[T1]{fontenc} \usepackage[utf8]{inputenc} \usepackage[palatino, helvetica, courier, maths = cmr]{mdwfonts} +\let\saveCwebN=\N +\usepackage{mdwmath} +\usepackage{crypto} \usepackage{tikz} + \usetikzlibrary{arrows.meta} + \usetikzlibrary{shapes.geometric} + \usetikzlibrary{shapes.misc} +\let\N=\saveCwebN \def\bitor#1!{\begingroup\let\MOD=\OR#1\endgroup} \def\email#1{\href{mailto:#1}{\nolinkurl{#1}}} +\defop\hwt{hwt} +\defbrk\parens{(}{)} +\defbrk\index{[}{]} + +\def\card{\#} +\let\NULL=\Lambda + +\def\fixme#1{\marginpar{\textbf{FIXME!}}\fbox{\textbf{FIXME:} #1}} \makeatletter \def\CwebNumber#1{% % octal, hex, or decimal constant @@ -24,6 +40,11 @@ \def\CwebSectLayout{\normalfont\sffamily\bfseries} \def\CwebPointer{\mathord{\rightarrow}} +\tikzset + {> = {Computer Modern Rightarrow[length = 5pt, width = 7pt]}, + algstep/.style = {draw, align = center, inner sep = 6pt}, + algcond/.style = {algstep, shape = rounded rectangle, aspect = 2}} + \title{QP-tries} \author{Tony Finch \and Mark Wooding} @@ -33,9 +54,9 @@ \tableofcontents %% hack formatting for various identifiers -@f uint64_t int -@f uint32_t int -@f xor anything +@s uint64_t int +@s uint32_t int +@s xor anything %%%-------------------------------------------------------------------------- @* Introduction. @@ -55,9 +76,9 @@ is, to the maximum extent permitted, not protected by copyright. Some jurisdictions allow authors to abandon their copyrights explicitly; others don't. -@ = -/* Written by Tony Finch \email{dot@@dotat.at} - You may do anything with this. It has no warranty. +@= +/* Written by Tony Finch \email{dot@@dotat.at} \\ + You may do anything with this. It has no warranty. \\ \url{http://creativecommons.org/publicdomain/zero/1.0/} */ @@ -76,7 +97,7 @@ bits. The interface is defined in a header file \texttt{Tbl.h}. -@(Tbl.h@> = +@(Tbl.h@>= // Tbl.h: an abstract API for tables with string keys. @ @# @@ -91,7 +112,7 @@ The interface is defined in a header file \texttt{Tbl.h}. implemented in terms of other functions, rather than by each individual table data structure. These common implementations are given in \texttt{Tbl.c}. -@(Tbl.c@> = +@(Tbl.c@>= // Tbl.c: simpler wrappers for core table functions @ @# @@ -108,7 +129,7 @@ There's no function to initialize a table: an empty table is simply represented as a null pointer. Table pointers are passed by value; functions such as |Tset| which update the table return the new pointer. -@<\texttt{Tbl.h} declarations@> = +@<\texttt{Tbl.h} declarations@>= typedef struct Tbl Tbl; @ The |Tget|\ldots\ functions retrieve the information assoicated with a key. @@ -121,7 +142,7 @@ If you already have the string's length lying around for some reason, you can call |Tgetl| instead, passing the length along. The length does not include the null terminator, which must still be present. -@<\texttt{Tbl.h} declarations@> = +@<\texttt{Tbl.h} declarations@>= void *Tgetl(Tbl *tbl, const char *key, size_t klen); void *Tget(Tbl *tbl, const char *key); @@ -131,11 +152,11 @@ pointers to the original key string and the associated value in |*rkey| and |*rval|, respectively, and returns |true|; otherwise, it returns |false| leaving |*rkey| and |*rval| unchanged. -@<\texttt{Tbl.h} declarations@> = +@<\texttt{Tbl.h} declarations@>= bool Tgetkv(Tbl *tbl, const char *key, size_t klen, const char **rkey, void **rval); @ The implementations of |Tget| and |Tgetl| are trivial. -@ = +@= void * Tgetl(Tbl *tbl, const char *key, size_t len) { const char *rkey = NULL; @@ -182,14 +203,14 @@ cases. empty. This is a successful outcome; |errno| is not changed. \end{itemize} -@<\texttt{Tbl.h} declarations@> = +@<\texttt{Tbl.h} declarations@>= Tbl *Tsetl(Tbl *tbl, const char *key, size_t klen, void *value); Tbl *Tset(Tbl *tbl, const char *key, void *value); @ The |Tset| function is implemented in terms of |Tsetl| in the obvious way. The |Tsetl| function is primitive, implemented by the data structure. -@ = +@= Tbl * Tset(Tbl *tbl, const char *key, void *value) { return(Tsetl(tbl, key, strlen(key), value)); @@ -201,20 +222,20 @@ the now-familiar pattern, the |Tdel| function takes pointers to a table and a null-terminated key string; the |Tdell| function is additionally passed the length of the string. -@<\texttt{Tbl.h} declarations@> = +@<\texttt{Tbl.h} declarations@>= Tbl *Tdell(Tbl *tbl, const char *key, size_t klen); Tbl *Tdel(Tbl *tbl, const char *key); @ The |Tdel| and |Tdell| functions are implemented in terms of the primitive -|Tdelkv|, which additionally stores pointers to the entry's original key and -value in |*rkey| and |*rvalue|. +|Tdelkv|, which additionally saves pointers to the entry's original key and +value in |*rkey| and |*rvalue| so that their storage can be reclaimed. -@<\texttt{Tbl.h} declarations@> = +@<\texttt{Tbl.h} declarations@>= Tbl *Tdelkv(Tbl *tbl, const char *key, size_t klen, const char **rkey, void **rval); @ The implementations of |Tdel| and |Tdell| are straightforward. -@ = +@= Tbl * Tdell(Tbl *tbl, const char *key, size_t len) { const char *rkey = NULL; @@ -246,7 +267,7 @@ address, as an argument, and returns a pointer to the lexicographically next key string present in the table, or null if no such string exists; it doesn't report the associated value pointer at all. -@<\texttt{Tbl.h} declarations@> = +@<\texttt{Tbl.h} declarations@>= bool Tnextl(Tbl *tbl, const char **pkey, size_t *pklen, void **pvalue); bool Tnext(Tbl *tbl, const char **pkey, void **pvalue); const char *Tnxt(Tbl *tbl, const char *key); @@ -254,7 +275,7 @@ const char *Tnxt(Tbl *tbl, const char *key); @ The |Tnext| and |Tnxt| functions are implemented in terms of |Tnextl| in the obvious way. -@ = +@= bool Tnext(Tbl *tbl, const char **pkey, void **pvalue) { size_t len = *pkey == NULL ? 0 : strlen(*pkey); @@ -270,17 +291,186 @@ Tnxt(Tbl *tbl, const char *key) { @ Finally, we define some diagnostic tools. -The |Tsize| function reports statistics about a table +The |Tsize| function reports statistics about a table. Specifically, it +stores information in the addresses given as its arguments: +\begin{description} +\item[|*rtype|] A statically allocated string describing the table + implementation. +\item[|*size|] The total memory allocated by the table. This doesn't include + keys or values, because they're allocated by the caller. +\item[|*rdepth|] The total depth of the search structure. + \fixme{why is this an interesting thing to know?} +\item[|*rbranch|] The number of branch nodes in the search structure. +\item[|*rleaves|] The number of leaf nodes in the search structure. +\end{description} + +The |Tdump| function prints a diagnostic dump of a table to standard output. + +Both of these functions are primitive, implemented separately for each data +structure. -@<\texttt{Tbl.h} declarations@> = +@<\texttt{Tbl.h} declarations@>= void Tdump(Tbl *tbl); -void Tsize(Tbl *tbl, const char **rtype, +void Tsize(Tbl *tbl, const char **rtype, @| size_t *rsize, size_t *rdepth, size_t *rbranches, size_t *rleaves); + +@ \relax + +%%%-------------------------------------------------------------------------- +@* QP-Tries. + +@*1 Theory lesson. + +QP-tries are a data structure for keeping track of a dynamic set of +\emph{strings}. It'll be best if we don't think of these strings as being +made up of bytes or characters: those units are too large to be convenient. +Instead, we'll work with a smaller set~$\Sigma$ of \emph{symbols}, which will +usually be small fixed-size groups of bits, say four or five. The limiting +factor will be that the number of symbols $\card\Sigma$ must be at most equal +to the machine's word size $w$. + +There's another constraint, which is that the set of strings that we work +with must be \emph{prefix-free}. This usually most easily arranged by ending +each string with a \emph{terminating symbol}. For C~strings, the obvious +choice is a null character, since C has a preference for null-terminated +strings anyway. + +@ The trie \emph{root} points at a node. The trie structure consists of two +kinds of nodes called \emph{branch} nodes and \emph{leaf} nodes. Both kinds +of nodes are the same size. Both kinds of notes have a \emph{tag bit} which +is clear in leaf nodes and set in branch nodes. (The opposite convention is +possible, but doesn't work as well in practice.) + +In broad outline, searching QP-trie for a key proceeds as follows. We start +at the root. A branch node has children corresponding to the symbols in +$\Sigma$: if we encounter one, we move to the child selected by one of the +symbols of the search key. A leaf node holds a key and its associated value: +if we encounter one, then we compare the search key against the key in the +leaf, and return the associated value if keys match, or report failure if +they don't. + +The trie is compressed in two ways. +\begin{itemize} +\item Branch nodes which would have fewer than two children are elided. + Branch nodes contain an \emph{offset} to indicate which of the search key's + symbols to check. +\item Children which don't have leaves are omitted. A branch node contains a + \emph{bitmap}, indexed by symbols in some convenient ordering: the bit + corresponding to a symbol is set if the node has a corresponding child. A + branch node also holds a pointer to a \emph{twigs} vector, which simply + holds the node's remaining children, in order. +\end{itemize} +The size of a branch node's twigs vector is equal to the Hamming weight -- +i.e., the number of set bits -- of its bitmap~$B$. Moreover, the index of +the child corresponding to a symbol $s \in \Sigma$, if one exists, is equal +to the number of present children corresponding to symbols preceding~$s$ in +the ordering, which can conveniently be computed by +\[ \hwt\bigparens{B \bitand \bigparens{(1 \lsl s) - 1}} \mpunct, \] +where $\hwt(\cdot)$ denotes Hamming weight. + +This leads to the first algorithm, for lookup in a QP-trie. + +\begin{list} + {\textbf{Algorithm L} \emph{(Lookup in a QP-trie.)}} + {\leftmargin=0pt \rightmargin=0pt \labelwidth=-\labelsep} +\item +Given a QP-trie root pointer $\texttt{ROOT}$ and a key string~$k$ of +length~$n$, find the value associated with~$k$ in the trie. + +Every node~$\texttt{N}$ has a tag bit $\texttt{TAG(N)}$. If +$\texttt{TAG(N)} = 0$ then $\texttt{N}$ points to a leaf node, which has +$\texttt{KEY}$ and $\texttt{VALUE}$ fields. If $\texttt{TAG(N)} = 1$ then +$\texttt{N}$ points to a branch node, which has $\texttt{OFFSET}$, +$\texttt{BITMAP}$, and $\texttt{TWIGS}$ fields. + +\begin{enumerate}[\bfseries L1.] +\item{} [Empty tree?] + If $\texttt{ROOT} = \NULL$, then return failure. +\item{} [Initialize.] + Set $\texttt{N} \gets \texttt{ROOT}$. +\item{} [Check tag.] \label{en:qp.lookup.chk-tag} + If $\texttt{TAG(N)} = 0$ then then $\texttt{N}$ points to a leaf node: go to + step~L\ref{en:qp.lookup.cmp-key}. Otherwise, it points to a branch node: go + to step~L\ref{en:qp.lookup.chk-len}. +\item{} [Check length.] \label{en:qp.lookup.chk-len} + If $\texttt{OFFSET(N)} \ge n$ then return failure. Otherwise, continue + with the next step. +\item{} [Check symbol.] + Let $s \gets k[\texttt{OFFSET(N)}]$. If $\texttt{BITMAP(N)} \bitand (1 \lsl + s) = 0$ then return failure. Otherwise, set $\texttt{N}$ to be the address + of $\texttt{TWIGS(N)}[\hwt(\texttt{BITMAP(N)} \bitand ((1 \lsl s) - 1))]$ + and go to step~L\ref{en:qp.lookup.chk-tag}. +\item{} [Compare key.] \label{en:qp.lookup.cmp-key} + If the strings $k$ and $\texttt{KEY(N)}$ are equal then return + $\texttt{VALUE(N)}$ successfully; otherwise return failure. + \qed +\end{enumerate} +\end{list} + +\begin{figure} + \centering + \begin{tikzpicture} + [x = 48mm, y = 10mm] + \node[algcond] (emptyp) at (0, 8) {L1. Empty tree?}; + \node[algstep] (init) at (0, 6) {L2. Initialize}; + \node[algcond] (leafp) at (0, 4) {L3. Check tag}; + \node[algcond] (len) at (1, 5.5) {L4. Check length}; + \node[algcond] (sym) at (1, 3) {L5. Check symbol}; + \node[algcond] (key) at (0, 2) {L6. Compare key}; + \node (lose) at (2, 4) {\texttt{FAILURE}}; + \node (win) at (0, 0) {\texttt{SUCCESS}}; + \path[->] + (emptyp) + edge node[left] {$\texttt{ROOT} \ne \NULL$} (init) + edge [out = 0, in = 140] + node[above, sloped] {$\texttt{ROOT} = \NULL$} (lose) + (init) + edge (leafp) + (leafp) + edge node[left] {$\texttt{TAG(N)} = 0$} (key) + edge node[sloped, above] {$\texttt{TAG(N)} = 1$} (len) + (len) + edge node[left] {$\texttt{OFFSET(N)} < n$} (sym) + edge node[above, sloped] {$\texttt{OFFSET(N)} \ge n$} (lose) + (sym) + edge node[below, sloped, align = center] + {bit $s$ set in \\ $\texttt{BITMAP(N)}$} (leafp) + edge node[below, sloped, align = center] + {bit $s$ clear in \\ $\texttt{BITMAP(N)}$} (lose) + (key) + edge node[left] {$k$ matches $\texttt{KEY(N)}$} (win) + edge [out = 0, in = -125] + node[below] {$k$ doesn't match $\texttt{KEY(N)}$} (lose); + \end{tikzpicture} + \caption{QP-trie lookup algorithm} \label{fig:qp.lookup} +\end{figure} + + + + + + + + + + + + + + + + + + + + + + @* {qp.h}. -@(qp.h@> = +@(qp.h@>= // qp.h: tables implemented with quadbit popcount patricia tries. // // Written by Tony Finch @@ -517,7 +707,7 @@ twig(Trie *t, uint i) { @* {qp.c}. -@(qp.c@> = +@(qp.c@>= // qp.c: tables implemented with quadbit popcount patricia tries. // // Written by Tony Finch @@ -720,7 +910,7 @@ growbranch:; @* {fn.h}. -@(fn.h@> = +@(fn.h@>= // fn.h: quintet bit popcount patricia tries, new version // // This version uses somewhat different terminology than older @@ -945,7 +1135,7 @@ twigoff(Tindex i, Tbitmap bit) { @* {fn.c}. -@(fn.c@> = +@(fn.c@>= // fn.h: quintet bit popcount patricia tries, new version // // Written by Tony Finch @@ -1156,7 +1346,7 @@ growbranch:; @* {qp-debug.c}. -@(qp-debug.c@> = +@(qp-debug.c@>= // qp-debug.c: qp trie debug support // // Written by Tony Finch @@ -1231,7 +1421,7 @@ Tsize(Tbl *tbl, const char **rtype, @* {fn-debug.c}. -@(fn-debug.c@> = +@(fn-debug.c@>= // fn-debug.c: fn trie debug support // // Written by Tony Finch @@ -1325,7 +1515,7 @@ Tsize(Tbl *tbl, const char **rtype, @* {toy.c}. -@(toy.c@> = +@(toy.c@>= #include #include #include @@ -1366,9 +1556,11 @@ int main(int argc, char *argv[]) p = argv[i]; switch (*p++) { case '+': - v = malloc(sizeof(*v)); if (!v) bail("out of memory"); - v->seq = seq; v->k = p; - tt = Tset(t, p, v); + len = strlen(p) + 1; size = (len + sizeof(*v) + 15)&-16; + v = malloc(size); if (!v) bail("out of memory"); + memcpy((char *)v + size - len, p, len); + v->k = (char *)v + size - len; v->seq = seq++; + tt = Tset(t, v->k, v); if (!tt) bail("Tset failed: %s", strerror(errno)); t = tt; break; case '-': @@ -1379,6 +1571,7 @@ int main(int argc, char *argv[]) v = Tget(t, p); if (!v) puts("(not found)"); else printf("%lu\n", v->seq); + assert(strcmp(p, v->k) == 0); break; case 'D': Tdump(t); @@ -1402,5 +1595,4 @@ int main(int argc, char *argv[]) return (0); } -@ -\end{document} \iftrue % gobble following `\fi' +@ \end{document} \iftrue % gobble following `\fi'