From: Vladimír Vondruš <mosra@centrum.cz>
Date: Sat, 3 Feb 2018 11:53:58 +0000 (+0100)
Subject: doxygen: save result count instead of node size to search binary.
X-Git-Url: https://www.chiark.greenend.org.uk/ucgi/~cjwatson/git?a=commitdiff_plain;h=cee83243c39b08fb8b4ee227179450f2d9ca2d37;p=blog.git

doxygen: save result count instead of node size to search binary.

Makes the code much simpler and allows for up to 256 children and 256
results in each node, instead of limiting its size to 512 bytes (thus
e.g. 64 children and 127 results or some other combination).

I like it when things get better, more flexible and simpler by removing
a bunch of complicated code.
---

diff --git a/doxygen/dox2html5.py b/doxygen/dox2html5.py
index 5adbb1ee..fbc0d238 100755
--- a/doxygen/dox2html5.py
+++ b/doxygen/dox2html5.py
@@ -54,22 +54,22 @@ import m.math
 import ansilexer
 
 class Trie:
-    #  root  |     |     header       | values | child 1 | child 1 | child 1 |
-    # offset | ... | size/2 | value # |  ...   |   char  | barrier | offset  | ...
-    #  32b   |     |   8b   |    8b   | n*16b  |   8b    |    1b   |   23b   |
+    #  root  |     |     header         | results | child 1 | child 1 | child 1 |
+    # offset | ... | result # | value # |   ...   |  char   | barrier | offset  | ...
+    #  32b   |     |    8b    |   8b    |  n*16b  |   8b    |    1b   |   23b   |
     root_offset_struct = struct.Struct('<I')
     header_struct = struct.Struct('<BB')
-    value_struct = struct.Struct('<H')
+    result_struct = struct.Struct('<H')
     child_struct = struct.Struct('<I')
     child_char_struct = struct.Struct('<B')
 
     def __init__(self):
-        self.values = []
+        self.results = []
         self.children = {}
 
-    def _insert(self, path: bytes, value, lookahead_barriers):
+    def _insert(self, path: bytes, result, lookahead_barriers):
         if not path:
-            self.values += [value]
+            self.results += [result]
             return
 
         char = path[0]
@@ -78,11 +78,11 @@ class Trie:
         if lookahead_barriers and lookahead_barriers[0] == 0:
             lookahead_barriers = lookahead_barriers[1:]
             self.children[char] = (True, self.children[char][1])
-        self.children[char][1]._insert(path[1:], value, [b - 1 for b in lookahead_barriers])
+        self.children[char][1]._insert(path[1:], result, [b - 1 for b in lookahead_barriers])
 
-    def insert(self, path: str, value, lookahead_barriers=[]):
+    def insert(self, path: str, result, lookahead_barriers=[]):
         assert not path.isupper() # to avoid unnecessary duplicates
-        self._insert(path.encode('utf-8'), value, lookahead_barriers)
+        self._insert(path.encode('utf-8'), result, lookahead_barriers)
 
     # Returns offset of the serialized thing in `output`
     def _serialize(self, hashtable, output: bytearray, merge_subtrees) -> int:
@@ -93,11 +93,10 @@ class Trie:
             child_offsets += [(char, child[0], offset)]
 
         # Serialize this node
-        size = int(2 + 2*len(self.values) + 4*len(child_offsets))
         serialized = bytearray()
-        serialized += self.header_struct.pack(int(size/2), len(self.values))
-        for v in self.values:
-            serialized += self.value_struct.pack(v)
+        serialized += self.header_struct.pack(len(self.results), len(self.children))
+        for v in self.results:
+            serialized += self.result_struct.pack(v)
 
         # Serialize child offsets
         for char, lookahead_barrier, abs_offset in child_offsets:
@@ -109,8 +108,6 @@ class Trie:
             serialized += self.child_struct.pack(abs_offset | ((1 if lookahead_barrier else 0) << 23))
             self.child_char_struct.pack_into(serialized, offset + 3, char)
 
-        assert size == len(serialized)
-
         # Subtree merging: if this exact tree is already in the table, return
         # its offset. Otherwise add it and return the new offset.
         # TODO: why hashable = bytes(output[base_offset:] + serialized) didn't work?
diff --git a/doxygen/search.js b/doxygen/search.js
index 0bac7a51..06062610 100644
--- a/doxygen/search.js
+++ b/doxygen/search.js
@@ -188,9 +188,8 @@ var Search = {
         for(; foundPrefix != searchString.length; ++foundPrefix) {
             /* Calculate offset and count of children */
             let offset = this.searchStack[this.searchStack.length - 1];
-            let nodeSize = this.trie.getUint8(offset)*2;
-            let relChildOffset = 2 + this.trie.getUint8(offset + 1)*2;
-            let childCount = (nodeSize - relChildOffset)/4;
+            let relChildOffset = 2 + this.trie.getUint8(offset)*2;
+            let childCount = this.trie.getUint8(offset + 1);
 
             /* Go through all children and find the next offset */
             let childOffset = offset + relChildOffset;
@@ -230,10 +229,10 @@ var Search = {
     },
 
     gatherResults: function(offset, suffixLength, results) {
-        let valueCount = this.trie.getUint8(offset + 1);
+        let resultCount = this.trie.getUint8(offset);
 
         /* Populate the results with all values associated with this node */
-        for(let i = 0; i != valueCount; ++i) {
+        for(let i = 0; i != resultCount; ++i) {
             let index = this.trie.getUint16(offset + (i + 1)*2, true);
             let flags = this.map.getUint8(index*4 + 3);
             let resultOffset = this.map.getUint32(index*4, true) & 0x00ffffff;
@@ -279,9 +278,8 @@ var Search = {
 
         /* Dig deeper. If the child already has enough, return. */
         /* TODO: hmmm. this is helluvalot duplicated code. hmm. */
-        let nodeSize = this.trie.getUint8(offset)*2;
-        let relChildOffset = 2 + this.trie.getUint8(offset + 1)*2;
-        let childCount = (nodeSize - relChildOffset)/4;
+        let relChildOffset = 2 + this.trie.getUint8(offset)*2;
+        let childCount = this.trie.getUint8(offset + 1);
         let childOffset = offset + relChildOffset;
         for(let j = 0; j != childCount; ++j) {
             let offsetBarrier = this.trie.getUint32(childOffset + j*4, true);
diff --git a/doxygen/test/js-test-data/empty.bin b/doxygen/test/js-test-data/empty.bin
index 053f032f..6d194001 100644
Binary files a/doxygen/test/js-test-data/empty.bin and b/doxygen/test/js-test-data/empty.bin differ
diff --git a/doxygen/test/js-test-data/searchdata.b85 b/doxygen/test/js-test-data/searchdata.b85
index 2d1c1614..a9f602a1 100644
--- a/doxygen/test/js-test-data/searchdata.b85
+++ b/doxygen/test/js-test-data/searchdata.b85
@@ -1 +1 @@
-O+!-vL;(N*Dggih0s#R40{{d704W0i2mk;m0{{*H0B!>S6aWBe0s#X60{|cZ04W0iBme*?0{|)j0B!>SFaQ8)0{}Jv0Br*RJOBVX1OWm7LI8j|0{}<>0CEEWPyhgL0{~V40CWQYTmS%L0{~(G0A&IJ1pos8ZU6u&0|0UW04M_hcmM!y0|0&i0BHjNga80-0|1Hu06GK#1OSi#fI0&JmH+@{0|1@?0A~XLqyPYJ0|2T30AU9J8UO%oXaE3qumAvZ0|2%F06GK#006`QfI0&J$^Zap0|3$h0CWTc0RRI41pos8-T(k80|4d#04M_h>;M361pwFp0Aca~0BHgN1^@#90s#PJ0{{jA0A~XL3;_UP0{{{M0B{2U7y$rc0{|WY0Cfof_y7QHXaE3qumAvZBmn?(AOHXmHvj-(VgLXlhX4R!z5oCq;Q#<-76AajG64VpO<{Cs0B&JzWpi+0V`WWYbZ9PUbZu-1O<{CsIy!A>ZYXJPbSxlgZgeRCZeeX@b8ul}WldppXf9}UZEPcLX>LtnbZ9y{R%K&!Z*l-*Y+-YAO<{CsUol@XR%K&!Z*neZbZu+~O<{CsIyzQmV{~tFIy!A>ZYU`rV{dMAbO2*)VRLg$VRUF;F<&uOWn*-2axQ3eZEPcLX>LtnbZ9y{QekdqWdLJrVRLg$VRUF;F<&uKVQyz-E@*UZYz9qXbZ9y{QekdqWjZ=-X>KSfAY*TCb94Y>Y+-YAO<{CsUol@XQekdqWiDuRZEPcLX>L$qXJsJ5yC73_VsK$+WdL(^VsK$+WiDuRZEOGl
\ No newline at end of file
+O+!-vL;(N*Dggih0RRC2009I504V?g2mk;m009mF0B!&Q6aWBe0RRI400AHX04V?gBme*?00Alh0B!&QFaQ8)00A}t0BryPJOBVX0RaL4LI8j|00Bq<0CE5UPyhgL00CA20CWHWTmS%L00CkE0A&FH1poj6ZU6u&00D9U04M+fcmM!y00Djg0BHaLga80-00D{s06GBy1OSi#fI0vHmH+@{00Eu=0A~OJqyPYJ00F810AT<F8UO%oXaE3qumAvZ00FiD06GBy006`QfI0vH$^Zap00Ghf0CWQY0RRI41poj6-T(k800HIz04M+f>;M3600P(m0Aca~0BHdL1^@s70s#PJ009O80A~OJ3;_UP009yK0B`^S7y$rc00ABW0CfNa_y7QHXaE3qumAvZBmn?(AOHXmHvj-(VgLXlhX4R!z5oCq;Q#<-76AajG64VpO<{Cs0B&JzWpi+0V`WWYbZ9PUbZu-1O<{CsIy!A>ZYXJPbSxlgZgeRCZeeX@b8ul}WldppXf9}UZEPcLX>LtnbZ9y{R%K&!Z*l-*Y+-YAO<{CsUol@XR%K&!Z*neZbZu+~O<{CsIyzQmV{~tFIy!A>ZYU`rV{dMAbO2*)VRLg$VRUF;F<&uOWn*-2axQ3eZEPcLX>LtnbZ9y{QekdqWdLJrVRLg$VRUF;F<&uKVQyz-E@*UZYz9qXbZ9y{QekdqWjZ=-X>KSfAY*TCb94Y>Y+-YAO<{CsUol@XQekdqWiDuRZEPcLX>L$qXJsJ5yC73_VsK$+WdL(^VsK$+WiDuRZEOGl
\ No newline at end of file
diff --git a/doxygen/test/js-test-data/searchdata.bin b/doxygen/test/js-test-data/searchdata.bin
index 12fe6ea1..3c44891c 100644
Binary files a/doxygen/test/js-test-data/searchdata.bin and b/doxygen/test/js-test-data/searchdata.bin differ
diff --git a/doxygen/test/js-test-data/unicode.bin b/doxygen/test/js-test-data/unicode.bin
index ed2bd505..23aa5fc9 100644
Binary files a/doxygen/test/js-test-data/unicode.bin and b/doxygen/test/js-test-data/unicode.bin differ
diff --git a/doxygen/test/test_search.py b/doxygen/test/test_search.py
index 4c1a98dd..e915e135 100755
--- a/doxygen/test/test_search.py
+++ b/doxygen/test/test_search.py
@@ -34,7 +34,7 @@ from dox2html5 import Trie, ResultMap, ResultFlag, serialize_search_data, search
 
 from test import IntegrationTestCase
 
-def _pretty_print_trie(serialized: bytearray, hashtable, stats, base_offset, indent, draw_pipe, show_merged, show_lookahead_barriers, color_map) -> str:
+def _pretty_print_trie(serialized: bytearray, hashtable, stats, base_offset, indent, show_merged, show_lookahead_barriers, color_map) -> str:
     # Visualize where the trees were merged
     if show_merged and base_offset in hashtable:
         return color_map['red'] + '#' + color_map['reset']
@@ -42,27 +42,25 @@ def _pretty_print_trie(serialized: bytearray, hashtable, stats, base_offset, ind
     stats.node_count += 1
 
     out = ''
-    size, value_count = Trie.header_struct.unpack_from(serialized, base_offset)
-    stats.max_node_size = max(size, stats.max_node_size)
-    stats.max_node_values = max(value_count, stats.max_node_values)
+    result_count, child_count = Trie.header_struct.unpack_from(serialized, base_offset)
+    stats.max_node_results = max(result_count, stats.max_node_results)
+    stats.max_node_children = max(child_count, stats.max_node_children)
     offset = base_offset + Trie.header_struct.size
 
-    # print values, if any
-    if value_count:
+    # print results, if any
+    if result_count:
         out += color_map['blue'] + ' ['
-        for i in range(value_count):
+        for i in range(result_count):
             if i: out += color_map['blue']+', '
-            value = Trie.value_struct.unpack_from(serialized, offset)[0]
-            stats.max_node_value_index = max(value, stats.max_node_value_index)
-            out += color_map['cyan'] + str(value)
-            offset += Trie.value_struct.size
+            result = Trie.result_struct.unpack_from(serialized, offset)[0]
+            stats.max_node_result_index = max(result, stats.max_node_result_index)
+            out += color_map['cyan'] + str(result)
+            offset += Trie.result_struct.size
         out += color_map['blue'] + ']'
 
-    # print children
-    if base_offset + size*2 - offset > 4: draw_pipe = True
-    child_count = 0
-    while offset < base_offset + size*2:
-        if child_count or value_count:
+    # print children, if any
+    for i in range(child_count):
+        if result_count or i:
             out += color_map['reset'] + '\n'
             out += color_map['blue'] + indent + color_map['white']
         char = Trie.child_char_struct.unpack_from(serialized, offset + 3)[0]
@@ -77,11 +75,9 @@ def _pretty_print_trie(serialized: bytearray, hashtable, stats, base_offset, ind
         child_offset = Trie.child_struct.unpack_from(serialized, offset)[0] & 0x007fffff
         stats.max_node_child_offset = max(child_offset, stats.max_node_child_offset)
         offset += Trie.child_struct.size
-        out += _pretty_print_trie(serialized, hashtable, stats, child_offset, indent + ('|' if draw_pipe else ' '), draw_pipe=False, show_merged=show_merged, show_lookahead_barriers=show_lookahead_barriers, color_map=color_map)
+        out += _pretty_print_trie(serialized, hashtable, stats, child_offset, indent + ('|' if child_count > 1 else ' '), show_merged=show_merged, show_lookahead_barriers=show_lookahead_barriers, color_map=color_map)
         child_count += 1
 
-    stats.max_node_children = max(child_count, stats.max_node_children)
-
     hashtable[base_offset] = True
     return out
 
@@ -108,21 +104,19 @@ def pretty_print_trie(serialized: bytes, show_merged=False, show_lookahead_barri
 
     stats = Empty()
     stats.node_count = 0
-    stats.max_node_size = 0
-    stats.max_node_values = 0
+    stats.max_node_results = 0
     stats.max_node_children = 0
-    stats.max_node_value_index = 0
+    stats.max_node_result_index = 0
     stats.max_node_child_offset = 0
 
-    out = _pretty_print_trie(serialized, hashtable, stats, Trie.root_offset_struct.unpack_from(serialized, 0)[0], '', draw_pipe=False, show_merged=show_merged, show_lookahead_barriers=show_lookahead_barriers, color_map=color_map)
+    out = _pretty_print_trie(serialized, hashtable, stats, Trie.root_offset_struct.unpack_from(serialized, 0)[0], '', show_merged=show_merged, show_lookahead_barriers=show_lookahead_barriers, color_map=color_map)
     if out: out = color_map['white'] + out
     stats = """
 node count:             {}
-max node size:          {} bytes
-max node values:        {}
+max node results:       {}
 max node children:      {}
-max node value index:   {}
-max node child offset:  {}""".lstrip().format(stats.node_count, stats.max_node_size*2, stats.max_node_values, stats.max_node_children, stats.max_node_value_index, stats.max_node_child_offset)
+max node result index:  {}
+max node child offset:  {}""".lstrip().format(stats.node_count, stats.max_node_results, stats.max_node_children, stats.max_node_result_index, stats.max_node_child_offset)
     return out, stats
 
 def pretty_print_map(serialized: bytes, colors=False):