From: Vladimír Vondruš Date: Sat, 3 Feb 2018 11:53:58 +0000 (+0100) Subject: doxygen: save result count instead of node size to search binary. X-Git-Url: https://www.chiark.greenend.org.uk/ucgi/~cjwatson/git?a=commitdiff_plain;h=cee83243c39b08fb8b4ee227179450f2d9ca2d37;p=blog.git doxygen: save result count instead of node size to search binary. Makes the code much simpler and allows for up to 256 children and 256 results in each node, instead of limiting its size to 512 bytes (thus e.g. 64 children and 127 results or some other combination). I like it when things get better, more flexible and simpler by removing a bunch of complicated code. --- diff --git a/doxygen/dox2html5.py b/doxygen/dox2html5.py index 5adbb1ee..fbc0d238 100755 --- a/doxygen/dox2html5.py +++ b/doxygen/dox2html5.py @@ -54,22 +54,22 @@ import m.math import ansilexer class Trie: - # root | | header | values | child 1 | child 1 | child 1 | - # offset | ... | size/2 | value # | ... | char | barrier | offset | ... - # 32b | | 8b | 8b | n*16b | 8b | 1b | 23b | + # root | | header | results | child 1 | child 1 | child 1 | + # offset | ... | result # | value # | ... | char | barrier | offset | ... + # 32b | | 8b | 8b | n*16b | 8b | 1b | 23b | root_offset_struct = struct.Struct(' int: @@ -93,11 +93,10 @@ class Trie: child_offsets += [(char, child[0], offset)] # Serialize this node - size = int(2 + 2*len(self.values) + 4*len(child_offsets)) serialized = bytearray() - serialized += self.header_struct.pack(int(size/2), len(self.values)) - for v in self.values: - serialized += self.value_struct.pack(v) + serialized += self.header_struct.pack(len(self.results), len(self.children)) + for v in self.results: + serialized += self.result_struct.pack(v) # Serialize child offsets for char, lookahead_barrier, abs_offset in child_offsets: @@ -109,8 +108,6 @@ class Trie: serialized += self.child_struct.pack(abs_offset | ((1 if lookahead_barrier else 0) << 23)) self.child_char_struct.pack_into(serialized, offset + 3, char) - assert size == len(serialized) - # Subtree merging: if this exact tree is already in the table, return # its offset. Otherwise add it and return the new offset. # TODO: why hashable = bytes(output[base_offset:] + serialized) didn't work? diff --git a/doxygen/search.js b/doxygen/search.js index 0bac7a51..06062610 100644 --- a/doxygen/search.js +++ b/doxygen/search.js @@ -188,9 +188,8 @@ var Search = { for(; foundPrefix != searchString.length; ++foundPrefix) { /* Calculate offset and count of children */ let offset = this.searchStack[this.searchStack.length - 1]; - let nodeSize = this.trie.getUint8(offset)*2; - let relChildOffset = 2 + this.trie.getUint8(offset + 1)*2; - let childCount = (nodeSize - relChildOffset)/4; + let relChildOffset = 2 + this.trie.getUint8(offset)*2; + let childCount = this.trie.getUint8(offset + 1); /* Go through all children and find the next offset */ let childOffset = offset + relChildOffset; @@ -230,10 +229,10 @@ var Search = { }, gatherResults: function(offset, suffixLength, results) { - let valueCount = this.trie.getUint8(offset + 1); + let resultCount = this.trie.getUint8(offset); /* Populate the results with all values associated with this node */ - for(let i = 0; i != valueCount; ++i) { + for(let i = 0; i != resultCount; ++i) { let index = this.trie.getUint16(offset + (i + 1)*2, true); let flags = this.map.getUint8(index*4 + 3); let resultOffset = this.map.getUint32(index*4, true) & 0x00ffffff; @@ -279,9 +278,8 @@ var Search = { /* Dig deeper. If the child already has enough, return. */ /* TODO: hmmm. this is helluvalot duplicated code. hmm. */ - let nodeSize = this.trie.getUint8(offset)*2; - let relChildOffset = 2 + this.trie.getUint8(offset + 1)*2; - let childCount = (nodeSize - relChildOffset)/4; + let relChildOffset = 2 + this.trie.getUint8(offset)*2; + let childCount = this.trie.getUint8(offset + 1); let childOffset = offset + relChildOffset; for(let j = 0; j != childCount; ++j) { let offsetBarrier = this.trie.getUint32(childOffset + j*4, true); diff --git a/doxygen/test/js-test-data/empty.bin b/doxygen/test/js-test-data/empty.bin index 053f032f..6d194001 100644 Binary files a/doxygen/test/js-test-data/empty.bin and b/doxygen/test/js-test-data/empty.bin differ diff --git a/doxygen/test/js-test-data/searchdata.b85 b/doxygen/test/js-test-data/searchdata.b85 index 2d1c1614..a9f602a1 100644 --- a/doxygen/test/js-test-data/searchdata.b85 +++ b/doxygen/test/js-test-data/searchdata.b85 @@ -1 +1 @@ -O+!-vL;(N*Dggih0s#R40{{d704W0i2mk;m0{{*H0B!>S6aWBe0s#X60{|cZ04W0iBme*?0{|)j0B!>SFaQ8)0{}Jv0Br*RJOBVX1OWm7LI8j|0{}<>0CEEWPyhgL0{~V40CWQYTmS%L0{~(G0A&IJ1pos8ZU6u&0|0UW04M_hcmM!y0|0&i0BHjNga80-0|1Hu06GK#1OSi#fI0&JmH+@{0|1@?0A~XLqyPYJ0|2T30AU9J8UO%oXaE3qumAvZ0|2%F06GK#006`QfI0&J$^Zap0|3$h0CWTc0RRI41pos8-T(k80|4d#04M_h>;M361pwFp0Aca~0BHgN1^@#90s#PJ0{{jA0A~XL3;_UP0{{{M0B{2U7y$rc0{|WY0Cfof_y7QHXaE3qumAvZBmn?(AOHXmHvj-(VgLXlhX4R!z5oCq;Q#<-76AajG64VpO<{Cs0B&JzWpi+0V`WWYbZ9PUbZu-1O<{CsIy!A>ZYXJPbSxlgZgeRCZeeX@b8ul}WldppXf9}UZEPcLX>LtnbZ9y{R%K&!Z*l-*Y+-YAO<{CsUol@XR%K&!Z*neZbZu+~O<{CsIyzQmV{~tFIy!A>ZYU`rV{dMAbO2*)VRLg$VRUF;F<&uOWn*-2axQ3eZEPcLX>LtnbZ9y{QekdqWdLJrVRLg$VRUF;F<&uKVQyz-E@*UZYz9qXbZ9y{QekdqWjZ=-X>KSfAY*TCb94Y>Y+-YAO<{CsUol@XQekdqWiDuRZEPcLX>L$qXJsJ5yC73_VsK$+WdL(^VsK$+WiDuRZEOGl \ No newline at end of file +O+!-vL;(N*Dggih0RRC2009I504V?g2mk;m009mF0B!&Q6aWBe0RRI400AHX04V?gBme*?00Alh0B!&QFaQ8)00A}t0BryPJOBVX0RaL4LI8j|00Bq<0CE5UPyhgL00CA20CWHWTmS%L00CkE0A&FH1poj6ZU6u&00D9U04M+fcmM!y00Djg0BHaLga80-00D{s06GBy1OSi#fI0vHmH+@{00Eu=0A~OJqyPYJ00F810AT;M3600P(m0Aca~0BHdL1^@s70s#PJ009O80A~OJ3;_UP009yK0B`^S7y$rc00ABW0CfNa_y7QHXaE3qumAvZBmn?(AOHXmHvj-(VgLXlhX4R!z5oCq;Q#<-76AajG64VpO<{Cs0B&JzWpi+0V`WWYbZ9PUbZu-1O<{CsIy!A>ZYXJPbSxlgZgeRCZeeX@b8ul}WldppXf9}UZEPcLX>LtnbZ9y{R%K&!Z*l-*Y+-YAO<{CsUol@XR%K&!Z*neZbZu+~O<{CsIyzQmV{~tFIy!A>ZYU`rV{dMAbO2*)VRLg$VRUF;F<&uOWn*-2axQ3eZEPcLX>LtnbZ9y{QekdqWdLJrVRLg$VRUF;F<&uKVQyz-E@*UZYz9qXbZ9y{QekdqWjZ=-X>KSfAY*TCb94Y>Y+-YAO<{CsUol@XQekdqWiDuRZEPcLX>L$qXJsJ5yC73_VsK$+WdL(^VsK$+WiDuRZEOGl \ No newline at end of file diff --git a/doxygen/test/js-test-data/searchdata.bin b/doxygen/test/js-test-data/searchdata.bin index 12fe6ea1..3c44891c 100644 Binary files a/doxygen/test/js-test-data/searchdata.bin and b/doxygen/test/js-test-data/searchdata.bin differ diff --git a/doxygen/test/js-test-data/unicode.bin b/doxygen/test/js-test-data/unicode.bin index ed2bd505..23aa5fc9 100644 Binary files a/doxygen/test/js-test-data/unicode.bin and b/doxygen/test/js-test-data/unicode.bin differ diff --git a/doxygen/test/test_search.py b/doxygen/test/test_search.py index 4c1a98dd..e915e135 100755 --- a/doxygen/test/test_search.py +++ b/doxygen/test/test_search.py @@ -34,7 +34,7 @@ from dox2html5 import Trie, ResultMap, ResultFlag, serialize_search_data, search from test import IntegrationTestCase -def _pretty_print_trie(serialized: bytearray, hashtable, stats, base_offset, indent, draw_pipe, show_merged, show_lookahead_barriers, color_map) -> str: +def _pretty_print_trie(serialized: bytearray, hashtable, stats, base_offset, indent, show_merged, show_lookahead_barriers, color_map) -> str: # Visualize where the trees were merged if show_merged and base_offset in hashtable: return color_map['red'] + '#' + color_map['reset'] @@ -42,27 +42,25 @@ def _pretty_print_trie(serialized: bytearray, hashtable, stats, base_offset, ind stats.node_count += 1 out = '' - size, value_count = Trie.header_struct.unpack_from(serialized, base_offset) - stats.max_node_size = max(size, stats.max_node_size) - stats.max_node_values = max(value_count, stats.max_node_values) + result_count, child_count = Trie.header_struct.unpack_from(serialized, base_offset) + stats.max_node_results = max(result_count, stats.max_node_results) + stats.max_node_children = max(child_count, stats.max_node_children) offset = base_offset + Trie.header_struct.size - # print values, if any - if value_count: + # print results, if any + if result_count: out += color_map['blue'] + ' [' - for i in range(value_count): + for i in range(result_count): if i: out += color_map['blue']+', ' - value = Trie.value_struct.unpack_from(serialized, offset)[0] - stats.max_node_value_index = max(value, stats.max_node_value_index) - out += color_map['cyan'] + str(value) - offset += Trie.value_struct.size + result = Trie.result_struct.unpack_from(serialized, offset)[0] + stats.max_node_result_index = max(result, stats.max_node_result_index) + out += color_map['cyan'] + str(result) + offset += Trie.result_struct.size out += color_map['blue'] + ']' - # print children - if base_offset + size*2 - offset > 4: draw_pipe = True - child_count = 0 - while offset < base_offset + size*2: - if child_count or value_count: + # print children, if any + for i in range(child_count): + if result_count or i: out += color_map['reset'] + '\n' out += color_map['blue'] + indent + color_map['white'] char = Trie.child_char_struct.unpack_from(serialized, offset + 3)[0] @@ -77,11 +75,9 @@ def _pretty_print_trie(serialized: bytearray, hashtable, stats, base_offset, ind child_offset = Trie.child_struct.unpack_from(serialized, offset)[0] & 0x007fffff stats.max_node_child_offset = max(child_offset, stats.max_node_child_offset) offset += Trie.child_struct.size - out += _pretty_print_trie(serialized, hashtable, stats, child_offset, indent + ('|' if draw_pipe else ' '), draw_pipe=False, show_merged=show_merged, show_lookahead_barriers=show_lookahead_barriers, color_map=color_map) + out += _pretty_print_trie(serialized, hashtable, stats, child_offset, indent + ('|' if child_count > 1 else ' '), show_merged=show_merged, show_lookahead_barriers=show_lookahead_barriers, color_map=color_map) child_count += 1 - stats.max_node_children = max(child_count, stats.max_node_children) - hashtable[base_offset] = True return out @@ -108,21 +104,19 @@ def pretty_print_trie(serialized: bytes, show_merged=False, show_lookahead_barri stats = Empty() stats.node_count = 0 - stats.max_node_size = 0 - stats.max_node_values = 0 + stats.max_node_results = 0 stats.max_node_children = 0 - stats.max_node_value_index = 0 + stats.max_node_result_index = 0 stats.max_node_child_offset = 0 - out = _pretty_print_trie(serialized, hashtable, stats, Trie.root_offset_struct.unpack_from(serialized, 0)[0], '', draw_pipe=False, show_merged=show_merged, show_lookahead_barriers=show_lookahead_barriers, color_map=color_map) + out = _pretty_print_trie(serialized, hashtable, stats, Trie.root_offset_struct.unpack_from(serialized, 0)[0], '', show_merged=show_merged, show_lookahead_barriers=show_lookahead_barriers, color_map=color_map) if out: out = color_map['white'] + out stats = """ node count: {} -max node size: {} bytes -max node values: {} +max node results: {} max node children: {} -max node value index: {} -max node child offset: {}""".lstrip().format(stats.node_count, stats.max_node_size*2, stats.max_node_values, stats.max_node_children, stats.max_node_value_index, stats.max_node_child_offset) +max node result index: {} +max node child offset: {}""".lstrip().format(stats.node_count, stats.max_node_results, stats.max_node_children, stats.max_node_result_index, stats.max_node_child_offset) return out, stats def pretty_print_map(serialized: bytes, colors=False):