From: Vladimír Vondruš Date: Sat, 20 Jan 2018 22:39:00 +0000 (+0100) Subject: doxygen: serializing search result map and everything together. X-Git-Url: https://www.chiark.greenend.org.uk/ucgi/~cjwatson/git?a=commitdiff_plain;h=8764dbc21499c9a42b1f07f1f1402ebe088034e2;p=blog.git doxygen: serializing search result map and everything together. --- diff --git a/doxygen/dox2html5.py b/doxygen/dox2html5.py index ffddf3c6..b7d125ed 100755 --- a/doxygen/dox2html5.py +++ b/doxygen/dox2html5.py @@ -121,6 +121,54 @@ class Trie: self.root_offset_struct.pack_into(output, 0, self._serialize(hashtable, output)) return output +class ResultMap: + # item 1 flags | item 2 flags | | item N flags | file | item 1 | + # + offset | + offset | ... | + offset | size | name + url | ... + # 8 + 24b | 8 + 24b | | 8 + 24b | 32b | (0-delimited) | + offset_struct = struct.Struct(' int: + self.entries += [(name, url, flags)] + return len(self.entries) - 1 + + def serialize(self) -> bytearray: + output = bytearray() + + # Write the offset array. Starting offset for items is after the offset + # array and the file size + offset = (len(self.entries) + 1)*4 + for name, url, flags in self.entries: + assert offset < 2**24 + output += self.offset_struct.pack(offset) + self.flags_struct.pack_into(output, len(output) - 1, flags) + + # include the 0-delimiter + offset += len(name) + len(url) + 1 + + # Write file size + output += self.offset_struct.pack(offset) + + # Write the entries themselves + for name, url, _ in self.entries: + output += name.encode('utf-8') + output += b'\0' + output += url.encode('utf-8') + + assert len(output) == offset + return output + +search_data_header_struct = struct.Struct('<3sBI') + +def serialize_search_data(trie: Trie, map: ResultMap) -> bytearray: + serialized_trie = trie.serialize() + serialized_map = map.serialize() + # magic header, version, offset of result map + return search_data_header_struct.pack(b'MCS', 0, len(serialized_trie) + 8) + serialized_trie + serialized_map + xref_id_rx = re.compile(r"""(.*)_1(_[a-z-]+[0-9]+)$""") slugify_nonalnum_rx = re.compile(r"""[^\w\s-]""") slugify_hyphens_rx = re.compile(r"""[-\s]+""") diff --git a/doxygen/test/test_trie.py b/doxygen/test/test_search.py similarity index 58% rename from doxygen/test/test_trie.py rename to doxygen/test/test_search.py index a083e8ea..528e1d65 100755 --- a/doxygen/test/test_trie.py +++ b/doxygen/test/test_search.py @@ -29,9 +29,9 @@ import unittest import sys from types import SimpleNamespace as Empty -from dox2html5 import Trie +from dox2html5 import Trie, ResultMap, serialize_search_data, search_data_header_struct -def _pretty_print(serialized: bytearray, hashtable, stats, base_offset, indent, draw_pipe, show_merged) -> str: +def _pretty_print_trie(serialized: bytearray, hashtable, stats, base_offset, indent, draw_pipe, show_merged) -> str: # Visualize where the trees were merged if show_merged and base_offset in hashtable: return ' #' @@ -65,7 +65,7 @@ def _pretty_print(serialized: bytearray, hashtable, stats, base_offset, indent, child_offset = Trie.child_struct.unpack_from(serialized, offset)[0] & 0x00ffffff stats.max_node_child_offset = max(child_offset, stats.max_node_child_offset) offset += Trie.child_struct.size - out += _pretty_print(serialized, hashtable, stats, child_offset, indent + ('|' if draw_pipe else ' '), draw_pipe=False, show_merged=show_merged) + out += _pretty_print_trie(serialized, hashtable, stats, child_offset, indent + ('|' if draw_pipe else ' '), draw_pipe=False, show_merged=show_merged) child_count += 1 stats.max_node_children = max(child_count, stats.max_node_children) @@ -73,7 +73,7 @@ def _pretty_print(serialized: bytearray, hashtable, stats, base_offset, indent, hashtable[base_offset] = True return out -def pretty_print(serialized: bytes, show_merged=False): +def pretty_print_trie(serialized: bytes, show_merged=False): hashtable = {} stats = Empty() @@ -84,7 +84,7 @@ def pretty_print(serialized: bytes, show_merged=False): stats.max_node_value_index = 0 stats.max_node_child_offset = 0 - out = _pretty_print(serialized, hashtable, stats, Trie.root_offset_struct.unpack_from(serialized, 0)[0], '', draw_pipe=False, show_merged=show_merged) + out = _pretty_print_trie(serialized, hashtable, stats, Trie.root_offset_struct.unpack_from(serialized, 0)[0], '', draw_pipe=False, show_merged=show_merged) stats = """ node count: {} max node size: {} bytes @@ -94,13 +94,38 @@ max node value index: {} max node child offset: {}""".lstrip().format(stats.node_count, stats.max_node_size, stats.max_node_values, stats.max_node_children, stats.max_node_value_index, stats.max_node_child_offset) return out, stats -class Serialization(unittest.TestCase): +def pretty_print_map(serialized: bytes): + # The first item gives out offset of first value, which can be used to + # calculate total value count + offset = ResultMap.offset_struct.unpack_from(serialized, 0)[0] & 0x00ffffff + size = int(offset/4 - 1) + + out = '' + for i in range(size): + if i: out += '\n' + flags = ResultMap.flags_struct.unpack_from(serialized, i*4 + 3)[0] + next_offset = ResultMap.offset_struct.unpack_from(serialized, (i + 1)*4)[0] & 0x00ffffff + name, _, url = serialized[offset:next_offset].partition(b'\0') + out += "{}: {} [{}] -> {}".format(i, name.decode('utf-8'), flags, url.decode('utf-8')) + offset = next_offset + return out + +def pretty_print(serialized: bytes, show_merged=False): + magic, version, map_offset = search_data_header_struct.unpack_from(serialized) + assert magic == b'MCS' + assert version == 0 + + pretty_trie, stats = pretty_print_trie(serialized[search_data_header_struct.size:map_offset], show_merged=show_merged) + pretty_map = pretty_print_map(serialized[map_offset:]) + return pretty_trie + '\n' + pretty_map, stats + +class TrieSerialization(unittest.TestCase): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self.maxDiff = None def compare(self, serialized: bytes, expected: str): - pretty = pretty_print(serialized)[0] + pretty = pretty_print_trie(serialized)[0] #print(pretty) self.assertEqual(pretty, expected.strip()) @@ -182,6 +207,87 @@ range [2] """) self.assertEqual(len(serialized), 340) +class MapSerialization(unittest.TestCase): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.maxDiff = None + + def compare(self, serialized: bytes, expected: str): + pretty = pretty_print_map(serialized) + #print(pretty) + self.assertEqual(pretty, expected.strip()) + + def test_empty(self): + map = ResultMap() + + serialized = map.serialize() + self.compare(serialized, "") + self.assertEqual(len(serialized), 4) + + def test_single(self): + map = ResultMap() + self.assertEqual(map.add("Magnum", "namespaceMagnum.html", 11), 0) + + serialized = map.serialize() + self.compare(serialized, """ +0: Magnum [11] -> namespaceMagnum.html +""") + self.assertEqual(len(serialized), 35) + + def test_multiple(self): + map = ResultMap() + + self.assertEqual(map.add("Math", "namespaceMath.html"), 0) + self.assertEqual(map.add("Math::Vector", "classMath_1_1Vector.html", 42), 1) + self.assertEqual(map.add("Math::Range", "classMath_1_1Range.html", 255), 2) + self.assertEqual(map.add("Math::min()", "namespaceMath.html#abcdef2875"), 3) + self.assertEqual(map.add("Math::max()", "namespaceMath.html#abcdef2875"), 4) + + serialized = map.serialize() + self.compare(serialized, """ +0: Math [0] -> namespaceMath.html +1: Math::Vector [42] -> classMath_1_1Vector.html +2: Math::Range [255] -> classMath_1_1Range.html +3: Math::min() [0] -> namespaceMath.html#abcdef2875 +4: Math::max() [0] -> namespaceMath.html#abcdef2875 +""") + self.assertEqual(len(serialized), 201) + +class Serialization(unittest.TestCase): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.maxDiff = None + + def compare(self, serialized: bytes, expected: str): + pretty = pretty_print(serialized)[0] + #print(pretty) + self.assertEqual(pretty, expected.strip()) + + def test(self): + trie = Trie() + map = ResultMap() + + trie.insert("math", map.add("Math", "namespaceMath.html")) + index = map.add("Math::Vector", "classMath_1_1Vector.html", 42) + trie.insert("math::vector", index) + trie.insert("vector", index) + index = map.add("Math::Range", "classMath_1_1Range.html", 255) + trie.insert("math::range", index) + trie.insert("range", index) + + serialized = serialize_search_data(trie, map) + self.compare(serialized, """ +math [0] +| ::vector [1] +| range [2] +vector [1] +range [2] +0: Math [0] -> namespaceMath.html +1: Math::Vector [42] -> classMath_1_1Vector.html +2: Math::Range [255] -> classMath_1_1Range.html +""") + self.assertEqual(len(serialized), 241) + if __name__ == '__main__': # pragma: no cover parser = argparse.ArgumentParser() parser.add_argument('file', help="file to pretty-print")