From: Vladimír Vondruš Date: Fri, 19 Jan 2018 21:54:42 +0000 (+0100) Subject: doxygen: initial Trie building & serialization implementation. X-Git-Url: https://www.chiark.greenend.org.uk/ucgi/~cjwatson/git?a=commitdiff_plain;h=fc3adabbf33dc4e8c167e36e2919500dbecdb5b7;p=blog.git doxygen: initial Trie building & serialization implementation. --- diff --git a/doxygen/dox2html5.py b/doxygen/dox2html5.py index b6309b5a..4df2b50f 100755 --- a/doxygen/dox2html5.py +++ b/doxygen/dox2html5.py @@ -33,6 +33,7 @@ import os import glob import mimetypes import shutil +import struct import subprocess import urllib.parse import logging @@ -50,6 +51,67 @@ import latex2svg import m.math import ansilexer +class Trie: + # root | | header | values | child | + # offset | ... | size | value # | ... | offsets ... | + # 32b | | 8b | 8b | n*16b | 8b + 24b | + root_offset_struct = struct.Struct(' int: + # Serialize all children first + child_offsets = [] + for char, child in self.children.items(): + offset = child._serialize(output) + child_offsets += [(char, offset)] + + # Serialize this node + size = 2 + 2*len(self.values) + 4*len(child_offsets) + + serialized = bytearray() + serialized += self.header_struct.pack(size, len(self.values)) + for v in self.values: + serialized += self.value_struct.pack(v) + + # Serialize child offsets + for char, abs_offset in child_offsets: + assert abs_offset < 2**24 + + # write them over each other because that's the only way to pack + # a 24 bit field + offset = len(serialized) + serialized += self.child_struct.pack(abs_offset) + self.child_char_struct.pack_into(serialized, offset + 3, char.encode('utf-8')) + + assert size == len(serialized) + + offset = len(output) + output += serialized + return offset + + def serialize(self) -> bytearray: + output = bytearray(b'\x00\x00\x00\x00') + self.root_offset_struct.pack_into(output, 0, self._serialize(output)) + return output + xref_id_rx = re.compile(r"""(.*)_1(_[a-z-]+[0-9]+)$""") slugify_nonalnum_rx = re.compile(r"""[^\w\s-]""") slugify_hyphens_rx = re.compile(r"""[-\s]+""") diff --git a/doxygen/test/test_trie.py b/doxygen/test/test_trie.py new file mode 100644 index 00000000..01542de6 --- /dev/null +++ b/doxygen/test/test_trie.py @@ -0,0 +1,147 @@ +# +# This file is part of m.css. +# +# Copyright © 2017, 2018 Vladimír VondruÅ¡ +# +# Permission is hereby granted, free of charge, to any person obtaining a +# copy of this software and associated documentation files (the "Software"), +# to deal in the Software without restriction, including without limitation +# the rights to use, copy, modify, merge, publish, distribute, sublicense, +# and/or sell copies of the Software, and to permit persons to whom the +# Software is furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included +# in all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +# DEALINGS IN THE SOFTWARE. +# + +import unittest + +from dox2html5 import Trie + +def _pretty_print(serialized: bytearray, base_offset, indent, draw_pipe) -> str: + out = '' + size, value_count = Trie.header_struct.unpack_from(serialized, base_offset) + offset = base_offset + Trie.header_struct.size + + # print values, if any + if value_count: + out += ' [' + for i in range(value_count): + if i: out += ', ' + out += str(Trie.value_struct.unpack_from(serialized, offset)[0]) + offset += Trie.value_struct.size + out += ']' + + # print children + if base_offset + size - offset > 4: draw_pipe = True + newline = False + while offset < base_offset + size: + if newline or value_count: + out += '\n' + out += indent + out += Trie.child_char_struct.unpack_from(serialized, offset + 3)[0].decode('utf-8') + child_offset = Trie.child_struct.unpack_from(serialized, offset)[0] & 0x00ffffff + offset += Trie.child_struct.size + out += _pretty_print(serialized, child_offset, indent + ('|' if draw_pipe else ' '), False) + newline = True + + return out + +def pretty_print(serialized: bytes) -> str: + return _pretty_print(serialized, Trie.root_offset_struct.unpack_from(serialized, 0)[0], '', False) + +class Serialization(unittest.TestCase): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.maxDiff = None + + def compare(self, serialized: bytes, expected: str): + pretty = pretty_print(serialized) + #print(pretty) + self.assertEqual(pretty, expected.strip()) + + def test_empty(self): + trie = Trie() + + serialized = trie.serialize() + self.compare(serialized, "") + self.assertEqual(len(serialized), 6) + + def test_single(self): + trie = Trie() + trie.insert("magnum", 1337) + trie.insert("magnum", 21) + + serialized = trie.serialize() + self.compare(serialized, """ +magnum [1337, 21] +""") + self.assertEqual(len(serialized), 46) + + def test_multiple(self): + trie = Trie() + + trie.insert("math", 0) + trie.insert("math::vector", 1) + trie.insert("vector", 1) + trie.insert("math::range", 2) + trie.insert("range", 2) + + trie.insert("math::min", 3) + trie.insert("min", 3) + trie.insert("math::max", 4) + trie.insert("max", 4) + trie.insert("math::minmax", 5) + trie.insert("minmax", 5) + + trie.insert("math::vector::minmax", 6) + trie.insert("vector::minmax", 6) + trie.insert("minmax", 6) + trie.insert("math::vector::min", 7) + trie.insert("vector::min", 7) + trie.insert("min", 7) + trie.insert("math::vector::max", 8) + trie.insert("vector::max", 8) + trie.insert("max", 8) + + trie.insert("math::range::min", 9) + trie.insert("range::min", 9) + trie.insert("min", 9) + + trie.insert("math::range::max", 10) + trie.insert("range::max", 10) + trie.insert("max", 10) + + serialized = trie.serialize() + self.compare(serialized, """ +math [0] +||| ::vector [1] +||| | ::min [7] +||| | | max [6] +||| | ax [8] +||| range [2] +||| | ::min [9] +||| | ax [10] +||| min [3] +||| || max [5] +||| |ax [4] +||x [4, 8, 10] +|in [3, 7, 9] +|| max [5, 6] +vector [1] +| ::min [7] +| | max [6] +| ax [8] +range [2] +| ::min [9] +| ax [10] +""") + self.assertEqual(len(serialized), 514)