From: Vladimír Vondruš Date: Sat, 8 Jan 2022 19:49:26 +0000 (+0100) Subject: documentation: parametrize the search binary data type sizes. X-Git-Url: https://www.chiark.greenend.org.uk/ucgi/~cjwatson/git?a=commitdiff_plain;h=b0cf44e4ddbf42ce79a8612563e84e00e8a75808;p=blog.git documentation: parametrize the search binary data type sizes. Needed in order to support more than 65k symbols or files larger than 16 MB. What I thought was "more than enough" during the initial design was quickly stepped over by various projects, including my own Magnum Python bindings. To avoid having to either maintain two separate formats and two separate en/decoders or needlessly inflate the format for everyone, certain data types are parametrized based on how large the data is: * RESULT_ID_BYTES describes how many bytes is needed to store result IDs. By default it's 2 (so 65536 results) but can be also 3 (16M results) or 4. * FILE_OFFSET_BYTES describes how many bytes is needed to store file offsets. By default it's 3 (so 16 MB), but can be also 4. * NAME_SIZE_BYTES describes how many bytes is needed to store various name lengths (prefix, suffix lengths etc). By default it's 1 (so 256 bytes at most), but can be also 2. At first I tried to preserve 32-bit alignment as much as possible, but eventually realized this is completely unimportant in the browser environment -- there's other much worse performance pitfalls than reading an unaligned value. This is also why there are 24-bit integer types, even though they're quite annoying to pack from Python. Furthermore, the original hack to reserve 11 bits for result count at the cost of having only 4 bits for child count was changed to instead expand the result count to a 15-bit value if there's > 127 results. Some endianness tricks involved, but much cleaner than before. I briefly considered having a global RESULT_COUNT_BYTES parameter as well, but considering >90% of result counts fit into 8 bits and this is only for weird outliers like Python __init__(), it would be a giant waste of precious bytes. The minor differences in the test file sizes are due to: * The header expanding symbol count from 16 to 32 bits (+2B) * The header containing type description and associated padding (+4B) * The result map no longer packing flags and offsets together, thus saving one byte from flags (-1B) To ensure there's no hardcoded type size assumptions anymore, the tests now go through all type size combinations. --- diff --git a/documentation/_search.py b/documentation/_search.py index 1a1d893e..a2d44556 100644 --- a/documentation/_search.py +++ b/documentation/_search.py @@ -29,14 +29,263 @@ import base64 import enum import struct from types import SimpleNamespace as Empty -from typing import List, Tuple +from typing import List, Tuple, Union # Version 0 was without the type map -searchdata_format_version = 1 +searchdata_format_version = 2 search_filename = f'search-v{searchdata_format_version}.js' searchdata_filename = f'{{search_filename_prefix}}-v{searchdata_format_version}.bin' searchdata_filename_b85 = f'{{search_filename_prefix}}-v{searchdata_format_version}.js' +# In order to be both space-efficient and flexible enough to accomodate for +# larger projects, the bit counts for particular data types can vary in each +# file. There's the following categories: +# +# - NAME_SIZE_BITS, how many bits is needed to store name lengths (such as +# prefix length). Can be either 8 or 16. +# - RESULT_ID_BITS, how many bits is needed for IDs pointing into the result +# map. Can be either 16, 24 or 32. +# - FILE_OFFSET_BITS, how many bits is needed to store general offsets into +# the file. Can be either 24 or 32. +# +# Whole file encoding +# =================== +# +# magic | version | type | not | symbol | result | type | trie | result | type +# 'MCS' | (0x02) | data | used | count | map | map | data | map | map +# | | | | | offset | offset | | data | data +# 24b | 8b | 8b | 24b | 32b | 32b | 32b | … | … | … +# +# The type data encode the NAME_SIZE_BITS, RESULT_ID_BITS and +# FILE_OFFSET_BITS: +# +# not | NAME_SIZE_BITS | RESULT_ID_BITS | FILE_OFFSET_BITS +# used | 0b0 = 8b | 0b00 = 16b | 0b0 = 24b +# | 0b1 = 16b | 0b01 = 24b | 0b1 = 32b +# | | 0b10 = 32b | +# 4b | 1b | 2b | 1b +# +# Trie encoding +# ============= +# +# Because child tries are serialized first, the trie containing the initial +# characters is never the first, and instead the root offset points to it. If +# result count < 128: +# +# root | | header | results | children +# offset | … | | result # | child # | … | data +# 32b | |0| 7b | 8b | n*RESULT_ID_BITS | … +# +# If result count > 127, it's instead this -- since entries with very large +# number of results (such as python __init__()) are rather rare, it doesn't +# make sense to have it globally configurable and then waste 8 bits in the +# majority of cases. Note that the 15-bit value is stored as Big-Endian, +# otherwise the leftmost bit couldn't be used to denote the size. +# +# root | | header | results | children +# offset | … | | result # | child # | … | data +# 32b | |1| 15b (BE) | 8b | n*RESULT_ID_BITS | … +# +# Trie children data encoding, the barrier is stored in the topmost offset bit: +# +# child 1 | child 2 | | child 1 | child 2 | +# char | char | … | barrier + offset | barrier + offset | … +# 8b | 8b | | FILE_OFFSET_BITS | FILE_OFFSET_BITS | +# +# Result map encoding +# =================== +# +# First all flags, then all offsets, so we don't need to have weird paddings or +# alignments. The "file size" is there so size of item N can be always +# retrieved as `offsets[N + 1] - offsets[N]` +# +# item | file | item | item 1 | item 2 | +# offsets | size | flags | data | data | … +# n*FILE_OFFSET_BITS | FILE_OFFSET_BITS | n*8b | | | +# +# Basic item data (flags & 0b11 == 0b00): +# +# name | \0 | URL +# | | +# | 8b | +# +# Suffixed item data (flags & 0b11 == 0b01): +# +# suffix | name | \0 | URL +# length | | | +# NAME_SIZE_BITS | | 8b | +# +# Prefixed item data (flags & 0xb11 == 0b10): +# +# prefix | prefix | name | \0 | URL +# id | length | suffix | | suffix +# RESULT_ID_BITS | NAME_SIZE_BITS | | 8b | +# +# Prefixed & suffixed item (flags & 0xb11 == 0b11): +# +# prefix | prefix | suffix | name | \0 | URL +# id | length | length | suffix | | +# RESULT_ID_BITS | NAME_SIZE_BITS | NAME_SIZE_BITS | | 8b | +# +# Alias item (flags & 0xf0 == 0x00), flags & 0xb11 then denote what's in the +# `…` portion, alias have no URL so the alias name is in place of it: +# +# alias | | alias +# id | … | name +# RESULT_ID_BITS | | +# +# Type map encoding +# ================= +# +# Again the "end offset" is here so size of type N can be always retrieved as +# `offsets[N + 1] - offsets[N]`. Type names are not expected to have more than +# 255 chars, so NAME_SIZE_BITS is not used here. +# +# type 1 | type 2 | | | | type 1 | +# class | name | class | name | … | padding | end | name | … +# ID | offset | ID | offset | | | offset | data | +# 8b | 8b | 8b | 8b | | 8b | 8b | | + +class Serializer: + # This is currently hardcoded + result_map_flag_bytes = 1 + + header_struct = struct.Struct('<3sBBxxxIII') + result_map_flags_struct = struct.Struct('= child_barrier_mask: raise OverflowError + out += (offset | (barrier*child_barrier_mask)).to_bytes(self.file_offset_bytes, byteorder='little') + return out + + def pack_type_map_entry(self, class_: int, offset: int): + return self.type_map_entry_struct.pack(class_, offset) + +class Deserializer: + def __init__(self, *, file_offset_bytes, result_id_bytes, name_size_bytes): + assert file_offset_bytes in [3, 4] + self.file_offset_bytes = file_offset_bytes + + assert result_id_bytes in [2, 3, 4] + self.result_id_bytes = result_id_bytes + + assert name_size_bytes in [1, 2] + self.name_size_bytes = name_size_bytes + + @classmethod + def from_serialized(self, serialized: bytes): + magic, version, type_data, symbol_count, map_offset, type_map_offset = Serializer.header_struct.unpack_from(serialized) + assert magic == b'MCS' + assert version == searchdata_format_version + out = Deserializer( + file_offset_bytes=[3, 4][(type_data & 0b0001) >> 0], + result_id_bytes=[2, 3, 4][(type_data & 0b0110) >> 1], + name_size_bytes=[1, 2][(type_data & 0b1000) >> 3]) + out.symbol_count = symbol_count + out.map_offset = map_offset + out.type_map_offset = type_map_offset + return out + + # The last tuple item is number of bytes extracted + def unpack_result_map_flags(self, serialized: bytes, offset: int) -> Tuple[int, int]: + return Serializer.result_map_flags_struct.unpack_from(serialized, offset) + (Serializer.result_map_flags_struct.size, ) + def unpack_result_map_offset(self, serialized: bytes, offset: int) -> Tuple[int, int]: + return int.from_bytes(serialized[offset:offset + self.file_offset_bytes], byteorder='little'), self.file_offset_bytes + def unpack_result_map_prefix(self, serialized: bytes, offset: int) -> Tuple[int, int, int]: + return int.from_bytes(serialized[offset:offset + self.result_id_bytes], byteorder='little'), int.from_bytes(serialized[offset + self.result_id_bytes:offset + self.result_id_bytes + self.name_size_bytes], byteorder='little'), self.result_id_bytes + self.name_size_bytes + def unpack_result_map_suffix_length(self, serialized: bytes, offset: int) -> Tuple[int, int]: + return int.from_bytes(serialized[offset:offset + self.name_size_bytes], byteorder='little'), self.name_size_bytes + def unpack_result_map_alias(self, serialized: bytes, offset: int) -> Tuple[int, int]: + return int.from_bytes(serialized[offset:offset + self.result_id_bytes], byteorder='little'), self.result_id_bytes + + def unpack_trie_root_offset(self, serialized: bytes, offset: int) -> Tuple[int, int]: + return Serializer.trie_root_offset_struct.unpack_from(serialized, offset) + (Serializer.trie_root_offset_struct.size, ) + def unpack_trie_node(self, serialized: bytes, offset: int) -> Tuple[List[int], List[int], List[Tuple[int, int, bool]], int]: + prev_offset = offset + # Result count, first try 8-bit, if it has the highest bit set, extract + # two bytes (as a BE) and then remove the highest bit + result_count = int.from_bytes(serialized[offset:offset + 1], byteorder='little') + if result_count & 0x80: + result_count = int.from_bytes(serialized[offset:offset + 2], byteorder='big') & ~0x8000 + offset += 1 + offset += 1 + child_count = int.from_bytes(serialized[offset:offset + 1], byteorder='little') + offset += 1 + + # Unpack all result IDs + result_ids = [] + for i in range(result_count): + result_ids += [int.from_bytes(serialized[offset:offset + self.result_id_bytes], byteorder='little')] + offset += self.result_id_bytes + + # Unpack all child chars + child_chars = list(serialized[offset:offset + child_count]) + offset += child_count + + # Unpack all children offsets and lookahead barriers + child_chars_offsets_barriers = [] + child_barrier_mask = 1 << (self.file_offset_bytes*8 - 1) + for i in range(child_count): + child_offset_barrier = int.from_bytes(serialized[offset:offset + self.file_offset_bytes], byteorder='little') + child_chars_offsets_barriers += [(child_chars[i], child_offset_barrier & ~child_barrier_mask, bool(child_offset_barrier & child_barrier_mask))] + offset += self.file_offset_bytes + + return result_ids, child_chars_offsets_barriers, offset - prev_offset + + def unpack_type_map_entry(self, serialized: bytes, offset: int) -> Tuple[int, int, int]: + return Serializer.type_map_entry_struct.unpack_from(serialized, offset) + (Serializer.type_map_entry_struct.size, ) + class CssClass(enum.Enum): DEFAULT = 0 PRIMARY = 1 @@ -87,50 +336,7 @@ class ResultFlag(enum.Flag): _TYPE14 = 14 << 4 _TYPE15 = 15 << 4 -# Result map encoding -- the "file size" is there so size of item N can be -# always retrieved as `offsets[N + 1] - offsets[N]` -# -# item 1 flags | item 2 flags | | item N flags | file | item 1 | -# + offset | + offset | … | + offset | size | data | … -# 8 + 24b | 8 + 24b | | 8 + 24b | 32b | | -# -# basic item (flags & 0b11 == 0b00): -# -# name | \0 | URL -# | | -# | 8b | -# -# suffixed item (flags & 0b11 == 0b01): -# -# suffix | name | \0 | URL -# length | | | -# 8b | | 8b | -# -# prefixed item (flags & 0xb11 == 0b10): -# -# prefix | name | \0 | URL -# id + len | suffix | | suffix -# 16b + 8b | | 8b | -# -# prefixed & suffixed item (flags & 0xb11 == 0b11): -# -# prefix | suffix | name | \0 | URL -# id + len | length | suffix | | -# 16b + 8b | 8b | | 8b | -# -# alias item (flags & 0xf0 == 0x00), flags & 0xb11 then denote what's in the -# `…` portion, alias have no URL so the alias name is in place of it: -# -# alias | | alias -# id | … | name -# 16b | | class ResultMap: - offset_struct = struct.Struct(' bytearray: - output = bytearray() - + def serialize(self, serializer: Serializer, merge_prefixes=True) -> bytearray: if merge_prefixes: # Put all entry names into a trie to discover common prefixes trie = Trie() @@ -225,25 +429,24 @@ class ResultMap: # Everything merged, replace the original list self.entries = merged - # Write the offset array. Starting offset for items is after the offset - # array and the file size - offset = (len(self.entries) + 1)*4 + # Write the offset array. Starting offset for items is after the + # (aligned) flag array and (aligned) offset + file size array. + output = bytearray() + offset = len(self.entries)*serializer.result_map_flag_bytes + (len(self.entries) + 1)*serializer.file_offset_bytes for e in self.entries: - assert offset < 2**24 - output += self.offset_struct.pack(offset) - self.flags_struct.pack_into(output, len(output) - 1, e.flags.value) + output += serializer.pack_result_map_offset(offset) # The entry is an alias, extra field for alias index if e.flags & ResultFlag._TYPE == ResultFlag.ALIAS: - offset += self.alias_struct.size + offset += serializer.result_id_bytes # Extra field for prefix index and length if e.flags & ResultFlag.HAS_PREFIX: - offset += self.prefix_struct.size + offset += serializer.result_id_bytes + serializer.name_size_bytes # Extra field for suffix length if e.flags & ResultFlag.HAS_SUFFIX: - offset += self.suffix_length_struct.size + offset += serializer.name_size_bytes # Length of the name offset += len(e.name.encode('utf-8')) @@ -254,18 +457,22 @@ class ResultMap: offset += len(e.url.encode('utf-8')) + 1 # Write file size - output += self.offset_struct.pack(offset) + output += serializer.pack_result_map_offset(offset) + + # Write the flag array + for e in self.entries: + output += serializer.pack_result_map_flags(e.flags.value) # Write the entries themselves for e in self.entries: if e.flags & ResultFlag._TYPE == ResultFlag.ALIAS: assert not e.alias is None assert not e.url - output += self.alias_struct.pack(e.alias) + output += serializer.pack_result_map_alias(e.alias) if e.flags & ResultFlag.HAS_PREFIX: - output += self.prefix_struct.pack(e.prefix, e.prefix_length) + output += serializer.pack_result_map_prefix(e.prefix, e.prefix_length) if e.flags & ResultFlag.HAS_SUFFIX: - output += self.suffix_length_struct.pack(e.suffix_length) + output += serializer.pack_result_map_suffix_length(e.suffix_length) output += e.name.encode('utf-8') if e.url: output += b'\0' @@ -274,31 +481,21 @@ class ResultMap: assert len(output) == offset return output -# Trie encoding: -# -# root | | header | results | child 1 | child 1 | child 1 | -# offset | … | | result # | child # | … | char | barrier | offset | … -# 32b | |0| 7b | 8b | n*16b | 8b | 1b | 23b | -# -# if result count > 127, it's instead: -# -# root | | header | results | child 1 | child 1 | child 1 | -# offset | … | | result # | child # | … | char | barrier | offset | … -# 32b | |1| 11b | 4b | n*16b | 8b | 1b | 23b | class Trie: - root_offset_struct = struct.Struct(' int: + def _serialize(self, serializer: Serializer, hashtable, output: bytearray, merge_subtrees) -> int: # Serialize all children first - child_offsets = [] + child_chars_offsets_barriers = [] for char, child in self.children.items(): - offset = child[1]._serialize(hashtable, output, merge_subtrees=merge_subtrees) - child_offsets += [(char, child[0], offset)] - - # Serialize this node. Sometimes we'd have an insane amount of results - # (such as Python's __init__), but very little children to go with - # that. Then we can make the result count storage larger (11 bits, - # 2048 results) and the child count storage smaller (4 bits, 16 - # children). Hopefully that's enough. The remaining leftmost bit is - # used as an indicator of this shifted state. - serialized = bytearray() - if len(self.results) > 127: - assert len(self.children) < 16 and len(self.results) < 2048 - result_count = (len(self.results) & 0x7f) | 0x80 - children_count = ((len(self.results) & 0xf80) >> 3) | len(self.children) - else: - result_count = len(self.results) - children_count = len(self.children) - serialized += self.header_struct.pack(result_count, children_count) - for v in self.results: - serialized += self.result_struct.pack(v) - - # Serialize child offsets - for char, lookahead_barrier, abs_offset in child_offsets: - assert abs_offset < 2**23 - - # write them over each other because that's the only way to pack - # a 24 bit field - offset = len(serialized) - serialized += self.child_struct.pack(abs_offset | ((1 if lookahead_barrier else 0) << 23)) - self.child_char_struct.pack_into(serialized, offset + 3, char) + offset = child[1]._serialize(serializer, hashtable, output, merge_subtrees=merge_subtrees) + child_chars_offsets_barriers += [(char, offset, child[0])] + + # Serialize this node + serialized = serializer.pack_trie_node(self.results, child_chars_offsets_barriers) # Subtree merging: if this exact tree is already in the table, return # its offset. Otherwise add it and return the new offset. @@ -389,21 +561,13 @@ class Trie: if merge_subtrees: hashtable[hashable] = offset return offset - def serialize(self, merge_subtrees=True) -> bytearray: + def serialize(self, serializer: Serializer, merge_subtrees=True) -> bytearray: output = bytearray(b'\x00\x00\x00\x00') hashtable = {} - self.root_offset_struct.pack_into(output, 0, self._serialize(hashtable, output, merge_subtrees=merge_subtrees)) + output[0:4] = serializer.pack_trie_root_offset(self._serialize(serializer, hashtable, output, merge_subtrees=merge_subtrees)) return output -# Type map encoding: -# -# type 1 | type 2 | | | | type 1 | -# class | name | class | name | … | padding | end | name | … -# ID | offset | ID | offset | | | offset | data | -# 8b | 8b | 8b | 8b | | 8b | 8b | | -type_map_entry_struct = struct.Struct(' bytearray: +def serialize_type_map(serializer: Serializer, map: List[Tuple[CssClass, str]]) -> bytearray: serialized = bytearray() names = bytearray() @@ -412,42 +576,31 @@ def serialize_type_map(map: List[Tuple[CssClass, str]]) -> bytearray: assert len(map) <= 15 # Initial name offset is after all the offset entries plus the final one - initial_name_offset = (len(map) + 1)*type_map_entry_struct.size + initial_name_offset = (len(map) + 1)*serializer.type_map_entry_struct.size # Add all entries (and the final offset), encode the names separately, # concatenate at the end for css_class, name in map: - serialized += type_map_entry_struct.pack(css_class.value, initial_name_offset + len(names)) + serialized += serializer.pack_type_map_entry(css_class.value, initial_name_offset + len(names)) names += name.encode('utf-8') - serialized += type_map_entry_struct.pack(0, initial_name_offset + len(names)) + serialized += serializer.pack_type_map_entry(0, initial_name_offset + len(names)) assert len(serialized) == initial_name_offset return serialized + names -# Whole file encoding: -# -# magic | version | symbol | result | type | trie | result | type -# header | | count | map | map | data | map | map -# | | | offset | offset | | data | data -# 24b | 8b | 16b | 32b | 32b | … | … | … -search_data_header_struct = struct.Struct('<3sBHII') - -def serialize_search_data(trie: Trie, map: ResultMap, type_map: List[Tuple[CssClass, str]], symbol_count, *, merge_subtrees=True, merge_prefixes=True) -> bytearray: - serialized_trie = trie.serialize(merge_subtrees=merge_subtrees) - serialized_map = map.serialize(merge_prefixes=merge_prefixes) - serialized_type_map = serialize_type_map(type_map) +def serialize_search_data(serializer: Serializer, trie: Trie, map: ResultMap, type_map: List[Tuple[CssClass, str]], symbol_count, *, merge_subtrees=True, merge_prefixes=True) -> bytearray: + serialized_trie = trie.serialize(serializer, merge_subtrees=merge_subtrees) + serialized_map = map.serialize(serializer, merge_prefixes=merge_prefixes) + serialized_type_map = serialize_type_map(serializer, type_map) - preamble = search_data_header_struct.pack(b'MCS', - searchdata_format_version, symbol_count, - search_data_header_struct.size + len(serialized_trie), - search_data_header_struct.size + len(serialized_trie) + len(serialized_map)) + preamble = serializer.pack_header(symbol_count, len(serialized_trie), len(serialized_map)) return preamble + serialized_trie + serialized_map + serialized_type_map def base85encode_search_data(data: bytearray) -> bytearray: return (b"/* Generated by https://mcss.mosra.cz/documentation/doxygen/. Do not edit. */\n" + b"Search.load('" + base64.b85encode(data, True) + b"');\n") -def _pretty_print_trie(serialized: bytearray, hashtable, stats, base_offset, indent, *, show_merged, show_lookahead_barriers, color_map) -> str: +def _pretty_print_trie(deserializer: Deserializer, serialized: bytearray, hashtable, stats, base_offset, indent, *, show_merged, show_lookahead_barriers, color_map) -> str: # Visualize where the trees were merged if show_merged and base_offset in hashtable: return color_map['red'] + '#' + color_map['reset'] @@ -455,46 +608,35 @@ def _pretty_print_trie(serialized: bytearray, hashtable, stats, base_offset, ind stats.node_count += 1 out = '' - result_count, child_count = Trie.header_struct.unpack_from(serialized, base_offset) - # If result count has the high bit set, it's stored in 11 bits and child - # count in 4 bits instead of 7 + 8 - if result_count & 0x80: - result_count = (result_count & 0x7f) | ((child_count & 0xf0) << 3) - child_count = child_count & 0x0f - stats.max_node_results = max(result_count, stats.max_node_results) - stats.max_node_children = max(child_count, stats.max_node_children) - offset = base_offset + Trie.header_struct.size + result_ids, child_chars_offsets_barriers, offset = deserializer.unpack_trie_node(serialized, base_offset) + + stats.max_node_results = max(len(result_ids), stats.max_node_results) + stats.max_node_children = max(len(child_chars_offsets_barriers), stats.max_node_children) # print results, if any - if result_count: + if result_ids: out += color_map['blue'] + ' [' - for i in range(result_count): + for i, result in enumerate(result_ids): if i: out += color_map['blue']+', ' - result = Trie.result_struct.unpack_from(serialized, offset)[0] stats.max_node_result_index = max(result, stats.max_node_result_index) out += color_map['cyan'] + str(result) - offset += Trie.result_struct.size out += color_map['blue'] + ']' # print children, if any - for i in range(child_count): - if result_count or i: + for i, (char, offset, barrier) in enumerate(child_chars_offsets_barriers): + if len(result_ids) or i: out += color_map['reset'] + '\n' out += color_map['blue'] + indent + color_map['white'] - char = Trie.child_char_struct.unpack_from(serialized, offset + 3)[0] if char <= 127: out += chr(char) else: out += color_map['reset'] + hex(char) - if (show_lookahead_barriers and Trie.child_struct.unpack_from(serialized, offset)[0] & 0x00800000): + if (show_lookahead_barriers and barrier): out += color_map['green'] + '$' - if char > 127 or (show_lookahead_barriers and Trie.child_struct.unpack_from(serialized, offset)[0] & 0x00800000): + if char > 127 or (show_lookahead_barriers and barrier): out += color_map['reset'] + '\n' + color_map['blue'] + indent + ' ' + color_map['white'] - child_offset = Trie.child_struct.unpack_from(serialized, offset)[0] & 0x007fffff - stats.max_node_child_offset = max(child_offset, stats.max_node_child_offset) - offset += Trie.child_struct.size - out += _pretty_print_trie(serialized, hashtable, stats, child_offset, indent + ('|' if child_count > 1 else ' '), show_merged=show_merged, show_lookahead_barriers=show_lookahead_barriers, color_map=color_map) - child_count += 1 + stats.max_node_child_offset = max(offset, stats.max_node_child_offset) + out += _pretty_print_trie(deserializer, serialized, hashtable, stats, offset, indent + ('|' if len(child_chars_offsets_barriers) > 1 else ' '), show_merged=show_merged, show_lookahead_barriers=show_lookahead_barriers, color_map=color_map) hashtable[base_offset] = True return out @@ -515,7 +657,7 @@ color_map_dummy = {'blue': '', 'yellow': '', 'reset': ''} -def pretty_print_trie(serialized: bytes, *, show_merged=False, show_lookahead_barriers=True, colors=False): +def pretty_print_trie(deserializer: Deserializer, serialized: bytes, *, show_merged=False, show_lookahead_barriers=True, colors=False): color_map = color_map_colors if colors else color_map_dummy hashtable = {} @@ -527,7 +669,7 @@ def pretty_print_trie(serialized: bytes, *, show_merged=False, show_lookahead_ba stats.max_node_result_index = 0 stats.max_node_child_offset = 0 - out = _pretty_print_trie(serialized, hashtable, stats, Trie.root_offset_struct.unpack_from(serialized, 0)[0], '', show_merged=show_merged, show_lookahead_barriers=show_lookahead_barriers, color_map=color_map) + out = _pretty_print_trie(deserializer, serialized, hashtable, stats, deserializer.unpack_trie_root_offset(serialized, 0)[0], '', show_merged=show_merged, show_lookahead_barriers=show_lookahead_barriers, color_map=color_map) if out: out = color_map['white'] + out stats = """ node count: {} @@ -537,59 +679,61 @@ max node result index: {} max node child offset: {}""".lstrip().format(stats.node_count, stats.max_node_results, stats.max_node_children, stats.max_node_result_index, stats.max_node_child_offset) return out, stats -def pretty_print_map(serialized: bytes, *, entryTypeClass, colors=False): +def pretty_print_map(deserializer: Deserializer, serialized: bytes, *, entryTypeClass, colors=False): color_map = color_map_colors if colors else color_map_dummy # The first item gives out offset of first value, which can be used to # calculate total value count - offset = ResultMap.offset_struct.unpack_from(serialized, 0)[0] & 0x00ffffff - size = int(offset/4 - 1) + offset, offset_size = deserializer.unpack_result_map_offset(serialized, 0) + size = int((offset - offset_size)/(offset_size + Serializer.result_map_flag_bytes)) + flags_offset = (size + 1)*offset_size out = '' for i in range(size): if i: out += '\n' - flags = ResultFlag(ResultMap.flags_struct.unpack_from(serialized, i*4 + 3)[0]) + flags = ResultFlag(deserializer.unpack_result_map_flags(serialized, flags_offset + i*Serializer.result_map_flag_bytes)[0]) extra = [] if flags & ResultFlag._TYPE == ResultFlag.ALIAS: - extra += ['alias={}'.format(ResultMap.alias_struct.unpack_from(serialized, offset)[0])] - offset += ResultMap.alias_struct.size + alias, alias_bytes = deserializer.unpack_result_map_alias(serialized, offset) + extra += ['alias={}'.format(alias)] + offset += alias_bytes if flags & ResultFlag.HAS_PREFIX: - extra += ['prefix={}[:{}]'.format(*ResultMap.prefix_struct.unpack_from(serialized, offset))] - offset += ResultMap.prefix_struct.size + prefix_id, prefix_length, prefix_bytes = deserializer.unpack_result_map_prefix(serialized, offset) + extra += ['prefix={}[:{}]'.format(prefix_id, prefix_length)] + offset += prefix_bytes if flags & ResultFlag.HAS_SUFFIX: - extra += ['suffix_length={}'.format(ResultMap.suffix_length_struct.unpack_from(serialized, offset)[0])] - offset += ResultMap.suffix_length_struct.size + suffix_length, suffix_bytes = deserializer.unpack_result_map_suffix_length(serialized, offset) + extra += ['suffix_length={}'.format(suffix_length)] + offset += suffix_bytes if flags & ResultFlag.DEPRECATED: extra += ['deprecated'] if flags & ResultFlag.DELETED: extra += ['deleted'] if flags & ResultFlag._TYPE: extra += ['type={}'.format(entryTypeClass(flags.type).name)] - next_offset = ResultMap.offset_struct.unpack_from(serialized, (i + 1)*4)[0] & 0x00ffffff + next_offset = deserializer.unpack_result_map_offset(serialized, (i + 1)*offset_size)[0] name, _, url = serialized[offset:next_offset].partition(b'\0') out += color_map['cyan'] + str(i) + color_map['blue'] + ': ' + color_map['white'] + name.decode('utf-8') + color_map['blue'] + ' [' + color_map['yellow'] + (color_map['blue'] + ', ' + color_map['yellow']).join(extra) + color_map['blue'] + '] ->' + (' ' + color_map['reset'] + url.decode('utf-8') if url else '') offset = next_offset return out -def pretty_print_type_map(serialized: bytes, *, entryTypeClass): +def pretty_print_type_map(deserializer: Deserializer, serialized: bytes, *, entryTypeClass): # Unpack until we aren't at EOF i = 0 out = '' - class_id, offset = type_map_entry_struct.unpack_from(serialized, 0) - while offset < len(serialized): + class_id, name_offset, type_map_bytes = deserializer.unpack_type_map_entry(serialized, 0) + while name_offset < len(serialized): if i: out += ',\n' - next_class_id, next_offset = type_map_entry_struct.unpack_from(serialized, (i + 1)*type_map_entry_struct.size) - out += "({}, {}, '{}')".format(entryTypeClass(i + 1), CssClass(class_id), serialized[offset:next_offset].decode('utf-8')) + next_class_id, next_name_offset = deserializer.unpack_type_map_entry(serialized, (i + 1)*type_map_bytes)[:2] + out += "({}, {}, '{}')".format(entryTypeClass(i + 1), CssClass(class_id), serialized[name_offset:next_name_offset].decode('utf-8')) i += 1 - class_id, offset = next_class_id, next_offset + class_id, name_offset = next_class_id, next_name_offset return out def pretty_print(serialized: bytes, *, entryTypeClass, show_merged=False, show_lookahead_barriers=True, colors=False): - magic, version, symbol_count, map_offset, type_map_offset = search_data_header_struct.unpack_from(serialized) - assert magic == b'MCS' - assert version == searchdata_format_version - - pretty_trie, stats = pretty_print_trie(serialized[search_data_header_struct.size:map_offset], show_merged=show_merged, show_lookahead_barriers=show_lookahead_barriers, colors=colors) - pretty_map = pretty_print_map(serialized[map_offset:type_map_offset], entryTypeClass=entryTypeClass, colors=colors) - pretty_type_map = pretty_print_type_map(serialized[type_map_offset:], entryTypeClass=entryTypeClass) - return '{} symbols\n'.format(symbol_count) + pretty_trie + '\n' + pretty_map + '\n' + pretty_type_map, stats + deserializer = Deserializer.from_serialized(serialized) + + pretty_trie, stats = pretty_print_trie(deserializer, serialized[Serializer.header_struct.size:deserializer.map_offset], show_merged=show_merged, show_lookahead_barriers=show_lookahead_barriers, colors=colors) + pretty_map = pretty_print_map(deserializer, serialized[deserializer.map_offset:deserializer.type_map_offset], entryTypeClass=entryTypeClass, colors=colors) + pretty_type_map = pretty_print_type_map(deserializer, serialized[deserializer.type_map_offset:], entryTypeClass=entryTypeClass) + return '{} symbols\n'.format(deserializer.symbol_count) + pretty_trie + '\n' + pretty_map + '\n' + pretty_type_map, stats diff --git a/documentation/doxygen.py b/documentation/doxygen.py index 822dca78..973c3973 100755 --- a/documentation/doxygen.py +++ b/documentation/doxygen.py @@ -49,7 +49,7 @@ from pygments import highlight from pygments.formatters import HtmlFormatter from pygments.lexers import TextLexer, BashSessionLexer, get_lexer_by_name, find_lexer_class_for_filename -from _search import CssClass, ResultFlag, ResultMap, Trie, serialize_search_data, base85encode_search_data, search_filename, searchdata_filename, searchdata_filename_b85, searchdata_format_version +from _search import CssClass, ResultFlag, ResultMap, Trie, Serializer, serialize_search_data, base85encode_search_data, search_filename, searchdata_filename, searchdata_filename_b85, searchdata_format_version sys.path.append(os.path.join(os.path.dirname(os.path.realpath(__file__)), '../plugins')) import dot2svg @@ -2440,7 +2440,7 @@ def build_search_data(state: State, merge_subtrees=True, add_lookahead_barriers= # order by default trie.sort(map) - return serialize_search_data(trie, map, search_type_map, symbol_count, merge_subtrees=merge_subtrees, merge_prefixes=merge_prefixes) + return serialize_search_data(Serializer(file_offset_bytes=3, result_id_bytes=2, name_size_bytes=1), trie, map, search_type_map, symbol_count, merge_subtrees=merge_subtrees, merge_prefixes=merge_prefixes) def parse_xml(state: State, xml: str): # Reset counter for unique math formulas diff --git a/documentation/python.py b/documentation/python.py index 1d2dfe12..2d048858 100755 --- a/documentation/python.py +++ b/documentation/python.py @@ -54,7 +54,7 @@ from docutils.transforms import Transform import jinja2 -from _search import CssClass, ResultFlag, ResultMap, Trie, serialize_search_data, base85encode_search_data, searchdata_format_version, search_filename, searchdata_filename, searchdata_filename_b85 +from _search import CssClass, ResultFlag, ResultMap, Trie, Serializer, serialize_search_data, base85encode_search_data, searchdata_format_version, search_filename, searchdata_filename, searchdata_filename_b85 sys.path.append(os.path.join(os.path.dirname(os.path.realpath(__file__)), '../plugins')) import m.htmlsanity @@ -2454,7 +2454,7 @@ def build_search_data(state: State, merge_subtrees=True, add_lookahead_barriers= # order by default trie.sort(map) - return serialize_search_data(trie, map, search_type_map, symbol_count, merge_subtrees=merge_subtrees, merge_prefixes=merge_prefixes) + return serialize_search_data(Serializer(file_offset_bytes=3, result_id_bytes=2, name_size_bytes=1), trie, map, search_type_map, symbol_count, merge_subtrees=merge_subtrees, merge_prefixes=merge_prefixes) def run(basedir, config, *, templates=default_templates, search_add_lookahead_barriers=True, search_merge_subtrees=True, search_merge_prefixes=True): # Populate the INPUT, if not specified, make it absolute diff --git a/documentation/search.js b/documentation/search.js index 87a3ed7d..8cc2a3d4 100644 --- a/documentation/search.js +++ b/documentation/search.js @@ -25,15 +25,27 @@ "use strict"; /* it summons the Cthulhu in a proper way, they say */ var Search = { - formatVersion: 1, /* the data filename contains this number too */ + formatVersion: 2, /* the data filename contains this number too */ dataSize: 0, /* used mainly by tests, not here */ symbolCount: '…', trie: null, map: null, + mapFlagsOffset: null, typeMap: null, maxResults: 0, + /* Type sizes and masks. The data is always fetched as 16/32bit number and + then masked to 1, 2, 3 or 4 bytes. Fortunately on LE a mask is enough, + on BE we'd have to read N bytes before and then mask. */ + nameSizeBytes: null, + nameSizeMask: null, + resultIdBytes: null, + resultIdMask: null, + fileOffsetBytes: null, + fileOffsetMask: null, + lookaheadBarrierMask: null, + /* Always contains at least the root node offset and then one node offset per entered character */ searchString: '', @@ -57,7 +69,7 @@ var Search = { /* The file is too short to contain at least the headers and empty sections */ - if(view.byteLength < 26) { + if(view.byteLength < 31) { console.error("Search data too short"); return false; } @@ -74,16 +86,61 @@ var Search = { return false; } - /* Separate the data into the trie and the result / type map */ - let mapOffset = view.getUint32(6, true); - let typeMapOffset = view.getUint32(10, true); - this.trie = new DataView(buffer, 14, mapOffset - 14); - this.map = new DataView(buffer, mapOffset, typeMapOffset - mapOffset); + /* Fetch type sizes. The only value that can fail is result ID byte + count, where value of 3 has no assigned meaning. */ + let typeSizes = view.getUint8(4, true); + if((typeSizes & 0x01) >> 0 == 0) { + this.fileOffsetBytes = 3; + this.fileOffsetMask = 0x00ffffff; + this.lookaheadBarrierMask = 0x00800000; + } else /* (typeSizes & 0x01) >> 0 == 1 */ { + this.fileOffsetBytes = 4; + this.fileOffsetMask = 0xffffffff; + this.lookaheadBarrierMask = 0x80000000; + } + if((typeSizes & 0x06) >> 1 == 0) { + this.resultIdBytes = 2; + this.resultIdMask = 0x0000ffff; + } else if((typeSizes & 0x06) >> 1 == 1) { + this.resultIdBytes = 3; + this.resultIdMask = 0x00ffffff; + } else if((typeSizes & 0x06) >> 1 == 2) { + this.resultIdBytes = 4; + this.resultIdMask = 0xffffffff; + } else /* (typeSizes & 0x06) >> 1 == 3 */ { + console.error("Invalid search data result ID byte value"); + return false; + } + if((typeSizes & 0x08) >> 3 == 0) { + this.nameSizeBytes = 1; + this.nameSizeMask = 0x00ff; + } else /* (typeSizes & 0x08) >> 3 == 1 */ { + this.nameSizeBytes = 2; + this.nameSizeMask = 0xffff; + } + + /* Separate the data into the trie and the result / type map. Because + we're reading larger values than there might be and then masking out + the high bytes, keep extra 1/2 byte padding at the end to avoid + OOB errors. */ + let mapOffset = view.getUint32(12, true); + let typeMapOffset = view.getUint32(16, true); + /* There may be a 3-byte file offset at the end of the trie which we'll + read as 32-bit, add one safety byte in that case */ + this.trie = new DataView(buffer, 20, mapOffset - 20 + (4 - this.fileOffsetBytes)); + /* There may be a 3-byte file size (for zero results) which we'll read + as 32-bit, add one safety byte in that case */ + this.map = new DataView(buffer, mapOffset, typeMapOffset - mapOffset + (4 - this.fileOffsetBytes)); + /* No variable-size types in the type map at the moment */ this.typeMap = new DataView(buffer, typeMapOffset); + /* Offset of the first result map item is after N + 1 offsets and N + flags, calculate flag offset from that */ + this.mapFlagsOffset = this.fileOffsetBytes*(((this.map.getUint32(0, true) & this.fileOffsetMask) - this.fileOffsetBytes)/(this.fileOffsetBytes + 1) + 1); + /* Set initial properties */ this.dataSize = buffer.byteLength; - this.symbolCount = view.getUint16(4, true) + " symbols (" + Math.round(this.dataSize/102.4)/10 + " kB)"; + this.symbolCount = view.getUint32(8, true) + " symbols (" + Math.round(this.dataSize/102.4)/10 + " kB)"; this.maxResults = maxResults ? maxResults : 100; this.searchString = ''; this.searchStack = [this.trie.getUint32(0, true)]; @@ -257,23 +314,25 @@ var Search = { /* Calculate offset and count of children */ let offset = this.searchStack[this.searchStack.length - 1]; - /* Calculate child count. If there's a lot of results, the count - "leaks over" to the child count storage. */ + /* If there's a lot of results, the result count is a 16bit BE value + instead */ let resultCount = this.trie.getUint8(offset); - let childCount = this.trie.getUint8(offset + 1); + let resultCountSize = 1; if(resultCount & 0x80) { - resultCount = (resultCount & 0x7f) | ((childCount & 0xf0) << 3); - childCount = childCount & 0x0f; + resultCount = this.trie.getUint16(offset, false) & ~0x8000; + ++resultCountSize; } + let childCount = this.trie.getUint8(offset + resultCountSize); + /* Go through all children and find the next offset */ - let childOffset = offset + 2 + resultCount*2; + let childOffset = offset + resultCountSize + 1 + resultCount*this.resultIdBytes; let found = false; for(let j = 0; j != childCount; ++j) { - if(String.fromCharCode(this.trie.getUint8(childOffset + j*4 + 3)) != searchString[foundPrefix]) + if(String.fromCharCode(this.trie.getUint8(childOffset + j)) != searchString[foundPrefix]) continue; - this.searchStack.push(this.trie.getUint32(childOffset + j*4, true) & 0x007fffff); + this.searchStack.push(this.trie.getUint32(childOffset + childCount + j*this.fileOffsetBytes, true) & this.fileOffsetMask & ~this.lookaheadBarrierMask); found = true; break; } @@ -321,15 +380,17 @@ var Search = { "leaks over" to the child count storage. */ /* TODO: hmmm. this is helluvalot duplicated code. hmm. */ let resultCount = this.trie.getUint8(offset); - let childCount = this.trie.getUint8(offset + 1); + let resultCountSize = 1; if(resultCount & 0x80) { - resultCount = (resultCount & 0x7f) | ((childCount & 0xf0) << 3); - childCount = childCount & 0x0f; + resultCount = this.trie.getUint16(offset, false) & ~0x8000; + ++resultCountSize; } + let childCount = this.trie.getUint8(offset + resultCountSize); + /* Populate the results with all values associated with this node */ for(let i = 0; i != resultCount; ++i) { - let index = this.trie.getUint16(offset + 2 + i*2, true); + let index = this.trie.getUint32(offset + resultCountSize + 1 + i*this.resultIdBytes, true) & this.resultIdMask; results.push(this.gatherResult(index, suffixLength, 0xffffff)); /* should be enough haha */ /* 'nuff said. */ @@ -338,15 +399,15 @@ var Search = { } /* Dig deeper */ - let childOffset = offset + 2 + resultCount*2; + let childOffset = offset + resultCountSize + 1 + resultCount*this.resultIdBytes; for(let j = 0; j != childCount; ++j) { - let offsetBarrier = this.trie.getUint32(childOffset + j*4, true); + let offsetBarrier = this.trie.getUint32(childOffset + childCount + j*this.fileOffsetBytes, true) & this.fileOffsetMask; /* Lookahead barrier, don't dig deeper */ - if(offsetBarrier & 0x00800000) continue; + if(offsetBarrier & this.lookaheadBarrierMask) continue; /* Append to the queue */ - leaves.push([offsetBarrier & 0x007fffff, suffixLength + 1]); + leaves.push([offsetBarrier & ~this.lookaheadBarrierMask, suffixLength + 1]); /* We don't have anything yet and this is the only path forward, add the char to suggested Tab autocompletion. Can't @@ -357,7 +418,7 @@ var Search = { absolutely unwanted when all I want is check for truncated UTF-8. */ if(!results.length && leaves.length == 1 && childCount == 1) - suggestedTabAutocompletionChars.push(this.trie.getUint8(childOffset + j*4 + 3)); + suggestedTabAutocompletionChars.push(this.trie.getUint8(childOffset + j)); } } @@ -365,38 +426,38 @@ var Search = { }, gatherResult: function(index, suffixLength, maxUrlPrefix) { - let flags = this.map.getUint8(index*4 + 3); - let resultOffset = this.map.getUint32(index*4, true) & 0x00ffffff; + let flags = this.map.getUint8(this.mapFlagsOffset + index); + let resultOffset = this.map.getUint32(index*this.fileOffsetBytes, true) & this.fileOffsetMask; /* The result is an alias, parse the aliased prefix */ let aliasedIndex = null; if((flags & 0xf0) == 0x00) { - aliasedIndex = this.map.getUint16(resultOffset, true); - resultOffset += 2; + aliasedIndex = this.map.getUint32(resultOffset, true) & this.resultIdMask; + resultOffset += this.resultIdBytes; } /* The result has a prefix, parse that first, recursively */ let name = ''; let url = ''; if(flags & (1 << 3)) { - let prefixIndex = this.map.getUint16(resultOffset, true); - let prefixUrlPrefixLength = Math.min(this.map.getUint8(resultOffset + 2), maxUrlPrefix); + let prefixIndex = this.map.getUint32(resultOffset, true) & this.resultIdMask; + let prefixUrlPrefixLength = Math.min(this.map.getUint16(resultOffset + this.resultIdBytes, true) & this.nameSizeMask, maxUrlPrefix); let prefix = this.gatherResult(prefixIndex, 0 /*ignored*/, prefixUrlPrefixLength); name = prefix.name; url = prefix.url; - resultOffset += 3; + resultOffset += this.resultIdBytes + this.nameSizeBytes; } /* The result has a suffix, extract its length */ let resultSuffixLength = 0; if(flags & (1 << 0)) { - resultSuffixLength = this.map.getUint8(resultOffset); - ++resultOffset; + resultSuffixLength = this.map.getUint16(resultOffset, true) & this.nameSizeMask; + resultOffset += this.nameSizeBytes; } - let nextResultOffset = this.map.getUint32((index + 1)*4, true) & 0x00ffffff; + let nextResultOffset = this.map.getUint32((index + 1)*this.fileOffsetBytes, true) & this.fileOffsetMask; /* Extract name */ let j = resultOffset; diff --git a/documentation/test/_search_test_metadata.py b/documentation/test/_search_test_metadata.py index 33711f50..89ee55b0 100644 --- a/documentation/test/_search_test_metadata.py +++ b/documentation/test/_search_test_metadata.py @@ -44,3 +44,48 @@ search_type_map = [ (CssClass.PRIMARY, "class"), (CssClass.INFO, "func") ] + +# Tries don't store any strings, so name_size_bytes can be whatever +trie_type_sizes = [ + {'file_offset_bytes': 3, + 'result_id_bytes': 2, + 'name_size_bytes': 1}, + {'file_offset_bytes': 3, + 'result_id_bytes': 3, + 'name_size_bytes': 1}, + {'file_offset_bytes': 3, + 'result_id_bytes': 4, + 'name_size_bytes': 1}, + + {'file_offset_bytes': 4, + 'result_id_bytes': 2, + 'name_size_bytes': 1}, + {'file_offset_bytes': 4, + 'result_id_bytes': 3, + 'name_size_bytes': 1}, + {'file_offset_bytes': 4, + 'result_id_bytes': 4, + 'name_size_bytes': 1}, +] + +type_sizes = trie_type_sizes + [ + {'file_offset_bytes': 3, + 'result_id_bytes': 2, + 'name_size_bytes': 2}, + {'file_offset_bytes': 3, + 'result_id_bytes': 3, + 'name_size_bytes': 2}, + {'file_offset_bytes': 3, + 'result_id_bytes': 4, + 'name_size_bytes': 2}, + + {'file_offset_bytes': 4, + 'result_id_bytes': 2, + 'name_size_bytes': 2}, + {'file_offset_bytes': 4, + 'result_id_bytes': 3, + 'name_size_bytes': 2}, + {'file_offset_bytes': 4, + 'result_id_bytes': 4, + 'name_size_bytes': 2}, +] diff --git a/documentation/test/js-test-data/empty-ns1-ri2-fo3.bin b/documentation/test/js-test-data/empty-ns1-ri2-fo3.bin new file mode 100644 index 00000000..d13e176c Binary files /dev/null and b/documentation/test/js-test-data/empty-ns1-ri2-fo3.bin differ diff --git a/documentation/test/js-test-data/empty-ns1-ri2-fo4.bin b/documentation/test/js-test-data/empty-ns1-ri2-fo4.bin new file mode 100644 index 00000000..3803349e Binary files /dev/null and b/documentation/test/js-test-data/empty-ns1-ri2-fo4.bin differ diff --git a/documentation/test/js-test-data/empty-ns1-ri3-fo3.bin b/documentation/test/js-test-data/empty-ns1-ri3-fo3.bin new file mode 100644 index 00000000..4d98643a Binary files /dev/null and b/documentation/test/js-test-data/empty-ns1-ri3-fo3.bin differ diff --git a/documentation/test/js-test-data/empty-ns1-ri3-fo4.bin b/documentation/test/js-test-data/empty-ns1-ri3-fo4.bin new file mode 100644 index 00000000..8b484e18 Binary files /dev/null and b/documentation/test/js-test-data/empty-ns1-ri3-fo4.bin differ diff --git a/documentation/test/js-test-data/empty-ns1-ri4-fo3.bin b/documentation/test/js-test-data/empty-ns1-ri4-fo3.bin new file mode 100644 index 00000000..abbc4d41 Binary files /dev/null and b/documentation/test/js-test-data/empty-ns1-ri4-fo3.bin differ diff --git a/documentation/test/js-test-data/empty-ns1-ri4-fo4.bin b/documentation/test/js-test-data/empty-ns1-ri4-fo4.bin new file mode 100644 index 00000000..4fbe1f52 Binary files /dev/null and b/documentation/test/js-test-data/empty-ns1-ri4-fo4.bin differ diff --git a/documentation/test/js-test-data/empty-ns2-ri2-fo3.bin b/documentation/test/js-test-data/empty-ns2-ri2-fo3.bin new file mode 100644 index 00000000..f856f5b4 Binary files /dev/null and b/documentation/test/js-test-data/empty-ns2-ri2-fo3.bin differ diff --git a/documentation/test/js-test-data/empty-ns2-ri2-fo4.bin b/documentation/test/js-test-data/empty-ns2-ri2-fo4.bin new file mode 100644 index 00000000..cd7ce16e Binary files /dev/null and b/documentation/test/js-test-data/empty-ns2-ri2-fo4.bin differ diff --git a/documentation/test/js-test-data/empty-ns2-ri3-fo3.bin b/documentation/test/js-test-data/empty-ns2-ri3-fo3.bin new file mode 100644 index 00000000..81f31146 Binary files /dev/null and b/documentation/test/js-test-data/empty-ns2-ri3-fo3.bin differ diff --git a/documentation/test/js-test-data/empty-ns2-ri3-fo4.bin b/documentation/test/js-test-data/empty-ns2-ri3-fo4.bin new file mode 100644 index 00000000..ef2ee650 Binary files /dev/null and b/documentation/test/js-test-data/empty-ns2-ri3-fo4.bin differ diff --git a/documentation/test/js-test-data/empty-ns2-ri4-fo3.bin b/documentation/test/js-test-data/empty-ns2-ri4-fo3.bin new file mode 100644 index 00000000..9f94c887 Binary files /dev/null and b/documentation/test/js-test-data/empty-ns2-ri4-fo3.bin differ diff --git a/documentation/test/js-test-data/empty-ns2-ri4-fo4.bin b/documentation/test/js-test-data/empty-ns2-ri4-fo4.bin new file mode 100644 index 00000000..90c4aa8d Binary files /dev/null and b/documentation/test/js-test-data/empty-ns2-ri4-fo4.bin differ diff --git a/documentation/test/js-test-data/empty.bin b/documentation/test/js-test-data/empty.bin deleted file mode 100644 index 36e30edc..00000000 Binary files a/documentation/test/js-test-data/empty.bin and /dev/null differ diff --git a/documentation/test/js-test-data/manyresults-ns1-ri2-fo3.bin b/documentation/test/js-test-data/manyresults-ns1-ri2-fo3.bin new file mode 100644 index 00000000..23e1735b Binary files /dev/null and b/documentation/test/js-test-data/manyresults-ns1-ri2-fo3.bin differ diff --git a/documentation/test/js-test-data/manyresults-ns1-ri2-fo4.bin b/documentation/test/js-test-data/manyresults-ns1-ri2-fo4.bin new file mode 100644 index 00000000..ea42fbcf Binary files /dev/null and b/documentation/test/js-test-data/manyresults-ns1-ri2-fo4.bin differ diff --git a/documentation/test/js-test-data/manyresults-ns1-ri3-fo3.bin b/documentation/test/js-test-data/manyresults-ns1-ri3-fo3.bin new file mode 100644 index 00000000..52c1637f Binary files /dev/null and b/documentation/test/js-test-data/manyresults-ns1-ri3-fo3.bin differ diff --git a/documentation/test/js-test-data/manyresults-ns1-ri3-fo4.bin b/documentation/test/js-test-data/manyresults-ns1-ri3-fo4.bin new file mode 100644 index 00000000..56dfd9e6 Binary files /dev/null and b/documentation/test/js-test-data/manyresults-ns1-ri3-fo4.bin differ diff --git a/documentation/test/js-test-data/manyresults-ns1-ri4-fo3.bin b/documentation/test/js-test-data/manyresults-ns1-ri4-fo3.bin new file mode 100644 index 00000000..c8d6c8bd Binary files /dev/null and b/documentation/test/js-test-data/manyresults-ns1-ri4-fo3.bin differ diff --git a/documentation/test/js-test-data/manyresults-ns1-ri4-fo4.bin b/documentation/test/js-test-data/manyresults-ns1-ri4-fo4.bin new file mode 100644 index 00000000..6b1c4941 Binary files /dev/null and b/documentation/test/js-test-data/manyresults-ns1-ri4-fo4.bin differ diff --git a/documentation/test/js-test-data/manyresults-ns2-ri2-fo3.bin b/documentation/test/js-test-data/manyresults-ns2-ri2-fo3.bin new file mode 100644 index 00000000..62ecdcde Binary files /dev/null and b/documentation/test/js-test-data/manyresults-ns2-ri2-fo3.bin differ diff --git a/documentation/test/js-test-data/manyresults-ns2-ri2-fo4.bin b/documentation/test/js-test-data/manyresults-ns2-ri2-fo4.bin new file mode 100644 index 00000000..858d4566 Binary files /dev/null and b/documentation/test/js-test-data/manyresults-ns2-ri2-fo4.bin differ diff --git a/documentation/test/js-test-data/manyresults-ns2-ri3-fo3.bin b/documentation/test/js-test-data/manyresults-ns2-ri3-fo3.bin new file mode 100644 index 00000000..4e3f2840 Binary files /dev/null and b/documentation/test/js-test-data/manyresults-ns2-ri3-fo3.bin differ diff --git a/documentation/test/js-test-data/manyresults-ns2-ri3-fo4.bin b/documentation/test/js-test-data/manyresults-ns2-ri3-fo4.bin new file mode 100644 index 00000000..c280a6da Binary files /dev/null and b/documentation/test/js-test-data/manyresults-ns2-ri3-fo4.bin differ diff --git a/documentation/test/js-test-data/manyresults-ns2-ri4-fo3.bin b/documentation/test/js-test-data/manyresults-ns2-ri4-fo3.bin new file mode 100644 index 00000000..9b295b7a Binary files /dev/null and b/documentation/test/js-test-data/manyresults-ns2-ri4-fo3.bin differ diff --git a/documentation/test/js-test-data/manyresults-ns2-ri4-fo4.bin b/documentation/test/js-test-data/manyresults-ns2-ri4-fo4.bin new file mode 100644 index 00000000..8097294a Binary files /dev/null and b/documentation/test/js-test-data/manyresults-ns2-ri4-fo4.bin differ diff --git a/documentation/test/js-test-data/manyresults.bin b/documentation/test/js-test-data/manyresults.bin deleted file mode 100644 index 4d3eb35f..00000000 Binary files a/documentation/test/js-test-data/manyresults.bin and /dev/null differ diff --git a/documentation/test/js-test-data/nested.bin b/documentation/test/js-test-data/nested.bin index 6a92cb9c..7079b87b 100644 Binary files a/documentation/test/js-test-data/nested.bin and b/documentation/test/js-test-data/nested.bin differ diff --git a/documentation/test/js-test-data/searchdata-ns1-ri2-fo3.b85 b/documentation/test/js-test-data/searchdata-ns1-ri2-fo3.b85 new file mode 100644 index 00000000..c49bb8cc --- /dev/null +++ b/documentation/test/js-test-data/searchdata-ns1-ri2-fo3.b85 @@ -0,0 +1 @@ +O+!-x000002LJ#7zySaN&H?}cmH_|&0RRC200Aik00001CCy00C)!00001ZG->-00BCR000321OPga0Du4iWtIQ{00C#700001ZlnMJ00CjD00003ZFX`R003wJ0I&c600BC-000320026~0Du4iXvzQp00DH;000930RRI41poj6Dc%4800AiG00001ZtMU600LoY*Z=_X000312mk;9WdZ>J00C?U0RR92XAA)V00C|i0RRC32>@Xj0RR92bRGc!00Cnr0RR93VP&cS04xCj0RRU800Ct@0RR92XFve}00Ch}0RR92a7+OJ00ClB0RR92byxua00DDe0RR92AY=gm00Fyd0RR92!f*iq0RaX8Aa((O00Cuu0RR92XM_O&00Ci&0RR96ZFX{SbNB!NXaE2*0RWHz0A2wAD*ymO003G50Db@flmGy>007JY0NMZm^#A|>0RRpG03b;^NjOaq7yt=PVRUE!ZeeX@b8ul}WldppXf9}UZEOGl5(qjvZE0>OX>N2ZAZc!NDF7pFX>I@j06IEWWn*-2asXp&VRLg$VRUF;F<&uOWn*-2axQ3eZEOMn7zR2zZE0>ODIjBSZgX@1BW-DJ0000wI#OY7XJr6mY+-YAO<{CsUol@XQekdqWiDuRZEOSp7X~^yZE0>ODIjBSZgX@1BW-DJP+@0f0B~VvWiDuRZEOYr03gDaA9X<0CRO>aA9X00EW&0A&CHo&W%600E=`0B!&QssI3C00SBT0BvXh0Cund0CE5Uwg3P+0RaF2!~lRg00GJX0B8UK(f|N-0{{U40{{g800G_r04V?g<^TXF00Ha(0B!&R*Z=@w@&Ev70RRX9009C40A&CH1_1zU009gE0A~OJ5&-~i0RagB7y$rb00ABW0CWHWCIJ9r00OE20AVZv0A&FH2LJ#8JOKb@00BS&0A~OJMgag}00B$^0B`^SQUL&B00CG50CfNXUI74e00CqH03ZMXY5@Sd00D3T0Kx$Q1^{*efFJ+?d;tJu00D#n0A~OJiU9y&00sB}0BvXh0Cq9~0CJE40B~Lb0COw=03bsE07+W_06KpF07;bq064b*08PyR01(>%02uZF0003200|EP002#4bZ7u>VQpn|aA9L*O<{CsE@*UZYybcf2s%1#X>KTKZgealX>N2W03&T_ZU6uPIyzQmV{~tF0Ap-nb8}5$bZB2OUolo?V{~tFE@*UZYyton20A)zX>KSfAY*TCb94YBZE0=*0025VQekdqWdLJrVRLg$VRUF;F<&uKVQyz-E@*UZYyKSfAY*TCb94YBZE0>$VP|CkaA9X= 31 && Search.dataSize <= 32); + } + assert.equal(Search.symbolCount, "0 symbols (0 kB)"); assert.deepEqual(Search.search(''), [[], '']); } -/* Search */ -{ - let buffer = fs.readFileSync(path.join(__dirname, "js-test-data/searchdata.bin")); +/* Search, in all type size variants */ +for(let i = 0; i != type_size_suffixes.length; ++i) { + let buffer = fs.readFileSync(path.join(__dirname, "js-test-data/searchdata-" + type_size_suffixes[i] + ".bin")); assert.ok(Search.init(buffer.buffer.slice(buffer.byteOffset, buffer.byteOffset + buffer.byteLength))); - assert.equal(Search.dataSize, 745); - assert.equal(Search.symbolCount, "7 symbols (0.7 kB)"); + + /* Test just the smallest and largest size, everything else should be in + between */ + if(i == 0) { + assert.equal(Search.dataSize, 750); + assert.equal(Search.symbolCount, "7 symbols (0.7 kB)"); + } else if(i == type_size_suffixes.length - 1) { + assert.equal(Search.dataSize, 883); + assert.equal(Search.symbolCount, "7 symbols (0.9 kB)"); + } else { + assert.ok(Search.dataSize > 750 && Search.dataSize < 883); + } + assert.equal(Search.maxResults, 100); /* Blow up */ @@ -217,11 +261,12 @@ const { StringDecoder } = require('string_decoder'); suffixLength: 8 }], '']); } -/* Search with spaces */ +/* Search with spaces. Nothing type-size-dependent here, so test just on the + first variant. */ { - let buffer = fs.readFileSync(path.join(__dirname, "js-test-data/searchdata.bin")); + let buffer = fs.readFileSync(path.join(__dirname, "js-test-data/searchdata-" + type_size_suffixes[0] + ".bin")); assert.ok(Search.init(buffer.buffer.slice(buffer.byteOffset, buffer.byteOffset + buffer.byteLength))); - assert.equal(Search.dataSize, 745); + assert.equal(Search.dataSize, 750); assert.equal(Search.symbolCount, "7 symbols (0.7 kB)"); assert.equal(Search.maxResults, 100); @@ -269,11 +314,12 @@ const { StringDecoder } = require('string_decoder'); suffixLength: 10 }], Search.toUtf8('» subpage')]); } -/* Search, limiting the results to 3 */ +/* Search, limiting the results to 3. Nothing type-size-dependent here, so test + just on the first variant. */ { - let buffer = fs.readFileSync(path.join(__dirname, "js-test-data/searchdata.bin")); + let buffer = fs.readFileSync(path.join(__dirname, "js-test-data/searchdata-" + type_size_suffixes[0] + ".bin")); assert.ok(Search.init(buffer.buffer.slice(buffer.byteOffset, buffer.byteOffset + buffer.byteLength), 3)); - assert.equal(Search.dataSize, 745); + assert.equal(Search.dataSize, 750); assert.equal(Search.symbolCount, "7 symbols (0.7 kB)"); assert.equal(Search.maxResults, 3); assert.deepEqual(Search.search('m'), [[ @@ -297,11 +343,12 @@ const { StringDecoder } = require('string_decoder'); suffixLength: 10 }], '']); } -/* Search loaded from a base85-encoded file should work properly */ +/* Search loaded from a base85-encoded file should work properly. Nothing + type-size-dependent here, so test just on the first variant. */ { - let b85 = fs.readFileSync(path.join(__dirname, "js-test-data/searchdata.b85"), {encoding: 'utf-8'}); + let b85 = fs.readFileSync(path.join(__dirname, "js-test-data/searchdata-" + type_size_suffixes[0] + ".b85"), {encoding: 'utf-8'}); assert.ok(Search.load(b85)); - assert.equal(Search.dataSize, 748); /* some padding on the end, that's okay */ + assert.equal(Search.dataSize, 752); /* some padding on the end, that's okay */ assert.equal(Search.symbolCount, "7 symbols (0.7 kB)"); assert.equal(Search.maxResults, 100); assert.deepEqual(Search.search('min'), [[ @@ -325,11 +372,11 @@ const { StringDecoder } = require('string_decoder'); suffixLength: 8 }], '()']); } -/* Search, Unicode */ +/* Search, Unicode. Nothing type-size-dependent here. */ { let buffer = fs.readFileSync(path.join(__dirname, "js-test-data/unicode.bin")); assert.ok(Search.init(buffer.buffer.slice(buffer.byteOffset, buffer.byteOffset + buffer.byteLength))); - assert.equal(Search.dataSize, 160); + assert.equal(Search.dataSize, 165); assert.equal(Search.symbolCount, "2 symbols (0.2 kB)"); /* Both "Hýždě" and "Hárá" have common autocompletion to "h\xA1", which is not valid UTF-8, so it has to get truncated */ @@ -363,11 +410,11 @@ const { StringDecoder } = require('string_decoder'); suffixLength: 3 }], Search.toUtf8('rá')]); } -/* Properly combine heavily nested URLs */ +/* Properly combine heavily nested URLs. Nothing type-size-dependent here. */ { let buffer = fs.readFileSync(path.join(__dirname, "js-test-data/nested.bin")); assert.ok(Search.init(buffer.buffer.slice(buffer.byteOffset, buffer.byteOffset + buffer.byteLength))); - assert.equal(Search.dataSize, 331); + assert.equal(Search.dataSize, 336); assert.equal(Search.symbolCount, "4 symbols (0.3 kB)"); assert.deepEqual(Search.search('geo'), [[ { name: 'Magnum::Math::Geometry', @@ -386,12 +433,24 @@ const { StringDecoder } = require('string_decoder'); suffixLength: 3 }], 'nge']); } -/* Extreme amount of search results */ -{ - let buffer = fs.readFileSync(path.join(__dirname, "js-test-data/manyresults.bin")); +/* Extreme amount of search results, in all type size variants to ensure no + size assumptions were left there */ +for(let i = 0; i != type_size_suffixes.length; ++i) { + let buffer = fs.readFileSync(path.join(__dirname, "js-test-data/manyresults-" + type_size_suffixes[i] + ".bin")); assert.ok(Search.init(buffer.buffer.slice(buffer.byteOffset, buffer.byteOffset + buffer.byteLength), 10000)); - assert.equal(Search.dataSize, 6415); - assert.equal(Search.symbolCount, "131 symbols (6.3 kB)"); + + /* Test just the smallest and largest size, everything else should be in + between */ + if(i == 0) { + assert.equal(Search.dataSize, 6421); + assert.equal(Search.symbolCount, "131 symbols (6.3 kB)"); + } else if(i == type_size_suffixes.length - 1) { + assert.equal(Search.dataSize, 6964); + assert.equal(Search.symbolCount, "131 symbols (6.8 kB)"); + } else { + assert.ok(Search.dataSize > 6421 && Search.dataSize < 6964); + } + assert.equal(Search.maxResults, 10000); assert.deepEqual(Search.search('__init__')[0].length, 128 + 3); assert.deepEqual(Search.search('__init__')[1], ''); diff --git a/documentation/test/test_search.py b/documentation/test/test_search.py index c716589a..2e8c8e01 100755 --- a/documentation/test/test_search.py +++ b/documentation/test/test_search.py @@ -27,8 +27,8 @@ import sys import unittest from types import SimpleNamespace as Empty -from ._search_test_metadata import EntryType, search_type_map -from _search import Trie, ResultMap, ResultFlag, serialize_search_data, pretty_print_trie, pretty_print_map, pretty_print +from ._search_test_metadata import EntryType, search_type_map, trie_type_sizes, type_sizes +from _search import Trie, ResultMap, ResultFlag, Serializer, Deserializer, serialize_search_data, pretty_print_trie, pretty_print_map, pretty_print from test_doxygen import IntegrationTestCase @@ -37,28 +37,40 @@ class TrieSerialization(unittest.TestCase): super().__init__(*args, **kwargs) self.maxDiff = None - def compare(self, serialized: bytes, expected: str): - pretty = pretty_print_trie(serialized)[0] + def compare(self, deserializer: Deserializer, serialized: bytes, expected: str): + pretty = pretty_print_trie(deserializer, serialized)[0] #print(pretty) self.assertEqual(pretty, expected.strip()) def test_empty(self): trie = Trie() - serialized = trie.serialize() - self.compare(serialized, "") - self.assertEqual(len(serialized), 6) + for i in trie_type_sizes: + with self.subTest(**i): + serialized = trie.serialize(Serializer(**i)) + self.compare(Deserializer(**i), serialized, "") + self.assertEqual(len(serialized), 6) def test_single(self): trie = Trie() trie.insert("magnum", 1337) trie.insert("magnum", 21) - serialized = trie.serialize() - self.compare(serialized, """ + for i in trie_type_sizes: + with self.subTest(**i): + serialized = trie.serialize(Serializer(**i)) + self.compare(Deserializer(**i), serialized, """ magnum [1337, 21] """) - self.assertEqual(len(serialized), 46) + # Verify just the smallest and largest size, everything else + # should fit in between + if i['file_offset_bytes'] == 3 and i['result_id_bytes'] == 2: + self.assertEqual(len(serialized), 46) + elif i['file_offset_bytes'] == 4 and i['result_id_bytes'] == 4: + self.assertEqual(len(serialized), 56) + else: + self.assertGreater(len(serialized), 46) + self.assertLess(len(serialized), 56) def test_multiple(self): trie = Trie() @@ -94,8 +106,10 @@ magnum [1337, 21] trie.insert("range::max", 10) trie.insert("max", 10) - serialized = trie.serialize() - self.compare(serialized, """ + for i in trie_type_sizes: + with self.subTest(**i): + serialized = trie.serialize(Serializer(**i)) + self.compare(Deserializer(**i), serialized, """ math [0] ||| :$ ||| :vector [1] @@ -123,7 +137,15 @@ range [2] | :min [9] | ax [10] """) - self.assertEqual(len(serialized), 340) + # Verify just the smallest and largest size, everything else + # should fit in between + if i['file_offset_bytes'] == 3 and i['result_id_bytes'] == 2: + self.assertEqual(len(serialized), 340) + elif i['file_offset_bytes'] == 4 and i['result_id_bytes'] == 4: + self.assertEqual(len(serialized), 428) + else: + self.assertGreater(len(serialized), 340) + self.assertLess(len(serialized), 428) def test_unicode(self): trie = Trie() @@ -131,8 +153,8 @@ range [2] trie.insert("hýždě", 0) trie.insert("hárá", 1) - serialized = trie.serialize() - self.compare(serialized, """ + serialized = trie.serialize(Serializer(file_offset_bytes=3, result_id_bytes=2, name_size_bytes=1)) + self.compare(Deserializer(file_offset_bytes=3, result_id_bytes=2, name_size_bytes=1), serialized, """ h0xc3 0xbd 0xc5 @@ -147,7 +169,7 @@ h0xc3 """) self.assertEqual(len(serialized), 82) - def test_many_results(self): + def test_16bit_result_count(self): trie = Trie() for i in range(128): @@ -158,39 +180,99 @@ h0xc3 for i in [203, 215, 267]: trie.insert("__init__subclass__", i) - serialized = trie.serialize() - self.compare(serialized, """ + for i in trie_type_sizes: + with self.subTest(**i): + serialized = trie.serialize(Serializer(**i)) + self.compare(Deserializer(**i), serialized, """ __init__ [{}] subclass__ [203, 215, 267] """.format(', '.join([str(i) for i in range(128)]))) - self.assertEqual(len(serialized), 376) + # Verify just the smallest and largest size, everything else + # should fit in between + if i['file_offset_bytes'] == 3 and i['result_id_bytes'] == 2: + self.assertEqual(len(serialized), 377) + elif i['file_offset_bytes'] == 4 and i['result_id_bytes'] == 4: + self.assertEqual(len(serialized), 657) + else: + self.assertGreater(len(serialized), 377) + self.assertLess(len(serialized), 657) + + def test_16bit_result_id_too_small(self): + trie = Trie() + trie.insert("a", 65536) + with self.assertRaises(OverflowError): + trie.serialize(Serializer(file_offset_bytes=3, result_id_bytes=2, name_size_bytes=1)) + + # This should work + trie.serialize(Serializer(file_offset_bytes=3, result_id_bytes=3, name_size_bytes=1)) + + def test_24bit_result_id_too_small(self): + trie = Trie() + trie.insert("a", 16*1024*1024) + with self.assertRaises(OverflowError): + trie.serialize(Serializer(file_offset_bytes=3, result_id_bytes=3, name_size_bytes=1)) + + # This should work + trie.serialize(Serializer(file_offset_bytes=3, result_id_bytes=4, name_size_bytes=1)) + + def test_23bit_file_offset_too_small(self): + trie = Trie() + + # The hight bit of the child offset stores a lookahead barrier, so the + # file has to be smaller than 8M, not 16. Python has a recursion limit + # of 1000, so we can't really insert a 8M character long string. + # Instead, insert one 130-character string where each char has 32k + # 16bit result IDs. 129 isn't enough to overflow the offsets. + results_32k = [j for j in range(32767)] + for i in range(130): + trie.insert('a'*i, results_32k) + + with self.assertRaises(OverflowError): + trie.serialize(Serializer(file_offset_bytes=3, result_id_bytes=2, name_size_bytes=1)) + + # This should work + trie.serialize(Serializer(file_offset_bytes=4, result_id_bytes=2, name_size_bytes=1)) class MapSerialization(unittest.TestCase): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self.maxDiff = None - def compare(self, serialized: bytes, expected: str): - pretty = pretty_print_map(serialized, entryTypeClass=EntryType) + def compare(self, deserializer: Deserializer, serialized: bytes, expected: str): + pretty = pretty_print_map(deserializer, serialized, entryTypeClass=EntryType) #print(pretty) self.assertEqual(pretty, expected.strip()) def test_empty(self): map = ResultMap() - serialized = map.serialize() - self.compare(serialized, "") - self.assertEqual(len(serialized), 4) + for i in type_sizes: + with self.subTest(**i): + serialized = map.serialize(Serializer(**i)) + self.compare(Deserializer(**i), serialized, "") + self.assertEqual(len(serialized), i['file_offset_bytes']) def test_single(self): map = ResultMap() + self.assertEqual(map.add("Magnum", "namespaceMagnum.html", suffix_length=11, flags=ResultFlag.from_type(ResultFlag.NONE, EntryType.NAMESPACE)), 0) - serialized = map.serialize() - self.compare(serialized, """ + for i in type_sizes: + with self.subTest(**i): + serialized = map.serialize(Serializer(**i)) + self.compare(Deserializer(**i), serialized, """ 0: Magnum [suffix_length=11, type=NAMESPACE] -> namespaceMagnum.html """) - self.assertEqual(len(serialized), 36) + # Verify just the smallest and largest size, everything else + # should fit in between. The `result_id_bytes` don't affect + # this case. + if i['file_offset_bytes'] == 3 and i['name_size_bytes'] == 1: + self.assertEqual(len(serialized), 35) + elif i['file_offset_bytes'] == 4 and i['name_size_bytes'] == 2: + self.assertEqual(len(serialized), 38) + else: + self.assertGreater(len(serialized), 35) + self.assertLess(len(serialized), 38) def test_multiple(self): map = ResultMap() @@ -203,8 +285,10 @@ class MapSerialization(unittest.TestCase): self.assertEqual(map.add("Rectangle", "", alias=2), 5) self.assertEqual(map.add("Rectangle::Rect()", "", suffix_length=2, alias=2), 6) - serialized = map.serialize() - self.compare(serialized, """ + for i in type_sizes: + with self.subTest(**i): + serialized = map.serialize(Serializer(**i)) + self.compare(Deserializer(**i), serialized, """ 0: Math [type=NAMESPACE] -> namespaceMath.html 1: ::Vector [prefix=0[:0], type=CLASS] -> classMath_1_1Vector.html 2: ::Range [prefix=0[:0], type=CLASS] -> classMath_1_1Range.html @@ -213,7 +297,97 @@ class MapSerialization(unittest.TestCase): 5: Rectangle [alias=2] -> 6: ::Rect() [alias=2, prefix=5[:0], suffix_length=2] -> """) - self.assertEqual(len(serialized), 203) + # Verify just the smallest and largest size, everything else + # should fit in between + if i['file_offset_bytes'] == 3 and i['result_id_bytes'] == 2 and i['name_size_bytes'] == 1: + self.assertEqual(len(serialized), 202) + elif i['file_offset_bytes'] == 4 and i['result_id_bytes'] == 4 and i['name_size_bytes'] == 2: + self.assertEqual(len(serialized), 231) + else: + self.assertGreater(len(serialized), 202) + self.assertLess(len(serialized), 231) + + def test_24bit_file_offset_too_small(self): + map = ResultMap() + # 3 bytes for the initial offset, 3 bytes for file size, 1 byte for the + # flags, 1 byte for the null terminator, 6 bytes for the URL + map.add('F'*(16*1024*1024 - 14), 'f.html', flags=ResultFlag.from_type(ResultFlag.NONE, EntryType.CLASS)) + + with self.assertRaises(OverflowError): + # Disabling prefix merging otherwise memory usage goes to hell + map.serialize(Serializer(file_offset_bytes=3, result_id_bytes=2, name_size_bytes=1), merge_prefixes=False) + + # This should work. Disabling prefix merging otherwise memory usage + # goes to hell. + map.serialize(Serializer(file_offset_bytes=4, result_id_bytes=2, name_size_bytes=1), merge_prefixes=False) + + def test_8bit_suffix_length_too_small(self): + map = ResultMap() + map.add("F()" + ';'*256, "f.html", flags=ResultFlag.from_type(ResultFlag.NONE, EntryType.FUNC), suffix_length=256) + + with self.assertRaises(OverflowError): + map.serialize(Serializer(file_offset_bytes=3, result_id_bytes=2, name_size_bytes=1)) + + # This should work + map.serialize(Serializer(file_offset_bytes=3, result_id_bytes=2, name_size_bytes=2)) + + def test_8bit_prefix_length_too_small(self): + map = ResultMap() + map.add("A", 'a'*251 + ".html", flags=ResultFlag.from_type(ResultFlag.NONE, EntryType.CLASS)) + map.add("A::foo()", 'a'*251 + ".html#foo", flags=ResultFlag.from_type(ResultFlag.NONE, EntryType.FUNC)) + + with self.assertRaises(OverflowError): + map.serialize(Serializer(file_offset_bytes=3, result_id_bytes=2, name_size_bytes=1)) + + # This should work + map.serialize(Serializer(file_offset_bytes=3, result_id_bytes=2, name_size_bytes=2)) + + def test_16bit_prefix_id_too_small(self): + map = ResultMap() + + # Adding A0 to A65535 would be too slow due to the recursive Trie + # population during prefix merging (SIGH) so trying this instead. It's + # still hella slow, but at least not TWO MINUTES. + for i in range(128): + for j in range(128): + for k in range(4): + map.add(bytes([i, j, k]).decode('utf-8'), "a.html", flags=ResultFlag.from_type(ResultFlag.NONE, EntryType.CLASS)) + + self.assertEqual(map.add("B", "b.html", flags=ResultFlag.from_type(ResultFlag.NONE, EntryType.CLASS)), 65536) + map.add("B::foo()", "b.html#foo", flags=ResultFlag.from_type(ResultFlag.NONE, EntryType.FUNC)) + + with self.assertRaises(OverflowError): + map.serialize(Serializer(file_offset_bytes=3, result_id_bytes=2, name_size_bytes=1)) + + # This should work + map.serialize(Serializer(file_offset_bytes=3, result_id_bytes=3, name_size_bytes=1)) + + # Testing this error for a 24bit prefix seems infeasibly slow, not + # doing that + + def test_16bit_alias_id_too_small(self): + map = ResultMap() + + # The alias doesn't exist of course, hopefully that's fine in this case + map.add("B", "", alias=65536) + + with self.assertRaises(OverflowError): + map.serialize(Serializer(file_offset_bytes=3, result_id_bytes=2, name_size_bytes=1)) + + # This should work + map.serialize(Serializer(file_offset_bytes=3, result_id_bytes=3, name_size_bytes=1)) + + def test_24bit_alias_id_too_small(self): + map = ResultMap() + + # The alias doesn't exist of course, hopefully that's fine in this case + map.add("B", "", alias=16*1024*1024) + + with self.assertRaises(OverflowError): + map.serialize(Serializer(file_offset_bytes=3, result_id_bytes=3, name_size_bytes=1)) + + # This should work + map.serialize(Serializer(file_offset_bytes=3, result_id_bytes=4, name_size_bytes=1)) class Serialization(unittest.TestCase): def __init__(self, *args, **kwargs): @@ -237,8 +411,10 @@ class Serialization(unittest.TestCase): trie.insert("math::range", index) trie.insert("range", index) - serialized = serialize_search_data(trie, map, search_type_map, 3) - self.compare(serialized, """ + for i in type_sizes: + with self.subTest(**i): + serialized = serialize_search_data(Serializer(**i), trie, map, search_type_map, 3) + self.compare(serialized, """ 3 symbols math [0] | ::vector [1] @@ -253,4 +429,12 @@ range [2] (EntryType.CLASS, CssClass.PRIMARY, 'class'), (EntryType.FUNC, CssClass.INFO, 'func') """) - self.assertEqual(len(serialized), 277) + # Verify just the smallest and largest size, everything else + # should fit in between + if i['file_offset_bytes'] == 3 and i['result_id_bytes'] == 2 and i['name_size_bytes'] == 1: + self.assertEqual(len(serialized), 282) + elif i['file_offset_bytes'] == 4 and i['result_id_bytes'] == 4 and i['name_size_bytes'] == 2: + self.assertEqual(len(serialized), 317) + else: + self.assertGreater(len(serialized), 282) + self.assertLess(len(serialized), 317) diff --git a/documentation/test_doxygen/layout/pages.html b/documentation/test_doxygen/layout/pages.html index 943167cf..13d27fb1 100644 --- a/documentation/test_doxygen/layout/pages.html +++ b/documentation/test_doxygen/layout/pages.html @@ -111,8 +111,8 @@ - - + +