import enum
import struct
from types import SimpleNamespace as Empty
-from typing import List, Tuple
+from typing import List, Tuple, Union
# Version 0 was without the type map
-searchdata_format_version = 1
+searchdata_format_version = 2
search_filename = f'search-v{searchdata_format_version}.js'
searchdata_filename = f'{{search_filename_prefix}}-v{searchdata_format_version}.bin'
searchdata_filename_b85 = f'{{search_filename_prefix}}-v{searchdata_format_version}.js'
+# In order to be both space-efficient and flexible enough to accomodate for
+# larger projects, the bit counts for particular data types can vary in each
+# file. There's the following categories:
+#
+# - NAME_SIZE_BITS, how many bits is needed to store name lengths (such as
+# prefix length). Can be either 8 or 16.
+# - RESULT_ID_BITS, how many bits is needed for IDs pointing into the result
+# map. Can be either 16, 24 or 32.
+# - FILE_OFFSET_BITS, how many bits is needed to store general offsets into
+# the file. Can be either 24 or 32.
+#
+# Whole file encoding
+# ===================
+#
+# magic | version | type | not | symbol | result | type | trie | result | type
+# 'MCS' | (0x02) | data | used | count | map | map | data | map | map
+# | | | | | offset | offset | | data | data
+# 24b | 8b | 8b | 24b | 32b | 32b | 32b | … | … | …
+#
+# The type data encode the NAME_SIZE_BITS, RESULT_ID_BITS and
+# FILE_OFFSET_BITS:
+#
+# not | NAME_SIZE_BITS | RESULT_ID_BITS | FILE_OFFSET_BITS
+# used | 0b0 = 8b | 0b00 = 16b | 0b0 = 24b
+# | 0b1 = 16b | 0b01 = 24b | 0b1 = 32b
+# | | 0b10 = 32b |
+# 4b | 1b | 2b | 1b
+#
+# Trie encoding
+# =============
+#
+# Because child tries are serialized first, the trie containing the initial
+# characters is never the first, and instead the root offset points to it. If
+# result count < 128:
+#
+# root | | header | results | children
+# offset | … | | result # | child # | … | data
+# 32b | |0| 7b | 8b | n*RESULT_ID_BITS | …
+#
+# If result count > 127, it's instead this -- since entries with very large
+# number of results (such as python __init__()) are rather rare, it doesn't
+# make sense to have it globally configurable and then waste 8 bits in the
+# majority of cases. Note that the 15-bit value is stored as Big-Endian,
+# otherwise the leftmost bit couldn't be used to denote the size.
+#
+# root | | header | results | children
+# offset | … | | result # | child # | … | data
+# 32b | |1| 15b (BE) | 8b | n*RESULT_ID_BITS | …
+#
+# Trie children data encoding, the barrier is stored in the topmost offset bit:
+#
+# child 1 | child 2 | | child 1 | child 2 |
+# char | char | … | barrier + offset | barrier + offset | …
+# 8b | 8b | | FILE_OFFSET_BITS | FILE_OFFSET_BITS |
+#
+# Result map encoding
+# ===================
+#
+# First all flags, then all offsets, so we don't need to have weird paddings or
+# alignments. The "file size" is there so size of item N can be always
+# retrieved as `offsets[N + 1] - offsets[N]`
+#
+# item | file | item | item 1 | item 2 |
+# offsets | size | flags | data | data | …
+# n*FILE_OFFSET_BITS | FILE_OFFSET_BITS | n*8b | | |
+#
+# Basic item data (flags & 0b11 == 0b00):
+#
+# name | \0 | URL
+# | |
+# | 8b |
+#
+# Suffixed item data (flags & 0b11 == 0b01):
+#
+# suffix | name | \0 | URL
+# length | | |
+# NAME_SIZE_BITS | | 8b |
+#
+# Prefixed item data (flags & 0xb11 == 0b10):
+#
+# prefix | prefix | name | \0 | URL
+# id | length | suffix | | suffix
+# RESULT_ID_BITS | NAME_SIZE_BITS | | 8b |
+#
+# Prefixed & suffixed item (flags & 0xb11 == 0b11):
+#
+# prefix | prefix | suffix | name | \0 | URL
+# id | length | length | suffix | |
+# RESULT_ID_BITS | NAME_SIZE_BITS | NAME_SIZE_BITS | | 8b |
+#
+# Alias item (flags & 0xf0 == 0x00), flags & 0xb11 then denote what's in the
+# `…` portion, alias have no URL so the alias name is in place of it:
+#
+# alias | | alias
+# id | … | name
+# RESULT_ID_BITS | |
+#
+# Type map encoding
+# =================
+#
+# Again the "end offset" is here so size of type N can be always retrieved as
+# `offsets[N + 1] - offsets[N]`. Type names are not expected to have more than
+# 255 chars, so NAME_SIZE_BITS is not used here.
+#
+# type 1 | type 2 | | | | type 1 |
+# class | name | class | name | … | padding | end | name | …
+# ID | offset | ID | offset | | | offset | data |
+# 8b | 8b | 8b | 8b | | 8b | 8b | |
+
+class Serializer:
+ # This is currently hardcoded
+ result_map_flag_bytes = 1
+
+ header_struct = struct.Struct('<3sBBxxxIII')
+ result_map_flags_struct = struct.Struct('<B')
+ trie_root_offset_struct = struct.Struct('<I')
+ type_map_entry_struct = struct.Struct('<BB')
+
+ def __init__(self, *, file_offset_bytes, result_id_bytes, name_size_bytes):
+ assert file_offset_bytes in [3, 4]
+ self.file_offset_bytes = file_offset_bytes
+
+ assert result_id_bytes in [2, 3, 4]
+ self.result_id_bytes = result_id_bytes
+
+ assert name_size_bytes in [1, 2]
+ self.name_size_bytes = name_size_bytes
+
+ def pack_header(self, symbol_count, trie_size, result_map_size):
+ return self.header_struct.pack(b'MCS', searchdata_format_version,
+ (self.file_offset_bytes - 3) << 0 |
+ (self.result_id_bytes - 2) << 1 |
+ (self.name_size_bytes - 1) << 3,
+ symbol_count,
+ self.header_struct.size + trie_size,
+ self.header_struct.size + trie_size + result_map_size)
+
+ def pack_result_map_flags(self, flags: int):
+ return self.result_map_flags_struct.pack(flags)
+ def pack_result_map_offset(self, offset: int):
+ return offset.to_bytes(self.file_offset_bytes, byteorder='little')
+ def pack_result_map_prefix(self, id: int, length: int):
+ return id.to_bytes(self.result_id_bytes, byteorder='little') + \
+ length.to_bytes(self.name_size_bytes, byteorder='little')
+ def pack_result_map_suffix_length(self, length: int):
+ return length.to_bytes(self.name_size_bytes, byteorder='little')
+ def pack_result_map_alias(self, id: int):
+ return id.to_bytes(self.result_id_bytes, byteorder='little')
+
+ def pack_trie_root_offset(self, offset: int):
+ return self.trie_root_offset_struct.pack(offset)
+ def pack_trie_node(self, result_ids: List[int], child_chars_offsets_barriers: List[Tuple[int, int, bool]]):
+ out = bytearray()
+ # If the result count fits into 7 bits, pack it into a single byte
+ if len(result_ids) < 128:
+ out += len(result_ids).to_bytes(1, byteorder='little')
+ # Otherwise use the leftmost bit to denote it's two-byte, and store the
+ # higher 8 bits in a second byte. Which is the same as storing the
+ # value as Big-Endian.
+ else:
+ assert len(result_ids) < 32768
+ out += (len(result_ids) | 0x8000).to_bytes(2, byteorder='big')
+ out += len(child_chars_offsets_barriers).to_bytes(1, byteorder='little')
+ for id in result_ids:
+ out += id.to_bytes(self.result_id_bytes, byteorder='little')
+ out += bytes([char for char, offset, barrier in child_chars_offsets_barriers])
+ child_barrier_mask = 1 << (self.file_offset_bytes*8 - 1)
+ for char, offset, barrier in child_chars_offsets_barriers:
+ if offset >= child_barrier_mask: raise OverflowError
+ out += (offset | (barrier*child_barrier_mask)).to_bytes(self.file_offset_bytes, byteorder='little')
+ return out
+
+ def pack_type_map_entry(self, class_: int, offset: int):
+ return self.type_map_entry_struct.pack(class_, offset)
+
+class Deserializer:
+ def __init__(self, *, file_offset_bytes, result_id_bytes, name_size_bytes):
+ assert file_offset_bytes in [3, 4]
+ self.file_offset_bytes = file_offset_bytes
+
+ assert result_id_bytes in [2, 3, 4]
+ self.result_id_bytes = result_id_bytes
+
+ assert name_size_bytes in [1, 2]
+ self.name_size_bytes = name_size_bytes
+
+ @classmethod
+ def from_serialized(self, serialized: bytes):
+ magic, version, type_data, symbol_count, map_offset, type_map_offset = Serializer.header_struct.unpack_from(serialized)
+ assert magic == b'MCS'
+ assert version == searchdata_format_version
+ out = Deserializer(
+ file_offset_bytes=[3, 4][(type_data & 0b0001) >> 0],
+ result_id_bytes=[2, 3, 4][(type_data & 0b0110) >> 1],
+ name_size_bytes=[1, 2][(type_data & 0b1000) >> 3])
+ out.symbol_count = symbol_count
+ out.map_offset = map_offset
+ out.type_map_offset = type_map_offset
+ return out
+
+ # The last tuple item is number of bytes extracted
+ def unpack_result_map_flags(self, serialized: bytes, offset: int) -> Tuple[int, int]:
+ return Serializer.result_map_flags_struct.unpack_from(serialized, offset) + (Serializer.result_map_flags_struct.size, )
+ def unpack_result_map_offset(self, serialized: bytes, offset: int) -> Tuple[int, int]:
+ return int.from_bytes(serialized[offset:offset + self.file_offset_bytes], byteorder='little'), self.file_offset_bytes
+ def unpack_result_map_prefix(self, serialized: bytes, offset: int) -> Tuple[int, int, int]:
+ return int.from_bytes(serialized[offset:offset + self.result_id_bytes], byteorder='little'), int.from_bytes(serialized[offset + self.result_id_bytes:offset + self.result_id_bytes + self.name_size_bytes], byteorder='little'), self.result_id_bytes + self.name_size_bytes
+ def unpack_result_map_suffix_length(self, serialized: bytes, offset: int) -> Tuple[int, int]:
+ return int.from_bytes(serialized[offset:offset + self.name_size_bytes], byteorder='little'), self.name_size_bytes
+ def unpack_result_map_alias(self, serialized: bytes, offset: int) -> Tuple[int, int]:
+ return int.from_bytes(serialized[offset:offset + self.result_id_bytes], byteorder='little'), self.result_id_bytes
+
+ def unpack_trie_root_offset(self, serialized: bytes, offset: int) -> Tuple[int, int]:
+ return Serializer.trie_root_offset_struct.unpack_from(serialized, offset) + (Serializer.trie_root_offset_struct.size, )
+ def unpack_trie_node(self, serialized: bytes, offset: int) -> Tuple[List[int], List[int], List[Tuple[int, int, bool]], int]:
+ prev_offset = offset
+ # Result count, first try 8-bit, if it has the highest bit set, extract
+ # two bytes (as a BE) and then remove the highest bit
+ result_count = int.from_bytes(serialized[offset:offset + 1], byteorder='little')
+ if result_count & 0x80:
+ result_count = int.from_bytes(serialized[offset:offset + 2], byteorder='big') & ~0x8000
+ offset += 1
+ offset += 1
+ child_count = int.from_bytes(serialized[offset:offset + 1], byteorder='little')
+ offset += 1
+
+ # Unpack all result IDs
+ result_ids = []
+ for i in range(result_count):
+ result_ids += [int.from_bytes(serialized[offset:offset + self.result_id_bytes], byteorder='little')]
+ offset += self.result_id_bytes
+
+ # Unpack all child chars
+ child_chars = list(serialized[offset:offset + child_count])
+ offset += child_count
+
+ # Unpack all children offsets and lookahead barriers
+ child_chars_offsets_barriers = []
+ child_barrier_mask = 1 << (self.file_offset_bytes*8 - 1)
+ for i in range(child_count):
+ child_offset_barrier = int.from_bytes(serialized[offset:offset + self.file_offset_bytes], byteorder='little')
+ child_chars_offsets_barriers += [(child_chars[i], child_offset_barrier & ~child_barrier_mask, bool(child_offset_barrier & child_barrier_mask))]
+ offset += self.file_offset_bytes
+
+ return result_ids, child_chars_offsets_barriers, offset - prev_offset
+
+ def unpack_type_map_entry(self, serialized: bytes, offset: int) -> Tuple[int, int, int]:
+ return Serializer.type_map_entry_struct.unpack_from(serialized, offset) + (Serializer.type_map_entry_struct.size, )
+
class CssClass(enum.Enum):
DEFAULT = 0
PRIMARY = 1
_TYPE14 = 14 << 4
_TYPE15 = 15 << 4
-# Result map encoding -- the "file size" is there so size of item N can be
-# always retrieved as `offsets[N + 1] - offsets[N]`
-#
-# item 1 flags | item 2 flags | | item N flags | file | item 1 |
-# + offset | + offset | … | + offset | size | data | …
-# 8 + 24b | 8 + 24b | | 8 + 24b | 32b | |
-#
-# basic item (flags & 0b11 == 0b00):
-#
-# name | \0 | URL
-# | |
-# | 8b |
-#
-# suffixed item (flags & 0b11 == 0b01):
-#
-# suffix | name | \0 | URL
-# length | | |
-# 8b | | 8b |
-#
-# prefixed item (flags & 0xb11 == 0b10):
-#
-# prefix | name | \0 | URL
-# id + len | suffix | | suffix
-# 16b + 8b | | 8b |
-#
-# prefixed & suffixed item (flags & 0xb11 == 0b11):
-#
-# prefix | suffix | name | \0 | URL
-# id + len | length | suffix | |
-# 16b + 8b | 8b | | 8b |
-#
-# alias item (flags & 0xf0 == 0x00), flags & 0xb11 then denote what's in the
-# `…` portion, alias have no URL so the alias name is in place of it:
-#
-# alias | | alias
-# id | … | name
-# 16b | |
class ResultMap:
- offset_struct = struct.Struct('<I')
- flags_struct = struct.Struct('<B')
- prefix_struct = struct.Struct('<HB')
- suffix_length_struct = struct.Struct('<B')
- alias_struct = struct.Struct('<H')
-
def __init__(self):
self.entries = []
self.entries += [entry]
return len(self.entries) - 1
- def serialize(self, merge_prefixes=True) -> bytearray:
- output = bytearray()
-
+ def serialize(self, serializer: Serializer, merge_prefixes=True) -> bytearray:
if merge_prefixes:
# Put all entry names into a trie to discover common prefixes
trie = Trie()
# Everything merged, replace the original list
self.entries = merged
- # Write the offset array. Starting offset for items is after the offset
- # array and the file size
- offset = (len(self.entries) + 1)*4
+ # Write the offset array. Starting offset for items is after the
+ # (aligned) flag array and (aligned) offset + file size array.
+ output = bytearray()
+ offset = len(self.entries)*serializer.result_map_flag_bytes + (len(self.entries) + 1)*serializer.file_offset_bytes
for e in self.entries:
- assert offset < 2**24
- output += self.offset_struct.pack(offset)
- self.flags_struct.pack_into(output, len(output) - 1, e.flags.value)
+ output += serializer.pack_result_map_offset(offset)
# The entry is an alias, extra field for alias index
if e.flags & ResultFlag._TYPE == ResultFlag.ALIAS:
- offset += self.alias_struct.size
+ offset += serializer.result_id_bytes
# Extra field for prefix index and length
if e.flags & ResultFlag.HAS_PREFIX:
- offset += self.prefix_struct.size
+ offset += serializer.result_id_bytes + serializer.name_size_bytes
# Extra field for suffix length
if e.flags & ResultFlag.HAS_SUFFIX:
- offset += self.suffix_length_struct.size
+ offset += serializer.name_size_bytes
# Length of the name
offset += len(e.name.encode('utf-8'))
offset += len(e.url.encode('utf-8')) + 1
# Write file size
- output += self.offset_struct.pack(offset)
+ output += serializer.pack_result_map_offset(offset)
+
+ # Write the flag array
+ for e in self.entries:
+ output += serializer.pack_result_map_flags(e.flags.value)
# Write the entries themselves
for e in self.entries:
if e.flags & ResultFlag._TYPE == ResultFlag.ALIAS:
assert not e.alias is None
assert not e.url
- output += self.alias_struct.pack(e.alias)
+ output += serializer.pack_result_map_alias(e.alias)
if e.flags & ResultFlag.HAS_PREFIX:
- output += self.prefix_struct.pack(e.prefix, e.prefix_length)
+ output += serializer.pack_result_map_prefix(e.prefix, e.prefix_length)
if e.flags & ResultFlag.HAS_SUFFIX:
- output += self.suffix_length_struct.pack(e.suffix_length)
+ output += serializer.pack_result_map_suffix_length(e.suffix_length)
output += e.name.encode('utf-8')
if e.url:
output += b'\0'
assert len(output) == offset
return output
-# Trie encoding:
-#
-# root | | header | results | child 1 | child 1 | child 1 |
-# offset | … | | result # | child # | … | char | barrier | offset | …
-# 32b | |0| 7b | 8b | n*16b | 8b | 1b | 23b |
-#
-# if result count > 127, it's instead:
-#
-# root | | header | results | child 1 | child 1 | child 1 |
-# offset | … | | result # | child # | … | char | barrier | offset | …
-# 32b | |1| 11b | 4b | n*16b | 8b | 1b | 23b |
class Trie:
- root_offset_struct = struct.Struct('<I')
- header_struct = struct.Struct('<BB')
- result_struct = struct.Struct('<H')
- child_struct = struct.Struct('<I')
- child_char_struct = struct.Struct('<B')
-
def __init__(self):
self.results = []
self.children = {}
- def _insert(self, path: bytes, result, lookahead_barriers):
+ def _insert(self, path: bytes, result: Union[int, List[int]], lookahead_barriers):
if not path:
- self.results += [result]
+ # Inserting a list is mainly used by the
+ # TrieSerialization.test_23bit_file_offset_too_small() test, as
+ # otherwise it'd be WAY too slow.
+ # TODO this whole thing needs optimizing with less recursion
+ if type(result) is list:
+ self.results += result
+ else:
+ self.results += [result]
return
char = path[0]
self.children[char] = (True, self.children[char][1])
self.children[char][1]._insert(path[1:], result, [b - 1 for b in lookahead_barriers])
- def insert(self, path: str, result, lookahead_barriers=[]):
+ def insert(self, path: str, result: Union[int, List[int]], lookahead_barriers=[]):
self._insert(path.encode('utf-8'), result, lookahead_barriers)
def _sort(self, key):
self._sort(key)
# Returns offset of the serialized thing in `output`
- def _serialize(self, hashtable, output: bytearray, merge_subtrees) -> int:
+ def _serialize(self, serializer: Serializer, hashtable, output: bytearray, merge_subtrees) -> int:
# Serialize all children first
- child_offsets = []
+ child_chars_offsets_barriers = []
for char, child in self.children.items():
- offset = child[1]._serialize(hashtable, output, merge_subtrees=merge_subtrees)
- child_offsets += [(char, child[0], offset)]
-
- # Serialize this node. Sometimes we'd have an insane amount of results
- # (such as Python's __init__), but very little children to go with
- # that. Then we can make the result count storage larger (11 bits,
- # 2048 results) and the child count storage smaller (4 bits, 16
- # children). Hopefully that's enough. The remaining leftmost bit is
- # used as an indicator of this shifted state.
- serialized = bytearray()
- if len(self.results) > 127:
- assert len(self.children) < 16 and len(self.results) < 2048
- result_count = (len(self.results) & 0x7f) | 0x80
- children_count = ((len(self.results) & 0xf80) >> 3) | len(self.children)
- else:
- result_count = len(self.results)
- children_count = len(self.children)
- serialized += self.header_struct.pack(result_count, children_count)
- for v in self.results:
- serialized += self.result_struct.pack(v)
-
- # Serialize child offsets
- for char, lookahead_barrier, abs_offset in child_offsets:
- assert abs_offset < 2**23
-
- # write them over each other because that's the only way to pack
- # a 24 bit field
- offset = len(serialized)
- serialized += self.child_struct.pack(abs_offset | ((1 if lookahead_barrier else 0) << 23))
- self.child_char_struct.pack_into(serialized, offset + 3, char)
+ offset = child[1]._serialize(serializer, hashtable, output, merge_subtrees=merge_subtrees)
+ child_chars_offsets_barriers += [(char, offset, child[0])]
+
+ # Serialize this node
+ serialized = serializer.pack_trie_node(self.results, child_chars_offsets_barriers)
# Subtree merging: if this exact tree is already in the table, return
# its offset. Otherwise add it and return the new offset.
if merge_subtrees: hashtable[hashable] = offset
return offset
- def serialize(self, merge_subtrees=True) -> bytearray:
+ def serialize(self, serializer: Serializer, merge_subtrees=True) -> bytearray:
output = bytearray(b'\x00\x00\x00\x00')
hashtable = {}
- self.root_offset_struct.pack_into(output, 0, self._serialize(hashtable, output, merge_subtrees=merge_subtrees))
+ output[0:4] = serializer.pack_trie_root_offset(self._serialize(serializer, hashtable, output, merge_subtrees=merge_subtrees))
return output
-# Type map encoding:
-#
-# type 1 | type 2 | | | | type 1 |
-# class | name | class | name | … | padding | end | name | …
-# ID | offset | ID | offset | | | offset | data |
-# 8b | 8b | 8b | 8b | | 8b | 8b | |
-type_map_entry_struct = struct.Struct('<BB')
-
-def serialize_type_map(map: List[Tuple[CssClass, str]]) -> bytearray:
+def serialize_type_map(serializer: Serializer, map: List[Tuple[CssClass, str]]) -> bytearray:
serialized = bytearray()
names = bytearray()
assert len(map) <= 15
# Initial name offset is after all the offset entries plus the final one
- initial_name_offset = (len(map) + 1)*type_map_entry_struct.size
+ initial_name_offset = (len(map) + 1)*serializer.type_map_entry_struct.size
# Add all entries (and the final offset), encode the names separately,
# concatenate at the end
for css_class, name in map:
- serialized += type_map_entry_struct.pack(css_class.value, initial_name_offset + len(names))
+ serialized += serializer.pack_type_map_entry(css_class.value, initial_name_offset + len(names))
names += name.encode('utf-8')
- serialized += type_map_entry_struct.pack(0, initial_name_offset + len(names))
+ serialized += serializer.pack_type_map_entry(0, initial_name_offset + len(names))
assert len(serialized) == initial_name_offset
return serialized + names
-# Whole file encoding:
-#
-# magic | version | symbol | result | type | trie | result | type
-# header | | count | map | map | data | map | map
-# | | | offset | offset | | data | data
-# 24b | 8b | 16b | 32b | 32b | … | … | …
-search_data_header_struct = struct.Struct('<3sBHII')
-
-def serialize_search_data(trie: Trie, map: ResultMap, type_map: List[Tuple[CssClass, str]], symbol_count, *, merge_subtrees=True, merge_prefixes=True) -> bytearray:
- serialized_trie = trie.serialize(merge_subtrees=merge_subtrees)
- serialized_map = map.serialize(merge_prefixes=merge_prefixes)
- serialized_type_map = serialize_type_map(type_map)
+def serialize_search_data(serializer: Serializer, trie: Trie, map: ResultMap, type_map: List[Tuple[CssClass, str]], symbol_count, *, merge_subtrees=True, merge_prefixes=True) -> bytearray:
+ serialized_trie = trie.serialize(serializer, merge_subtrees=merge_subtrees)
+ serialized_map = map.serialize(serializer, merge_prefixes=merge_prefixes)
+ serialized_type_map = serialize_type_map(serializer, type_map)
- preamble = search_data_header_struct.pack(b'MCS',
- searchdata_format_version, symbol_count,
- search_data_header_struct.size + len(serialized_trie),
- search_data_header_struct.size + len(serialized_trie) + len(serialized_map))
+ preamble = serializer.pack_header(symbol_count, len(serialized_trie), len(serialized_map))
return preamble + serialized_trie + serialized_map + serialized_type_map
def base85encode_search_data(data: bytearray) -> bytearray:
return (b"/* Generated by https://mcss.mosra.cz/documentation/doxygen/. Do not edit. */\n" +
b"Search.load('" + base64.b85encode(data, True) + b"');\n")
-def _pretty_print_trie(serialized: bytearray, hashtable, stats, base_offset, indent, *, show_merged, show_lookahead_barriers, color_map) -> str:
+def _pretty_print_trie(deserializer: Deserializer, serialized: bytearray, hashtable, stats, base_offset, indent, *, show_merged, show_lookahead_barriers, color_map) -> str:
# Visualize where the trees were merged
if show_merged and base_offset in hashtable:
return color_map['red'] + '#' + color_map['reset']
stats.node_count += 1
out = ''
- result_count, child_count = Trie.header_struct.unpack_from(serialized, base_offset)
- # If result count has the high bit set, it's stored in 11 bits and child
- # count in 4 bits instead of 7 + 8
- if result_count & 0x80:
- result_count = (result_count & 0x7f) | ((child_count & 0xf0) << 3)
- child_count = child_count & 0x0f
- stats.max_node_results = max(result_count, stats.max_node_results)
- stats.max_node_children = max(child_count, stats.max_node_children)
- offset = base_offset + Trie.header_struct.size
+ result_ids, child_chars_offsets_barriers, offset = deserializer.unpack_trie_node(serialized, base_offset)
+
+ stats.max_node_results = max(len(result_ids), stats.max_node_results)
+ stats.max_node_children = max(len(child_chars_offsets_barriers), stats.max_node_children)
# print results, if any
- if result_count:
+ if result_ids:
out += color_map['blue'] + ' ['
- for i in range(result_count):
+ for i, result in enumerate(result_ids):
if i: out += color_map['blue']+', '
- result = Trie.result_struct.unpack_from(serialized, offset)[0]
stats.max_node_result_index = max(result, stats.max_node_result_index)
out += color_map['cyan'] + str(result)
- offset += Trie.result_struct.size
out += color_map['blue'] + ']'
# print children, if any
- for i in range(child_count):
- if result_count or i:
+ for i, (char, offset, barrier) in enumerate(child_chars_offsets_barriers):
+ if len(result_ids) or i:
out += color_map['reset'] + '\n'
out += color_map['blue'] + indent + color_map['white']
- char = Trie.child_char_struct.unpack_from(serialized, offset + 3)[0]
if char <= 127:
out += chr(char)
else:
out += color_map['reset'] + hex(char)
- if (show_lookahead_barriers and Trie.child_struct.unpack_from(serialized, offset)[0] & 0x00800000):
+ if (show_lookahead_barriers and barrier):
out += color_map['green'] + '$'
- if char > 127 or (show_lookahead_barriers and Trie.child_struct.unpack_from(serialized, offset)[0] & 0x00800000):
+ if char > 127 or (show_lookahead_barriers and barrier):
out += color_map['reset'] + '\n' + color_map['blue'] + indent + ' ' + color_map['white']
- child_offset = Trie.child_struct.unpack_from(serialized, offset)[0] & 0x007fffff
- stats.max_node_child_offset = max(child_offset, stats.max_node_child_offset)
- offset += Trie.child_struct.size
- out += _pretty_print_trie(serialized, hashtable, stats, child_offset, indent + ('|' if child_count > 1 else ' '), show_merged=show_merged, show_lookahead_barriers=show_lookahead_barriers, color_map=color_map)
- child_count += 1
+ stats.max_node_child_offset = max(offset, stats.max_node_child_offset)
+ out += _pretty_print_trie(deserializer, serialized, hashtable, stats, offset, indent + ('|' if len(child_chars_offsets_barriers) > 1 else ' '), show_merged=show_merged, show_lookahead_barriers=show_lookahead_barriers, color_map=color_map)
hashtable[base_offset] = True
return out
'yellow': '',
'reset': ''}
-def pretty_print_trie(serialized: bytes, *, show_merged=False, show_lookahead_barriers=True, colors=False):
+def pretty_print_trie(deserializer: Deserializer, serialized: bytes, *, show_merged=False, show_lookahead_barriers=True, colors=False):
color_map = color_map_colors if colors else color_map_dummy
hashtable = {}
stats.max_node_result_index = 0
stats.max_node_child_offset = 0
- out = _pretty_print_trie(serialized, hashtable, stats, Trie.root_offset_struct.unpack_from(serialized, 0)[0], '', show_merged=show_merged, show_lookahead_barriers=show_lookahead_barriers, color_map=color_map)
+ out = _pretty_print_trie(deserializer, serialized, hashtable, stats, deserializer.unpack_trie_root_offset(serialized, 0)[0], '', show_merged=show_merged, show_lookahead_barriers=show_lookahead_barriers, color_map=color_map)
if out: out = color_map['white'] + out
stats = """
node count: {}
max node child offset: {}""".lstrip().format(stats.node_count, stats.max_node_results, stats.max_node_children, stats.max_node_result_index, stats.max_node_child_offset)
return out, stats
-def pretty_print_map(serialized: bytes, *, entryTypeClass, colors=False):
+def pretty_print_map(deserializer: Deserializer, serialized: bytes, *, entryTypeClass, colors=False):
color_map = color_map_colors if colors else color_map_dummy
# The first item gives out offset of first value, which can be used to
# calculate total value count
- offset = ResultMap.offset_struct.unpack_from(serialized, 0)[0] & 0x00ffffff
- size = int(offset/4 - 1)
+ offset, offset_size = deserializer.unpack_result_map_offset(serialized, 0)
+ size = int((offset - offset_size)/(offset_size + Serializer.result_map_flag_bytes))
+ flags_offset = (size + 1)*offset_size
out = ''
for i in range(size):
if i: out += '\n'
- flags = ResultFlag(ResultMap.flags_struct.unpack_from(serialized, i*4 + 3)[0])
+ flags = ResultFlag(deserializer.unpack_result_map_flags(serialized, flags_offset + i*Serializer.result_map_flag_bytes)[0])
extra = []
if flags & ResultFlag._TYPE == ResultFlag.ALIAS:
- extra += ['alias={}'.format(ResultMap.alias_struct.unpack_from(serialized, offset)[0])]
- offset += ResultMap.alias_struct.size
+ alias, alias_bytes = deserializer.unpack_result_map_alias(serialized, offset)
+ extra += ['alias={}'.format(alias)]
+ offset += alias_bytes
if flags & ResultFlag.HAS_PREFIX:
- extra += ['prefix={}[:{}]'.format(*ResultMap.prefix_struct.unpack_from(serialized, offset))]
- offset += ResultMap.prefix_struct.size
+ prefix_id, prefix_length, prefix_bytes = deserializer.unpack_result_map_prefix(serialized, offset)
+ extra += ['prefix={}[:{}]'.format(prefix_id, prefix_length)]
+ offset += prefix_bytes
if flags & ResultFlag.HAS_SUFFIX:
- extra += ['suffix_length={}'.format(ResultMap.suffix_length_struct.unpack_from(serialized, offset)[0])]
- offset += ResultMap.suffix_length_struct.size
+ suffix_length, suffix_bytes = deserializer.unpack_result_map_suffix_length(serialized, offset)
+ extra += ['suffix_length={}'.format(suffix_length)]
+ offset += suffix_bytes
if flags & ResultFlag.DEPRECATED:
extra += ['deprecated']
if flags & ResultFlag.DELETED:
extra += ['deleted']
if flags & ResultFlag._TYPE:
extra += ['type={}'.format(entryTypeClass(flags.type).name)]
- next_offset = ResultMap.offset_struct.unpack_from(serialized, (i + 1)*4)[0] & 0x00ffffff
+ next_offset = deserializer.unpack_result_map_offset(serialized, (i + 1)*offset_size)[0]
name, _, url = serialized[offset:next_offset].partition(b'\0')
out += color_map['cyan'] + str(i) + color_map['blue'] + ': ' + color_map['white'] + name.decode('utf-8') + color_map['blue'] + ' [' + color_map['yellow'] + (color_map['blue'] + ', ' + color_map['yellow']).join(extra) + color_map['blue'] + '] ->' + (' ' + color_map['reset'] + url.decode('utf-8') if url else '')
offset = next_offset
return out
-def pretty_print_type_map(serialized: bytes, *, entryTypeClass):
+def pretty_print_type_map(deserializer: Deserializer, serialized: bytes, *, entryTypeClass):
# Unpack until we aren't at EOF
i = 0
out = ''
- class_id, offset = type_map_entry_struct.unpack_from(serialized, 0)
- while offset < len(serialized):
+ class_id, name_offset, type_map_bytes = deserializer.unpack_type_map_entry(serialized, 0)
+ while name_offset < len(serialized):
if i: out += ',\n'
- next_class_id, next_offset = type_map_entry_struct.unpack_from(serialized, (i + 1)*type_map_entry_struct.size)
- out += "({}, {}, '{}')".format(entryTypeClass(i + 1), CssClass(class_id), serialized[offset:next_offset].decode('utf-8'))
+ next_class_id, next_name_offset = deserializer.unpack_type_map_entry(serialized, (i + 1)*type_map_bytes)[:2]
+ out += "({}, {}, '{}')".format(entryTypeClass(i + 1), CssClass(class_id), serialized[name_offset:next_name_offset].decode('utf-8'))
i += 1
- class_id, offset = next_class_id, next_offset
+ class_id, name_offset = next_class_id, next_name_offset
return out
def pretty_print(serialized: bytes, *, entryTypeClass, show_merged=False, show_lookahead_barriers=True, colors=False):
- magic, version, symbol_count, map_offset, type_map_offset = search_data_header_struct.unpack_from(serialized)
- assert magic == b'MCS'
- assert version == searchdata_format_version
-
- pretty_trie, stats = pretty_print_trie(serialized[search_data_header_struct.size:map_offset], show_merged=show_merged, show_lookahead_barriers=show_lookahead_barriers, colors=colors)
- pretty_map = pretty_print_map(serialized[map_offset:type_map_offset], entryTypeClass=entryTypeClass, colors=colors)
- pretty_type_map = pretty_print_type_map(serialized[type_map_offset:], entryTypeClass=entryTypeClass)
- return '{} symbols\n'.format(symbol_count) + pretty_trie + '\n' + pretty_map + '\n' + pretty_type_map, stats
+ deserializer = Deserializer.from_serialized(serialized)
+
+ pretty_trie, stats = pretty_print_trie(deserializer, serialized[Serializer.header_struct.size:deserializer.map_offset], show_merged=show_merged, show_lookahead_barriers=show_lookahead_barriers, colors=colors)
+ pretty_map = pretty_print_map(deserializer, serialized[deserializer.map_offset:deserializer.type_map_offset], entryTypeClass=entryTypeClass, colors=colors)
+ pretty_type_map = pretty_print_type_map(deserializer, serialized[deserializer.type_map_offset:], entryTypeClass=entryTypeClass)
+ return '{} symbols\n'.format(deserializer.symbol_count) + pretty_trie + '\n' + pretty_map + '\n' + pretty_type_map, stats
from pygments.formatters import HtmlFormatter
from pygments.lexers import TextLexer, BashSessionLexer, get_lexer_by_name, find_lexer_class_for_filename
-from _search import CssClass, ResultFlag, ResultMap, Trie, serialize_search_data, base85encode_search_data, search_filename, searchdata_filename, searchdata_filename_b85, searchdata_format_version
+from _search import CssClass, ResultFlag, ResultMap, Trie, Serializer, serialize_search_data, base85encode_search_data, search_filename, searchdata_filename, searchdata_filename_b85, searchdata_format_version
sys.path.append(os.path.join(os.path.dirname(os.path.realpath(__file__)), '../plugins'))
import dot2svg
# order by default
trie.sort(map)
- return serialize_search_data(trie, map, search_type_map, symbol_count, merge_subtrees=merge_subtrees, merge_prefixes=merge_prefixes)
+ return serialize_search_data(Serializer(file_offset_bytes=3, result_id_bytes=2, name_size_bytes=1), trie, map, search_type_map, symbol_count, merge_subtrees=merge_subtrees, merge_prefixes=merge_prefixes)
def parse_xml(state: State, xml: str):
# Reset counter for unique math formulas
import jinja2
-from _search import CssClass, ResultFlag, ResultMap, Trie, serialize_search_data, base85encode_search_data, searchdata_format_version, search_filename, searchdata_filename, searchdata_filename_b85
+from _search import CssClass, ResultFlag, ResultMap, Trie, Serializer, serialize_search_data, base85encode_search_data, searchdata_format_version, search_filename, searchdata_filename, searchdata_filename_b85
sys.path.append(os.path.join(os.path.dirname(os.path.realpath(__file__)), '../plugins'))
import m.htmlsanity
# order by default
trie.sort(map)
- return serialize_search_data(trie, map, search_type_map, symbol_count, merge_subtrees=merge_subtrees, merge_prefixes=merge_prefixes)
+ return serialize_search_data(Serializer(file_offset_bytes=3, result_id_bytes=2, name_size_bytes=1), trie, map, search_type_map, symbol_count, merge_subtrees=merge_subtrees, merge_prefixes=merge_prefixes)
def run(basedir, config, *, templates=default_templates, search_add_lookahead_barriers=True, search_merge_subtrees=True, search_merge_prefixes=True):
# Populate the INPUT, if not specified, make it absolute
"use strict"; /* it summons the Cthulhu in a proper way, they say */
var Search = {
- formatVersion: 1, /* the data filename contains this number too */
+ formatVersion: 2, /* the data filename contains this number too */
dataSize: 0, /* used mainly by tests, not here */
symbolCount: '…',
trie: null,
map: null,
+ mapFlagsOffset: null,
typeMap: null,
maxResults: 0,
+ /* Type sizes and masks. The data is always fetched as 16/32bit number and
+ then masked to 1, 2, 3 or 4 bytes. Fortunately on LE a mask is enough,
+ on BE we'd have to read N bytes before and then mask. */
+ nameSizeBytes: null,
+ nameSizeMask: null,
+ resultIdBytes: null,
+ resultIdMask: null,
+ fileOffsetBytes: null,
+ fileOffsetMask: null,
+ lookaheadBarrierMask: null,
+
/* Always contains at least the root node offset and then one node offset
per entered character */
searchString: '',
/* The file is too short to contain at least the headers and empty
sections */
- if(view.byteLength < 26) {
+ if(view.byteLength < 31) {
console.error("Search data too short");
return false;
}
return false;
}
- /* Separate the data into the trie and the result / type map */
- let mapOffset = view.getUint32(6, true);
- let typeMapOffset = view.getUint32(10, true);
- this.trie = new DataView(buffer, 14, mapOffset - 14);
- this.map = new DataView(buffer, mapOffset, typeMapOffset - mapOffset);
+ /* Fetch type sizes. The only value that can fail is result ID byte
+ count, where value of 3 has no assigned meaning. */
+ let typeSizes = view.getUint8(4, true);
+ if((typeSizes & 0x01) >> 0 == 0) {
+ this.fileOffsetBytes = 3;
+ this.fileOffsetMask = 0x00ffffff;
+ this.lookaheadBarrierMask = 0x00800000;
+ } else /* (typeSizes & 0x01) >> 0 == 1 */ {
+ this.fileOffsetBytes = 4;
+ this.fileOffsetMask = 0xffffffff;
+ this.lookaheadBarrierMask = 0x80000000;
+ }
+ if((typeSizes & 0x06) >> 1 == 0) {
+ this.resultIdBytes = 2;
+ this.resultIdMask = 0x0000ffff;
+ } else if((typeSizes & 0x06) >> 1 == 1) {
+ this.resultIdBytes = 3;
+ this.resultIdMask = 0x00ffffff;
+ } else if((typeSizes & 0x06) >> 1 == 2) {
+ this.resultIdBytes = 4;
+ this.resultIdMask = 0xffffffff;
+ } else /* (typeSizes & 0x06) >> 1 == 3 */ {
+ console.error("Invalid search data result ID byte value");
+ return false;
+ }
+ if((typeSizes & 0x08) >> 3 == 0) {
+ this.nameSizeBytes = 1;
+ this.nameSizeMask = 0x00ff;
+ } else /* (typeSizes & 0x08) >> 3 == 1 */ {
+ this.nameSizeBytes = 2;
+ this.nameSizeMask = 0xffff;
+ }
+
+ /* Separate the data into the trie and the result / type map. Because
+ we're reading larger values than there might be and then masking out
+ the high bytes, keep extra 1/2 byte padding at the end to avoid
+ OOB errors. */
+ let mapOffset = view.getUint32(12, true);
+ let typeMapOffset = view.getUint32(16, true);
+ /* There may be a 3-byte file offset at the end of the trie which we'll
+ read as 32-bit, add one safety byte in that case */
+ this.trie = new DataView(buffer, 20, mapOffset - 20 + (4 - this.fileOffsetBytes));
+ /* There may be a 3-byte file size (for zero results) which we'll read
+ as 32-bit, add one safety byte in that case */
+ this.map = new DataView(buffer, mapOffset, typeMapOffset - mapOffset + (4 - this.fileOffsetBytes));
+ /* No variable-size types in the type map at the moment */
this.typeMap = new DataView(buffer, typeMapOffset);
+ /* Offset of the first result map item is after N + 1 offsets and N
+ flags, calculate flag offset from that */
+ this.mapFlagsOffset = this.fileOffsetBytes*(((this.map.getUint32(0, true) & this.fileOffsetMask) - this.fileOffsetBytes)/(this.fileOffsetBytes + 1) + 1);
+
/* Set initial properties */
this.dataSize = buffer.byteLength;
- this.symbolCount = view.getUint16(4, true) + " symbols (" + Math.round(this.dataSize/102.4)/10 + " kB)";
+ this.symbolCount = view.getUint32(8, true) + " symbols (" + Math.round(this.dataSize/102.4)/10 + " kB)";
this.maxResults = maxResults ? maxResults : 100;
this.searchString = '';
this.searchStack = [this.trie.getUint32(0, true)];
/* Calculate offset and count of children */
let offset = this.searchStack[this.searchStack.length - 1];
- /* Calculate child count. If there's a lot of results, the count
- "leaks over" to the child count storage. */
+ /* If there's a lot of results, the result count is a 16bit BE value
+ instead */
let resultCount = this.trie.getUint8(offset);
- let childCount = this.trie.getUint8(offset + 1);
+ let resultCountSize = 1;
if(resultCount & 0x80) {
- resultCount = (resultCount & 0x7f) | ((childCount & 0xf0) << 3);
- childCount = childCount & 0x0f;
+ resultCount = this.trie.getUint16(offset, false) & ~0x8000;
+ ++resultCountSize;
}
+ let childCount = this.trie.getUint8(offset + resultCountSize);
+
/* Go through all children and find the next offset */
- let childOffset = offset + 2 + resultCount*2;
+ let childOffset = offset + resultCountSize + 1 + resultCount*this.resultIdBytes;
let found = false;
for(let j = 0; j != childCount; ++j) {
- if(String.fromCharCode(this.trie.getUint8(childOffset + j*4 + 3)) != searchString[foundPrefix])
+ if(String.fromCharCode(this.trie.getUint8(childOffset + j)) != searchString[foundPrefix])
continue;
- this.searchStack.push(this.trie.getUint32(childOffset + j*4, true) & 0x007fffff);
+ this.searchStack.push(this.trie.getUint32(childOffset + childCount + j*this.fileOffsetBytes, true) & this.fileOffsetMask & ~this.lookaheadBarrierMask);
found = true;
break;
}
"leaks over" to the child count storage. */
/* TODO: hmmm. this is helluvalot duplicated code. hmm. */
let resultCount = this.trie.getUint8(offset);
- let childCount = this.trie.getUint8(offset + 1);
+ let resultCountSize = 1;
if(resultCount & 0x80) {
- resultCount = (resultCount & 0x7f) | ((childCount & 0xf0) << 3);
- childCount = childCount & 0x0f;
+ resultCount = this.trie.getUint16(offset, false) & ~0x8000;
+ ++resultCountSize;
}
+ let childCount = this.trie.getUint8(offset + resultCountSize);
+
/* Populate the results with all values associated with this node */
for(let i = 0; i != resultCount; ++i) {
- let index = this.trie.getUint16(offset + 2 + i*2, true);
+ let index = this.trie.getUint32(offset + resultCountSize + 1 + i*this.resultIdBytes, true) & this.resultIdMask;
results.push(this.gatherResult(index, suffixLength, 0xffffff)); /* should be enough haha */
/* 'nuff said. */
}
/* Dig deeper */
- let childOffset = offset + 2 + resultCount*2;
+ let childOffset = offset + resultCountSize + 1 + resultCount*this.resultIdBytes;
for(let j = 0; j != childCount; ++j) {
- let offsetBarrier = this.trie.getUint32(childOffset + j*4, true);
+ let offsetBarrier = this.trie.getUint32(childOffset + childCount + j*this.fileOffsetBytes, true) & this.fileOffsetMask;
/* Lookahead barrier, don't dig deeper */
- if(offsetBarrier & 0x00800000) continue;
+ if(offsetBarrier & this.lookaheadBarrierMask) continue;
/* Append to the queue */
- leaves.push([offsetBarrier & 0x007fffff, suffixLength + 1]);
+ leaves.push([offsetBarrier & ~this.lookaheadBarrierMask, suffixLength + 1]);
/* We don't have anything yet and this is the only path
forward, add the char to suggested Tab autocompletion. Can't
absolutely unwanted when all I want is check for truncated
UTF-8. */
if(!results.length && leaves.length == 1 && childCount == 1)
- suggestedTabAutocompletionChars.push(this.trie.getUint8(childOffset + j*4 + 3));
+ suggestedTabAutocompletionChars.push(this.trie.getUint8(childOffset + j));
}
}
},
gatherResult: function(index, suffixLength, maxUrlPrefix) {
- let flags = this.map.getUint8(index*4 + 3);
- let resultOffset = this.map.getUint32(index*4, true) & 0x00ffffff;
+ let flags = this.map.getUint8(this.mapFlagsOffset + index);
+ let resultOffset = this.map.getUint32(index*this.fileOffsetBytes, true) & this.fileOffsetMask;
/* The result is an alias, parse the aliased prefix */
let aliasedIndex = null;
if((flags & 0xf0) == 0x00) {
- aliasedIndex = this.map.getUint16(resultOffset, true);
- resultOffset += 2;
+ aliasedIndex = this.map.getUint32(resultOffset, true) & this.resultIdMask;
+ resultOffset += this.resultIdBytes;
}
/* The result has a prefix, parse that first, recursively */
let name = '';
let url = '';
if(flags & (1 << 3)) {
- let prefixIndex = this.map.getUint16(resultOffset, true);
- let prefixUrlPrefixLength = Math.min(this.map.getUint8(resultOffset + 2), maxUrlPrefix);
+ let prefixIndex = this.map.getUint32(resultOffset, true) & this.resultIdMask;
+ let prefixUrlPrefixLength = Math.min(this.map.getUint16(resultOffset + this.resultIdBytes, true) & this.nameSizeMask, maxUrlPrefix);
let prefix = this.gatherResult(prefixIndex, 0 /*ignored*/, prefixUrlPrefixLength);
name = prefix.name;
url = prefix.url;
- resultOffset += 3;
+ resultOffset += this.resultIdBytes + this.nameSizeBytes;
}
/* The result has a suffix, extract its length */
let resultSuffixLength = 0;
if(flags & (1 << 0)) {
- resultSuffixLength = this.map.getUint8(resultOffset);
- ++resultOffset;
+ resultSuffixLength = this.map.getUint16(resultOffset, true) & this.nameSizeMask;
+ resultOffset += this.nameSizeBytes;
}
- let nextResultOffset = this.map.getUint32((index + 1)*4, true) & 0x00ffffff;
+ let nextResultOffset = this.map.getUint32((index + 1)*this.fileOffsetBytes, true) & this.fileOffsetMask;
/* Extract name */
let j = resultOffset;
(CssClass.PRIMARY, "class"),
(CssClass.INFO, "func")
]
+
+# Tries don't store any strings, so name_size_bytes can be whatever
+trie_type_sizes = [
+ {'file_offset_bytes': 3,
+ 'result_id_bytes': 2,
+ 'name_size_bytes': 1},
+ {'file_offset_bytes': 3,
+ 'result_id_bytes': 3,
+ 'name_size_bytes': 1},
+ {'file_offset_bytes': 3,
+ 'result_id_bytes': 4,
+ 'name_size_bytes': 1},
+
+ {'file_offset_bytes': 4,
+ 'result_id_bytes': 2,
+ 'name_size_bytes': 1},
+ {'file_offset_bytes': 4,
+ 'result_id_bytes': 3,
+ 'name_size_bytes': 1},
+ {'file_offset_bytes': 4,
+ 'result_id_bytes': 4,
+ 'name_size_bytes': 1},
+]
+
+type_sizes = trie_type_sizes + [
+ {'file_offset_bytes': 3,
+ 'result_id_bytes': 2,
+ 'name_size_bytes': 2},
+ {'file_offset_bytes': 3,
+ 'result_id_bytes': 3,
+ 'name_size_bytes': 2},
+ {'file_offset_bytes': 3,
+ 'result_id_bytes': 4,
+ 'name_size_bytes': 2},
+
+ {'file_offset_bytes': 4,
+ 'result_id_bytes': 2,
+ 'name_size_bytes': 2},
+ {'file_offset_bytes': 4,
+ 'result_id_bytes': 3,
+ 'name_size_bytes': 2},
+ {'file_offset_bytes': 4,
+ 'result_id_bytes': 4,
+ 'name_size_bytes': 2},
+]
--- /dev/null
+O+!-x000002LJ#7zySaN&H?}cmH_|&0RRC200Aik00001C<p)m00C|e00001X%qke0RRI400Ai=00001C?o&?00C|)00001X)pi)00C__00001Iy?XX0RaL4Izj+|00DAH00001Z%_aL00DGX00001V_X0L00CuU000311poj6DQ*A&00Ag+00001Zg>Cy00C)!00001ZG->-00BCR000321OPga0Du4iWtIQ{00C#700001ZlnMJ00CjD00003ZFX`R003wJ0I&c600BC-000320026~0Du4iXvzQp00DH;000930RRI41poj6Dc%4800AiG00001ZtMU600LoY*Z=_X000312mk;9WdZ>J00C?U0RR92XAA)V00C|i0RRC32>@Xj0RR92bRGc!00Cnr0RR93VP&cS04xCj0RRU800Ct@0RR92XFve}00Ch}0RR92a7+OJ00ClB0RR92byxua00DDe0RR92AY=gm00Fyd0RR92!f*iq0RaX8Aa((O00Cuu0RR92XM_O&00Ci&0RR96ZFX{SbNB!NXaE2*0RWHz0A2wAD*ymO003G50Db@flmGy>007JY0NMZm^#A|>0RRpG03b;^NjOaq7yt=PVRUE!ZeeX@b8ul}WldppXf9}UZEOGl5(qjvZE0>OX>N2ZAZc!NDF7pFX>I@j06IEWWn*-2asXp&VRLg$VRUF;F<&uOWn*-2axQ3eZEOMn7zR2zZE0>ODIjBSZgX@1BW-DJ0000wI#OY7XJr6mY+-YAO<{CsUol@XQekdqWiDuRZEOSp7X~^yZE0>ODIjBSZgX@1BW-DJP+@0f0B~VvWiDuRZEOYr03gD<AX9Z>aA9X<0CRO>aA9X<E@*UZYy<#OWn*+<Zf9&|1ONyC00KHXQe|UwC@BI80S*Bd1snh%aA9X<ZeeX@b8ul}Wn*k%b8}{OZesud
\ No newline at end of file
+++ /dev/null
-O+!-w2LQSO007AX005Q&000310RR921ONaj009U904M+f4gdgd009&L0BHdL0{{R4AOHX<00ATb04M+fDgXd(00A%n0BHaLHUI!^00BGz06GBy0suk)fI0vHNB{tG00B?{0B-;RRsaBW00CS80Am0FVgLYT0RRO600C|Q04V?gasU7*00DRa0B!&QegFVz00D#m0BryPiU0sQ0RaR6kN|)>00EW&0A&CHo&W%600E=`0B!&QssI3C00SBT0BvXh0Cund0CE5Uwg3P+0RaF2!~lRg00GJX0B8UK(f|N-0{{U40{{g800G_r04V?g<^TXF00Ha(0B!&R*Z=@w@&Ev70RRX9009C40A&CH1_1zU009gE0A~OJ5&-~i0RagB7y$rb00ABW0CWHWCIJ9r00OE20AVZv0A&FH2LJ#8JOKb@00BS&0A~OJMgag}00B$^0B`^SQUL&B00CG50CfNXUI74e00CqH03ZMXY5@Sd00D3T0Kx$Q1^{*efFJ+?d;tJu00D#n0A~OJiU9y&00sB}0BvXh0Cq9~0CJE40B~Lb0COw=03bsE07+W_06KpF07;bq064b*08PyR01(>%02uZF0003200|EP002#4bZ7u>VQpn|aA9L*O<{CsE@*UZYybcf2s%1#X>KTKZgealX>N2W03&T_ZU6uPIyzQmV{~tF0Ap-nb8}5$bZB2OUolo?V{~tFE@*UZYyton20A)zX>KSfAY*TCb94YBZE0=*0025VQekdqWdLJrVRLg$VRUF;F<&uKVQyz-E@*UZYy<!o20A)zX>KSfAY*TCb94YBZE0>$VP|CkaA9X<E@*UZYz6=TAi}#KQ*~l+VP|Ckb9G{HVP|D7Xmo9C1OQTHV{~C|XKZBz00;m80y;WUWn*+GDFO-s4gnVh8~`A2VP|D-VQpn|aA9L*V{Bn_b7pmJV*mgE
\ No newline at end of file
-#############
\ No newline at end of file
+##############################
\ No newline at end of file
sys.path.append(os.path.join(os.path.dirname(os.path.realpath(__file__))))
sys.path.append(os.path.join(os.path.dirname(os.path.realpath(__file__)), '..'))
-from _search_test_metadata import EntryType, search_type_map
-from _search import Trie, ResultMap, ResultFlag, serialize_search_data, search_data_header_struct
+from _search_test_metadata import EntryType, search_type_map, type_sizes
+from _search import Trie, ResultMap, ResultFlag, serialize_search_data, Serializer
basedir = pathlib.Path(os.path.dirname(os.path.realpath(__file__)))/'js-test-data'
+def type_size_suffix(*, name_size_bytes, result_id_bytes, file_offset_bytes):
+ return f'ns{name_size_bytes}-ri{result_id_bytes}-fo{file_offset_bytes}'
+
# Basic error handling
+min_size = len(serialize_search_data(Serializer(name_size_bytes=1, result_id_bytes=2, file_offset_bytes=3), Trie(), ResultMap(), [], 0))
+
with open(basedir/'short.bin', 'wb') as f:
- f.write(b'#'*(search_data_header_struct.size - 1))
+ f.write(b'#'*(min_size - 1))
with open(basedir/'wrong-magic.bin', 'wb') as f:
- f.write(b'MOS\1 ')
+ f.write(b'MOS\2')
+ f.write(b'\0'*(min_size - 4))
with open(basedir/'wrong-version.bin', 'wb') as f:
- f.write(b'MCS\0 ')
-with open(basedir/'empty.bin', 'wb') as f:
- f.write(serialize_search_data(Trie(), ResultMap(), [], 0))
+ f.write(b'MCS\1')
+ f.write(b'\0'*(min_size - 4))
+with open(basedir/'wrong-result-id-bytes.bin', 'wb') as f:
+ f.write(Serializer.header_struct.pack(b'MCS', 2, 3 << 1, 0, 0, 0))
+ f.write(b'\0'*(min_size - Serializer.header_struct.size))
+
+# Empty file, in all possible type size combinations
-# General test
+for i in type_sizes:
+ with open(basedir/'empty-{}.bin'.format(type_size_suffix(**i)), 'wb') as f:
+ f.write(serialize_search_data(Serializer(**i), Trie(), ResultMap(), [], 0))
+
+# General test, in all possible type size combinations
trie = Trie()
map = ResultMap()
trie.insert("rectangle", map.add("Rectangle", "", alias=range_index))
trie.insert("rect", map.add("Rectangle::Rect()", "", suffix_length=2, alias=range_index))
-with open(basedir/'searchdata.bin', 'wb') as f:
- f.write(serialize_search_data(trie, map, search_type_map, 7))
-with open(basedir/'searchdata.b85', 'wb') as f:
- f.write(base64.b85encode(serialize_search_data(trie, map, search_type_map, 7), True))
+for i in type_sizes:
+ with open(basedir/'searchdata-{}.bin'.format(type_size_suffix(**i)), 'wb') as f:
+ f.write(serialize_search_data(Serializer(**i), trie, map, search_type_map, 7))
+
+# The Base-85 file however doesn't need to have all type size variants as it's
+# just used to verify it decodes to the right binary variant
+with open(basedir/'searchdata-{}.b85'.format(type_size_suffix(**type_sizes[0])), 'wb') as f:
+ f.write(base64.b85encode(serialize_search_data(Serializer(**type_sizes[0]), trie, map, search_type_map, 7), True))
-# UTF-8 names
+# UTF-8 names, nothing size-dependent here so just one variant
trie = Trie()
map = ResultMap()
trie.insert("hárá", map.add("Hárá", "#b", flags=ResultFlag.from_type(ResultFlag.NONE, EntryType.PAGE)))
with open(basedir/'unicode.bin', 'wb') as f:
- f.write(serialize_search_data(trie, map, search_type_map, 2))
+ f.write(serialize_search_data(Serializer(**type_sizes[0]), trie, map, search_type_map, 2))
-# Heavy prefix nesting
+# Heavy prefix nesting, nothing size-dependent here so just one variant
trie = Trie()
map = ResultMap()
trie.insert("range", map.add("Magnum::Math::Range", "classMagnum_1_1Math_1_1Range.html", flags=ResultFlag.from_type(ResultFlag.NONE, EntryType.CLASS)))
with open(basedir/'nested.bin', 'wb') as f:
- f.write(serialize_search_data(trie, map, search_type_map, 4))
+ f.write(serialize_search_data(Serializer(**type_sizes[0]), trie, map, search_type_map, 4))
-# Extreme amount of search results (Python's __init__, usually)
+# Extreme amount of search results (Python's __init__, usually), in all
+# possible type size combinations
trie = Trie()
map = ResultMap()
for i in [3, 15, 67]:
trie.insert("__init__subclass__", map.add(f"Foo{i}.__init__subclass__(self)", f"Foo{i}.html#__init__subclass__", suffix_length=6, flags=ResultFlag.from_type(ResultFlag.NONE, EntryType.FUNC)))
-with open(basedir/'manyresults.bin', 'wb') as f:
- f.write(serialize_search_data(trie, map, search_type_map, 128 + 3))
+for i in type_sizes:
+ with open(basedir/'manyresults-{}.bin'.format(type_size_suffix(**i)), 'wb') as f:
+ f.write(serialize_search_data(Serializer(**i), trie, map, search_type_map, 128 + 3))
assert.deepEqual(Buffer.from(buf), Buffer.from([0, 0, 0, 0]));
}
-/* Verify that base85-decoded file is equivalent to the binary */
+let type_size_suffixes = [
+ 'ns1-ri2-fo3',
+ 'ns1-ri2-fo4',
+ 'ns1-ri3-fo3',
+ 'ns1-ri3-fo4',
+ 'ns1-ri4-fo3',
+ 'ns1-ri4-fo4',
+
+ 'ns2-ri2-fo3',
+ 'ns2-ri2-fo4',
+ 'ns2-ri3-fo3',
+ 'ns2-ri3-fo4',
+ 'ns2-ri4-fo3',
+ 'ns2-ri4-fo4',
+]
+
+/* Verify that base85-decoded file is equivalent to the binary. Nothing
+ type-size-dependent in the decoder, so test just on the first variant. */
{
- let binary = fs.readFileSync(path.join(__dirname, "js-test-data/searchdata.bin"));
- assert.equal(binary.byteLength, 745);
- let b85 = fs.readFileSync(path.join(__dirname, "js-test-data/searchdata.b85"), {encoding: 'utf-8'});
+ let binary = fs.readFileSync(path.join(__dirname, "js-test-data/searchdata-" + type_size_suffixes[0] + ".bin"));
+ assert.equal(binary.byteLength, 750);
+ let b85 = fs.readFileSync(path.join(__dirname, "js-test-data/searchdata-" + type_size_suffixes[0] + ".b85"), {encoding: 'utf-8'});
assert.deepEqual(new DataView(binary.buffer.slice(binary.byteOffset, binary.byteOffset + binary.byteLength)), new DataView(Search.base85decode(b85), 0, binary.byteLength));
}
assert.ok(!Search.init(buffer.buffer.slice(buffer.byteOffset, buffer.byteOffset + buffer.byteLength)));
}
-/* Search with empty data */
+/* Opening file with wrong result id byte count */
{
- let buffer = fs.readFileSync(path.join(__dirname, "js-test-data/empty.bin"));
+ let buffer = fs.readFileSync(path.join(__dirname, "js-test-data/wrong-result-id-bytes.bin"));
+ assert.ok(!Search.init(buffer.buffer.slice(buffer.byteOffset, buffer.byteOffset + buffer.byteLength)));
+}
+
+/* Search with empty data, in all type size variants */
+for(let i = 0; i != type_size_suffixes.length; ++i) {
+ let buffer = fs.readFileSync(path.join(__dirname, "js-test-data/empty-" + type_size_suffixes[i] + ".bin"));
assert.ok(Search.init(buffer.buffer.slice(buffer.byteOffset, buffer.byteOffset + buffer.byteLength)));
- assert.equal(Search.dataSize, 26);
+
+ /* Test just the smallest and largest size, everything else should be in
+ between */
+ if(i == 0)
+ assert.equal(Search.dataSize, 31);
+ else if(i == type_size_suffixes.length - 1)
+ assert.equal(Search.dataSize, 32);
+ else {
+ assert.ok(Search.dataSize >= 31 && Search.dataSize <= 32);
+ }
+
assert.equal(Search.symbolCount, "0 symbols (0 kB)");
assert.deepEqual(Search.search(''), [[], '']);
}
-/* Search */
-{
- let buffer = fs.readFileSync(path.join(__dirname, "js-test-data/searchdata.bin"));
+/* Search, in all type size variants */
+for(let i = 0; i != type_size_suffixes.length; ++i) {
+ let buffer = fs.readFileSync(path.join(__dirname, "js-test-data/searchdata-" + type_size_suffixes[i] + ".bin"));
assert.ok(Search.init(buffer.buffer.slice(buffer.byteOffset, buffer.byteOffset + buffer.byteLength)));
- assert.equal(Search.dataSize, 745);
- assert.equal(Search.symbolCount, "7 symbols (0.7 kB)");
+
+ /* Test just the smallest and largest size, everything else should be in
+ between */
+ if(i == 0) {
+ assert.equal(Search.dataSize, 750);
+ assert.equal(Search.symbolCount, "7 symbols (0.7 kB)");
+ } else if(i == type_size_suffixes.length - 1) {
+ assert.equal(Search.dataSize, 883);
+ assert.equal(Search.symbolCount, "7 symbols (0.9 kB)");
+ } else {
+ assert.ok(Search.dataSize > 750 && Search.dataSize < 883);
+ }
+
assert.equal(Search.maxResults, 100);
/* Blow up */
suffixLength: 8 }], '']);
}
-/* Search with spaces */
+/* Search with spaces. Nothing type-size-dependent here, so test just on the
+ first variant. */
{
- let buffer = fs.readFileSync(path.join(__dirname, "js-test-data/searchdata.bin"));
+ let buffer = fs.readFileSync(path.join(__dirname, "js-test-data/searchdata-" + type_size_suffixes[0] + ".bin"));
assert.ok(Search.init(buffer.buffer.slice(buffer.byteOffset, buffer.byteOffset + buffer.byteLength)));
- assert.equal(Search.dataSize, 745);
+ assert.equal(Search.dataSize, 750);
assert.equal(Search.symbolCount, "7 symbols (0.7 kB)");
assert.equal(Search.maxResults, 100);
suffixLength: 10 }], Search.toUtf8('» subpage')]);
}
-/* Search, limiting the results to 3 */
+/* Search, limiting the results to 3. Nothing type-size-dependent here, so test
+ just on the first variant. */
{
- let buffer = fs.readFileSync(path.join(__dirname, "js-test-data/searchdata.bin"));
+ let buffer = fs.readFileSync(path.join(__dirname, "js-test-data/searchdata-" + type_size_suffixes[0] + ".bin"));
assert.ok(Search.init(buffer.buffer.slice(buffer.byteOffset, buffer.byteOffset + buffer.byteLength), 3));
- assert.equal(Search.dataSize, 745);
+ assert.equal(Search.dataSize, 750);
assert.equal(Search.symbolCount, "7 symbols (0.7 kB)");
assert.equal(Search.maxResults, 3);
assert.deepEqual(Search.search('m'), [[
suffixLength: 10 }], '']);
}
-/* Search loaded from a base85-encoded file should work properly */
+/* Search loaded from a base85-encoded file should work properly. Nothing
+ type-size-dependent here, so test just on the first variant. */
{
- let b85 = fs.readFileSync(path.join(__dirname, "js-test-data/searchdata.b85"), {encoding: 'utf-8'});
+ let b85 = fs.readFileSync(path.join(__dirname, "js-test-data/searchdata-" + type_size_suffixes[0] + ".b85"), {encoding: 'utf-8'});
assert.ok(Search.load(b85));
- assert.equal(Search.dataSize, 748); /* some padding on the end, that's okay */
+ assert.equal(Search.dataSize, 752); /* some padding on the end, that's okay */
assert.equal(Search.symbolCount, "7 symbols (0.7 kB)");
assert.equal(Search.maxResults, 100);
assert.deepEqual(Search.search('min'), [[
suffixLength: 8 }], '()']);
}
-/* Search, Unicode */
+/* Search, Unicode. Nothing type-size-dependent here. */
{
let buffer = fs.readFileSync(path.join(__dirname, "js-test-data/unicode.bin"));
assert.ok(Search.init(buffer.buffer.slice(buffer.byteOffset, buffer.byteOffset + buffer.byteLength)));
- assert.equal(Search.dataSize, 160);
+ assert.equal(Search.dataSize, 165);
assert.equal(Search.symbolCount, "2 symbols (0.2 kB)");
/* Both "Hýždě" and "Hárá" have common autocompletion to "h\xA1", which is
not valid UTF-8, so it has to get truncated */
suffixLength: 3 }], Search.toUtf8('rá')]);
}
-/* Properly combine heavily nested URLs */
+/* Properly combine heavily nested URLs. Nothing type-size-dependent here. */
{
let buffer = fs.readFileSync(path.join(__dirname, "js-test-data/nested.bin"));
assert.ok(Search.init(buffer.buffer.slice(buffer.byteOffset, buffer.byteOffset + buffer.byteLength)));
- assert.equal(Search.dataSize, 331);
+ assert.equal(Search.dataSize, 336);
assert.equal(Search.symbolCount, "4 symbols (0.3 kB)");
assert.deepEqual(Search.search('geo'), [[
{ name: 'Magnum::Math::Geometry',
suffixLength: 3 }], 'nge']);
}
-/* Extreme amount of search results */
-{
- let buffer = fs.readFileSync(path.join(__dirname, "js-test-data/manyresults.bin"));
+/* Extreme amount of search results, in all type size variants to ensure no
+ size assumptions were left there */
+for(let i = 0; i != type_size_suffixes.length; ++i) {
+ let buffer = fs.readFileSync(path.join(__dirname, "js-test-data/manyresults-" + type_size_suffixes[i] + ".bin"));
assert.ok(Search.init(buffer.buffer.slice(buffer.byteOffset, buffer.byteOffset + buffer.byteLength), 10000));
- assert.equal(Search.dataSize, 6415);
- assert.equal(Search.symbolCount, "131 symbols (6.3 kB)");
+
+ /* Test just the smallest and largest size, everything else should be in
+ between */
+ if(i == 0) {
+ assert.equal(Search.dataSize, 6421);
+ assert.equal(Search.symbolCount, "131 symbols (6.3 kB)");
+ } else if(i == type_size_suffixes.length - 1) {
+ assert.equal(Search.dataSize, 6964);
+ assert.equal(Search.symbolCount, "131 symbols (6.8 kB)");
+ } else {
+ assert.ok(Search.dataSize > 6421 && Search.dataSize < 6964);
+ }
+
assert.equal(Search.maxResults, 10000);
assert.deepEqual(Search.search('__init__')[0].length, 128 + 3);
assert.deepEqual(Search.search('__init__')[1], '');
import unittest
from types import SimpleNamespace as Empty
-from ._search_test_metadata import EntryType, search_type_map
-from _search import Trie, ResultMap, ResultFlag, serialize_search_data, pretty_print_trie, pretty_print_map, pretty_print
+from ._search_test_metadata import EntryType, search_type_map, trie_type_sizes, type_sizes
+from _search import Trie, ResultMap, ResultFlag, Serializer, Deserializer, serialize_search_data, pretty_print_trie, pretty_print_map, pretty_print
from test_doxygen import IntegrationTestCase
super().__init__(*args, **kwargs)
self.maxDiff = None
- def compare(self, serialized: bytes, expected: str):
- pretty = pretty_print_trie(serialized)[0]
+ def compare(self, deserializer: Deserializer, serialized: bytes, expected: str):
+ pretty = pretty_print_trie(deserializer, serialized)[0]
#print(pretty)
self.assertEqual(pretty, expected.strip())
def test_empty(self):
trie = Trie()
- serialized = trie.serialize()
- self.compare(serialized, "")
- self.assertEqual(len(serialized), 6)
+ for i in trie_type_sizes:
+ with self.subTest(**i):
+ serialized = trie.serialize(Serializer(**i))
+ self.compare(Deserializer(**i), serialized, "")
+ self.assertEqual(len(serialized), 6)
def test_single(self):
trie = Trie()
trie.insert("magnum", 1337)
trie.insert("magnum", 21)
- serialized = trie.serialize()
- self.compare(serialized, """
+ for i in trie_type_sizes:
+ with self.subTest(**i):
+ serialized = trie.serialize(Serializer(**i))
+ self.compare(Deserializer(**i), serialized, """
magnum [1337, 21]
""")
- self.assertEqual(len(serialized), 46)
+ # Verify just the smallest and largest size, everything else
+ # should fit in between
+ if i['file_offset_bytes'] == 3 and i['result_id_bytes'] == 2:
+ self.assertEqual(len(serialized), 46)
+ elif i['file_offset_bytes'] == 4 and i['result_id_bytes'] == 4:
+ self.assertEqual(len(serialized), 56)
+ else:
+ self.assertGreater(len(serialized), 46)
+ self.assertLess(len(serialized), 56)
def test_multiple(self):
trie = Trie()
trie.insert("range::max", 10)
trie.insert("max", 10)
- serialized = trie.serialize()
- self.compare(serialized, """
+ for i in trie_type_sizes:
+ with self.subTest(**i):
+ serialized = trie.serialize(Serializer(**i))
+ self.compare(Deserializer(**i), serialized, """
math [0]
||| :$
||| :vector [1]
| :min [9]
| ax [10]
""")
- self.assertEqual(len(serialized), 340)
+ # Verify just the smallest and largest size, everything else
+ # should fit in between
+ if i['file_offset_bytes'] == 3 and i['result_id_bytes'] == 2:
+ self.assertEqual(len(serialized), 340)
+ elif i['file_offset_bytes'] == 4 and i['result_id_bytes'] == 4:
+ self.assertEqual(len(serialized), 428)
+ else:
+ self.assertGreater(len(serialized), 340)
+ self.assertLess(len(serialized), 428)
def test_unicode(self):
trie = Trie()
trie.insert("hýždě", 0)
trie.insert("hárá", 1)
- serialized = trie.serialize()
- self.compare(serialized, """
+ serialized = trie.serialize(Serializer(file_offset_bytes=3, result_id_bytes=2, name_size_bytes=1))
+ self.compare(Deserializer(file_offset_bytes=3, result_id_bytes=2, name_size_bytes=1), serialized, """
h0xc3
0xbd
0xc5
""")
self.assertEqual(len(serialized), 82)
- def test_many_results(self):
+ def test_16bit_result_count(self):
trie = Trie()
for i in range(128):
for i in [203, 215, 267]:
trie.insert("__init__subclass__", i)
- serialized = trie.serialize()
- self.compare(serialized, """
+ for i in trie_type_sizes:
+ with self.subTest(**i):
+ serialized = trie.serialize(Serializer(**i))
+ self.compare(Deserializer(**i), serialized, """
__init__ [{}]
subclass__ [203, 215, 267]
""".format(', '.join([str(i) for i in range(128)])))
- self.assertEqual(len(serialized), 376)
+ # Verify just the smallest and largest size, everything else
+ # should fit in between
+ if i['file_offset_bytes'] == 3 and i['result_id_bytes'] == 2:
+ self.assertEqual(len(serialized), 377)
+ elif i['file_offset_bytes'] == 4 and i['result_id_bytes'] == 4:
+ self.assertEqual(len(serialized), 657)
+ else:
+ self.assertGreater(len(serialized), 377)
+ self.assertLess(len(serialized), 657)
+
+ def test_16bit_result_id_too_small(self):
+ trie = Trie()
+ trie.insert("a", 65536)
+ with self.assertRaises(OverflowError):
+ trie.serialize(Serializer(file_offset_bytes=3, result_id_bytes=2, name_size_bytes=1))
+
+ # This should work
+ trie.serialize(Serializer(file_offset_bytes=3, result_id_bytes=3, name_size_bytes=1))
+
+ def test_24bit_result_id_too_small(self):
+ trie = Trie()
+ trie.insert("a", 16*1024*1024)
+ with self.assertRaises(OverflowError):
+ trie.serialize(Serializer(file_offset_bytes=3, result_id_bytes=3, name_size_bytes=1))
+
+ # This should work
+ trie.serialize(Serializer(file_offset_bytes=3, result_id_bytes=4, name_size_bytes=1))
+
+ def test_23bit_file_offset_too_small(self):
+ trie = Trie()
+
+ # The hight bit of the child offset stores a lookahead barrier, so the
+ # file has to be smaller than 8M, not 16. Python has a recursion limit
+ # of 1000, so we can't really insert a 8M character long string.
+ # Instead, insert one 130-character string where each char has 32k
+ # 16bit result IDs. 129 isn't enough to overflow the offsets.
+ results_32k = [j for j in range(32767)]
+ for i in range(130):
+ trie.insert('a'*i, results_32k)
+
+ with self.assertRaises(OverflowError):
+ trie.serialize(Serializer(file_offset_bytes=3, result_id_bytes=2, name_size_bytes=1))
+
+ # This should work
+ trie.serialize(Serializer(file_offset_bytes=4, result_id_bytes=2, name_size_bytes=1))
class MapSerialization(unittest.TestCase):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.maxDiff = None
- def compare(self, serialized: bytes, expected: str):
- pretty = pretty_print_map(serialized, entryTypeClass=EntryType)
+ def compare(self, deserializer: Deserializer, serialized: bytes, expected: str):
+ pretty = pretty_print_map(deserializer, serialized, entryTypeClass=EntryType)
#print(pretty)
self.assertEqual(pretty, expected.strip())
def test_empty(self):
map = ResultMap()
- serialized = map.serialize()
- self.compare(serialized, "")
- self.assertEqual(len(serialized), 4)
+ for i in type_sizes:
+ with self.subTest(**i):
+ serialized = map.serialize(Serializer(**i))
+ self.compare(Deserializer(**i), serialized, "")
+ self.assertEqual(len(serialized), i['file_offset_bytes'])
def test_single(self):
map = ResultMap()
+
self.assertEqual(map.add("Magnum", "namespaceMagnum.html", suffix_length=11, flags=ResultFlag.from_type(ResultFlag.NONE, EntryType.NAMESPACE)), 0)
- serialized = map.serialize()
- self.compare(serialized, """
+ for i in type_sizes:
+ with self.subTest(**i):
+ serialized = map.serialize(Serializer(**i))
+ self.compare(Deserializer(**i), serialized, """
0: Magnum [suffix_length=11, type=NAMESPACE] -> namespaceMagnum.html
""")
- self.assertEqual(len(serialized), 36)
+ # Verify just the smallest and largest size, everything else
+ # should fit in between. The `result_id_bytes` don't affect
+ # this case.
+ if i['file_offset_bytes'] == 3 and i['name_size_bytes'] == 1:
+ self.assertEqual(len(serialized), 35)
+ elif i['file_offset_bytes'] == 4 and i['name_size_bytes'] == 2:
+ self.assertEqual(len(serialized), 38)
+ else:
+ self.assertGreater(len(serialized), 35)
+ self.assertLess(len(serialized), 38)
def test_multiple(self):
map = ResultMap()
self.assertEqual(map.add("Rectangle", "", alias=2), 5)
self.assertEqual(map.add("Rectangle::Rect()", "", suffix_length=2, alias=2), 6)
- serialized = map.serialize()
- self.compare(serialized, """
+ for i in type_sizes:
+ with self.subTest(**i):
+ serialized = map.serialize(Serializer(**i))
+ self.compare(Deserializer(**i), serialized, """
0: Math [type=NAMESPACE] -> namespaceMath.html
1: ::Vector [prefix=0[:0], type=CLASS] -> classMath_1_1Vector.html
2: ::Range [prefix=0[:0], type=CLASS] -> classMath_1_1Range.html
5: Rectangle [alias=2] ->
6: ::Rect() [alias=2, prefix=5[:0], suffix_length=2] ->
""")
- self.assertEqual(len(serialized), 203)
+ # Verify just the smallest and largest size, everything else
+ # should fit in between
+ if i['file_offset_bytes'] == 3 and i['result_id_bytes'] == 2 and i['name_size_bytes'] == 1:
+ self.assertEqual(len(serialized), 202)
+ elif i['file_offset_bytes'] == 4 and i['result_id_bytes'] == 4 and i['name_size_bytes'] == 2:
+ self.assertEqual(len(serialized), 231)
+ else:
+ self.assertGreater(len(serialized), 202)
+ self.assertLess(len(serialized), 231)
+
+ def test_24bit_file_offset_too_small(self):
+ map = ResultMap()
+ # 3 bytes for the initial offset, 3 bytes for file size, 1 byte for the
+ # flags, 1 byte for the null terminator, 6 bytes for the URL
+ map.add('F'*(16*1024*1024 - 14), 'f.html', flags=ResultFlag.from_type(ResultFlag.NONE, EntryType.CLASS))
+
+ with self.assertRaises(OverflowError):
+ # Disabling prefix merging otherwise memory usage goes to hell
+ map.serialize(Serializer(file_offset_bytes=3, result_id_bytes=2, name_size_bytes=1), merge_prefixes=False)
+
+ # This should work. Disabling prefix merging otherwise memory usage
+ # goes to hell.
+ map.serialize(Serializer(file_offset_bytes=4, result_id_bytes=2, name_size_bytes=1), merge_prefixes=False)
+
+ def test_8bit_suffix_length_too_small(self):
+ map = ResultMap()
+ map.add("F()" + ';'*256, "f.html", flags=ResultFlag.from_type(ResultFlag.NONE, EntryType.FUNC), suffix_length=256)
+
+ with self.assertRaises(OverflowError):
+ map.serialize(Serializer(file_offset_bytes=3, result_id_bytes=2, name_size_bytes=1))
+
+ # This should work
+ map.serialize(Serializer(file_offset_bytes=3, result_id_bytes=2, name_size_bytes=2))
+
+ def test_8bit_prefix_length_too_small(self):
+ map = ResultMap()
+ map.add("A", 'a'*251 + ".html", flags=ResultFlag.from_type(ResultFlag.NONE, EntryType.CLASS))
+ map.add("A::foo()", 'a'*251 + ".html#foo", flags=ResultFlag.from_type(ResultFlag.NONE, EntryType.FUNC))
+
+ with self.assertRaises(OverflowError):
+ map.serialize(Serializer(file_offset_bytes=3, result_id_bytes=2, name_size_bytes=1))
+
+ # This should work
+ map.serialize(Serializer(file_offset_bytes=3, result_id_bytes=2, name_size_bytes=2))
+
+ def test_16bit_prefix_id_too_small(self):
+ map = ResultMap()
+
+ # Adding A0 to A65535 would be too slow due to the recursive Trie
+ # population during prefix merging (SIGH) so trying this instead. It's
+ # still hella slow, but at least not TWO MINUTES.
+ for i in range(128):
+ for j in range(128):
+ for k in range(4):
+ map.add(bytes([i, j, k]).decode('utf-8'), "a.html", flags=ResultFlag.from_type(ResultFlag.NONE, EntryType.CLASS))
+
+ self.assertEqual(map.add("B", "b.html", flags=ResultFlag.from_type(ResultFlag.NONE, EntryType.CLASS)), 65536)
+ map.add("B::foo()", "b.html#foo", flags=ResultFlag.from_type(ResultFlag.NONE, EntryType.FUNC))
+
+ with self.assertRaises(OverflowError):
+ map.serialize(Serializer(file_offset_bytes=3, result_id_bytes=2, name_size_bytes=1))
+
+ # This should work
+ map.serialize(Serializer(file_offset_bytes=3, result_id_bytes=3, name_size_bytes=1))
+
+ # Testing this error for a 24bit prefix seems infeasibly slow, not
+ # doing that
+
+ def test_16bit_alias_id_too_small(self):
+ map = ResultMap()
+
+ # The alias doesn't exist of course, hopefully that's fine in this case
+ map.add("B", "", alias=65536)
+
+ with self.assertRaises(OverflowError):
+ map.serialize(Serializer(file_offset_bytes=3, result_id_bytes=2, name_size_bytes=1))
+
+ # This should work
+ map.serialize(Serializer(file_offset_bytes=3, result_id_bytes=3, name_size_bytes=1))
+
+ def test_24bit_alias_id_too_small(self):
+ map = ResultMap()
+
+ # The alias doesn't exist of course, hopefully that's fine in this case
+ map.add("B", "", alias=16*1024*1024)
+
+ with self.assertRaises(OverflowError):
+ map.serialize(Serializer(file_offset_bytes=3, result_id_bytes=3, name_size_bytes=1))
+
+ # This should work
+ map.serialize(Serializer(file_offset_bytes=3, result_id_bytes=4, name_size_bytes=1))
class Serialization(unittest.TestCase):
def __init__(self, *args, **kwargs):
trie.insert("math::range", index)
trie.insert("range", index)
- serialized = serialize_search_data(trie, map, search_type_map, 3)
- self.compare(serialized, """
+ for i in type_sizes:
+ with self.subTest(**i):
+ serialized = serialize_search_data(Serializer(**i), trie, map, search_type_map, 3)
+ self.compare(serialized, """
3 symbols
math [0]
| ::vector [1]
(EntryType.CLASS, CssClass.PRIMARY, 'class'),
(EntryType.FUNC, CssClass.INFO, 'func')
""")
- self.assertEqual(len(serialized), 277)
+ # Verify just the smallest and largest size, everything else
+ # should fit in between
+ if i['file_offset_bytes'] == 3 and i['result_id_bytes'] == 2 and i['name_size_bytes'] == 1:
+ self.assertEqual(len(serialized), 282)
+ elif i['file_offset_bytes'] == 4 and i['result_id_bytes'] == 4 and i['name_size_bytes'] == 2:
+ self.assertEqual(len(serialized), 317)
+ else:
+ self.assertGreater(len(serialized), 282)
+ self.assertLess(len(serialized), 317)
</div>
</div>
</div>
-<script src="search-v1.js"></script>
-<script src="searchdata-v1.js" async="async"></script>
+<script src="search-v2.js"></script>
+<script src="searchdata-v2.js" async="async"></script>
<footer><nav>
<div class="m-container">
<div class="m-row">
</div>
</div>
</div>
-<script src="search-v1.js"></script>
-<script src="searchdata-v1.js" async="async"></script>
+<script src="search-v2.js"></script>
+<script src="searchdata-v2.js" async="async"></script>
<footer><nav>
<div class="m-container">
<div class="m-row">
</div>
</div>
</div>
-<script src="search-v1.js"></script>
-<script src="searchdata-v1.js" async="async"></script>
+<script src="search-v2.js"></script>
+<script src="searchdata-v2.js" async="async"></script>
<footer><nav>
<div class="m-container">
<div class="m-row">
</div>
</div>
</div>
-<script src="search-v1.js"></script>
+<script src="search-v2.js"></script>
<script>
- Search.download(window.location.pathname.substr(0, window.location.pathname.lastIndexOf('/') + 1) + 'searchdata-v1.bin');
+ Search.download(window.location.pathname.substr(0, window.location.pathname.lastIndexOf('/') + 1) + 'searchdata-v2.bin');
</script>
</body>
</html>
</div>
</div>
</div>
-<script src="search-v1.js"></script>
-<script src="searchdata-v1.js" async="async"></script>
+<script src="search-v2.js"></script>
+<script src="searchdata-v2.js" async="async"></script>
</body>
</html>
serialized = f.read()
search_data_pretty = pretty_print(serialized, entryTypeClass=EntryType)[0]
#print(search_data_pretty)
- self.assertEqual(len(serialized), 4836)
+ self.assertEqual(len(serialized), 4841)
self.assertEqual(search_data_pretty, """
53 symbols
deprecated_macro [0]
serialized = f.read()
search_data_pretty = pretty_print(serialized, entryTypeClass=EntryType)[0]
#print(search_data_pretty)
- self.assertEqual(len(serialized), 473)
+ self.assertEqual(len(serialized), 478)
# The parameters get cut off with an ellipsis
self.assertEqual(search_data_pretty, """
2 symbols
import os
-from _search import search_data_header_struct, searchdata_filename
+from _search import Serializer, searchdata_filename
from . import IntegrationTestCase
# TODO: reuse the search data deserialization API once done
with open(os.path.join(self.path, 'html', searchdata_filename.format(search_filename_prefix='searchdata')), 'rb') as f:
serialized = f.read()
- magic, version, symbol_count, map_offset, type_map_offset = search_data_header_struct.unpack_from(serialized)
+ magic, version, type_data, symbol_count, map_offset, type_map_offset = Serializer.header_struct.unpack_from(serialized)
self.assertEqual(symbol_count, 44)
</div>
</div>
</div>
-<script src="search-v1.js"></script>
+<script src="search-v2.js"></script>
<script>
- Search.download(window.location.pathname.substr(0, window.location.pathname.lastIndexOf('/') + 1) + 'searchdata-v1.bin');
+ Search.download(window.location.pathname.substr(0, window.location.pathname.lastIndexOf('/') + 1) + 'searchdata-v2.bin');
</script>
</body>
</html>
</div>
</div>
</div>
-<script src="search-v1.js"></script>
+<script src="search-v2.js"></script>
<script>
- Search.download(window.location.pathname.substr(0, window.location.pathname.lastIndexOf('/') + 1) + 'searchdata-v1.bin');
+ Search.download(window.location.pathname.substr(0, window.location.pathname.lastIndexOf('/') + 1) + 'searchdata-v2.bin');
</script>
</body>
</html>
</div>
</div>
</div>
-<script src="search-v1.js"></script>
+<script src="search-v2.js"></script>
<script>
- Search.download(window.location.pathname.substr(0, window.location.pathname.lastIndexOf('/') + 1) + 'searchdata-v1.bin');
+ Search.download(window.location.pathname.substr(0, window.location.pathname.lastIndexOf('/') + 1) + 'searchdata-v2.bin');
</script>
</body>
</html>
</div>
</div>
</div>
-<script src="search-v1.js"></script>
+<script src="search-v2.js"></script>
<script>
- Search.download(window.location.pathname.substr(0, window.location.pathname.lastIndexOf('/') + 1) + 'searchdata-v1.bin');
+ Search.download(window.location.pathname.substr(0, window.location.pathname.lastIndexOf('/') + 1) + 'searchdata-v2.bin');
</script>
</body>
</html>
</div>
</div>
</div>
-<script src="search-v1.js"></script>
+<script src="search-v2.js"></script>
<script>
- Search.download(window.location.pathname.substr(0, window.location.pathname.lastIndexOf('/') + 1) + 'searchdata-v1.bin');
+ Search.download(window.location.pathname.substr(0, window.location.pathname.lastIndexOf('/') + 1) + 'searchdata-v2.bin');
</script>
</body>
</html>
</div>
</div>
</div>
-<script src="search-v1.js"></script>
+<script src="search-v2.js"></script>
<script>
- Search.download(window.location.pathname.substr(0, window.location.pathname.lastIndexOf('/') + 1) + 'searchdata-v1.bin');
+ Search.download(window.location.pathname.substr(0, window.location.pathname.lastIndexOf('/') + 1) + 'searchdata-v2.bin');
</script>
</body>
</html>
</div>
</div>
</div>
-<script src="search-v1.js"></script>
+<script src="search-v2.js"></script>
<script>
- Search.download(window.location.pathname.substr(0, window.location.pathname.lastIndexOf('/') + 1) + 'searchdata-v1.bin');
+ Search.download(window.location.pathname.substr(0, window.location.pathname.lastIndexOf('/') + 1) + 'searchdata-v2.bin');
</script>
</body>
</html>
</div>
</div>
</div>
-<script src="search-v1.js"></script>
+<script src="search-v2.js"></script>
<script>
- Search.download(window.location.pathname.substr(0, window.location.pathname.lastIndexOf('/') + 1) + 'searchdata-v1.bin');
+ Search.download(window.location.pathname.substr(0, window.location.pathname.lastIndexOf('/') + 1) + 'searchdata-v2.bin');
</script>
</body>
</html>
</div>
</div>
</div>
-<script src="search-v1.js"></script>
-<script src="searchdata-v1.js" async="async"></script>
+<script src="search-v2.js"></script>
+<script src="searchdata-v2.js" async="async"></script>
<footer><nav>
<div class="m-container">
<div class="m-row">
</div>
</div>
</div>
-<script src="search-v1.js"></script>
+<script src="search-v2.js"></script>
<script>
- Search.download(window.location.pathname.substr(0, window.location.pathname.lastIndexOf('/') + 1) + 'searchdata-v1.bin');
+ Search.download(window.location.pathname.substr(0, window.location.pathname.lastIndexOf('/') + 1) + 'searchdata-v2.bin');
</script>
</body>
</html>
</div>
</div>
</div>
-<script src="search-v1.js"></script>
-<script src="searchdata-v1.js" async="async"></script>
+<script src="search-v2.js"></script>
+<script src="searchdata-v2.js" async="async"></script>
</body>
</html>
</div>
</div>
</div>
-<script src="t.search-v1.js#this-is-an-url"></script>
+<script src="t.search-v2.js#this-is-an-url"></script>
<script>
- Search.download('t.absolutesearchdata-v1.bin#this-is-an-url');
+ Search.download('t.absolutesearchdata-v2.bin#this-is-an-url');
</script>
</body>
</html>
</div>
</div>
</div>
-<script src="t.search-v1.js#this-is-an-url"></script>
+<script src="t.search-v2.js#this-is-an-url"></script>
<script>
- Search.download('t.absolutesearchdata-v1.bin#this-is-an-url');
+ Search.download('t.absolutesearchdata-v2.bin#this-is-an-url');
</script>
</body>
</html>
</div>
</div>
</div>
-<script src="t.search-v1.js#this-is-an-url"></script>
+<script src="t.search-v2.js#this-is-an-url"></script>
<script>
- Search.download('t.absolutesearchdata-v1.bin#this-is-an-url');
+ Search.download('t.absolutesearchdata-v2.bin#this-is-an-url');
</script>
</body>
</html>
</div>
</div>
</div>
-<script src="t.search-v1.js#this-is-an-url"></script>
+<script src="t.search-v2.js#this-is-an-url"></script>
<script>
- Search.download('t.absolutesearchdata-v1.bin#this-is-an-url');
+ Search.download('t.absolutesearchdata-v2.bin#this-is-an-url');
</script>
</body>
</html>
</div>
</div>
</div>
-<script src="t.search-v1.js#this-is-an-url"></script>
+<script src="t.search-v2.js#this-is-an-url"></script>
<script>
- Search.download('t.absolutesearchdata-v1.bin#this-is-an-url');
+ Search.download('t.absolutesearchdata-v2.bin#this-is-an-url');
</script>
</body>
</html>
</div>
</div>
</div>
-<script src="t.search-v1.js#this-is-an-url"></script>
+<script src="t.search-v2.js#this-is-an-url"></script>
<script>
- Search.download('t.absolutesearchdata-v1.bin#this-is-an-url');
+ Search.download('t.absolutesearchdata-v2.bin#this-is-an-url');
</script>
</body>
</html>
</div>
</div>
</div>
-<script src="t.search-v1.js#this-is-an-url"></script>
+<script src="t.search-v2.js#this-is-an-url"></script>
<script>
- Search.download('t.absolutesearchdata-v1.bin#this-is-an-url');
+ Search.download('t.absolutesearchdata-v2.bin#this-is-an-url');
</script>
</body>
</html>
</div>
</div>
</div>
-<script src="t.search-v1.js#this-is-an-url"></script>
+<script src="t.search-v2.js#this-is-an-url"></script>
<script>
- Search.download('t.absolutesearchdata-v1.bin#this-is-an-url');
+ Search.download('t.absolutesearchdata-v2.bin#this-is-an-url');
</script>
</body>
</html>
</div>
</div>
</div>
-<script src="t.search-v1.js#this-is-an-url"></script>
+<script src="t.search-v2.js#this-is-an-url"></script>
<script>
- Search.download('t.absolutesearchdata-v1.bin#this-is-an-url');
+ Search.download('t.absolutesearchdata-v2.bin#this-is-an-url');
</script>
</body>
</html>
</div>
</div>
</div>
-<script src="t.search-v1.js#this-is-an-url"></script>
+<script src="t.search-v2.js#this-is-an-url"></script>
<script>
- Search.download('t.absolutesearchdata-v1.bin#this-is-an-url');
+ Search.download('t.absolutesearchdata-v2.bin#this-is-an-url');
</script>
</body>
</html>
serialized = f.read()
search_data_pretty = pretty_print(serialized, entryTypeClass=EntryType)[0]
#print(search_data_pretty)
- self.assertEqual(len(serialized), 2269)
+ self.assertEqual(len(serialized), 2274)
self.assertEqual(search_data_pretty, """
21 symbols
search [14]
serialized = f.read()
search_data_pretty = pretty_print(serialized, entryTypeClass=EntryType)[0]
#print(search_data_pretty)
- self.assertEqual(len(serialized), 633)
+ self.assertEqual(len(serialized), 638)
# The parameters get cut off with an ellipsis
self.assertEqual(search_data_pretty, """
3 symbols