From: Vladimír Vondruš Date: Thu, 6 Jan 2022 16:06:23 +0000 (+0100) Subject: documentation: refresh search binary data layout docs a bit. X-Git-Url: https://www.chiark.greenend.org.uk/ucgi/~cjwatson/git?a=commitdiff_plain;h=f1207cec97c78765ac6ff034b55464695f3f6f56;p=blog.git documentation: refresh search binary data layout docs a bit. --- diff --git a/documentation/_search.py b/documentation/_search.py index 9c0297aa..c771b006 100644 --- a/documentation/_search.py +++ b/documentation/_search.py @@ -87,41 +87,44 @@ class ResultFlag(enum.Flag): _TYPE14 = 14 << 4 _TYPE15 = 15 << 4 +# Result map encoding -- the "file size" is there so size of item N can be +# always retrieved as `offsets[N + 1] - offsets[N]` +# +# item 1 flags | item 2 flags | | item N flags | file | item 1 | +# + offset | + offset | … | + offset | size | data | … +# 8 + 24b | 8 + 24b | | 8 + 24b | 32b | | +# +# basic item (flags & 0b11 == 0b00): +# +# name | \0 | URL +# | | +# | 8b | +# +# suffixed item (flags & 0b11 == 0b01): +# +# suffix | name | \0 | URL +# length | | | +# 8b | | 8b | +# +# prefixed item (flags & 0xb11 == 0b10): +# +# prefix | name | \0 | URL +# id + len | suffix | | suffix +# 16b + 8b | | 8b | +# +# prefixed & suffixed item (flags & 0xb11 == 0b11): +# +# prefix | suffix | name | \0 | URL +# id + len | length | suffix | | +# 16b + 8b | 8b | | 8b | +# +# alias item (flags & 0xf0 == 0x00), flags & 0xb11 then denote what's in the +# `…` portion, alias have no URL so the alias name is in place of it: +# +# alias | | alias +# id | … | name +# 16b | | class ResultMap: - # item 1 flags | item 2 flags | | item N flags | file | item 1 | - # + offset | + offset | ... | + offset | size | data | ... - # 8 + 24b | 8 + 24b | | 8 + 24b | 32b | | - # - # basic item (flags & 0b11 == 0b00): - # - # name | \0 | URL - # | | - # | 8b | - # - # suffixed item (flags & 0b11 == 0b01): - # - # suffix | name | \0 | URL - # length | | | - # 8b | | 8b | - # - # prefixed item (flags & 0xb11 == 0b10): - # - # prefix | name | \0 | URL - # id + len | suffix | | suffix - # 16b + 8b | | 8b | - # - # prefixed & suffixed item (flags & 0xb11 == 0b11): - # - # prefix | suffix | name | \0 | URL - # id + len | length | suffix | | - # 16b + 8b | 8b | | 8b | - # - # alias item (flags & 0xf0 == 0x00): - # - # alias | | alias - # id | ... | name - # 16b | | - # offset_struct = struct.Struct(' 127, it's instead: +# +# root | | header | results | child 1 | child 1 | child 1 | +# offset | … | | result # | child # | … | char | barrier | offset | … +# 32b | |1| 11b | 4b | n*16b | 8b | 1b | 23b | class Trie: - # root | | header | results | child 1 | child 1 | child 1 | - # offset | ... | | result # | child # | ... | char | barrier | offset | ... - # 32b | |0| 7b | 8b | n*16b | 8b | 1b | 23b | - # - # if result count > 127, it's instead: - # - # root | | header | results | child 1 | child 1 | child 1 | - # offset | ... | | result # | child # | ... | char | barrier | offset | ... - # 32b | |1| 11b | 4b | n*16b | 8b | 1b | 23b | - root_offset_struct = struct.Struct(' bytearray: @@ -418,10 +424,12 @@ def serialize_type_map(map: List[Tuple[CssClass, str]]) -> bytearray: return serialized + names -# magic | version | symbol | result | type | -# header | | count | map | map | -# | | | offset | offset | -# 24b | 8b | 16b | 32b | 32b | +# Whole file encoding: +# +# magic | version | symbol | result | type | trie | result | type +# header | | count | map | map | data | map | map +# | | | offset | offset | | data | data +# 24b | 8b | 16b | 32b | 32b | … | … | … search_data_header_struct = struct.Struct('<3sBHII') def serialize_search_data(trie: Trie, map: ResultMap, type_map: List[Tuple[CssClass, str]], symbol_count, *, merge_subtrees=True, merge_prefixes=True) -> bytearray: