From: Vladimír Vondruš Date: Wed, 17 Jul 2019 16:44:32 +0000 (+0200) Subject: documentation: search data format version 2. X-Git-Url: https://www.chiark.greenend.org.uk/ucgi/~cjwatson/git?a=commitdiff_plain;h=3cd9235952ee5280dd373e94fda4f113055c3c95;p=blog.git documentation: search data format version 2. Instead of having a hardcoded mapping from flags to type names and associated classes on the JavaScript side, which was very C++-specific, the new format now moves this mapping directly into the search data. It's a new section at the end, for each type index containing CSS class ID (which is still hardcoded, but that's not language specific anyway) and an UTF-8 name that's displayed in the label. For C++ this makes the search data file size 107 bytes larger, which isn't significant by any means. --- diff --git a/documentation/_search.py b/documentation/_search.py index 1e58b4a0..25c0be1b 100644 --- a/documentation/_search.py +++ b/documentation/_search.py @@ -26,15 +26,26 @@ # doxygen.py. But `from _search import bla` works. Ugh. import base64 +import enum import struct -from enum import Flag from types import SimpleNamespace as Empty +from typing import List, Tuple -searchdata_format_version = 0 +# Version 0 was without the type map +searchdata_format_version = 1 searchdata_filename = f'searchdata-v{searchdata_format_version}.bin' searchdata_filename_b85 = f'searchdata-v{searchdata_format_version}.js' -class ResultFlag(Flag): +class CssClass(enum.Enum): + DEFAULT = 0 + PRIMARY = 1 + SUCCESS = 2 + WARNING = 3 + DANGER = 4 + INFO = 5 + DIM = 6 + +class ResultFlag(enum.Flag): @staticmethod def from_type(flag: 'ResultFlag', type) -> 'ResultFlag': assert not flag & ResultFlag._TYPE @@ -360,13 +371,49 @@ class Trie: self.root_offset_struct.pack_into(output, 0, self._serialize(hashtable, output, merge_subtrees=merge_subtrees)) return output -search_data_header_struct = struct.Struct('<3sBHI') +# type 1 | type 2 | | | | type 1 | +# class | name | class | name | ... | padding | end | name | ... +# ID | offset | ID | offset | | | offset | data | +# 8b | 8b | 8b | 8b | | 8b | 8b | | +type_map_entry_struct = struct.Struct(' bytearray: + serialized = bytearray() + names = bytearray() + + # There's just 16 bits for the type and we're using one for aliases, so + # that makes at most 15 values left. See ResultFlag for details. + assert len(map) <= 15 + + # Initial name offset is after all the offset entries plus the final one + initial_name_offset = (len(map) + 1)*type_map_entry_struct.size + + # Add all entries (and the final offset), encode the names separately, + # concatenate at the end + for css_class, name in map: + serialized += type_map_entry_struct.pack(css_class.value, initial_name_offset + len(names)) + names += name.encode('utf-8') + serialized += type_map_entry_struct.pack(0, initial_name_offset + len(names)) + assert len(serialized) == initial_name_offset + + return serialized + names -def serialize_search_data(trie: Trie, map: ResultMap, symbol_count, merge_subtrees=True, merge_prefixes=True) -> bytearray: +# magic | version | symbol | result | type | +# header | | count | map | map | +# | | | offset | offset | +# 24b | 8b | 16b | 32b | 32b | +search_data_header_struct = struct.Struct('<3sBHII') + +def serialize_search_data(trie: Trie, map: ResultMap, type_map: List[Tuple[CssClass, str]], symbol_count, *, merge_subtrees=True, merge_prefixes=True) -> bytearray: serialized_trie = trie.serialize(merge_subtrees=merge_subtrees) serialized_map = map.serialize(merge_prefixes=merge_prefixes) - # magic header, version, symbol count, offset of result map - return search_data_header_struct.pack(b'MCS', searchdata_format_version, symbol_count, len(serialized_trie) + 10) + serialized_trie + serialized_map + serialized_type_map = serialize_type_map(type_map) + + preamble = search_data_header_struct.pack(b'MCS', + searchdata_format_version, symbol_count, + search_data_header_struct.size + len(serialized_trie), + search_data_header_struct.size + len(serialized_trie) + len(serialized_map)) + return preamble + serialized_trie + serialized_map + serialized_type_map def base85encode_search_data(data: bytearray) -> bytearray: return (b"/* Generated by https://mcss.mosra.cz/documentation/doxygen/. Do not edit. */\n" + @@ -491,11 +538,25 @@ def pretty_print_map(serialized: bytes, *, entryTypeClass, colors=False): offset = next_offset return out +def pretty_print_type_map(serialized: bytes, *, entryTypeClass): + # Unpack until we aren't at EOF + i = 0 + out = '' + class_id, offset = type_map_entry_struct.unpack_from(serialized, 0) + while offset < len(serialized): + if i: out += ',\n' + next_class_id, next_offset = type_map_entry_struct.unpack_from(serialized, (i + 1)*type_map_entry_struct.size) + out += "({}, {}, '{}')".format(entryTypeClass(i + 1), CssClass(class_id), serialized[offset:next_offset].decode('utf-8')) + i += 1 + class_id, offset = next_class_id, next_offset + return out + def pretty_print(serialized: bytes, *, entryTypeClass, show_merged=False, show_lookahead_barriers=True, colors=False): - magic, version, symbol_count, map_offset = search_data_header_struct.unpack_from(serialized) + magic, version, symbol_count, map_offset, type_map_offset = search_data_header_struct.unpack_from(serialized) assert magic == b'MCS' assert version == searchdata_format_version pretty_trie, stats = pretty_print_trie(serialized[search_data_header_struct.size:map_offset], show_merged=show_merged, show_lookahead_barriers=show_lookahead_barriers, colors=colors) - pretty_map = pretty_print_map(serialized[map_offset:], entryTypeClass=entryTypeClass, colors=colors) - return '{} symbols\n'.format(symbol_count) + pretty_trie + '\n' + pretty_map, stats + pretty_map = pretty_print_map(serialized[map_offset:type_map_offset], entryTypeClass=entryTypeClass, colors=colors) + pretty_type_map = pretty_print_type_map(serialized[type_map_offset:], entryTypeClass=entryTypeClass) + return '{} symbols\n'.format(symbol_count) + pretty_trie + '\n' + pretty_map + '\n' + pretty_type_map, stats diff --git a/documentation/doxygen.py b/documentation/doxygen.py index e2c73dc3..4e4ac2a0 100755 --- a/documentation/doxygen.py +++ b/documentation/doxygen.py @@ -47,7 +47,7 @@ from pygments import highlight from pygments.formatters import HtmlFormatter from pygments.lexers import TextLexer, BashSessionLexer, get_lexer_by_name, find_lexer_class_for_filename -from _search import ResultFlag, ResultMap, Trie, serialize_search_data, base85encode_search_data, searchdata_filename, searchdata_filename_b85, searchdata_format_version +from _search import CssClass, ResultFlag, ResultMap, Trie, serialize_search_data, base85encode_search_data, searchdata_filename, searchdata_filename_b85, searchdata_format_version sys.path.append(os.path.join(os.path.dirname(os.path.realpath(__file__)), '../plugins')) import dot2svg @@ -56,6 +56,8 @@ import latex2svgextra import ansilexer class EntryType(enum.Enum): + # Order must match the search_type_map below; first value is reserved for + # ResultFlag.ALIAS PAGE = 1 NAMESPACE = 2 GROUP = 3 @@ -71,6 +73,24 @@ class EntryType(enum.Enum): ENUM_VALUE = 13 VAR = 14 +# Order must match the EntryType above +search_type_map = [ + (CssClass.SUCCESS, "page"), + (CssClass.PRIMARY, "namespace"), + (CssClass.SUCCESS, "group"), + (CssClass.PRIMARY, "class"), + (CssClass.PRIMARY, "struct"), + (CssClass.PRIMARY, "union"), + (CssClass.PRIMARY, "typedef"), + (CssClass.WARNING, "dir"), + (CssClass.WARNING, "file"), + (CssClass.INFO, "func"), + (CssClass.INFO, "define"), + (CssClass.PRIMARY, "enum"), + (CssClass.DEFAULT, "enum val"), + (CssClass.DEFAULT, "var") +] + xref_id_rx = re.compile(r"""(.*)_1(_[a-z-]+[0-9]+|@)$""") slugify_nonalnum_rx = re.compile(r"""[^\w\s-]""") slugify_hyphens_rx = re.compile(r"""[-\s]+""") @@ -2354,7 +2374,7 @@ def build_search_data(state: State, merge_subtrees=True, add_lookahead_barriers= # order by default trie.sort(map) - return serialize_search_data(trie, map, symbol_count, merge_subtrees=merge_subtrees, merge_prefixes=merge_prefixes) + return serialize_search_data(trie, map, search_type_map, symbol_count, merge_subtrees=merge_subtrees, merge_prefixes=merge_prefixes) def parse_xml(state: State, xml: str): # Reset counter for unique math formulas diff --git a/documentation/search.js b/documentation/search.js index 43b336c3..cbd6ed03 100644 --- a/documentation/search.js +++ b/documentation/search.js @@ -25,10 +25,11 @@ "use strict"; /* it summons the Cthulhu in a proper way, they say */ var Search = { - formatVersion: 0, /* the data filename contains this number too */ + formatVersion: 1, /* the data filename contains this number too */ trie: null, map: null, + typeMap: null, dataSize: 0, symbolCount: 0, maxResults: 0, @@ -54,8 +55,9 @@ var Search = { init: function(buffer, maxResults) { let view = new DataView(buffer); - /* The file is too short to contain at least the headers */ - if(view.byteLength < 20) { + /* The file is too short to contain at least the headers and empty + sections */ + if(view.byteLength < 26) { console.error("Search data too short"); return false; } @@ -74,8 +76,10 @@ var Search = { /* Separate the data into the trie and the result map */ let mapOffset = view.getUint32(6, true); - this.trie = new DataView(buffer, 10, mapOffset - 10); - this.map = new DataView(buffer, mapOffset); + let typeMapOffset = view.getUint32(10, true); + this.trie = new DataView(buffer, 14, mapOffset - 14); + this.map = new DataView(buffer, mapOffset, typeMapOffset - mapOffset); + this.typeMap = new DataView(buffer, typeMapOffset); /* Set initial properties */ this.dataSize = buffer.byteLength; @@ -396,6 +400,8 @@ var Search = { alias: alias.name, url: alias.url, flags: alias.flags, + cssClass: alias.cssClass, + typeName: alias.typeName, suffixLength: suffixLength + resultSuffixLength}; } @@ -405,10 +411,40 @@ var Search = { url += String.fromCharCode(this.map.getUint8(j)); } - /* Keeping in UTF-8, as we need that for proper slicing (and concatenating) */ + /* This is an alias, return what we have, without parsed CSS class and + type name as those are retrieved from the final target type */ + if(!(flags >> 4)) + return {name: name, + url: url, + flags: flags & 0x0f, + suffixLength: suffixLength + resultSuffixLength}; + + /* Otherwise, get CSS class and type name for the result label */ + let typeMapIndex = (flags >> 4) - 1; + let cssClass = [ + /* Keep in sync with _search.py */ + 'm-default', + 'm-primary', + 'm-success', + 'm-warning', + 'm-danger', + 'm-info', + 'm-dim' + ][this.typeMap.getUint8(typeMapIndex*2)]; + let typeNameOffset = this.typeMap.getUint8(typeMapIndex*2 + 1); + let nextTypeNameOffset = this.typeMap.getUint8((typeMapIndex + 1)*2 + 1); + let typeName = ''; + for(let j = typeNameOffset; j != nextTypeNameOffset; ++j) + typeName += String.fromCharCode(this.typeMap.getUint8(j)); + + /* Keeping in UTF-8, as we need that for proper slicing (and + concatenating). Strip the type from the flags, as it's now expressed + directly. */ return {name: name, url: url, - flags: flags, + flags: flags & 0x0f, + cssClass: cssClass, + typeName: typeName, suffixLength: suffixLength + resultSuffixLength}; }, @@ -451,70 +487,8 @@ var Search = { let list = ''; for(let i = 0; i != results.length; ++i) { - let type = ''; - let color = ''; - switch(results[i].flags >> 4) { - /* Keep in sync with doxygen.py */ - case 1: - type = 'page'; - color = 'm-success'; - break; - case 2: - type = 'namespace'; - color = 'm-primary'; - break; - case 3: - type = 'group'; - color = 'm-success'; - break; - case 4: - type = 'class'; - color = 'm-primary'; - break; - case 5: - type = 'struct'; - color = 'm-primary'; - break; - case 6: - type = 'union'; - color = 'm-primary'; - break; - case 7: - type = 'typedef'; - color = 'm-primary'; - break; - case 8: - type = 'dir'; - color = 'm-warning'; - break; - case 9: - type = 'file'; - color = 'm-warning'; - break; - case 10: - type = 'func'; - color = 'm-info'; - break; - case 11: - type = 'define'; - color = 'm-info'; - break; - case 12: - type = 'enum'; - color = 'm-primary'; - break; - case 13: - type = 'enum val'; - color = 'm-default'; - break; - case 14: - type = 'var'; - color = 'm-default'; - break; - } - /* Labels + */ - list += '
' + type + '
' + (results[i].flags & 2 ? '
deprecated
' : '') + (results[i].flags & 4 ? '
deleted
' : ''); + list += '
' + results[i].typeName + '
' + (results[i].flags & 2 ? '
deprecated
' : '') + (results[i].flags & 4 ? '
deleted
' : ''); /* Render the alias (cut off from the right) */ if(results[i].alias) { diff --git a/documentation/test_doxygen/js-test-data/empty.bin b/documentation/test_doxygen/js-test-data/empty.bin index 67dc8acd..36e30edc 100644 Binary files a/documentation/test_doxygen/js-test-data/empty.bin and b/documentation/test_doxygen/js-test-data/empty.bin differ diff --git a/documentation/test_doxygen/js-test-data/nested.bin b/documentation/test_doxygen/js-test-data/nested.bin index 8cc87696..39d332c9 100644 Binary files a/documentation/test_doxygen/js-test-data/nested.bin and b/documentation/test_doxygen/js-test-data/nested.bin differ diff --git a/documentation/test_doxygen/js-test-data/searchdata.b85 b/documentation/test_doxygen/js-test-data/searchdata.b85 index a8ca3584..3239298a 100644 --- a/documentation/test_doxygen/js-test-data/searchdata.b85 +++ b/documentation/test_doxygen/js-test-data/searchdata.b85 @@ -1 +1 @@ -O+!-v2LONp003kG000310RR921ONaj009U904M+f4gdgd009&L0BHdL0{{R4AOHX<00ATb04M+fDgXd(00A%n0BHaLHUI!^00BGz06GBy0suk)fI0vHNB{tG00B?{0B-;RRsaBW00CS80Am0FVgLYT0RRO600C|Q04V?gasU7*00DRa0B!&QegFVz00D#m0BryPiU0sQ0RaR6kN|)>00EW&0A&CHo&W%600E=`0B!&QssI3C00SBT0BvXh0Cund0CE5Uwg3P+0RaF2!~lRg00GJX0B8UK(f|N-0{{U40{{g800G_r04V?g<^TXF00Ha(0B!&R*Z=@w@&Ev70RRU8009C40A&CH1_1zU009gE0A~OJ5&-~i0RadA7y$rb00ABW0CWHWCIJ9r00OE20AVZv0A&FH1^@s7JOKb@00BS&0A~OJMgag}00B$^0B`^SQUL&B00CG50CfNa_y7QHXaE3qG64W`UI74eC;$K;KL7x!R{#J?djJ5bkpKWlvj70C$p8QlaA9X<0CRO>aA9X00EW&0A&CHo&W%600E=`0B!&QssI3C00SBT0BvXh0Cund0CE5Uwg3P+0RaF2!~lRg00GJX0B8UK(f|N-0{{U40{{g800G_r04V?g<^TXF00Ha(0B!&R*Z=@w@&Ev70RRU8009C40A&CH1_1zU009gE0A~OJ5&-~i0RadA7y$rb00ABW0CWHWCIJ9r00OE20AVZv0A&FH1^@s7JOKb@00BS&0A~OJMgag}00B$^0B`^SQUL&B00CG50CfNa_y7QHXaE3qG64W`UI74eC;$K;KL7x!R{#J?djJ5bkpKWlvj70C$p8QlaA9X<0CRO>aA9X - +