From 7e7480461f2fcb9b0c9a28dda7c56eeba9e66bea Mon Sep 17 00:00:00 2001 From: =?utf8?q?Vladim=C3=ADr=20Vondru=C5=A1?= Date: Sun, 14 Jul 2019 23:57:19 +0200 Subject: [PATCH] documentation: move search-related stuff into a dedicated module. So it can be used by the Python doc generator as well. Wanted to put it into __init__.py, but Guido disagreed with that, so I can't. --- documentation/_search.py | 353 ++++++++++++++++++ documentation/doxygen.py | 327 +--------------- .../test_doxygen/populate-js-test-data.py | 2 +- documentation/test_doxygen/test_search.py | 2 +- .../test_doxygen/test_undocumented.py | 2 +- 5 files changed, 358 insertions(+), 328 deletions(-) create mode 100644 documentation/_search.py diff --git a/documentation/_search.py b/documentation/_search.py new file mode 100644 index 00000000..598e995b --- /dev/null +++ b/documentation/_search.py @@ -0,0 +1,353 @@ +# +# This file is part of m.css. +# +# Copyright © 2017, 2018, 2019 Vladimír Vondruš +# +# Permission is hereby granted, free of charge, to any person obtaining a +# copy of this software and associated documentation files (the "Software"), +# to deal in the Software without restriction, including without limitation +# the rights to use, copy, modify, merge, publish, distribute, sublicense, +# and/or sell copies of the Software, and to permit persons to whom the +# Software is furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included +# in all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +# DEALINGS IN THE SOFTWARE. +# + +# Can't be in __init__.py because I can't say `from . import Trie` in +# doxygen.py. But `from _search import bla` works. Ugh. + +import base64 +import struct +from enum import Flag +from types import SimpleNamespace as Empty + +class ResultFlag(Flag): + HAS_SUFFIX = 1 << 0 + HAS_PREFIX = 1 << 3 + DEPRECATED = 1 << 1 + DELETED = 1 << 2 + + # Result type. Order defines order in which equally-named symbols appear in + # search results. Keep in sync with search.js. + _TYPE = 0xf << 4 + ALIAS = 0 << 4 # This one gets the type from the referenced result + PAGE = 1 << 4 + NAMESPACE = 2 << 4 + GROUP = 3 << 4 + CLASS = 4 << 4 + STRUCT = 5 << 4 + UNION = 6 << 4 + TYPEDEF = 7 << 4 + DIR = 8 << 4 + FILE = 9 << 4 + FUNC = 10 << 4 + DEFINE = 11 << 4 + ENUM = 12 << 4 + ENUM_VALUE = 13 << 4 + VAR = 14 << 4 + +class ResultMap: + # item 1 flags | item 2 flags | | item N flags | file | item 1 | + # + offset | + offset | ... | + offset | size | data | ... + # 8 + 24b | 8 + 24b | | 8 + 24b | 32b | | + # + # basic item (flags & 0b11 == 0b00): + # + # name | \0 | URL + # | | + # | 8b | + # + # suffixed item (flags & 0b11 == 0b01): + # + # suffix | name | \0 | URL + # length | | | + # 8b | | 8b | + # + # prefixed item (flags & 0xb11 == 0b10): + # + # prefix | name | \0 | URL + # id + len | suffix | | suffix + # 16b + 8b | | 8b | + # + # prefixed & suffixed item (flags & 0xb11 == 0b11): + # + # prefix | suffix | name | \0 | URL + # id + len | length | suffix | | + # 16b + 8b | 8b | | 8b | + # + # alias item (flags & 0xf0 == 0x00): + # + # alias | | alias + # id | ... | name + # 16b | | + # + offset_struct = struct.Struct(' int: + if suffix_length: flags |= ResultFlag.HAS_SUFFIX + if alias is not None: + assert flags & ResultFlag._TYPE == ResultFlag.ALIAS + + entry = Empty() + entry.name = name + entry.url = url + entry.flags = flags + entry.alias = alias + entry.prefix = 0 + entry.prefix_length = 0 + entry.suffix_length = suffix_length + + self.entries += [entry] + return len(self.entries) - 1 + + def serialize(self, merge_prefixes=True) -> bytearray: + output = bytearray() + + if merge_prefixes: + # Put all entry names into a trie to discover common prefixes + trie = Trie() + for index, e in enumerate(self.entries): + trie.insert(e.name, index) + + # Create a new list with merged prefixes + merged = [] + for index, e in enumerate(self.entries): + # Search in the trie and get the longest shared name prefix + # that is already fully contained in some other entry + current = trie + longest_prefix = None + for c in e.name.encode('utf-8'): + for candidate, child in current.children.items(): + if c == candidate: + current = child[1] + break + else: assert False # pragma: no cover + + # Allow self-reference only when referenced result suffix + # is longer (otherwise cycles happen). This is for + # functions that should appear when searching for foo (so + # they get ordered properly based on the name length) and + # also when searching for foo() (so everything that's not + # a function gets filtered out). Such entries are + # completely the same except for a different suffix length. + if index in current.results: + for i in current.results: + if self.entries[i].suffix_length > self.entries[index].suffix_length: + longest_prefix = current + break + elif current.results: + longest_prefix = current + + # Name prefix found, for all possible URLs find the one that + # shares the longest prefix + if longest_prefix: + max_prefix = (0, -1) + for longest_index in longest_prefix.results: + # Ignore self (function self-reference, see above) + if longest_index == index: continue + + prefix_length = 0 + for i in range(min(len(e.url), len(self.entries[longest_index].url))): + if e.url[i] != self.entries[longest_index].url[i]: break + prefix_length += 1 + if max_prefix[1] < prefix_length: + max_prefix = (longest_index, prefix_length) + + # Expect we found something + assert max_prefix[1] != -1 + + # Save the entry with reference to the prefix + entry = Empty() + assert e.name.startswith(self.entries[longest_prefix.results[0]].name) + entry.name = e.name[len(self.entries[longest_prefix.results[0]].name):] + entry.url = e.url[max_prefix[1]:] + entry.flags = e.flags|ResultFlag.HAS_PREFIX + entry.alias = e.alias + entry.prefix = max_prefix[0] + entry.prefix_length = max_prefix[1] + entry.suffix_length = e.suffix_length + merged += [entry] + + # No prefix found, copy the entry verbatim + else: merged += [e] + + # Everything merged, replace the original list + self.entries = merged + + # Write the offset array. Starting offset for items is after the offset + # array and the file size + offset = (len(self.entries) + 1)*4 + for e in self.entries: + assert offset < 2**24 + output += self.offset_struct.pack(offset) + self.flags_struct.pack_into(output, len(output) - 1, e.flags.value) + + # The entry is an alias, extra field for alias index + if e.flags & ResultFlag._TYPE == ResultFlag.ALIAS: + offset += 2 + + # Extra field for prefix index and length + if e.flags & ResultFlag.HAS_PREFIX: + offset += 3 + + # Extra field for suffix length + if e.flags & ResultFlag.HAS_SUFFIX: + offset += 1 + + # Length of the name + offset += len(e.name.encode('utf-8')) + + # Length of the URL and 0-delimiter. If URL is empty, it's not + # added at all, then the 0-delimiter is also not needed. + if e.name and e.url: + offset += len(e.url.encode('utf-8')) + 1 + + # Write file size + output += self.offset_struct.pack(offset) + + # Write the entries themselves + for e in self.entries: + if e.flags & ResultFlag._TYPE == ResultFlag.ALIAS: + assert not e.alias is None + assert not e.url + output += self.alias_struct.pack(e.alias) + if e.flags & ResultFlag.HAS_PREFIX: + output += self.prefix_struct.pack(e.prefix, e.prefix_length) + if e.flags & ResultFlag.HAS_SUFFIX: + output += self.suffix_length_struct.pack(e.suffix_length) + output += e.name.encode('utf-8') + if e.url: + output += b'\0' + output += e.url.encode('utf-8') + + assert len(output) == offset + return output + +class Trie: + # root | | header | results | child 1 | child 1 | child 1 | + # offset | ... | result # | value # | ... | char | barrier | offset | ... + # 32b | | 8b | 8b | n*16b | 8b | 1b | 23b | + root_offset_struct = struct.Struct(' int: + # Serialize all children first + child_offsets = [] + for char, child in self.children.items(): + offset = child[1]._serialize(hashtable, output, merge_subtrees=merge_subtrees) + child_offsets += [(char, child[0], offset)] + + # Serialize this node + serialized = bytearray() + serialized += self.header_struct.pack(len(self.results), len(self.children)) + for v in self.results: + serialized += self.result_struct.pack(v) + + # Serialize child offsets + for char, lookahead_barrier, abs_offset in child_offsets: + assert abs_offset < 2**23 + + # write them over each other because that's the only way to pack + # a 24 bit field + offset = len(serialized) + serialized += self.child_struct.pack(abs_offset | ((1 if lookahead_barrier else 0) << 23)) + self.child_char_struct.pack_into(serialized, offset + 3, char) + + # Subtree merging: if this exact tree is already in the table, return + # its offset. Otherwise add it and return the new offset. + # TODO: why hashable = bytes(output[base_offset:] + serialized) didn't work? + hashable = bytes(serialized) + if merge_subtrees and hashable in hashtable: + return hashtable[hashable] + else: + offset = len(output) + output += serialized + if merge_subtrees: hashtable[hashable] = offset + return offset + + def serialize(self, merge_subtrees=True) -> bytearray: + output = bytearray(b'\x00\x00\x00\x00') + hashtable = {} + self.root_offset_struct.pack_into(output, 0, self._serialize(hashtable, output, merge_subtrees=merge_subtrees)) + return output + +search_data_header_struct = struct.Struct('<3sBHI') + +def serialize_search_data(trie: Trie, map: ResultMap, symbol_count, merge_subtrees=True, merge_prefixes=True) -> bytearray: + serialized_trie = trie.serialize(merge_subtrees=merge_subtrees) + serialized_map = map.serialize(merge_prefixes=merge_prefixes) + # magic header, version, symbol count, offset of result map + return search_data_header_struct.pack(b'MCS', 0, symbol_count, len(serialized_trie) + 10) + serialized_trie + serialized_map + +def base85encode_search_data(data: bytearray) -> bytearray: + return (b"/* Generated by https://mcss.mosra.cz/documentation/doxygen/. Do not edit. */\n" + + b"Search.load('" + base64.b85encode(data, True) + b"');\n") diff --git a/documentation/doxygen.py b/documentation/doxygen.py index 82b78735..014b6f58 100755 --- a/documentation/doxygen.py +++ b/documentation/doxygen.py @@ -26,7 +26,6 @@ import xml.etree.ElementTree as ET import argparse -import base64 import copy import sys import re @@ -35,11 +34,9 @@ import os import glob import mimetypes import shutil -import struct import subprocess import urllib.parse import logging -from enum import Flag from types import SimpleNamespace as Empty from typing import Tuple, Dict, Any, List @@ -49,330 +46,14 @@ from pygments import highlight from pygments.formatters import HtmlFormatter from pygments.lexers import TextLexer, BashSessionLexer, get_lexer_by_name, find_lexer_class_for_filename +from _search import ResultFlag, ResultMap, Trie, serialize_search_data, base85encode_search_data + sys.path.append(os.path.join(os.path.dirname(os.path.realpath(__file__)), '../plugins')) import dot2svg import latex2svg import latex2svgextra import ansilexer -class ResultFlag(Flag): - HAS_SUFFIX = 1 << 0 - HAS_PREFIX = 1 << 3 - DEPRECATED = 1 << 1 - DELETED = 1 << 2 - - # Result type. Order defines order in which equally-named symbols appear in - # search results. Keep in sync with search.js. - _TYPE = 0xf << 4 - ALIAS = 0 << 4 # This one gets the type from the referenced result - PAGE = 1 << 4 - NAMESPACE = 2 << 4 - GROUP = 3 << 4 - CLASS = 4 << 4 - STRUCT = 5 << 4 - UNION = 6 << 4 - TYPEDEF = 7 << 4 - DIR = 8 << 4 - FILE = 9 << 4 - FUNC = 10 << 4 - DEFINE = 11 << 4 - ENUM = 12 << 4 - ENUM_VALUE = 13 << 4 - VAR = 14 << 4 - -class ResultMap: - # item 1 flags | item 2 flags | | item N flags | file | item 1 | - # + offset | + offset | ... | + offset | size | data | ... - # 8 + 24b | 8 + 24b | | 8 + 24b | 32b | | - # - # basic item (flags & 0b11 == 0b00): - # - # name | \0 | URL - # | | - # | 8b | - # - # suffixed item (flags & 0b11 == 0b01): - # - # suffix | name | \0 | URL - # length | | | - # 8b | | 8b | - # - # prefixed item (flags & 0xb11 == 0b10): - # - # prefix | name | \0 | URL - # id + len | suffix | | suffix - # 16b + 8b | | 8b | - # - # prefixed & suffixed item (flags & 0xb11 == 0b11): - # - # prefix | suffix | name | \0 | URL - # id + len | length | suffix | | - # 16b + 8b | 8b | | 8b | - # - # alias item (flags & 0xf0 == 0x00): - # - # alias | | alias - # id | ... | name - # 16b | | - # - offset_struct = struct.Struct(' int: - if suffix_length: flags |= ResultFlag.HAS_SUFFIX - if alias is not None: - assert flags & ResultFlag._TYPE == ResultFlag.ALIAS - - entry = Empty() - entry.name = name - entry.url = url - entry.flags = flags - entry.alias = alias - entry.prefix = 0 - entry.prefix_length = 0 - entry.suffix_length = suffix_length - - self.entries += [entry] - return len(self.entries) - 1 - - def serialize(self, merge_prefixes=True) -> bytearray: - output = bytearray() - - if merge_prefixes: - # Put all entry names into a trie to discover common prefixes - trie = Trie() - for index, e in enumerate(self.entries): - trie.insert(e.name, index) - - # Create a new list with merged prefixes - merged = [] - for index, e in enumerate(self.entries): - # Search in the trie and get the longest shared name prefix - # that is already fully contained in some other entry - current = trie - longest_prefix = None - for c in e.name.encode('utf-8'): - for candidate, child in current.children.items(): - if c == candidate: - current = child[1] - break - else: assert False # pragma: no cover - - # Allow self-reference only when referenced result suffix - # is longer (otherwise cycles happen). This is for - # functions that should appear when searching for foo (so - # they get ordered properly based on the name length) and - # also when searching for foo() (so everything that's not - # a function gets filtered out). Such entries are - # completely the same except for a different suffix length. - if index in current.results: - for i in current.results: - if self.entries[i].suffix_length > self.entries[index].suffix_length: - longest_prefix = current - break - elif current.results: - longest_prefix = current - - # Name prefix found, for all possible URLs find the one that - # shares the longest prefix - if longest_prefix: - max_prefix = (0, -1) - for longest_index in longest_prefix.results: - # Ignore self (function self-reference, see above) - if longest_index == index: continue - - prefix_length = 0 - for i in range(min(len(e.url), len(self.entries[longest_index].url))): - if e.url[i] != self.entries[longest_index].url[i]: break - prefix_length += 1 - if max_prefix[1] < prefix_length: - max_prefix = (longest_index, prefix_length) - - # Expect we found something - assert max_prefix[1] != -1 - - # Save the entry with reference to the prefix - entry = Empty() - assert e.name.startswith(self.entries[longest_prefix.results[0]].name) - entry.name = e.name[len(self.entries[longest_prefix.results[0]].name):] - entry.url = e.url[max_prefix[1]:] - entry.flags = e.flags|ResultFlag.HAS_PREFIX - entry.alias = e.alias - entry.prefix = max_prefix[0] - entry.prefix_length = max_prefix[1] - entry.suffix_length = e.suffix_length - merged += [entry] - - # No prefix found, copy the entry verbatim - else: merged += [e] - - # Everything merged, replace the original list - self.entries = merged - - # Write the offset array. Starting offset for items is after the offset - # array and the file size - offset = (len(self.entries) + 1)*4 - for e in self.entries: - assert offset < 2**24 - output += self.offset_struct.pack(offset) - self.flags_struct.pack_into(output, len(output) - 1, e.flags.value) - - # The entry is an alias, extra field for alias index - if e.flags & ResultFlag._TYPE == ResultFlag.ALIAS: - offset += 2 - - # Extra field for prefix index and length - if e.flags & ResultFlag.HAS_PREFIX: - offset += 3 - - # Extra field for suffix length - if e.flags & ResultFlag.HAS_SUFFIX: - offset += 1 - - # Length of the name - offset += len(e.name.encode('utf-8')) - - # Length of the URL and 0-delimiter. If URL is empty, it's not - # added at all, then the 0-delimiter is also not needed. - if e.name and e.url: - offset += len(e.url.encode('utf-8')) + 1 - - # Write file size - output += self.offset_struct.pack(offset) - - # Write the entries themselves - for e in self.entries: - if e.flags & ResultFlag._TYPE == ResultFlag.ALIAS: - assert not e.alias is None - assert not e.url - output += self.alias_struct.pack(e.alias) - if e.flags & ResultFlag.HAS_PREFIX: - output += self.prefix_struct.pack(e.prefix, e.prefix_length) - if e.flags & ResultFlag.HAS_SUFFIX: - output += self.suffix_length_struct.pack(e.suffix_length) - output += e.name.encode('utf-8') - if e.url: - output += b'\0' - output += e.url.encode('utf-8') - - assert len(output) == offset - return output - -class Trie: - # root | | header | results | child 1 | child 1 | child 1 | - # offset | ... | result # | value # | ... | char | barrier | offset | ... - # 32b | | 8b | 8b | n*16b | 8b | 1b | 23b | - root_offset_struct = struct.Struct(' int: - # Serialize all children first - child_offsets = [] - for char, child in self.children.items(): - offset = child[1]._serialize(hashtable, output, merge_subtrees=merge_subtrees) - child_offsets += [(char, child[0], offset)] - - # Serialize this node - serialized = bytearray() - serialized += self.header_struct.pack(len(self.results), len(self.children)) - for v in self.results: - serialized += self.result_struct.pack(v) - - # Serialize child offsets - for char, lookahead_barrier, abs_offset in child_offsets: - assert abs_offset < 2**23 - - # write them over each other because that's the only way to pack - # a 24 bit field - offset = len(serialized) - serialized += self.child_struct.pack(abs_offset | ((1 if lookahead_barrier else 0) << 23)) - self.child_char_struct.pack_into(serialized, offset + 3, char) - - # Subtree merging: if this exact tree is already in the table, return - # its offset. Otherwise add it and return the new offset. - # TODO: why hashable = bytes(output[base_offset:] + serialized) didn't work? - hashable = bytes(serialized) - if merge_subtrees and hashable in hashtable: - return hashtable[hashable] - else: - offset = len(output) - output += serialized - if merge_subtrees: hashtable[hashable] = offset - return offset - - def serialize(self, merge_subtrees=True) -> bytearray: - output = bytearray(b'\x00\x00\x00\x00') - hashtable = {} - self.root_offset_struct.pack_into(output, 0, self._serialize(hashtable, output, merge_subtrees=merge_subtrees)) - return output - -search_data_header_struct = struct.Struct('<3sBHI') - -def serialize_search_data(trie: Trie, map: ResultMap, symbol_count, merge_subtrees=True, merge_prefixes=True) -> bytearray: - serialized_trie = trie.serialize(merge_subtrees=merge_subtrees) - serialized_map = map.serialize(merge_prefixes=merge_prefixes) - # magic header, version, symbol count, offset of result map - return search_data_header_struct.pack(b'MCS', 0, symbol_count, len(serialized_trie) + 10) + serialized_trie + serialized_map - xref_id_rx = re.compile(r"""(.*)_1(_[a-z-]+[0-9]+|@)$""") slugify_nonalnum_rx = re.compile(r"""[^\w\s-]""") slugify_hyphens_rx = re.compile(r"""[-\s]+""") @@ -2658,10 +2339,6 @@ def build_search_data(state: State, merge_subtrees=True, add_lookahead_barriers= return serialize_search_data(trie, map, symbol_count, merge_subtrees=merge_subtrees, merge_prefixes=merge_prefixes) -def base85encode_search_data(data: bytearray) -> bytearray: - return (b"/* Generated by https://mcss.mosra.cz/documentation/doxygen/. Do not edit. */\n" + - b"Search.load('" + base64.b85encode(data, True) + b"');\n") - def parse_xml(state: State, xml: str): # Reset counter for unique math formulas latex2svgextra.counter = 0 diff --git a/documentation/test_doxygen/populate-js-test-data.py b/documentation/test_doxygen/populate-js-test-data.py index 84082585..86fa998f 100755 --- a/documentation/test_doxygen/populate-js-test-data.py +++ b/documentation/test_doxygen/populate-js-test-data.py @@ -30,7 +30,7 @@ import sys import pathlib sys.path.append(os.path.join(os.path.dirname(os.path.realpath(__file__)), '..')) -from doxygen import Trie, ResultMap, ResultFlag, serialize_search_data +from _search import Trie, ResultMap, ResultFlag, serialize_search_data basedir = pathlib.Path(os.path.dirname(os.path.realpath(__file__)))/'js-test-data' diff --git a/documentation/test_doxygen/test_search.py b/documentation/test_doxygen/test_search.py index 4b490d36..d9fa9103 100755 --- a/documentation/test_doxygen/test_search.py +++ b/documentation/test_doxygen/test_search.py @@ -30,7 +30,7 @@ import sys import unittest from types import SimpleNamespace as Empty -from doxygen import Trie, ResultMap, ResultFlag, serialize_search_data, search_data_header_struct +from _search import Trie, ResultMap, ResultFlag, serialize_search_data, search_data_header_struct from test_doxygen import IntegrationTestCase diff --git a/documentation/test_doxygen/test_undocumented.py b/documentation/test_doxygen/test_undocumented.py index aed7f522..07a80fa4 100644 --- a/documentation/test_doxygen/test_undocumented.py +++ b/documentation/test_doxygen/test_undocumented.py @@ -26,7 +26,7 @@ import os -from doxygen import search_data_header_struct +from _search import search_data_header_struct from . import IntegrationTestCase -- 2.30.2