From 0411e1812f1274bf6e49404c0dcdf649f7637076 Mon Sep 17 00:00:00 2001
From: =?utf8?q?Vladim=C3=ADr=20Vondru=C5=A1?= <mosra@centrum.cz>
Date: Sun, 9 Jan 2022 16:52:26 +0100
Subject: [PATCH] documentation: expose search data packing options.

There are new SEARCH_RESULT_ID_BYTES, SEARCH_FILE_OFFSET_BYTES and
SEARCH_NAME_SIZE_BYTES options exposed in both Doxygen and Python doc
generator. Because they affect some internals that the regular user
shouldn't need to care about and don't reflect anything tangible like
project symbol count, they're not really documented.

On the other hand, they can't really be estimated beforehand The
estimator would either overestimate, leading to files larger than they
could be, or it underestimates, leading to random corner cases that
would be impossible to track down. Thus the project just starts with the
small sizes and if they're not enough, an exception with a (hopefully
helpful-enough) message is raised, suggesting users to adjust their
config.
---
 doc/documentation/doxygen.rst               | 31 +++++++++++++++++++++
 doc/documentation/python.rst                | 27 ++++++++++++++++++
 documentation/_search.py                    | 15 +++++++++-
 documentation/doxygen.py                    |  8 +++++-
 documentation/python.py                     |  5 +++-
 documentation/test/test_search.py           | 18 ++++++------
 documentation/test_doxygen/test_doxyfile.py |  3 ++
 documentation/test_doxygen/test_search.py   | 16 +++++++++++
 documentation/test_python/test_search.py    | 19 +++++++++++++
 9 files changed, 130 insertions(+), 12 deletions(-)

diff --git a/doc/documentation/doxygen.rst b/doc/documentation/doxygen.rst
index 6366f8bd..ceead01f 100644
--- a/doc/documentation/doxygen.rst
+++ b/doc/documentation/doxygen.rst
@@ -390,6 +390,18 @@ Variable                            Description
                                     Python documentation shares the same
                                     directory. If not set, ``searchdata`` is
                                     used.
+:py:`SEARCH_RESULT_ID_BYTES: int`   Search data packing option. A value of
+                                    :py:`2`, :py:`3` or :py:`4` is allowed. If
+                                    not set, :py:`2` is used. See
+                                    `Search options`_ for more information.
+:py:`SEARCH_FILE_OFFSET_BYTES: int` Search data packing option. A value of
+                                    :py:`3` or :py:`4` is allowed. If not set,
+                                    :py:`3` is used. See `Search options`_ for
+                                    more information.
+:py:`SEARCH_NAME_SIZE_BYTES: int`   Search data packing option. A value of
+                                    :py:`1` or :py:`2` is allowed. If not set,
+                                    :py:`1` is used. See `Search options`_ for
+                                    more information.
 :py:`SEARCH_HELP: str`              HTML code to display as help text on empty
                                     search popup. If not set, a default message
                                     is used. Has effect only if
@@ -479,6 +491,9 @@ these are not expected to be excessively large.
     :ini:`M_SEARCH_DISABLED`            :py:`SEARCH_DISABLED`
     :ini:`M_SEARCH_DOWNLOAD_BINARY`     :py:`SEARCH_DOWNLOAD_BINARY`
     :ini:`M_SEARCH_FILENAME_PREFIX`     :py:`SEARCH_FILENAME_PREFIX`
+    :ini:`M_SEARCH_RESULT_ID_BYTES`     :py:`SEARCH_RESULT_ID_BYTES`
+    :ini:`M_SEARCH_FILE_OFFSET_BYTES`   :py:`SEARCH_FILE_OFFSET_BYTES`
+    :ini:`M_SEARCH_NAME_SIZE_BYTES`     :py:`SEARCH_NAME_SIZE_BYTES`
     :ini:`M_SEARCH_HELP`                :py:`SEARCH_HELP`
     :ini:`M_SEARCH_BASE_URL`            :py:`SEARCH_BASE_URL`
     :ini:`M_SEARCH_EXTERNAL_URL`        :py:`SEARCH_EXTERNAL_URL`
@@ -711,6 +726,22 @@ search to a subdomain:
 
     SEARCH_EXTERNAL_URL = "https://google.com/search?q=site:doc.magnum.graphics+{query}"
 
+The search binary is implicitly made with the tightest packing possible for
+smallest download sizes. On large projects with tens of thousands of symbols it
+may however happen that the data won't fit and doc generation fails with an
+exception such as the following, suggesting you to increase the packed type
+sizes:
+
+    OverflowError: Trie result ID too large to store in 16 bits, set
+    SEARCH_RESULT_ID_BYTES = 3 in your conf.py.
+
+The relevant `configuration`_ is :py:`SEARCH_RESULT_ID_BYTES`,
+:py:`SEARCH_FILE_OFFSET_BYTES` and :py:`SEARCH_NAME_SIZE_BYTES`. Simply update
+your ``conf.py`` with suggested values (or the ``Doxyfile-mcss`` with this
+option prefixed with ``M_``) and restart the generator. Due to the way the
+search data get processed during serialization it's unfortunately not feasible
+to estimate the packing sizes beforehand.
+
 `Showing undocumented symbols and files`_
 -----------------------------------------
 
diff --git a/doc/documentation/python.rst b/doc/documentation/python.rst
index 3f972ee2..795ff191 100644
--- a/doc/documentation/python.rst
+++ b/doc/documentation/python.rst
@@ -244,6 +244,18 @@ Variable                            Description
                                     Python documentation shares the same
                                     directory. If not set, ``searchdata`` is
                                     used.
+:py:`SEARCH_RESULT_ID_BYTES: int`   Search data packing option. A value of
+                                    :py:`2`, :py:`3` or :py:`4` is allowed. If
+                                    not set, :py:`2` is used. See
+                                    `Search options`_ for more information.
+:py:`SEARCH_FILE_OFFSET_BYTES: int` Search data packing option. A value of
+                                    :py:`3` or :py:`4` is allowed. If not set,
+                                    :py:`3` is used. See `Search options`_ for
+                                    more information.
+:py:`SEARCH_NAME_SIZE_BYTES: int`   Search data packing option. A value of
+                                    :py:`1` or :py:`2` is allowed. If not set,
+                                    :py:`1` is used. See `Search options`_ for
+                                    more information.
 :py:`SEARCH_HELP: str`              :abbr:`reST <reStructuredText>` markup to
                                     display as help text on empty search popup.
                                     If not set, a default message is used. Has
@@ -397,6 +409,21 @@ search to a subdomain:
 
     SEARCH_EXTERNAL_URL = 'https://google.com/search?q=site:doc.magnum.graphics+{query}'
 
+The search binary is implicitly made with the tightest packing possible for
+smallest download sizes. On large projects with tens of thousands of symbols it
+may however happen that the data won't fit and doc generation fails with an
+exception such as the following, suggesting you to increase the packed type
+sizes:
+
+    OverflowError: Trie result ID too large to store in 16 bits, set
+    SEARCH_RESULT_ID_BYTES = 3 in your conf.py.
+
+The relevant `configuration`_ is :py:`SEARCH_RESULT_ID_BYTES`,
+:py:`SEARCH_FILE_OFFSET_BYTES` and :py:`SEARCH_NAME_SIZE_BYTES`. Simply update
+your ``conf.py`` with suggested values and restart the generator. Due to the
+way the search data get processed during serialization it's unfortunately not
+feasible to estimate the packing sizes beforehand.
+
 `Custom URL formatters`_
 ------------------------
 
diff --git a/documentation/_search.py b/documentation/_search.py
index a2d44556..c756adff 100644
--- a/documentation/_search.py
+++ b/documentation/_search.py
@@ -177,13 +177,23 @@ class Serializer:
     def pack_result_map_flags(self, flags: int):
         return self.result_map_flags_struct.pack(flags)
     def pack_result_map_offset(self, offset: int):
+        if offset >= 256**self.file_offset_bytes:
+            raise OverflowError("Result map offset too large to store in {} bits, set SEARCH_FILE_OFFSET_BYTES = {} in your conf.py.".format(self.file_offset_bytes*8, self.file_offset_bytes + 1))
         return offset.to_bytes(self.file_offset_bytes, byteorder='little')
     def pack_result_map_prefix(self, id: int, length: int):
+        if id >= 256**self.result_id_bytes:
+            raise OverflowError("Result map prefix ID too large to store in {} bits, set SEARCH_RESULT_ID_BYTES = {} in your conf.py.".format(self.result_id_bytes*8, self.result_id_bytes + 1))
+        if length >= 256**self.name_size_bytes:
+            raise OverflowError("Result map prefix length too large to store in {} bits, set SEARCH_NAME_SIZE_BYTES = {} in your conf.py.".format(self.name_size_bytes*8, self.name_size_bytes + 1))
         return id.to_bytes(self.result_id_bytes, byteorder='little') + \
            length.to_bytes(self.name_size_bytes, byteorder='little')
     def pack_result_map_suffix_length(self, length: int):
+        if length >= 256**self.name_size_bytes:
+            raise OverflowError("Result map suffix length too large to store in {} bits, set SEARCH_NAME_SIZE_BYTES = {} in your conf.py.".format(self.name_size_bytes*8, self.name_size_bytes + 1))
         return length.to_bytes(self.name_size_bytes, byteorder='little')
     def pack_result_map_alias(self, id: int):
+        if id >= 256**self.result_id_bytes:
+            raise OverflowError("Result map alias ID too large to store in {} bits, set SEARCH_RESULT_ID_BYTES = {} in your conf.py.".format(self.result_id_bytes*8, self.result_id_bytes + 1))
         return id.to_bytes(self.result_id_bytes, byteorder='little')
 
     def pack_trie_root_offset(self, offset: int):
@@ -201,11 +211,14 @@ class Serializer:
             out += (len(result_ids) | 0x8000).to_bytes(2, byteorder='big')
         out += len(child_chars_offsets_barriers).to_bytes(1, byteorder='little')
         for id in result_ids:
+            if id >= 256**self.result_id_bytes:
+                raise OverflowError("Trie result ID too large to store in {} bits, set SEARCH_RESULT_ID_BYTES = {} in your conf.py.".format(self.result_id_bytes*8, self.result_id_bytes + 1))
             out += id.to_bytes(self.result_id_bytes, byteorder='little')
         out += bytes([char for char, offset, barrier in child_chars_offsets_barriers])
         child_barrier_mask = 1 << (self.file_offset_bytes*8 - 1)
         for char, offset, barrier in child_chars_offsets_barriers:
-            if offset >= child_barrier_mask: raise OverflowError
+            if offset >= child_barrier_mask:
+                raise OverflowError("Trie child offset too large to store in {} bits, set SEARCH_FILE_OFFSET_BYTES = {} in your conf.py.".format(self.file_offset_bytes*8 - 1, self.file_offset_bytes + 1))
             out += (offset | (barrier*child_barrier_mask)).to_bytes(self.file_offset_bytes, byteorder='little')
         return out
 
diff --git a/documentation/doxygen.py b/documentation/doxygen.py
index 973c3973..fa22d140 100755
--- a/documentation/doxygen.py
+++ b/documentation/doxygen.py
@@ -126,6 +126,9 @@ default_config = {
     'SEARCH_DISABLED': False,
     'SEARCH_DOWNLOAD_BINARY': False,
     'SEARCH_FILENAME_PREFIX': 'searchdata',
+    'SEARCH_RESULT_ID_BYTES': 2,
+    'SEARCH_FILE_OFFSET_BYTES': 3,
+    'SEARCH_NAME_SIZE_BYTES': 1,
     'SEARCH_HELP':
 """<p class="m-noindent">Search for symbols, directories, files, pages or
 modules. You can omit any prefix from the symbol or file path; adding a
@@ -2440,7 +2443,7 @@ def build_search_data(state: State, merge_subtrees=True, add_lookahead_barriers=
     # order by default
     trie.sort(map)
 
-    return serialize_search_data(Serializer(file_offset_bytes=3, result_id_bytes=2, name_size_bytes=1), trie, map, search_type_map, symbol_count, merge_subtrees=merge_subtrees, merge_prefixes=merge_prefixes)
+    return serialize_search_data(Serializer(file_offset_bytes=state.config['SEARCH_FILE_OFFSET_BYTES'], result_id_bytes=state.config['SEARCH_RESULT_ID_BYTES'], name_size_bytes=state.config['SEARCH_NAME_SIZE_BYTES']), trie, map, search_type_map, symbol_count, merge_subtrees=merge_subtrees, merge_prefixes=merge_prefixes)
 
 def parse_xml(state: State, xml: str):
     # Reset counter for unique math formulas
@@ -3510,6 +3513,9 @@ def parse_doxyfile(state: State, doxyfile, values = None):
         ('M_SEARCH_DISABLED', 'SEARCH_DISABLED', bool),
         ('M_SEARCH_DOWNLOAD_BINARY', 'SEARCH_DOWNLOAD_BINARY', bool),
         ('M_SEARCH_FILENAME_PREFIX', 'SEARCH_FILENAME_PREFIX', str),
+        ('M_SEARCH_RESULT_ID_BYTES', 'SEARCH_RESULT_ID_BYTES', int),
+        ('M_SEARCH_FILE_OFFSET_BYTES', 'SEARCH_FILE_OFFSET_BYTES', int),
+        ('M_SEARCH_NAME_SIZE_BYTES', 'SEARCH_NAME_SIZE_BYTES', int),
         ('M_SEARCH_HELP', 'SEARCH_HELP', str),
         ('M_SEARCH_BASE_URL', 'SEARCH_BASE_URL', str),
         ('M_SEARCH_EXTERNAL_URL', 'SEARCH_EXTERNAL_URL', str),
diff --git a/documentation/python.py b/documentation/python.py
index 2d048858..a47bfb21 100755
--- a/documentation/python.py
+++ b/documentation/python.py
@@ -172,6 +172,9 @@ default_config = {
     'SEARCH_DISABLED': False,
     'SEARCH_DOWNLOAD_BINARY': False,
     'SEARCH_FILENAME_PREFIX': 'searchdata',
+    'SEARCH_RESULT_ID_BYTES': 2,
+    'SEARCH_FILE_OFFSET_BYTES': 3,
+    'SEARCH_NAME_SIZE_BYTES': 1,
     'SEARCH_HELP': """.. raw:: html
 
     <p class="m-noindent">Search for modules, classes, functions and other
@@ -2454,7 +2457,7 @@ def build_search_data(state: State, merge_subtrees=True, add_lookahead_barriers=
     # order by default
     trie.sort(map)
 
-    return serialize_search_data(Serializer(file_offset_bytes=3, result_id_bytes=2, name_size_bytes=1), trie, map, search_type_map, symbol_count, merge_subtrees=merge_subtrees, merge_prefixes=merge_prefixes)
+    return serialize_search_data(Serializer(file_offset_bytes=state.config['SEARCH_FILE_OFFSET_BYTES'], result_id_bytes=state.config['SEARCH_RESULT_ID_BYTES'], name_size_bytes=state.config['SEARCH_NAME_SIZE_BYTES']), trie, map, search_type_map, symbol_count, merge_subtrees=merge_subtrees, merge_prefixes=merge_prefixes)
 
 def run(basedir, config, *, templates=default_templates, search_add_lookahead_barriers=True, search_merge_subtrees=True, search_merge_prefixes=True):
     # Populate the INPUT, if not specified, make it absolute
diff --git a/documentation/test/test_search.py b/documentation/test/test_search.py
index 2e8c8e01..77e86e49 100755
--- a/documentation/test/test_search.py
+++ b/documentation/test/test_search.py
@@ -200,7 +200,7 @@ __init__ [{}]
     def test_16bit_result_id_too_small(self):
         trie = Trie()
         trie.insert("a", 65536)
-        with self.assertRaises(OverflowError):
+        with self.assertRaisesRegex(OverflowError, "Trie result ID too large to store in 16 bits, set SEARCH_RESULT_ID_BYTES = 3 in your conf.py."):
             trie.serialize(Serializer(file_offset_bytes=3, result_id_bytes=2, name_size_bytes=1))
 
         # This should work
@@ -209,7 +209,7 @@ __init__ [{}]
     def test_24bit_result_id_too_small(self):
         trie = Trie()
         trie.insert("a", 16*1024*1024)
-        with self.assertRaises(OverflowError):
+        with self.assertRaisesRegex(OverflowError, "Trie result ID too large to store in 24 bits, set SEARCH_RESULT_ID_BYTES = 4 in your conf.py."):
             trie.serialize(Serializer(file_offset_bytes=3, result_id_bytes=3, name_size_bytes=1))
 
         # This should work
@@ -227,7 +227,7 @@ __init__ [{}]
         for i in range(130):
             trie.insert('a'*i, results_32k)
 
-        with self.assertRaises(OverflowError):
+        with self.assertRaisesRegex(OverflowError, "Trie child offset too large to store in 23 bits, set SEARCH_FILE_OFFSET_BYTES = 4 in your conf.py."):
             trie.serialize(Serializer(file_offset_bytes=3, result_id_bytes=2, name_size_bytes=1))
 
         # This should work
@@ -313,7 +313,7 @@ class MapSerialization(unittest.TestCase):
         # flags, 1 byte for the null terminator, 6 bytes for the URL
         map.add('F'*(16*1024*1024 - 14), 'f.html', flags=ResultFlag.from_type(ResultFlag.NONE, EntryType.CLASS))
 
-        with self.assertRaises(OverflowError):
+        with self.assertRaisesRegex(OverflowError, "Result map offset too large to store in 24 bits, set SEARCH_FILE_OFFSET_BYTES = 4 in your conf.py."):
             # Disabling prefix merging otherwise memory usage goes to hell
             map.serialize(Serializer(file_offset_bytes=3, result_id_bytes=2, name_size_bytes=1), merge_prefixes=False)
 
@@ -325,7 +325,7 @@ class MapSerialization(unittest.TestCase):
         map = ResultMap()
         map.add("F()" + ';'*256, "f.html", flags=ResultFlag.from_type(ResultFlag.NONE, EntryType.FUNC), suffix_length=256)
 
-        with self.assertRaises(OverflowError):
+        with self.assertRaisesRegex(OverflowError, "Result map suffix length too large to store in 8 bits, set SEARCH_NAME_SIZE_BYTES = 2 in your conf.py."):
             map.serialize(Serializer(file_offset_bytes=3, result_id_bytes=2, name_size_bytes=1))
 
         # This should work
@@ -336,7 +336,7 @@ class MapSerialization(unittest.TestCase):
         map.add("A", 'a'*251 + ".html", flags=ResultFlag.from_type(ResultFlag.NONE, EntryType.CLASS))
         map.add("A::foo()", 'a'*251 + ".html#foo", flags=ResultFlag.from_type(ResultFlag.NONE, EntryType.FUNC))
 
-        with self.assertRaises(OverflowError):
+        with self.assertRaisesRegex(OverflowError, "Result map prefix length too large to store in 8 bits, set SEARCH_NAME_SIZE_BYTES = 2 in your conf.py."):
             map.serialize(Serializer(file_offset_bytes=3, result_id_bytes=2, name_size_bytes=1))
 
         # This should work
@@ -356,7 +356,7 @@ class MapSerialization(unittest.TestCase):
         self.assertEqual(map.add("B", "b.html", flags=ResultFlag.from_type(ResultFlag.NONE, EntryType.CLASS)), 65536)
         map.add("B::foo()", "b.html#foo", flags=ResultFlag.from_type(ResultFlag.NONE, EntryType.FUNC))
 
-        with self.assertRaises(OverflowError):
+        with self.assertRaisesRegex(OverflowError, "Result map prefix ID too large to store in 16 bits, set SEARCH_RESULT_ID_BYTES = 3 in your conf.py."):
             map.serialize(Serializer(file_offset_bytes=3, result_id_bytes=2, name_size_bytes=1))
 
         # This should work
@@ -371,7 +371,7 @@ class MapSerialization(unittest.TestCase):
         # The alias doesn't exist of course, hopefully that's fine in this case
         map.add("B", "", alias=65536)
 
-        with self.assertRaises(OverflowError):
+        with self.assertRaisesRegex(OverflowError, "Result map alias ID too large to store in 16 bits, set SEARCH_RESULT_ID_BYTES = 3 in your conf.py."):
             map.serialize(Serializer(file_offset_bytes=3, result_id_bytes=2, name_size_bytes=1))
 
         # This should work
@@ -383,7 +383,7 @@ class MapSerialization(unittest.TestCase):
         # The alias doesn't exist of course, hopefully that's fine in this case
         map.add("B", "", alias=16*1024*1024)
 
-        with self.assertRaises(OverflowError):
+        with self.assertRaisesRegex(OverflowError, "Result map alias ID too large to store in 24 bits, set SEARCH_RESULT_ID_BYTES = 4 in your conf.py."):
             map.serialize(Serializer(file_offset_bytes=3, result_id_bytes=3, name_size_bytes=1))
 
         # This should work
diff --git a/documentation/test_doxygen/test_doxyfile.py b/documentation/test_doxygen/test_doxyfile.py
index a191420c..6b9dc76d 100644
--- a/documentation/test_doxygen/test_doxyfile.py
+++ b/documentation/test_doxygen/test_doxyfile.py
@@ -77,6 +77,9 @@ class Doxyfile(unittest.TestCase):
         'SEARCH_DISABLED': False,
         'SEARCH_DOWNLOAD_BINARY': False,
         'SEARCH_FILENAME_PREFIX': 'searchdata',
+        'SEARCH_RESULT_ID_BYTES': 2,
+        'SEARCH_FILE_OFFSET_BYTES': 3,
+        'SEARCH_NAME_SIZE_BYTES': 1,
         'SEARCH_BASE_URL': None,
         'SEARCH_EXTERNAL_URL': None,
         'SEARCH_HELP':
diff --git a/documentation/test_doxygen/test_search.py b/documentation/test_doxygen/test_search.py
index 6c5a6d3d..2c458539 100755
--- a/documentation/test_doxygen/test_search.py
+++ b/documentation/test_doxygen/test_search.py
@@ -225,6 +225,22 @@ union [59]
 (EntryType.VAR, CssClass.DEFAULT, 'var')
 """.strip())
 
+    def test_byte_sizes(self):
+        for config, bytes, size in [
+            ('SEARCH_RESULT_ID_BYTES', 3, 4959),
+            ('SEARCH_RESULT_ID_BYTES', 4, 5077),
+            ('SEARCH_FILE_OFFSET_BYTES', 4, 5302),
+            ('SEARCH_NAME_SIZE_BYTES', 2, 4893)
+        ]:
+            with self.subTest(config=config, bytes=bytes, size=size):
+                self.run_doxygen(index_pages=[], wildcard='*.xml', config={
+                    config: bytes
+                })
+
+                with open(os.path.join(self.path, 'html', searchdata_filename.format(search_filename_prefix='secretblob')), 'rb') as f:
+                    serialized = f.read()
+                self.assertEqual(len(serialized), size)
+
 class LongSuffixLength(IntegrationTestCase):
     def test(self):
         self.run_doxygen(index_pages=[], wildcard='*.xml')
diff --git a/documentation/test_python/test_search.py b/documentation/test_python/test_search.py
index 52adfe2a..e3a45b68 100644
--- a/documentation/test_python/test_search.py
+++ b/documentation/test_python/test_search.py
@@ -185,6 +185,25 @@ overloaded_method [17, 19, 15]
 (EntryType.DATA, CssClass.DEFAULT, 'data')
 """.strip())
 
+    def test_byte_sizes(self):
+        for config, bytes, size in [
+            ('SEARCH_RESULT_ID_BYTES', 3, 2333),
+            ('SEARCH_RESULT_ID_BYTES', 4, 2392),
+            ('SEARCH_FILE_OFFSET_BYTES', 4, 2525),
+            ('SEARCH_NAME_SIZE_BYTES', 2, 2313)
+        ]:
+            with self.subTest(config=config, bytes=bytes, size=size):
+                self.run_python({
+                    'SEARCH_DISABLED': False,
+                    'SEARCH_DOWNLOAD_BINARY': True,
+                    config: bytes,
+                    'PYBIND11_COMPATIBILITY': True
+                })
+
+                with open(os.path.join(self.path, 'output', searchdata_filename.format(search_filename_prefix='searchdata')), 'rb') as f:
+                    serialized = f.read()
+                self.assertEqual(len(serialized), size)
+
 class LongSuffixLength(BaseInspectTestCase):
     def test(self):
         self.run_python({
-- 
2.30.2