From: Vladimír Vondruš <mosra@centrum.cz>
Date: Thu, 18 Jul 2019 10:55:59 +0000 (+0200)
Subject: documentation: make it possible to have more than 128 results for a node.
X-Git-Url: https://www.chiark.greenend.org.uk/ucgi/~cjwatson/git?a=commitdiff_plain;h=63e8d474e84de6be95ed9b2fe8ee7da348eeff49;p=blog.git

documentation: make it possible to have more than 128 results for a node.

Python's __init__ is the main offender, the (currently very barebone)
Magnum Python bindings have 340 results for __init__. This change is
based on the assumption that nodes with extreme amount of results on the
other hand don't have many children, so we can steal some bits from the
child count instead. Now it's either up to 127 results and up to 127
children or up to 2048 results and 16 children.
---

diff --git a/documentation/_search.py b/documentation/_search.py
index 25c0be1b..7d6e2656 100644
--- a/documentation/_search.py
+++ b/documentation/_search.py
@@ -271,9 +271,16 @@ class ResultMap:
         return output
 
 class Trie:
-    #  root  |     |     header         | results | child 1 | child 1 | child 1 |
-    # offset | ... | result # | value # |   ...   |  char   | barrier | offset  | ...
-    #  32b   |     |    8b    |   8b    |  n*16b  |   8b    |    1b   |   23b   |
+    #  root  |     |       header         | results | child 1 | child 1 | child 1 |
+    # offset | ... | | result # | child # |   ...   |  char   | barrier | offset  | ...
+    #  32b   |     |0|    7b    |   8b    |  n*16b  |   8b    |    1b   |   23b   |
+    #
+    # if result count > 127, it's instead:
+    #
+    #  root  |     |      header          | results | child 1 | child 1 | child 1 |
+    # offset | ... | | result # | child # |   ...   |  char   | barrier | offset  | ...
+    #  32b   |     |1|   11b    |   4b    |  n*16b  |   8b    |    1b   |   23b   |
+
     root_offset_struct = struct.Struct('<I')
     header_struct = struct.Struct('<BB')
     result_struct = struct.Struct('<H')
@@ -337,9 +344,21 @@ class Trie:
             offset = child[1]._serialize(hashtable, output, merge_subtrees=merge_subtrees)
             child_offsets += [(char, child[0], offset)]
 
-        # Serialize this node
+        # Serialize this node. Sometimes we'd have an insane amount of results
+        # (such as Python's __init__), but very little children to go with
+        # that. Then we can make the result count storage larger (11 bits,
+        # 2048 results) and the child count storage smaller (4 bits, 16
+        # children). Hopefully that's enough. The remaining leftmost bit is
+        # used as an indicator of this shifted state.
         serialized = bytearray()
-        serialized += self.header_struct.pack(len(self.results), len(self.children))
+        if len(self.results) > 127:
+            assert len(self.children) < 16 and len(self.results) < 2048
+            result_count = (len(self.results) & 0x7f) | 0x80
+            children_count = ((len(self.results) & 0xf80) >> 3) | len(self.children)
+        else:
+            result_count = len(self.results)
+            children_count = len(self.children)
+        serialized += self.header_struct.pack(result_count, children_count)
         for v in self.results:
             serialized += self.result_struct.pack(v)
 
diff --git a/documentation/search.js b/documentation/search.js
index cbd6ed03..8b0c9a5b 100644
--- a/documentation/search.js
+++ b/documentation/search.js
@@ -256,7 +256,15 @@ var Search = {
             /* Calculate offset and count of children */
             let offset = this.searchStack[this.searchStack.length - 1];
             let relChildOffset = 2 + this.trie.getUint8(offset)*2;
+
+            /* Calculate child count. If there's a lot of results, the count
+               "leaks over" to the child count storage. */
+            let resultCount = this.trie.getUint8(offset);
             let childCount = this.trie.getUint8(offset + 1);
+            if(resultCount & 0x80) {
+                resultCount = (resultCount & 0x7f) | ((childCount & 0xf0) << 3);
+                childCount = childCount & 0x0f;
+            }
 
             /* Go through all children and find the next offset */
             let childOffset = offset + relChildOffset;
@@ -299,8 +307,17 @@ var Search = {
             let offset = current[0];
             let suffixLength = current[1];
 
-            /* Populate the results with all values associated with this node */
+            /* Calculate child count. If there's a lot of results, the count
+               "leaks over" to the child count storage. */
+            /* TODO: hmmm. this is helluvalot duplicated code. hmm. */
             let resultCount = this.trie.getUint8(offset);
+            let childCount = this.trie.getUint8(offset + 1);
+            if(resultCount & 0x80) {
+                resultCount = (resultCount & 0x7f) | ((childCount & 0xf0) << 3);
+                childCount = childCount & 0x0f;
+            }
+
+            /* Populate the results with all values associated with this node */
             for(let i = 0; i != resultCount; ++i) {
                 let index = this.trie.getUint16(offset + (i + 1)*2, true);
                 results.push(this.gatherResult(index, suffixLength, 0xffffff)); /* should be enough haha */
@@ -313,7 +330,6 @@ var Search = {
             /* Dig deeper */
             /* TODO: hmmm. this is helluvalot duplicated code. hmm. */
             let relChildOffset = 2 + this.trie.getUint8(offset)*2;
-            let childCount = this.trie.getUint8(offset + 1);
             let childOffset = offset + relChildOffset;
             for(let j = 0; j != childCount; ++j) {
                 let offsetBarrier = this.trie.getUint32(childOffset + j*4, true);
diff --git a/documentation/test/js-test-data/manyresults.bin b/documentation/test/js-test-data/manyresults.bin
new file mode 100644
index 00000000..4d3eb35f
Binary files /dev/null and b/documentation/test/js-test-data/manyresults.bin differ
diff --git a/documentation/test/populate-js-test-data.py b/documentation/test/populate-js-test-data.py
index 998beff1..9903ee84 100755
--- a/documentation/test/populate-js-test-data.py
+++ b/documentation/test/populate-js-test-data.py
@@ -103,3 +103,19 @@ trie.insert("range", map.add("Magnum::Math::Range", "classMagnum_1_1Math_1_1Rang
 
 with open(basedir/'nested.bin', 'wb') as f:
     f.write(serialize_search_data(trie, map, search_type_map, 4))
+
+# Extreme amount of search results (Python's __init__, usually)
+
+trie = Trie()
+map = ResultMap()
+
+for i in range(128):
+    trie.insert("__init__", map.add(f"Foo{i}.__init__(self)", f"Foo{i}.html#__init__", suffix_length=6, flags=ResultFlag.from_type(ResultFlag.NONE, EntryType.FUNC)))
+
+# It's __init_subclass__, but here I want to trigger the case of both a high
+# amount of results and some children as well.
+for i in [3, 15, 67]:
+    trie.insert("__init__subclass__", map.add(f"Foo{i}.__init__subclass__(self)", f"Foo{i}.html#__init__subclass__", suffix_length=6, flags=ResultFlag.from_type(ResultFlag.NONE, EntryType.FUNC)))
+
+with open(basedir/'manyresults.bin', 'wb') as f:
+    f.write(serialize_search_data(trie, map, search_type_map, 128 + 3))
diff --git a/documentation/test/test-search.js b/documentation/test/test-search.js
index 2bf9cc7a..d99550ba 100644
--- a/documentation/test/test-search.js
+++ b/documentation/test/test-search.js
@@ -336,4 +336,66 @@ const { StringDecoder } = require('string_decoder');
           suffixLength: 3 }], 'nge']);
 }
 
+/* Extreme amount of search results */
+{
+    let buffer = fs.readFileSync(path.join(__dirname, "js-test-data/manyresults.bin"));
+    assert.ok(Search.init(buffer.buffer.slice(buffer.byteOffset, buffer.byteOffset + buffer.byteLength), 10000));
+    assert.equal(Search.dataSize, 6415);
+    assert.equal(Search.symbolCount, 128 + 3);
+    assert.equal(Search.maxResults, 10000);
+    assert.deepEqual(Search.search('__init__')[0].length, 128 + 3);
+    assert.deepEqual(Search.search('__init__')[1], '');
+    assert.deepEqual(Search.search('__init__')[0][0],
+        { name: 'Foo0.__init__(self)',
+          url: 'Foo0.html#__init__',
+          flags: 1, /* has suffix */
+          cssClass: 'm-info',
+          typeName: 'func',
+          suffixLength: 6 });
+    /* The 127 other results in between are similar. It should also print
+       results from the children: */
+    assert.deepEqual(Search.search('__init__')[0][128],
+        { name: 'Foo3.__init__subclass__(self)',
+          url: 'Foo3.html#__init__subclass__',
+          flags: 1, /* has suffix */
+          cssClass: 'm-info',
+          typeName: 'func',
+          suffixLength: 16 });
+    assert.deepEqual(Search.search('__init__')[0][129],
+        { name: 'Foo15.__init__subclass__(self)',
+          url: 'Foo15.html#__init__subclass__',
+          flags: 1, /* has suffix */
+          cssClass: 'm-info',
+          typeName: 'func',
+          suffixLength: 16 });
+    assert.deepEqual(Search.search('__init__')[0][130],
+        { name: 'Foo67.__init__subclass__(self)',
+          url: 'Foo67.html#__init__subclass__',
+          flags: 1, /* has suffix */
+          cssClass: 'm-info',
+          typeName: 'func',
+          suffixLength: 16 });
+
+    /* Searching for nested results should work as well */
+    assert.deepEqual(Search.search('__init__s'), [[
+        { name: 'Foo3.__init__subclass__(self)',
+          url: 'Foo3.html#__init__subclass__',
+          flags: 1, /* has suffix */
+          cssClass: 'm-info',
+          typeName: 'func',
+          suffixLength: 15 },
+        { name: 'Foo15.__init__subclass__(self)',
+          url: 'Foo15.html#__init__subclass__',
+          flags: 1, /* has suffix */
+          cssClass: 'm-info',
+          typeName: 'func',
+          suffixLength: 15 },
+        { name: 'Foo67.__init__subclass__(self)',
+          url: 'Foo67.html#__init__subclass__',
+          flags: 1, /* has suffix */
+          cssClass: 'm-info',
+          typeName: 'func',
+          suffixLength: 15 }], 'ubclass__']);
+}
+
 /* Not testing Search.download() because the xmlhttprequest npm package is *crap* */