chiark / gitweb /
doxygen: implement lookahead barriers for search.
authorVladimír Vondruš <mosra@centrum.cz>
Mon, 29 Jan 2018 12:35:01 +0000 (13:35 +0100)
committerVladimír Vondruš <mosra@centrum.cz>
Sat, 3 Feb 2018 09:51:55 +0000 (10:51 +0100)
doxygen/dox2html5.py
doxygen/search.js
doxygen/test/js-test-data/searchdata.b85
doxygen/test/js-test-data/searchdata.bin
doxygen/test/populate-js-test-data.py
doxygen/test/test-search.js
doxygen/test/test_search.py

index 7fde75d70fb2448592982ffef32247173d19da6f..81b891f3e17c0ebdeddaa00b375781aeee990d56 100755 (executable)
@@ -54,9 +54,9 @@ import m.math
 import ansilexer
 
 class Trie:
-    #  root  |     |     header       | values |    child    |
-    # offset | ... | size/2 | value # |  ...   | offsets ... |
-    #  32b   |     |   8b   |    8b   | n*16b  |   8b + 24b  |
+    #  root  |     |     header       | values | child 1 | child 1 | child 1 |
+    # offset | ... | size/2 | value # |  ...   |   char  | barrier | offset  | ...
+    #  32b   |     |   8b   |    8b   | n*16b  |   8b    |    1b   |   23b   |
     root_offset_struct = struct.Struct('<I')
     header_struct = struct.Struct('<BB')
     value_struct = struct.Struct('<H')
@@ -67,7 +67,7 @@ class Trie:
         self.values = []
         self.children = {}
 
-    def insert(self, path: str, value):
+    def insert(self, path: str, value, lookahead_barriers=[]):
         if not path:
             self.values += [value]
             return
@@ -75,16 +75,19 @@ class Trie:
         char = path[0]
         assert not char.isupper() # to avoid unnecessary duplicates
         if not char in self.children:
-            self.children[char] = Trie()
-        self.children[char].insert(path[1:], value)
+            self.children[char] = (False, Trie())
+        if lookahead_barriers and lookahead_barriers[0] == 0:
+            lookahead_barriers = lookahead_barriers[1:]
+            self.children[char] = (True, self.children[char][1])
+        self.children[char][1].insert(path[1:], value, [b - 1 for b in lookahead_barriers])
 
     # Returns offset of the serialized thing in `output`
     def _serialize(self, hashtable, output: bytearray) -> int:
         # Serialize all children first
         child_offsets = []
         for char, child in self.children.items():
-            offset = child._serialize(hashtable, output)
-            child_offsets += [(char, offset)]
+            offset = child[1]._serialize(hashtable, output)
+            child_offsets += [(char, child[0], offset)]
 
         # Serialize this node
         size = int(2 + 2*len(self.values) + 4*len(child_offsets))
@@ -94,13 +97,13 @@ class Trie:
             serialized += self.value_struct.pack(v)
 
         # Serialize child offsets
-        for char, abs_offset in child_offsets:
-            assert abs_offset < 2**24
+        for char, lookahead_barrier, abs_offset in child_offsets:
+            assert abs_offset < 2**23
 
             # write them over each other because that's the only way to pack
             # a 24 bit field
             offset = len(serialized)
-            serialized += self.child_struct.pack(abs_offset)
+            serialized += self.child_struct.pack(abs_offset | ((1 if lookahead_barrier else 0) << 23))
             self.child_char_struct.pack_into(serialized, offset + 3, char.encode('utf-8'))
 
         assert size == len(serialized)
@@ -1550,7 +1553,14 @@ def _build_search_data(state: State, prefix, id: str, trie: Trie, map: ResultMap
         # TODO: escape elsewhere so i don't have to unescape here
         index = map.add(html.unescape(result_joiner.join(prefixed_result_name)), compound.url, suffix_length=suffix_length)
         for i in range(len(prefixed_name)):
-            trie.insert(html.unescape(joiner.join(prefixed_name[i:])).lower(), index)
+            lookahead_barriers = []
+            name = ''
+            for j in prefixed_name[i:]:
+                if name:
+                    lookahead_barriers += [len(name)]
+                    name += joiner
+                name += html.unescape(j)
+            trie.insert(name.lower(), index, lookahead_barriers=lookahead_barriers)
 
     for i in compound.children:
         if i in state.compounds:
@@ -1589,7 +1599,14 @@ def build_search_data(state: State) -> bytearray:
 
         prefixed_name = result.prefix + [name]
         for i in range(len(prefixed_name)):
-            trie.insert(html.unescape('::'.join(prefixed_name[i:])).lower(), index)
+            lookahead_barriers = []
+            name = ''
+            for j in prefixed_name[i:]:
+                if name:
+                    lookahead_barriers += [len(name)]
+                    name += '::'
+                name += html.unescape(j)
+            trie.insert(name.lower(), index, lookahead_barriers=lookahead_barriers)
 
     return serialize_search_data(trie, map)
 
index ab8af9869c383cb123721305df824fb29cd007a3..064028ac606b5e100498c35f95aa838bcd87251b 100644 (file)
@@ -192,7 +192,7 @@ var Search = {
                 if(String.fromCharCode(this.trie.getUint8(childOffset + j*4 + 3)) != searchString[foundPrefix])
                     continue;
 
-                this.searchStack.push(this.trie.getUint32(childOffset + j*4, true) & 0x00ffffff);
+                this.searchStack.push(this.trie.getUint32(childOffset + j*4, true) & 0x007fffff);
                 found = true;
                 break;
             }
@@ -276,9 +276,15 @@ var Search = {
         let relChildOffset = 2 + this.trie.getUint8(offset + 1)*2;
         let childCount = (nodeSize - relChildOffset)/4;
         let childOffset = offset + relChildOffset;
-        for(let j = 0; j != childCount; ++j)
-            if(this.gatherResults(this.trie.getUint32(childOffset + j*4, true) & 0x00ffffff, suffixLength + 1, results))
+        for(let j = 0; j != childCount; ++j) {
+            let offsetBarrier = this.trie.getUint32(childOffset + j*4, true);
+
+            /* Lookahead barrier, don't dig deeper */
+            if(offsetBarrier & 0x00800000) continue;
+
+            if(this.gatherResults(offsetBarrier & 0x007fffff, suffixLength + 1, results))
                 return true;
+        }
 
         /* Still hungry. */
         return false;
index 1fd24166796cdc8e173f73ca39ae4697d8c3b829..ba9751d5f611bb54e14753bb5491735f9e391e1d 100644 (file)
@@ -1 +1 @@
-O+!-vL;(N*Dggih0s#R40{{d704W0i2mk;m0{{*H0B!>S6aWBe0s#X60{|cZ04W0iBme*?0{|)j0B!>SFaQ8)0{}Jv0Br*RJOBVX1OWm7LI40d0{}<>0CEEWPyhgL0{~V40CWQYTmS%L0{~(G0A&IJ1pos8ZU6u&0|0UW04M_hcmM!y0|0&i0BHjNga80-0|1Hu06GK#1OSi#06GHzmH+@{0|1@?0A~XLqyPYJ0|2T30AU9J8UO%oXaE3qumAvZ0|2%F06GK#006`Q06GHz$^Zap0|3$h0CWTc0RRI41pos8-T(k80|4d#04M_h>;M361pwFp0Aca~0BHgN1^@#90s#PJ0{{jA0A~XL3;_UP0{{{M0B{2U7y$rc0{|WY0Cfof_y7QHXaE3qumAvZBmn?(AOHXWHvj+uVgLXDhX4Qpz5oCK;Q#;u76AYNG64VpO<{Cs0B&JzWpi+0V`WWYbZ9PUbZu-1O<{CsIy!A>ZYXJPbSxlgZgeRCZeeX@b8ul}WldppXf9}UZEPcLX>LtnbZ9y{R%K&!Z*l-*Y+-YAO<{CsUol@XR%K&!Z*neZbZu+~O<{CsIyzQmV{~tFIy!A>ZYU`rV{dMAbO2*)VRLg$VRUF;F<&uOWn*-2axQ3eZEPcLX>LtnbZ9y{QekdqWdLJrVRLg$VRUF;F<&uKVQyz-E@*UZYz9qXbZ9y{QekdqWjZ=-X>KSfAY*TCb94Y>Y+-YAO<{CsUol@XQekdqWiDuRZEPcLX>L$qXJsJ5yC73_VsK$+WdL(^VsK$+WiDuRZEOGl
\ No newline at end of file
+O+!-vL;(N*Dggih0s#R40{{d704W0i2mk;m0{{*H0B!>S6aWBe0s#X60{|cZ04W0iBme*?0{|)j0B!>SFaQ8)0{}Jv0Br*RJOBVX1OWm7LI8j|0{}<>0CEEWPyhgL0{~V40CWQYTmS%L0{~(G0A&IJ1pos8ZU6u&0|0UW04M_hcmM!y0|0&i0BHjNga80-0|1Hu06GK#1OSi#fI0&JmH+@{0|1@?0A~XLqyPYJ0|2T30AU9J8UO%oXaE3qumAvZ0|2%F06GK#006`QfI0&J$^Zap0|3$h0CWTc0RRI41pos8-T(k80|4d#04M_h>;M361pwFp0Aca~0BHgN1^@#90s#PJ0{{jA0A~XL3;_UP0{{{M0B{2U7y$rc0{|WY0Cfof_y7QHXaE3qumAvZBmn?(AOHXWHvj+uVgLXDhX4Qpz5oCK;Q#;u76AYNG64VpO<{Cs0B&JzWpi+0V`WWYbZ9PUbZu-1O<{CsIy!A>ZYXJPbSxlgZgeRCZeeX@b8ul}WldppXf9}UZEPcLX>LtnbZ9y{R%K&!Z*l-*Y+-YAO<{CsUol@XR%K&!Z*neZbZu+~O<{CsIyzQmV{~tFIy!A>ZYU`rV{dMAbO2*)VRLg$VRUF;F<&uOWn*-2axQ3eZEPcLX>LtnbZ9y{QekdqWdLJrVRLg$VRUF;F<&uKVQyz-E@*UZYz9qXbZ9y{QekdqWjZ=-X>KSfAY*TCb94Y>Y+-YAO<{CsUol@XQekdqWiDuRZEPcLX>L$qXJsJ5yC73_VsK$+WdL(^VsK$+WiDuRZEOGl
\ No newline at end of file
index e7a0c3e91cb828fd2c1a8d07de752f5f40104bbd..3af09be4e79c36db8c1f664602c790b0cd7f065d 100644 (file)
Binary files a/doxygen/test/js-test-data/searchdata.bin and b/doxygen/test/js-test-data/searchdata.bin differ
index 310e87442be4a10806789c5936d8217134ce0e5c..334e8bbf2389d23b0013d063bf7fa5f3708f6591 100755 (executable)
@@ -48,21 +48,21 @@ map = ResultMap()
 
 trie.insert("math", map.add("Math", "namespaceMath.html"))
 index = map.add("Math::min(int, int)", "namespaceMath.html#min", suffix_length=8)
-trie.insert("math::min()", index)
+trie.insert("math::min()", index, lookahead_barriers=[4])
 trie.insert("min()", index)
 index = map.add("Math::Vector", "classMath_1_1Vector.html")
 trie.insert("math::vector", index)
 trie.insert("vector", index)
 index = map.add("Math::Vector::min() const", "classMath_1_1Vector.html#min", suffix_length=6)
-trie.insert("math::vector::min()", index)
-trie.insert("vector::min()", index)
+trie.insert("math::vector::min()", index, lookahead_barriers=[4, 12])
+trie.insert("vector::min()", index, lookahead_barriers=[6])
 trie.insert("min()", index)
 index = map.add("Math::Range", "classMath_1_1Range.html")
 trie.insert("math::range", index)
 trie.insert("range", index)
 index = map.add("Math::Range::min() const", "classMath_1_1Range.html#min", suffix_length=6)
-trie.insert("math::range::min()", index)
-trie.insert("range::min()", index)
+trie.insert("math::range::min()", index, lookahead_barriers=[4, 11])
+trie.insert("range::min()", index, lookahead_barriers=[5])
 trie.insert("min()", index)
 trie.insert("subpage", map.add("Page » Subpage", "subpage.html"))
 
index c48dd8123b6c103b975736ccbe38ccc47391be0f..5a3292d0bcec6eee5bc8d5034282dc94a8760314 100644 (file)
@@ -108,21 +108,6 @@ const { StringDecoder } = require('string_decoder');
         { name: 'Math',
           url: 'namespaceMath.html',
           suffixLength: 3 },
-        { name: 'Math::min(int, int)',
-          url: 'namespaceMath.html#min',
-          suffixLength: 18 },
-        { name: 'Math::Vector',
-          url: 'classMath_1_1Vector.html',
-          suffixLength: 11 },
-        { name: 'Math::Vector::min() const',
-          url: 'classMath_1_1Vector.html#min',
-          suffixLength: 24 },
-        { name: 'Math::Range',
-          url: 'classMath_1_1Range.html',
-          suffixLength: 10 },
-        { name: 'Math::Range::min() const',
-          url: 'classMath_1_1Range.html#min',
-          suffixLength: 23 },
         { name: 'Math::min(int, int)',
           url: 'namespaceMath.html#min',
           suffixLength: 12 },
@@ -153,10 +138,7 @@ const { StringDecoder } = require('string_decoder');
     let resultsForVec = [
         { name: 'Math::Vector',
           url: 'classMath_1_1Vector.html',
-          suffixLength: 3 },
-        { name: 'Math::Vector::min() const',
-          url: 'classMath_1_1Vector.html#min',
-          suffixLength: 16 }];
+          suffixLength: 3 }];
     assert.deepEqual(Search.search('vec'), resultsForVec);
 
     /* Uppercase things and spaces */
@@ -184,10 +166,10 @@ const { StringDecoder } = require('string_decoder');
           suffixLength: 3 },
         { name: 'Math::min(int, int)',
           url: 'namespaceMath.html#min',
-          suffixLength: 18 },
-        { name: 'Math::Vector',
-          url: 'classMath_1_1Vector.html',
-          suffixLength: 11 }]);
+          suffixLength: 12 },
+        { name: 'Math::Vector::min() const',
+          url: 'classMath_1_1Vector.html#min',
+          suffixLength: 10 }]);
 }
 
 /* Search loaded from a base85-encoded file should work properly */
index 82252aaee15036e4bc75f628e8665f828f7d94bf..7f8ba91f3a1d746cfbdb028c4ae430c7587b66f0 100755 (executable)
@@ -65,7 +65,10 @@ def _pretty_print_trie(serialized: bytearray, hashtable, stats, base_offset, ind
             out += '\n'
             out += indent
         out += Trie.child_char_struct.unpack_from(serialized, offset + 3)[0].decode('utf-8')
-        child_offset = Trie.child_struct.unpack_from(serialized, offset)[0] & 0x00ffffff
+        if Trie.child_struct.unpack_from(serialized, offset)[0] & 0x00800000:
+            out += '$\n'
+            out += indent + ' '
+        child_offset = Trie.child_struct.unpack_from(serialized, offset)[0] & 0x007fffff
         stats.max_node_child_offset = max(child_offset, stats.max_node_child_offset)
         offset += Trie.child_struct.size
         out += _pretty_print_trie(serialized, hashtable, stats, child_offset, indent + ('|' if draw_pipe else ' '), draw_pipe=False, show_merged=show_merged)
@@ -159,7 +162,7 @@ magnum [1337, 21]
         trie = Trie()
 
         trie.insert("math", 0)
-        trie.insert("math::vector", 1)
+        trie.insert("math::vector", 1, lookahead_barriers=[4])
         trie.insert("vector", 1)
         trie.insert("math::range", 2)
         trie.insert("range", 2)
@@ -171,8 +174,8 @@ magnum [1337, 21]
         trie.insert("math::minmax", 5)
         trie.insert("minmax", 5)
 
-        trie.insert("math::vector::minmax", 6)
-        trie.insert("vector::minmax", 6)
+        trie.insert("math::vector::minmax", 6, lookahead_barriers=[4, 12])
+        trie.insert("vector::minmax", 6, lookahead_barriers=[6])
         trie.insert("minmax", 6)
         trie.insert("math::vector::min", 7)
         trie.insert("vector::min", 7)
@@ -181,8 +184,8 @@ magnum [1337, 21]
         trie.insert("vector::max", 8)
         trie.insert("max", 8)
 
-        trie.insert("math::range::min", 9)
-        trie.insert("range::min", 9)
+        trie.insert("math::range::min", 9, lookahead_barriers=[4, 11])
+        trie.insert("range::min", 9, lookahead_barriers=[5])
         trie.insert("min", 9)
 
         trie.insert("math::range::max", 10)
@@ -192,12 +195,15 @@ magnum [1337, 21]
         serialized = trie.serialize()
         self.compare(serialized, """
 math [0]
-||| ::vector [1]
-|||   |     ::min [7]
+||| :$
+|||  :vector [1]
+|||   |     :$
+|||   |      :min [7]
 |||   |        | max [6]
 |||   |        ax [8]
 |||   range [2]
-|||   |    ::min [9]
+|||   |    :$
+|||   |     :min [9]
 |||   |       ax [10]
 |||   min [3]
 |||   || max [5]
@@ -206,11 +212,13 @@ math [0]
 |in [3, 7, 9]
 || max [5, 6]
 vector [1]
-|     ::min [7]
+|     :$
+|      :min [7]
 |        | max [6]
 |        ax [8]
 range [2]
-|    ::min [9]
+|    :$
+|     :min [9]
 |       ax [10]
 """)
         self.assertEqual(len(serialized), 340)
@@ -308,22 +316,28 @@ class Search(IntegrationTestCase):
         #print(search_data_pretty)
         self.assertEqual(search_data_pretty, """
 namespace [0]
-|        ::class [1]
-|          |    ::foo() [6, 7, 8, 9]
+|        :$
+|         :class [1]
+|          |    :$
+|          |     :foo() [6, 7, 8, 9]
 |          enum [11]
-|          |   ::value [10]
+|          |   :$
+|          |    :value [10]
 |          typedef [12]
 |          variable [13]
 class [1]
-|    ::foo() [6, 7, 8, 9]
+|    :$
+|     :foo() [6, 7, 8, 9]
 a page [2]
 subpage [3]
 dir [4]
-|  /file.h [5]
+|  /$
+|   file.h [5]
 file.h [5]
 |oo() [6, 7, 8, 9]
 enum [11]
-|   ::value [10]
+|   :$
+|    :value [10]
 value [10]
 | riable [13]
 typedef [12]