chiark / gitweb /
doxygen: proper UTF-8 support in search.
authorVladimír Vondruš <mosra@centrum.cz>
Mon, 29 Jan 2018 23:38:39 +0000 (00:38 +0100)
committerVladimír Vondruš <mosra@centrum.cz>
Sat, 3 Feb 2018 09:51:55 +0000 (10:51 +0100)
doxygen/dox2html5.py
doxygen/search.js
doxygen/test/js-test-data/unicode.bin [new file with mode: 0644]
doxygen/test/populate-js-test-data.py
doxygen/test/test-search.js
doxygen/test/test_search.py

index 81b891f3e17c0ebdeddaa00b375781aeee990d56..e01191680c854006c340e6fe4a87177da8eb07d5 100755 (executable)
@@ -61,25 +61,28 @@ class Trie:
     header_struct = struct.Struct('<BB')
     value_struct = struct.Struct('<H')
     child_struct = struct.Struct('<I')
-    child_char_struct = struct.Struct('<c')
+    child_char_struct = struct.Struct('<B')
 
     def __init__(self):
         self.values = []
         self.children = {}
 
-    def insert(self, path: str, value, lookahead_barriers=[]):
+    def _insert(self, path: bytes, value, lookahead_barriers):
         if not path:
             self.values += [value]
             return
 
         char = path[0]
-        assert not char.isupper() # to avoid unnecessary duplicates
         if not char in self.children:
             self.children[char] = (False, Trie())
         if lookahead_barriers and lookahead_barriers[0] == 0:
             lookahead_barriers = lookahead_barriers[1:]
             self.children[char] = (True, self.children[char][1])
-        self.children[char][1].insert(path[1:], value, [b - 1 for b in lookahead_barriers])
+        self.children[char][1]._insert(path[1:], value, [b - 1 for b in lookahead_barriers])
+
+    def insert(self, path: str, value, lookahead_barriers=[]):
+        assert not path.isupper() # to avoid unnecessary duplicates
+        self._insert(path.encode('utf-8'), value, lookahead_barriers)
 
     # Returns offset of the serialized thing in `output`
     def _serialize(self, hashtable, output: bytearray) -> int:
@@ -104,7 +107,7 @@ class Trie:
             # a 24 bit field
             offset = len(serialized)
             serialized += self.child_struct.pack(abs_offset | ((1 if lookahead_barrier else 0) << 23))
-            self.child_char_struct.pack_into(serialized, offset + 3, char.encode('utf-8'))
+            self.child_char_struct.pack_into(serialized, offset + 3, char)
 
         assert size == len(serialized)
 
index 91861ce35e01387d377dbfe6858b2b140440800a..74d050f719306275e2b70939177299419f52a008 100644 (file)
@@ -159,9 +159,15 @@ var Search = {
         return this.init(this.base85decode(base85string));
     },
 
+    /* http://ecmanaut.blogspot.com/2006/07/encoding-decoding-utf8-in-javascript.html */
+    toUtf8: function(string) { return unescape(encodeURIComponent(string)); },
+    fromUtf8: function(string) { return decodeURIComponent(escape(string)); },
+
+    /* Returns the values in UTF-8, but input is in whatever shitty 16bit
+       encoding JS has */
     search: function(searchString) {
-        /* Normalize the search string first */
-        searchString = searchString.toLowerCase().trim();
+        /* Normalize the search string first, convert to UTF-8 */
+        searchString = this.toUtf8(searchString.toLowerCase().trim());
 
         /* TODO: maybe i could make use of InputEvent.data and others here */
 
@@ -260,9 +266,8 @@ var Search = {
                 url += String.fromCharCode(this.map.getUint8(j));
             }
 
-            /* Properly decode UTF-8 in the name
-               http://ecmanaut.blogspot.com/2006/07/encoding-decoding-utf8-in-javascript.html */
-            results.push({name: decodeURIComponent(escape(name)),
+            /* Keeping in UTF-8, as we need that for proper slicing */
+            results.push({name: name,
                           url: url,
                           suffixLength: suffixLength + resultSuffixLength});
 
@@ -305,8 +310,9 @@ var Search = {
     },
 
     renderResults: /* istanbul ignore next */ function(value, results) {
-        /* Normalize the value length so the slicing works properly */
-        value = value.trim();
+        /* Normalize the value and encode as UTF-8 so the slicing works
+           properly */
+        value = this.toUtf8(value.trim());
 
         if(!value.length) {
             document.getElementById('search-help').style.display = 'block';
@@ -323,7 +329,7 @@ var Search = {
 
             var list = '';
             for(let i = 0; i != results.length; ++i) {
-                list += '<li' + (i ? '' : ' id="search-current"') + '><a href="' + results[i].url + '" onmouseover="selectResult(event)"><div><span class="m-text m-dim">' + this.escapeForRtl(results[i].name.substr(0, results[i].name.length - value.length - results[i].suffixLength)) + '</span><span class="m-dox-search-typed">' + this.escapeForRtl(results[i].name.substr(results[i].name.length - value.length - results[i].suffixLength, value.length)) + '</span>' + this.escapeForRtl(results[i].name.substr(results[i].name.length - results[i].suffixLength)) + '</div></a></li>';
+                list += this.fromUtf8('<li' + (i ? '' : ' id="search-current"') + '><a href="' + results[i].url + '" onmouseover="selectResult(event)"><div><span class="m-text m-dim">' + this.escapeForRtl(results[i].name.substr(0, results[i].name.length - value.length - results[i].suffixLength)) + '</span><span class="m-dox-search-typed">' + this.escapeForRtl(results[i].name.substr(results[i].name.length - value.length - results[i].suffixLength, value.length)) + '</span>' + this.escapeForRtl(results[i].name.substr(results[i].name.length - results[i].suffixLength)) + '</div></a></li>');
             }
             document.getElementById('search-results').innerHTML = list;
             document.getElementById('search-current').scrollIntoView(true);
diff --git a/doxygen/test/js-test-data/unicode.bin b/doxygen/test/js-test-data/unicode.bin
new file mode 100644 (file)
index 0000000..ed2bd50
Binary files /dev/null and b/doxygen/test/js-test-data/unicode.bin differ
index 334e8bbf2389d23b0013d063bf7fa5f3708f6591..13a77f4f983d823f2ddaf78a78aaf4e3b7d884b6 100755 (executable)
@@ -70,3 +70,12 @@ with open(basedir/'searchdata.bin', 'wb') as f:
     f.write(serialize_search_data(trie, map))
 with open(basedir/'searchdata.b85', 'wb') as f:
     f.write(base64.b85encode(serialize_search_data(trie, map), True))
+
+trie = Trie()
+map = ResultMap()
+
+trie.insert("hýždě", map.add("Hýždě", "#a"))
+trie.insert("hárá", map.add("Hárá", "#b"))
+
+with open(basedir/'unicode.bin', 'wb') as f:
+    f.write(serialize_search_data(trie, map))
index 0e80b73150283376287f796c3593d610b16c7a1b..ec8637c791057d00ceaa3f6d5234d7eda6d76154 100644 (file)
@@ -41,6 +41,11 @@ const { StringDecoder } = require('string_decoder');
     assert.equal(Search.escapeForRtl("NS::Class<int>"), "NS&lrm;:&lrm;:Class&lt;int&lrm;&gt;&lrm;");
 }
 
+/* UTF-8 en/decoding */
+{
+    assert.equal(Search.fromUtf8(Search.toUtf8("hýždě bříza")), "hýždě bříza");
+}
+
 /* Simple base85 decoding */
 {
     let b85 = 'Xk~0{Zy-ZbL0VZLcW-iRWFa9T';
@@ -150,7 +155,7 @@ const { StringDecoder } = require('string_decoder');
 
     /* UTF-8 decoding */
     assert.deepEqual(Search.search('su'), [
-        { name: 'Page » Subpage',
+        { name: Search.toUtf8('Page » Subpage'),
           url: 'subpage.html',
           suffixLength: 5 }]);
 }
@@ -193,4 +198,27 @@ const { StringDecoder } = require('string_decoder');
           suffixLength: 8 }]);
 }
 
+/* Search, Unicode */
+{
+    let buffer = fs.readFileSync(path.join(__dirname, "js-test-data/unicode.bin"));
+    assert.ok(Search.init(buffer.buffer.slice(buffer.byteOffset, buffer.byteOffset + buffer.byteLength), 3));
+    assert.equal(Search.dataSize, 0.1);
+    assert.equal(Search.symbolCount, 2);
+    assert.deepEqual(Search.search('h'), [
+        { name: Search.toUtf8('Hýždě'),
+          url: '#a',
+          suffixLength: 7 },
+        { name: Search.toUtf8('Hárá'),
+          url: '#b',
+          suffixLength: 5 }]);
+    assert.deepEqual(Search.search('hý'), [
+        { name: Search.toUtf8('Hýždě'),
+          url: '#a',
+          suffixLength: 5 }]);
+    assert.deepEqual(Search.search('há'), [
+        { name: Search.toUtf8('Hárá'),
+          url: '#b',
+          suffixLength: 3 }]);
+}
+
 /* Not testing Search.download() because the xmlhttprequest npm package is *crap* */
index 7f8ba91f3a1d746cfbdb028c4ae430c7587b66f0..7f6b3e22924df4388aa38e1a8fc4dace1fd38d72 100755 (executable)
@@ -64,10 +64,15 @@ def _pretty_print_trie(serialized: bytearray, hashtable, stats, base_offset, ind
         if child_count or value_count:
             out += '\n'
             out += indent
-        out += Trie.child_char_struct.unpack_from(serialized, offset + 3)[0].decode('utf-8')
+        char = Trie.child_char_struct.unpack_from(serialized, offset + 3)[0]
+        if char <= 127:
+            out += chr(char)
+        else:
+            out += hex(char)
         if Trie.child_struct.unpack_from(serialized, offset)[0] & 0x00800000:
-            out += '$\n'
-            out += indent + ' '
+            out += '$'
+        if char > 127 or Trie.child_struct.unpack_from(serialized, offset)[0] & 0x00800000:
+            out += '\n' + indent + ' '
         child_offset = Trie.child_struct.unpack_from(serialized, offset)[0] & 0x007fffff
         stats.max_node_child_offset = max(child_offset, stats.max_node_child_offset)
         offset += Trie.child_struct.size
@@ -223,6 +228,28 @@ range [2]
 """)
         self.assertEqual(len(serialized), 340)
 
+    def test_unicode(self):
+        trie = Trie()
+
+        trie.insert("hýždě", 0)
+        trie.insert("hárá", 1)
+
+        serialized = trie.serialize()
+        self.compare(serialized, """
+h0xc3
+  0xbd
+   0xc5
+  | 0xbe
+  |  d0xc4
+  |    0x9b
+  |      [0]
+  0xa1
+   r0xc3
+  |  0xa1
+  |    [1]
+""")
+        self.assertEqual(len(serialized), 82)
+
 class MapSerialization(unittest.TestCase):
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)