doxygen: proper UTF-8 support in search.

author Vladimír Vondruš <mosra@centrum.cz>

Mon, 29 Jan 2018 23:38:39 +0000 (00:38 +0100)

committer Vladimír Vondruš <mosra@centrum.cz>

Sat, 3 Feb 2018 09:51:55 +0000 (10:51 +0100)
author Vladimír Vondruš <mosra@centrum.cz>
Mon, 29 Jan 2018 23:38:39 +0000 (00:38 +0100)
committer Vladimír Vondruš <mosra@centrum.cz>
Sat, 3 Feb 2018 09:51:55 +0000 (10:51 +0100)
diff --git a/doxygen/dox2html5.py b/doxygen/dox2html5.py

index 81b891f3e17c0ebdeddaa00b375781aeee990d56..e01191680c854006c340e6fe4a87177da8eb07d5 100755 (executable)
--- a/doxygen/dox2html5.py
+++ b/doxygen/dox2html5.py
@@ -61,25 +61,28 @@ class Trie:
      header_struct = struct.Struct('<BB')
      value_struct = struct.Struct('<H')
      child_struct = struct.Struct('<I')
-    child_char_struct = struct.Struct('<c')
+    child_char_struct = struct.Struct('<B')
  
      def __init__(self):
          self.values = []
          self.children = {}
  
-    def insert(self, path: str, value, lookahead_barriers=[]):
+    def _insert(self, path: bytes, value, lookahead_barriers):
          if not path:
              self.values += [value]
              return
  
          char = path[0]
-        assert not char.isupper() # to avoid unnecessary duplicates
          if not char in self.children:
              self.children[char] = (False, Trie())
          if lookahead_barriers and lookahead_barriers[0] == 0:
              lookahead_barriers = lookahead_barriers[1:]
              self.children[char] = (True, self.children[char][1])
-        self.children[char][1].insert(path[1:], value, [b - 1 for b in lookahead_barriers])
+        self.children[char][1]._insert(path[1:], value, [b - 1 for b in lookahead_barriers])
+
+    def insert(self, path: str, value, lookahead_barriers=[]):
+        assert not path.isupper() # to avoid unnecessary duplicates
+        self._insert(path.encode('utf-8'), value, lookahead_barriers)
  
      # Returns offset of the serialized thing in `output`
      def _serialize(self, hashtable, output: bytearray) -> int:
@@ -104,7 +107,7 @@ class Trie:
              # a 24 bit field
              offset = len(serialized)
              serialized += self.child_struct.pack(abs_offset | ((1 if lookahead_barrier else 0) << 23))
-            self.child_char_struct.pack_into(serialized, offset + 3, char.encode('utf-8'))
+            self.child_char_struct.pack_into(serialized, offset + 3, char)
  
          assert size == len(serialized)
  
diff --git a/doxygen/search.js b/doxygen/search.js

index 91861ce35e01387d377dbfe6858b2b140440800a..74d050f719306275e2b70939177299419f52a008 100644 (file)
--- a/doxygen/search.js
+++ b/doxygen/search.js
@@ -159,9 +159,15 @@ var Search = {
          return this.init(this.base85decode(base85string));
      },
  
+    /* http://ecmanaut.blogspot.com/2006/07/encoding-decoding-utf8-in-javascript.html */
+    toUtf8: function(string) { return unescape(encodeURIComponent(string)); },
+    fromUtf8: function(string) { return decodeURIComponent(escape(string)); },
+
+    /* Returns the values in UTF-8, but input is in whatever shitty 16bit
+       encoding JS has */
      search: function(searchString) {
-        /* Normalize the search string first */
-        searchString = searchString.toLowerCase().trim();
+        /* Normalize the search string first, convert to UTF-8 */
+        searchString = this.toUtf8(searchString.toLowerCase().trim());
  
          /* TODO: maybe i could make use of InputEvent.data and others here */
  
@@ -260,9 +266,8 @@ var Search = {
                  url += String.fromCharCode(this.map.getUint8(j));
              }
  
-            /* Properly decode UTF-8 in the name
-               http://ecmanaut.blogspot.com/2006/07/encoding-decoding-utf8-in-javascript.html */
-            results.push({name: decodeURIComponent(escape(name)),
+            /* Keeping in UTF-8, as we need that for proper slicing */
+            results.push({name: name,
                            url: url,
                            suffixLength: suffixLength + resultSuffixLength});
  
@@ -305,8 +310,9 @@ var Search = {
      },
  
      renderResults: /* istanbul ignore next */ function(value, results) {
-        /* Normalize the value length so the slicing works properly */
-        value = value.trim();
+        /* Normalize the value and encode as UTF-8 so the slicing works
+           properly */
+        value = this.toUtf8(value.trim());
  
          if(!value.length) {
              document.getElementById('search-help').style.display = 'block';
@@ -323,7 +329,7 @@ var Search = {
  
              var list = '';
              for(let i = 0; i != results.length; ++i) {
-                list += '<li' + (i ? '' : ' id="search-current"') + '><a href="' + results[i].url + '" onmouseover="selectResult(event)"><div><span class="m-text m-dim">' + this.escapeForRtl(results[i].name.substr(0, results[i].name.length - value.length - results[i].suffixLength)) + '</span><span class="m-dox-search-typed">' + this.escapeForRtl(results[i].name.substr(results[i].name.length - value.length - results[i].suffixLength, value.length)) + '</span>' + this.escapeForRtl(results[i].name.substr(results[i].name.length - results[i].suffixLength)) + '</div></a></li>';
+                list += this.fromUtf8('<li' + (i ? '' : ' id="search-current"') + '><a href="' + results[i].url + '" onmouseover="selectResult(event)"><div><span class="m-text m-dim">' + this.escapeForRtl(results[i].name.substr(0, results[i].name.length - value.length - results[i].suffixLength)) + '</span><span class="m-dox-search-typed">' + this.escapeForRtl(results[i].name.substr(results[i].name.length - value.length - results[i].suffixLength, value.length)) + '</span>' + this.escapeForRtl(results[i].name.substr(results[i].name.length - results[i].suffixLength)) + '</div></a></li>');
              }
              document.getElementById('search-results').innerHTML = list;
              document.getElementById('search-current').scrollIntoView(true);
diff --git a/doxygen/test/js-test-data/unicode.bin b/doxygen/test/js-test-data/unicode.bin

new file mode 100644 (file)

index 0000000..ed2bd50

Binary files /dev/null and b/doxygen/test/js-test-data/unicode.bin differ
diff --git a/doxygen/test/populate-js-test-data.py b/doxygen/test/populate-js-test-data.py

index 334e8bbf2389d23b0013d063bf7fa5f3708f6591..13a77f4f983d823f2ddaf78a78aaf4e3b7d884b6 100755 (executable)
--- a/doxygen/test/populate-js-test-data.py
+++ b/doxygen/test/populate-js-test-data.py
@@ -70,3 +70,12 @@ with open(basedir/'searchdata.bin', 'wb') as f:
      f.write(serialize_search_data(trie, map))
  with open(basedir/'searchdata.b85', 'wb') as f:
      f.write(base64.b85encode(serialize_search_data(trie, map), True))
+
+trie = Trie()
+map = ResultMap()
+
+trie.insert("hýždě", map.add("Hýždě", "#a"))
+trie.insert("hárá", map.add("Hárá", "#b"))
+
+with open(basedir/'unicode.bin', 'wb') as f:
+    f.write(serialize_search_data(trie, map))
diff --git a/doxygen/test/test-search.js b/doxygen/test/test-search.js

index 0e80b73150283376287f796c3593d610b16c7a1b..ec8637c791057d00ceaa3f6d5234d7eda6d76154 100644 (file)
--- a/doxygen/test/test-search.js
+++ b/doxygen/test/test-search.js
@@ -41,6 +41,11 @@ const { StringDecoder } = require('string_decoder');
      assert.equal(Search.escapeForRtl("NS::Class<int>"), "NS&lrm;:&lrm;:Class&lt;int&lrm;&gt;&lrm;");
  }
  
+/* UTF-8 en/decoding */
+{
+    assert.equal(Search.fromUtf8(Search.toUtf8("hýždě bříza")), "hýždě bříza");
+}
+
  /* Simple base85 decoding */
  {
      let b85 = 'Xk~0{Zy-ZbL0VZLcW-iRWFa9T';
@@ -150,7 +155,7 @@ const { StringDecoder } = require('string_decoder');
  
      /* UTF-8 decoding */
      assert.deepEqual(Search.search('su'), [
-        { name: 'Page » Subpage',
+        { name: Search.toUtf8('Page » Subpage'),
            url: 'subpage.html',
            suffixLength: 5 }]);
  }
@@ -193,4 +198,27 @@ const { StringDecoder } = require('string_decoder');
            suffixLength: 8 }]);
  }
  
+/* Search, Unicode */
+{
+    let buffer = fs.readFileSync(path.join(__dirname, "js-test-data/unicode.bin"));
+    assert.ok(Search.init(buffer.buffer.slice(buffer.byteOffset, buffer.byteOffset + buffer.byteLength), 3));
+    assert.equal(Search.dataSize, 0.1);
+    assert.equal(Search.symbolCount, 2);
+    assert.deepEqual(Search.search('h'), [
+        { name: Search.toUtf8('Hýždě'),
+          url: '#a',
+          suffixLength: 7 },
+        { name: Search.toUtf8('Hárá'),
+          url: '#b',
+          suffixLength: 5 }]);
+    assert.deepEqual(Search.search('hý'), [
+        { name: Search.toUtf8('Hýždě'),
+          url: '#a',
+          suffixLength: 5 }]);
+    assert.deepEqual(Search.search('há'), [
+        { name: Search.toUtf8('Hárá'),
+          url: '#b',
+          suffixLength: 3 }]);
+}
+
  /* Not testing Search.download() because the xmlhttprequest npm package is *crap* */
diff --git a/doxygen/test/test_search.py b/doxygen/test/test_search.py

index 7f8ba91f3a1d746cfbdb028c4ae430c7587b66f0..7f6b3e22924df4388aa38e1a8fc4dace1fd38d72 100755 (executable)
--- a/doxygen/test/test_search.py
+++ b/doxygen/test/test_search.py
@@ -64,10 +64,15 @@ def _pretty_print_trie(serialized: bytearray, hashtable, stats, base_offset, ind
          if child_count or value_count:
              out += '\n'
              out += indent
-        out += Trie.child_char_struct.unpack_from(serialized, offset + 3)[0].decode('utf-8')
+        char = Trie.child_char_struct.unpack_from(serialized, offset + 3)[0]
+        if char <= 127:
+            out += chr(char)
+        else:
+            out += hex(char)
          if Trie.child_struct.unpack_from(serialized, offset)[0] & 0x00800000:
-            out += '$\n'
-            out += indent + ' '
+            out += '$'
+        if char > 127 or Trie.child_struct.unpack_from(serialized, offset)[0] & 0x00800000:
+            out += '\n' + indent + ' '
          child_offset = Trie.child_struct.unpack_from(serialized, offset)[0] & 0x007fffff
          stats.max_node_child_offset = max(child_offset, stats.max_node_child_offset)
          offset += Trie.child_struct.size
@@ -223,6 +228,28 @@ range [2]
  """)
          self.assertEqual(len(serialized), 340)
  
+    def test_unicode(self):
+        trie = Trie()
+
+        trie.insert("hýždě", 0)
+        trie.insert("hárá", 1)
+
+        serialized = trie.serialize()
+        self.compare(serialized, """
+h0xc3
+  0xbd
+   0xc5
+  | 0xbe
+  |  d0xc4
+  |    0x9b
+  |      [0]
+  0xa1
+   r0xc3
+  |  0xa1
+  |    [1]
+""")
+        self.assertEqual(len(serialized), 82)
+
  class MapSerialization(unittest.TestCase):
      def __init__(self, *args, **kwargs):
          super().__init__(*args, **kwargs)
author	Vladimír Vondruš <mosra@centrum.cz>
	Mon, 29 Jan 2018 23:38:39 +0000 (00:38 +0100)
committer	Vladimír Vondruš <mosra@centrum.cz>
	Sat, 3 Feb 2018 09:51:55 +0000 (10:51 +0100)
doxygen/dox2html5.py		patch \| blob \| history
doxygen/search.js		patch \| blob \| history
doxygen/test/js-test-data/unicode.bin	[new file with mode: 0644]	patch \| blob
doxygen/test/populate-js-test-data.py		patch \| blob \| history
doxygen/test/test-search.js		patch \| blob \| history
doxygen/test/test_search.py		patch \| blob \| history