From 8c0daaa29d4ee22318aa8a1c74d9a9131aea9f11 Mon Sep 17 00:00:00 2001 From: =?utf8?q?Vladim=C3=ADr=20Vondru=C5=A1?= Date: Tue, 30 Jan 2018 00:38:39 +0100 Subject: [PATCH] doxygen: proper UTF-8 support in search. --- doxygen/dox2html5.py | 13 ++++++---- doxygen/search.js | 22 ++++++++++------- doxygen/test/js-test-data/unicode.bin | Bin 0 -> 122 bytes doxygen/test/populate-js-test-data.py | 9 +++++++ doxygen/test/test-search.js | 30 ++++++++++++++++++++++- doxygen/test/test_search.py | 33 +++++++++++++++++++++++--- 6 files changed, 90 insertions(+), 17 deletions(-) create mode 100644 doxygen/test/js-test-data/unicode.bin diff --git a/doxygen/dox2html5.py b/doxygen/dox2html5.py index 81b891f3..e0119168 100755 --- a/doxygen/dox2html5.py +++ b/doxygen/dox2html5.py @@ -61,25 +61,28 @@ class Trie: header_struct = struct.Struct(' int: @@ -104,7 +107,7 @@ class Trie: # a 24 bit field offset = len(serialized) serialized += self.child_struct.pack(abs_offset | ((1 if lookahead_barrier else 0) << 23)) - self.child_char_struct.pack_into(serialized, offset + 3, char.encode('utf-8')) + self.child_char_struct.pack_into(serialized, offset + 3, char) assert size == len(serialized) diff --git a/doxygen/search.js b/doxygen/search.js index 91861ce3..74d050f7 100644 --- a/doxygen/search.js +++ b/doxygen/search.js @@ -159,9 +159,15 @@ var Search = { return this.init(this.base85decode(base85string)); }, + /* http://ecmanaut.blogspot.com/2006/07/encoding-decoding-utf8-in-javascript.html */ + toUtf8: function(string) { return unescape(encodeURIComponent(string)); }, + fromUtf8: function(string) { return decodeURIComponent(escape(string)); }, + + /* Returns the values in UTF-8, but input is in whatever shitty 16bit + encoding JS has */ search: function(searchString) { - /* Normalize the search string first */ - searchString = searchString.toLowerCase().trim(); + /* Normalize the search string first, convert to UTF-8 */ + searchString = this.toUtf8(searchString.toLowerCase().trim()); /* TODO: maybe i could make use of InputEvent.data and others here */ @@ -260,9 +266,8 @@ var Search = { url += String.fromCharCode(this.map.getUint8(j)); } - /* Properly decode UTF-8 in the name - http://ecmanaut.blogspot.com/2006/07/encoding-decoding-utf8-in-javascript.html */ - results.push({name: decodeURIComponent(escape(name)), + /* Keeping in UTF-8, as we need that for proper slicing */ + results.push({name: name, url: url, suffixLength: suffixLength + resultSuffixLength}); @@ -305,8 +310,9 @@ var Search = { }, renderResults: /* istanbul ignore next */ function(value, results) { - /* Normalize the value length so the slicing works properly */ - value = value.trim(); + /* Normalize the value and encode as UTF-8 so the slicing works + properly */ + value = this.toUtf8(value.trim()); if(!value.length) { document.getElementById('search-help').style.display = 'block'; @@ -323,7 +329,7 @@ var Search = { var list = ''; for(let i = 0; i != results.length; ++i) { - list += '
' + this.escapeForRtl(results[i].name.substr(0, results[i].name.length - value.length - results[i].suffixLength)) + '' + this.escapeForRtl(results[i].name.substr(results[i].name.length - value.length - results[i].suffixLength, value.length)) + '' + this.escapeForRtl(results[i].name.substr(results[i].name.length - results[i].suffixLength)) + '
'; + list += this.fromUtf8('
' + this.escapeForRtl(results[i].name.substr(0, results[i].name.length - value.length - results[i].suffixLength)) + '' + this.escapeForRtl(results[i].name.substr(results[i].name.length - value.length - results[i].suffixLength, value.length)) + '' + this.escapeForRtl(results[i].name.substr(results[i].name.length - results[i].suffixLength)) + '
'); } document.getElementById('search-results').innerHTML = list; document.getElementById('search-current').scrollIntoView(true); diff --git a/doxygen/test/js-test-data/unicode.bin b/doxygen/test/js-test-data/unicode.bin new file mode 100644 index 0000000000000000000000000000000000000000..ed2bd5051a6b5edf84a1ef12a6f27dede4a1c005 GIT binary patch literal 122 zcmW-Zp$ddR6h+VYR@ktZ76g+ZY%rNN*@VU5CYw!}VD@(=&4*|i{VyLAPC55*|J}$0 z3`$}^cz_e0a0(e%;SGmSz?B%I#Rt4lLl7F+QcBpr(YDe4V14OhuINS|_Uo{xqni8R Ix8SLk8"), "NS‎:‎:Class<int‎>‎"); } +/* UTF-8 en/decoding */ +{ + assert.equal(Search.fromUtf8(Search.toUtf8("hýždě bříza")), "hýždě bříza"); +} + /* Simple base85 decoding */ { let b85 = 'Xk~0{Zy-ZbL0VZLcW-iRWFa9T'; @@ -150,7 +155,7 @@ const { StringDecoder } = require('string_decoder'); /* UTF-8 decoding */ assert.deepEqual(Search.search('su'), [ - { name: 'Page » Subpage', + { name: Search.toUtf8('Page » Subpage'), url: 'subpage.html', suffixLength: 5 }]); } @@ -193,4 +198,27 @@ const { StringDecoder } = require('string_decoder'); suffixLength: 8 }]); } +/* Search, Unicode */ +{ + let buffer = fs.readFileSync(path.join(__dirname, "js-test-data/unicode.bin")); + assert.ok(Search.init(buffer.buffer.slice(buffer.byteOffset, buffer.byteOffset + buffer.byteLength), 3)); + assert.equal(Search.dataSize, 0.1); + assert.equal(Search.symbolCount, 2); + assert.deepEqual(Search.search('h'), [ + { name: Search.toUtf8('Hýždě'), + url: '#a', + suffixLength: 7 }, + { name: Search.toUtf8('Hárá'), + url: '#b', + suffixLength: 5 }]); + assert.deepEqual(Search.search('hý'), [ + { name: Search.toUtf8('Hýždě'), + url: '#a', + suffixLength: 5 }]); + assert.deepEqual(Search.search('há'), [ + { name: Search.toUtf8('Hárá'), + url: '#b', + suffixLength: 3 }]); +} + /* Not testing Search.download() because the xmlhttprequest npm package is *crap* */ diff --git a/doxygen/test/test_search.py b/doxygen/test/test_search.py index 7f8ba91f..7f6b3e22 100755 --- a/doxygen/test/test_search.py +++ b/doxygen/test/test_search.py @@ -64,10 +64,15 @@ def _pretty_print_trie(serialized: bytearray, hashtable, stats, base_offset, ind if child_count or value_count: out += '\n' out += indent - out += Trie.child_char_struct.unpack_from(serialized, offset + 3)[0].decode('utf-8') + char = Trie.child_char_struct.unpack_from(serialized, offset + 3)[0] + if char <= 127: + out += chr(char) + else: + out += hex(char) if Trie.child_struct.unpack_from(serialized, offset)[0] & 0x00800000: - out += '$\n' - out += indent + ' ' + out += '$' + if char > 127 or Trie.child_struct.unpack_from(serialized, offset)[0] & 0x00800000: + out += '\n' + indent + ' ' child_offset = Trie.child_struct.unpack_from(serialized, offset)[0] & 0x007fffff stats.max_node_child_offset = max(child_offset, stats.max_node_child_offset) offset += Trie.child_struct.size @@ -223,6 +228,28 @@ range [2] """) self.assertEqual(len(serialized), 340) + def test_unicode(self): + trie = Trie() + + trie.insert("hýždě", 0) + trie.insert("hárá", 1) + + serialized = trie.serialize() + self.compare(serialized, """ +h0xc3 + 0xbd + 0xc5 + | 0xbe + | d0xc4 + | 0x9b + | [0] + 0xa1 + r0xc3 + | 0xa1 + | [1] +""") + self.assertEqual(len(serialized), 82) + class MapSerialization(unittest.TestCase): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) -- 2.30.2