header_struct = struct.Struct('<BB')
value_struct = struct.Struct('<H')
child_struct = struct.Struct('<I')
- child_char_struct = struct.Struct('<c')
+ child_char_struct = struct.Struct('<B')
def __init__(self):
self.values = []
self.children = {}
- def insert(self, path: str, value, lookahead_barriers=[]):
+ def _insert(self, path: bytes, value, lookahead_barriers):
if not path:
self.values += [value]
return
char = path[0]
- assert not char.isupper() # to avoid unnecessary duplicates
if not char in self.children:
self.children[char] = (False, Trie())
if lookahead_barriers and lookahead_barriers[0] == 0:
lookahead_barriers = lookahead_barriers[1:]
self.children[char] = (True, self.children[char][1])
- self.children[char][1].insert(path[1:], value, [b - 1 for b in lookahead_barriers])
+ self.children[char][1]._insert(path[1:], value, [b - 1 for b in lookahead_barriers])
+
+ def insert(self, path: str, value, lookahead_barriers=[]):
+ assert not path.isupper() # to avoid unnecessary duplicates
+ self._insert(path.encode('utf-8'), value, lookahead_barriers)
# Returns offset of the serialized thing in `output`
def _serialize(self, hashtable, output: bytearray) -> int:
# a 24 bit field
offset = len(serialized)
serialized += self.child_struct.pack(abs_offset | ((1 if lookahead_barrier else 0) << 23))
- self.child_char_struct.pack_into(serialized, offset + 3, char.encode('utf-8'))
+ self.child_char_struct.pack_into(serialized, offset + 3, char)
assert size == len(serialized)
return this.init(this.base85decode(base85string));
},
+ /* http://ecmanaut.blogspot.com/2006/07/encoding-decoding-utf8-in-javascript.html */
+ toUtf8: function(string) { return unescape(encodeURIComponent(string)); },
+ fromUtf8: function(string) { return decodeURIComponent(escape(string)); },
+
+ /* Returns the values in UTF-8, but input is in whatever shitty 16bit
+ encoding JS has */
search: function(searchString) {
- /* Normalize the search string first */
- searchString = searchString.toLowerCase().trim();
+ /* Normalize the search string first, convert to UTF-8 */
+ searchString = this.toUtf8(searchString.toLowerCase().trim());
/* TODO: maybe i could make use of InputEvent.data and others here */
url += String.fromCharCode(this.map.getUint8(j));
}
- /* Properly decode UTF-8 in the name
- http://ecmanaut.blogspot.com/2006/07/encoding-decoding-utf8-in-javascript.html */
- results.push({name: decodeURIComponent(escape(name)),
+ /* Keeping in UTF-8, as we need that for proper slicing */
+ results.push({name: name,
url: url,
suffixLength: suffixLength + resultSuffixLength});
},
renderResults: /* istanbul ignore next */ function(value, results) {
- /* Normalize the value length so the slicing works properly */
- value = value.trim();
+ /* Normalize the value and encode as UTF-8 so the slicing works
+ properly */
+ value = this.toUtf8(value.trim());
if(!value.length) {
document.getElementById('search-help').style.display = 'block';
var list = '';
for(let i = 0; i != results.length; ++i) {
- list += '<li' + (i ? '' : ' id="search-current"') + '><a href="' + results[i].url + '" onmouseover="selectResult(event)"><div><span class="m-text m-dim">' + this.escapeForRtl(results[i].name.substr(0, results[i].name.length - value.length - results[i].suffixLength)) + '</span><span class="m-dox-search-typed">' + this.escapeForRtl(results[i].name.substr(results[i].name.length - value.length - results[i].suffixLength, value.length)) + '</span>' + this.escapeForRtl(results[i].name.substr(results[i].name.length - results[i].suffixLength)) + '</div></a></li>';
+ list += this.fromUtf8('<li' + (i ? '' : ' id="search-current"') + '><a href="' + results[i].url + '" onmouseover="selectResult(event)"><div><span class="m-text m-dim">' + this.escapeForRtl(results[i].name.substr(0, results[i].name.length - value.length - results[i].suffixLength)) + '</span><span class="m-dox-search-typed">' + this.escapeForRtl(results[i].name.substr(results[i].name.length - value.length - results[i].suffixLength, value.length)) + '</span>' + this.escapeForRtl(results[i].name.substr(results[i].name.length - results[i].suffixLength)) + '</div></a></li>');
}
document.getElementById('search-results').innerHTML = list;
document.getElementById('search-current').scrollIntoView(true);
f.write(serialize_search_data(trie, map))
with open(basedir/'searchdata.b85', 'wb') as f:
f.write(base64.b85encode(serialize_search_data(trie, map), True))
+
+trie = Trie()
+map = ResultMap()
+
+trie.insert("hýždě", map.add("Hýždě", "#a"))
+trie.insert("hárá", map.add("Hárá", "#b"))
+
+with open(basedir/'unicode.bin', 'wb') as f:
+ f.write(serialize_search_data(trie, map))
assert.equal(Search.escapeForRtl("NS::Class<int>"), "NS‎:‎:Class<int‎>‎");
}
+/* UTF-8 en/decoding */
+{
+ assert.equal(Search.fromUtf8(Search.toUtf8("hýždě bříza")), "hýždě bříza");
+}
+
/* Simple base85 decoding */
{
let b85 = 'Xk~0{Zy-ZbL0VZLcW-iRWFa9T';
/* UTF-8 decoding */
assert.deepEqual(Search.search('su'), [
- { name: 'Page » Subpage',
+ { name: Search.toUtf8('Page » Subpage'),
url: 'subpage.html',
suffixLength: 5 }]);
}
suffixLength: 8 }]);
}
+/* Search, Unicode */
+{
+ let buffer = fs.readFileSync(path.join(__dirname, "js-test-data/unicode.bin"));
+ assert.ok(Search.init(buffer.buffer.slice(buffer.byteOffset, buffer.byteOffset + buffer.byteLength), 3));
+ assert.equal(Search.dataSize, 0.1);
+ assert.equal(Search.symbolCount, 2);
+ assert.deepEqual(Search.search('h'), [
+ { name: Search.toUtf8('Hýždě'),
+ url: '#a',
+ suffixLength: 7 },
+ { name: Search.toUtf8('Hárá'),
+ url: '#b',
+ suffixLength: 5 }]);
+ assert.deepEqual(Search.search('hý'), [
+ { name: Search.toUtf8('Hýždě'),
+ url: '#a',
+ suffixLength: 5 }]);
+ assert.deepEqual(Search.search('há'), [
+ { name: Search.toUtf8('Hárá'),
+ url: '#b',
+ suffixLength: 3 }]);
+}
+
/* Not testing Search.download() because the xmlhttprequest npm package is *crap* */
if child_count or value_count:
out += '\n'
out += indent
- out += Trie.child_char_struct.unpack_from(serialized, offset + 3)[0].decode('utf-8')
+ char = Trie.child_char_struct.unpack_from(serialized, offset + 3)[0]
+ if char <= 127:
+ out += chr(char)
+ else:
+ out += hex(char)
if Trie.child_struct.unpack_from(serialized, offset)[0] & 0x00800000:
- out += '$\n'
- out += indent + ' '
+ out += '$'
+ if char > 127 or Trie.child_struct.unpack_from(serialized, offset)[0] & 0x00800000:
+ out += '\n' + indent + ' '
child_offset = Trie.child_struct.unpack_from(serialized, offset)[0] & 0x007fffff
stats.max_node_child_offset = max(child_offset, stats.max_node_child_offset)
offset += Trie.child_struct.size
""")
self.assertEqual(len(serialized), 340)
+ def test_unicode(self):
+ trie = Trie()
+
+ trie.insert("hýždě", 0)
+ trie.insert("hárá", 1)
+
+ serialized = trie.serialize()
+ self.compare(serialized, """
+h0xc3
+ 0xbd
+ 0xc5
+ | 0xbe
+ | d0xc4
+ | 0x9b
+ | [0]
+ 0xa1
+ r0xc3
+ | 0xa1
+ | [1]
+""")
+ self.assertEqual(len(serialized), 82)
+
class MapSerialization(unittest.TestCase):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)