From 2176e02c9036e79cad6c0061e18e603b69762582 Mon Sep 17 00:00:00 2001 From: Simon Tatham Date: Fri, 12 Jan 2024 12:43:53 +0000 Subject: [PATCH] Fix toot length tracking with nontrivial Unicode. Somehow I totally forgot to distinguish chars from bytes in Composer::make_regions(), _and_ forgot to test it. So I had a crash this morning when editing a toot containing Unicode put the 500-byte mark midway through a UTF-8 character. That's wrong for _two_ reasons - firstly, no endpoint in the regions array ought to be anywhere other than a character boundary, and secondly, the 500 _byte_ boundary has nothing to do with the toot length limit in the first place! The 500 _character_ boundary would have been more to the point. --- src/editor.rs | 61 ++++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 58 insertions(+), 3 deletions(-) diff --git a/src/editor.rs b/src/editor.rs index 1ed6e04..395cade 100644 --- a/src/editor.rs +++ b/src/editor.rs @@ -24,6 +24,8 @@ impl EditorCore { fn is_char_boundary(&self, pos: usize) -> bool { if !self.text.is_char_boundary(pos) { false + } else if pos == 0 { + true // beginning of string } else { match self.text[pos..].chars().next() { None => true, // end of string @@ -859,6 +861,8 @@ impl Composer { // and may use it to adjust the colours we give to // add_region_inner. let mut add_region = |start: usize, end: usize, colour: char| { + // Count the actual characters in the region. + let region_chars = self.core.text[start..end].chars().count(); // Determine the total cost of the current region. let cost = match colour { 'u' => self.conf.characters_reserved_per_url, @@ -866,7 +870,7 @@ impl Composer { Some(pos) => pos + 1, // just the part before the @domain None => end - start, // otherwise the whole thing counts } - _ => end - start, + _ => region_chars, }; // Maybe recolour the region as 'out of bounds', or partly so. @@ -879,8 +883,28 @@ impl Composer { } else { // Break the region into an ok and a bad subsection let nbad_chars = nchars + cost - self.conf.max_characters; - let nok_chars = end.saturating_sub(start + nbad_chars); - let midpoint = start + nok_chars; + let nok_chars = region_chars.saturating_sub(nbad_chars); + let nok_bytes = { + let slice = &core.text[start..]; + let mut char_iter = slice.char_indices(); + for _ in 0..nok_chars { + char_iter.next(); + } + let mut pos = char_iter.next() + .map_or_else(|| slice.len(), |(i,_)| i); + + // If we've landed on a Unicode char boundary but + // not on an _editor_ char boundary (i.e. between + // a printing character and a combining one), move + // the position backwards so that the printable + // character is marked as overlong. + while pos > start && !core.is_char_boundary(start + pos) { + pos -= 1; + } + + pos + }; + let midpoint = start + nok_bytes; add_region_inner(start, midpoint, colour); add_region_inner(midpoint, end, '!'); } @@ -1318,6 +1342,37 @@ fn test_regions() { ComposeBufferRegion { start: 44, end: 51-1, colour: ' ' }, ComposeBufferRegion { start: 51-1, end: 91, colour: '!' }, }); + + // Test handling of non-single-byte Unicode characters. Note here + // that ² and ³ take two bytes each in UTF-8 (they live in the ISO + // 8859-1 top half), but all the other superscript digits need + // three bytes (U+2070 onwards). + let unicode_sample_text = "⁰ⁱ²³⁴⁵⁶⁷⁸⁹⁰ⁱ²³⁴⁵⁶⁷⁸⁹⁰ⁱ²³⁴⁵⁶⁷⁸⁹"; + let composer = Composer::test_new(InstanceStatusConfig { + max_characters: 23, + max_media_attachments: 4, + characters_reserved_per_url: 23, + }, unicode_sample_text); + assert_eq!(composer.make_regions(&composer.core), vec! { + // 28 bytes for a full ⁰ⁱ²³⁴⁵⁶⁷⁸⁹, 3+3+2=8 bytes for ⁰ⁱ² + ComposeBufferRegion { start: 0, end: 2*28+8, colour: ' ' }, + ComposeBufferRegion { start: 2*28+8, end: 3*28, colour: '!' }, + }); + + // An even more awkward case, where there's a combining mark at + // the join. + let unicode_sample_text = "Besźel"; + let composer = Composer::test_new(InstanceStatusConfig { + max_characters: 4, + max_media_attachments: 4, + characters_reserved_per_url: 23, + }, unicode_sample_text); + assert_eq!(composer.make_regions(&composer.core), vec! { + // We expect only the "Bes" to be marked as ok, even though + // the 'z' part of "ź" is within bounds in principle + ComposeBufferRegion { start: 0, end: 3, colour: ' ' }, + ComposeBufferRegion { start: 3, end: 8, colour: '!' }, + }); } #[test] -- 2.30.2