Fix toot length tracking with nontrivial Unicode.

author Simon Tatham <anakin@pobox.com>

Fri, 12 Jan 2024 12:43:53 +0000 (12:43 +0000)

committer Simon Tatham <anakin@pobox.com>

Fri, 12 Jan 2024 12:52:04 +0000 (12:52 +0000)
author Simon Tatham <anakin@pobox.com>
Fri, 12 Jan 2024 12:43:53 +0000 (12:43 +0000)
committer Simon Tatham <anakin@pobox.com>
Fri, 12 Jan 2024 12:52:04 +0000 (12:52 +0000)
diff --git a/src/editor.rs b/src/editor.rs

index 1ed6e0442fe039602f5baf5ec0587676a6f64de4..395cade64a022db3455c8ecfd6e2a917e0d66b22 100644 (file)
--- a/src/editor.rs
+++ b/src/editor.rs
@@ -24,6 +24,8 @@ impl EditorCore {
      fn is_char_boundary(&self, pos: usize) -> bool {
          if !self.text.is_char_boundary(pos) {
              false
+        } else if pos == 0 {
+            true // beginning of string
          } else {
              match self.text[pos..].chars().next() {
                  None => true, // end of string
@@ -859,6 +861,8 @@ impl Composer {
          // and may use it to adjust the colours we give to
          // add_region_inner.
          let mut add_region = |start: usize, end: usize, colour: char| {
+            // Count the actual characters in the region.
+            let region_chars = self.core.text[start..end].chars().count();
              // Determine the total cost of the current region.
              let cost = match colour {
                  'u' => self.conf.characters_reserved_per_url,
@@ -866,7 +870,7 @@ impl Composer {
                      Some(pos) => pos + 1, // just the part before the @domain
                      None => end - start, // otherwise the whole thing counts
                  }
-                _ => end - start,
+                _ => region_chars,
              };
  
              // Maybe recolour the region as 'out of bounds', or partly so.
@@ -879,8 +883,28 @@ impl Composer {
              } else {
                  // Break the region into an ok and a bad subsection
                  let nbad_chars = nchars + cost - self.conf.max_characters;
-                let nok_chars = end.saturating_sub(start + nbad_chars);
-                let midpoint = start + nok_chars;
+                let nok_chars = region_chars.saturating_sub(nbad_chars);
+                let nok_bytes = {
+                    let slice = &core.text[start..];
+                    let mut char_iter = slice.char_indices();
+                    for _ in 0..nok_chars {
+                        char_iter.next();
+                    }
+                    let mut pos = char_iter.next()
+                        .map_or_else(|| slice.len(), |(i,_)| i);
+
+                    // If we've landed on a Unicode char boundary but
+                    // not on an _editor_ char boundary (i.e. between
+                    // a printing character and a combining one), move
+                    // the position backwards so that the printable
+                    // character is marked as overlong.
+                    while pos > start && !core.is_char_boundary(start + pos) {
+                        pos -= 1;
+                    }
+
+                    pos
+                };
+                let midpoint = start + nok_bytes;
                  add_region_inner(start, midpoint, colour);
                  add_region_inner(midpoint, end, '!');
              }
@@ -1318,6 +1342,37 @@ fn test_regions() {
          ComposeBufferRegion { start: 44, end: 51-1, colour: ' ' },
          ComposeBufferRegion { start: 51-1, end: 91, colour: '!' },
      });
+
+    // Test handling of non-single-byte Unicode characters. Note here
+    // that ² and ³ take two bytes each in UTF-8 (they live in the ISO
+    // 8859-1 top half), but all the other superscript digits need
+    // three bytes (U+2070 onwards).
+    let unicode_sample_text = "⁰ⁱ²³⁴⁵⁶⁷⁸⁹⁰ⁱ²³⁴⁵⁶⁷⁸⁹⁰ⁱ²³⁴⁵⁶⁷⁸⁹";
+    let composer = Composer::test_new(InstanceStatusConfig {
+        max_characters: 23,
+        max_media_attachments: 4,
+        characters_reserved_per_url: 23,
+    }, unicode_sample_text);
+    assert_eq!(composer.make_regions(&composer.core), vec! {
+        // 28 bytes for a full ⁰ⁱ²³⁴⁵⁶⁷⁸⁹, 3+3+2=8 bytes for ⁰ⁱ²
+        ComposeBufferRegion { start: 0, end: 2*28+8, colour: ' ' },
+        ComposeBufferRegion { start: 2*28+8, end: 3*28, colour: '!' },
+    });
+
+    // An even more awkward case, where there's a combining mark at
+    // the join.
+    let unicode_sample_text = "Besźel";
+    let composer = Composer::test_new(InstanceStatusConfig {
+        max_characters: 4,
+        max_media_attachments: 4,
+        characters_reserved_per_url: 23,
+    }, unicode_sample_text);
+    assert_eq!(composer.make_regions(&composer.core), vec! {
+        // We expect only the "Bes" to be marked as ok, even though
+        // the 'z' part of "ź" is within bounds in principle
+        ComposeBufferRegion { start: 0, end: 3, colour: ' ' },
+        ComposeBufferRegion { start: 3, end: 8, colour: '!' },
+    });
  }
  
  #[test]
author	Simon Tatham <anakin@pobox.com>
	Fri, 12 Jan 2024 12:43:53 +0000 (12:43 +0000)
committer	Simon Tatham <anakin@pobox.com>
	Fri, 12 Jan 2024 12:52:04 +0000 (12:52 +0000)