chiark / gitweb /
First-draft HTML parser
authorSimon Tatham <anakin@pobox.com>
Mon, 25 Dec 2023 16:16:11 +0000 (16:16 +0000)
committerSimon Tatham <anakin@pobox.com>
Mon, 25 Dec 2023 18:41:53 +0000 (18:41 +0000)
src/text.rs

index 6cbcf3ed1c608093bbc5d92e7fd8563a2daa0335..6164b5bef833f1f6970b95e210a4468a360a6ed0 100644 (file)
@@ -1,6 +1,8 @@
 use chrono::{DateTime,Utc,Local};
 use core::cmp::max;
+use std::collections::{HashMap, HashSet};
 
+use super::html;
 use super::coloured_string::{ColouredString, ColouredStringSlice};
 
 pub trait TextFragment {
@@ -227,16 +229,23 @@ impl Paragraph {
         self
     }
 
-    pub fn add(mut self, text: &ColouredString) -> Self {
+    pub fn push_text(&mut self, text: &ColouredString, squash_spaces: bool) {
         for ch in text.chars() {
             if let Some(curr_word) = self.words.last_mut() {
-                if ch.is_space() == curr_word.is_space() {
-                    curr_word.push_str(&ch);
+                let is_space = ch.is_space();
+                if is_space == curr_word.is_space() {
+                    if !(is_space && squash_spaces) {
+                        curr_word.push_str(&ch);
+                    }
                     continue;
                 }
             }
             self.words.push(ch.to_owned());
         }
+    }
+
+    pub fn add(mut self, text: &ColouredString) -> Self {
+        self.push_text(text, false);
         self
     }
 
@@ -255,6 +264,13 @@ impl Paragraph {
     }
 
     pub fn into_box(self) -> Box<dyn TextFragment> { Box::new(self) }
+
+    pub fn is_empty(&self) -> bool {
+        match self.words.first() {
+            None => true,
+            Some(word) => word.nchars() == 0,
+        }
+    }
 }
 
 #[test]
@@ -488,6 +504,216 @@ fn test_fileheader() {
         });
 }
 
+struct HTMLFormatter {
+    paras: Vec<Paragraph>,
+    colourstack: Vec<char>,
+    bad_tags: HashSet<String>,
+    indent: usize,
+    pre_tag: usize,
+}
+
+impl HTMLFormatter {
+    fn new() -> Self {
+        HTMLFormatter {
+            paras: vec! { Paragraph::new() },
+            colourstack: vec! { ' ' },
+            bad_tags: HashSet::new(),
+            indent: 0,
+            pre_tag: 0,
+        }
+    }
+
+    fn finish(mut self) -> Vec<Paragraph> {
+        let first_nonempty = match self.paras.iter().enumerate()
+            .find(|(_, p)| !p.is_empty()) {
+            Some((i, _)) => i,
+            None => self.paras.len(),
+        };
+        self.paras.splice(..first_nonempty, vec![]);
+
+        while match self.paras.last() {
+            Some(p) => p.is_empty(),
+            None => false
+        } {
+            self.paras.pop();
+        }
+
+        self.paras
+    }
+
+    fn new_para(&self) -> Paragraph {
+        Paragraph::new().set_indent(self.indent, self.indent)
+    }
+    fn last_para(&self) -> &Paragraph {
+        // self.paras always contains at least one paragraph, so unwrap is OK
+        self.paras.last().unwrap()
+    }
+    fn last_para_mut(&mut self) -> &mut Paragraph {
+        self.paras.last_mut().unwrap()
+    }
+}
+
+impl html::Receiver for HTMLFormatter {
+    fn start_tag(&mut self, tag: &str, attrs: &HashMap<String, String>) {
+        if tag == "a" {
+            let mut colour = ' ';
+            if attrs.get("href").is_some() {
+                colour = 'u';
+            }
+            if let Some(classes) = attrs.get("class") {
+                if classes.split(' ').any(|x| x == "hashtag") {
+                    colour = '#';
+                } else if classes.split(' ').any(|x| x == "mention") {
+                    colour = '@';
+                }
+            }
+            self.colourstack.push(colour);
+        } else if tag == "p" {
+            if !self.last_para().is_empty() {
+                self.paras.push(Paragraph::new());
+            }
+            self.paras.push(self.new_para());
+        } else if tag == "pre" {
+            if !self.last_para().is_empty() {
+                self.paras.push(Paragraph::new());
+            }
+            self.paras.push(self.new_para());
+            self.pre_tag += 1;
+            self.colourstack.push('c');
+        } else if tag == "br" {
+            self.paras.push(self.new_para());
+        } else if tag == "blockquote" {
+            self.indent += 2;
+            self.paras.push(self.new_para());
+        } else if tag == "code" {
+            self.colourstack.push('c');
+        } else if tag == "strong" {
+            self.colourstack.push('s');
+        } else if tag == "em" || tag == "i" {
+            self.colourstack.push('_');
+        } else if tag == "span" {
+            // do nothing, except don't report this as an unknown tag
+        } else {
+            self.bad_tags.insert(tag.to_string());
+        }
+    }
+    fn end_tag(&mut self, tag: &str, _attrs: &HashMap<String, String>) {
+        if tag == "a" || tag == "code" || tag == "strong" || tag == "em" ||
+            tag == "i" {
+            self.colourstack.pop();
+        } else if tag == "p" {
+            if !self.last_para().is_empty() {
+                self.paras.push(Paragraph::new());
+            }
+        } else if tag == "pre" {
+            self.pre_tag -= 1;
+            self.colourstack.pop();
+            if !self.last_para().is_empty() {
+                self.paras.push(Paragraph::new());
+            }
+        } else if tag == "blockquote" {
+            if !self.last_para().is_empty() {
+                self.paras.push(Paragraph::new());
+            }
+            self.indent -= 2;
+            self.paras.push(self.new_para());
+        }
+    }
+    fn text(&mut self, text: &str) {
+        let colour = *self.colourstack.last().unwrap();
+        if self.pre_tag > 0 {
+            for (i, line) in text.split('\n').enumerate() {
+                if i > 0 {
+                    self.paras.push(self.new_para());
+                }
+                self.last_para_mut().push_text(&ColouredString::uniform(
+                        line, colour), false);
+            }
+        } else {
+            self.last_para_mut().push_text(&ColouredString::uniform(
+                    text, colour), true);
+        }
+    }
+}
+
+pub fn parse_html(html: &str) -> Vec<Paragraph> {
+    let mut recv = HTMLFormatter::new();
+    html::render(html, &mut recv);
+    recv.finish()
+}
+
+#[cfg(test)]
+fn render_html(html: &str, width: usize) -> Vec<ColouredString> {
+    parse_html(html)
+        .into_iter()
+        .map(|para| para.render(width))
+        .flatten()
+        .collect()
+}
+
+#[test]
+fn test_html() {
+    assert_eq!(render_html("<p>Testing, testing, 1, 2, 3</p>", 50),
+               vec! {
+            ColouredString::plain("Testing, testing, 1, 2, 3"),
+        });
+
+    assert_eq!(render_html("<p>First para</p><p>Second para</p>", 50),
+               vec! {
+            ColouredString::plain("First para"),
+            ColouredString::plain(""),
+            ColouredString::plain("Second para"),
+        });
+
+    assert_eq!(render_html("<p>First line<br>Second line</p>", 50),
+               vec! {
+            ColouredString::plain("First line"),
+            ColouredString::plain("Second line"),
+        });
+
+    assert_eq!(render_html("<p>Pease porridge hot, pease porridge cold, pease porridge in the pot, nine days old</p>", 50),
+               vec! {
+            ColouredString::plain("Pease porridge hot, pease porridge cold, pease"),
+            ColouredString::plain("porridge in the pot, nine days old"),
+        });
+
+    assert_eq!(render_html("<p>Test of some <code>literal code</code></p>", 50),
+               vec! {
+            ColouredString::general("Test of some literal code",
+                                    "             cccccccccccc"),
+        });
+
+    assert_eq!(render_html("<p>Test of some <strong>strong text</strong></p>", 50),
+               vec! {
+            ColouredString::general("Test of some strong text",
+                                    "             sssssssssss"),
+        });
+
+    assert_eq!(render_html("<p>Test of a <a href=\"https://some.instance/tags/hashtag\" class=\"mention hashtag\" rel=\"nofollow noopener noreferrer\" target=\"_blank\">#<span>hashtag</span></a></p>", 50),
+               vec! {
+            ColouredString::general("Test of a #hashtag",
+                                    "          ########"),
+        });
+
+    assert_eq!(render_html("<p>Test of a <span class=\"h-card\" translate=\"no\"><a href=\"https://some.instance/@username\" class=\"u-url mention\" rel=\"nofollow noopener noreferrer\" target=\"_blank\">@<span>username</span></a></span></p>", 50),
+               vec! {
+            ColouredString::general("Test of a @username",
+                                    "          @@@@@@@@@"),
+        });
+
+    assert_eq!(render_html("<p>Test of a <span class=\"h-card\" translate=\"no\"><a href=\"https://some.instance/@username\" class=\"u-url mention\" rel=\"nofollow noopener noreferrer\" target=\"_blank\">@<span>username</span></a></span></p>", 50),
+               vec! {
+            ColouredString::general("Test of a @username",
+                                    "          @@@@@@@@@"),
+        });
+
+    assert_eq!(render_html("<p>URL to <a href=\"https://www.example.com/stuff/\" target=\"_blank\" rel=\"nofollow noopener noreferrer\" translate=\"no\"><span class=\"invisible\">https://www.</span><span class=\"ellipsis\">example.com/st</span><span class=\"invisible\">uff/</span></a>.</p>", 50),
+               vec! {
+            ColouredString::general("URL to https://www.example.com/stuff/.",
+                                    "       uuuuuuuuuuuuuuuuuuuuuuuuuuuuuu "),
+        });
+}
+
 // TODO:
 // ExtendableIndicator at file header (with an 'active' flag)
 // InReplyToLine, with first line of included paragraph
@@ -496,4 +722,3 @@ fn test_fileheader() {
 // Media
 // FileStatusLine, with priorities
 // MenuKeypressLine
-// And, of course, the HTML parsing