From: Simon Tatham Date: Mon, 25 Dec 2023 16:16:11 +0000 (+0000) Subject: First-draft HTML parser X-Git-Url: https://www.chiark.greenend.org.uk/ucgi/~ian/git?a=commitdiff_plain;h=5d1a8dedc0e164d98906271a020febc71418dc73;p=mastodonochrome.git First-draft HTML parser --- diff --git a/src/text.rs b/src/text.rs index 6cbcf3e..6164b5b 100644 --- a/src/text.rs +++ b/src/text.rs @@ -1,6 +1,8 @@ use chrono::{DateTime,Utc,Local}; use core::cmp::max; +use std::collections::{HashMap, HashSet}; +use super::html; use super::coloured_string::{ColouredString, ColouredStringSlice}; pub trait TextFragment { @@ -227,16 +229,23 @@ impl Paragraph { self } - pub fn add(mut self, text: &ColouredString) -> Self { + pub fn push_text(&mut self, text: &ColouredString, squash_spaces: bool) { for ch in text.chars() { if let Some(curr_word) = self.words.last_mut() { - if ch.is_space() == curr_word.is_space() { - curr_word.push_str(&ch); + let is_space = ch.is_space(); + if is_space == curr_word.is_space() { + if !(is_space && squash_spaces) { + curr_word.push_str(&ch); + } continue; } } self.words.push(ch.to_owned()); } + } + + pub fn add(mut self, text: &ColouredString) -> Self { + self.push_text(text, false); self } @@ -255,6 +264,13 @@ impl Paragraph { } pub fn into_box(self) -> Box { Box::new(self) } + + pub fn is_empty(&self) -> bool { + match self.words.first() { + None => true, + Some(word) => word.nchars() == 0, + } + } } #[test] @@ -488,6 +504,216 @@ fn test_fileheader() { }); } +struct HTMLFormatter { + paras: Vec, + colourstack: Vec, + bad_tags: HashSet, + indent: usize, + pre_tag: usize, +} + +impl HTMLFormatter { + fn new() -> Self { + HTMLFormatter { + paras: vec! { Paragraph::new() }, + colourstack: vec! { ' ' }, + bad_tags: HashSet::new(), + indent: 0, + pre_tag: 0, + } + } + + fn finish(mut self) -> Vec { + let first_nonempty = match self.paras.iter().enumerate() + .find(|(_, p)| !p.is_empty()) { + Some((i, _)) => i, + None => self.paras.len(), + }; + self.paras.splice(..first_nonempty, vec![]); + + while match self.paras.last() { + Some(p) => p.is_empty(), + None => false + } { + self.paras.pop(); + } + + self.paras + } + + fn new_para(&self) -> Paragraph { + Paragraph::new().set_indent(self.indent, self.indent) + } + fn last_para(&self) -> &Paragraph { + // self.paras always contains at least one paragraph, so unwrap is OK + self.paras.last().unwrap() + } + fn last_para_mut(&mut self) -> &mut Paragraph { + self.paras.last_mut().unwrap() + } +} + +impl html::Receiver for HTMLFormatter { + fn start_tag(&mut self, tag: &str, attrs: &HashMap) { + if tag == "a" { + let mut colour = ' '; + if attrs.get("href").is_some() { + colour = 'u'; + } + if let Some(classes) = attrs.get("class") { + if classes.split(' ').any(|x| x == "hashtag") { + colour = '#'; + } else if classes.split(' ').any(|x| x == "mention") { + colour = '@'; + } + } + self.colourstack.push(colour); + } else if tag == "p" { + if !self.last_para().is_empty() { + self.paras.push(Paragraph::new()); + } + self.paras.push(self.new_para()); + } else if tag == "pre" { + if !self.last_para().is_empty() { + self.paras.push(Paragraph::new()); + } + self.paras.push(self.new_para()); + self.pre_tag += 1; + self.colourstack.push('c'); + } else if tag == "br" { + self.paras.push(self.new_para()); + } else if tag == "blockquote" { + self.indent += 2; + self.paras.push(self.new_para()); + } else if tag == "code" { + self.colourstack.push('c'); + } else if tag == "strong" { + self.colourstack.push('s'); + } else if tag == "em" || tag == "i" { + self.colourstack.push('_'); + } else if tag == "span" { + // do nothing, except don't report this as an unknown tag + } else { + self.bad_tags.insert(tag.to_string()); + } + } + fn end_tag(&mut self, tag: &str, _attrs: &HashMap) { + if tag == "a" || tag == "code" || tag == "strong" || tag == "em" || + tag == "i" { + self.colourstack.pop(); + } else if tag == "p" { + if !self.last_para().is_empty() { + self.paras.push(Paragraph::new()); + } + } else if tag == "pre" { + self.pre_tag -= 1; + self.colourstack.pop(); + if !self.last_para().is_empty() { + self.paras.push(Paragraph::new()); + } + } else if tag == "blockquote" { + if !self.last_para().is_empty() { + self.paras.push(Paragraph::new()); + } + self.indent -= 2; + self.paras.push(self.new_para()); + } + } + fn text(&mut self, text: &str) { + let colour = *self.colourstack.last().unwrap(); + if self.pre_tag > 0 { + for (i, line) in text.split('\n').enumerate() { + if i > 0 { + self.paras.push(self.new_para()); + } + self.last_para_mut().push_text(&ColouredString::uniform( + line, colour), false); + } + } else { + self.last_para_mut().push_text(&ColouredString::uniform( + text, colour), true); + } + } +} + +pub fn parse_html(html: &str) -> Vec { + let mut recv = HTMLFormatter::new(); + html::render(html, &mut recv); + recv.finish() +} + +#[cfg(test)] +fn render_html(html: &str, width: usize) -> Vec { + parse_html(html) + .into_iter() + .map(|para| para.render(width)) + .flatten() + .collect() +} + +#[test] +fn test_html() { + assert_eq!(render_html("

Testing, testing, 1, 2, 3

", 50), + vec! { + ColouredString::plain("Testing, testing, 1, 2, 3"), + }); + + assert_eq!(render_html("

First para

Second para

", 50), + vec! { + ColouredString::plain("First para"), + ColouredString::plain(""), + ColouredString::plain("Second para"), + }); + + assert_eq!(render_html("

First line
Second line

", 50), + vec! { + ColouredString::plain("First line"), + ColouredString::plain("Second line"), + }); + + assert_eq!(render_html("

Pease porridge hot, pease porridge cold, pease porridge in the pot, nine days old

", 50), + vec! { + ColouredString::plain("Pease porridge hot, pease porridge cold, pease"), + ColouredString::plain("porridge in the pot, nine days old"), + }); + + assert_eq!(render_html("

Test of some literal code

", 50), + vec! { + ColouredString::general("Test of some literal code", + " cccccccccccc"), + }); + + assert_eq!(render_html("

Test of some strong text

", 50), + vec! { + ColouredString::general("Test of some strong text", + " sssssssssss"), + }); + + assert_eq!(render_html("

Test of a #hashtag

", 50), + vec! { + ColouredString::general("Test of a #hashtag", + " ########"), + }); + + assert_eq!(render_html("

Test of a @username

", 50), + vec! { + ColouredString::general("Test of a @username", + " @@@@@@@@@"), + }); + + assert_eq!(render_html("

Test of a @username

", 50), + vec! { + ColouredString::general("Test of a @username", + " @@@@@@@@@"), + }); + + assert_eq!(render_html("

URL to https://www.example.com/stuff/.

", 50), + vec! { + ColouredString::general("URL to https://www.example.com/stuff/.", + " uuuuuuuuuuuuuuuuuuuuuuuuuuuuuu "), + }); +} + // TODO: // ExtendableIndicator at file header (with an 'active' flag) // InReplyToLine, with first line of included paragraph @@ -496,4 +722,3 @@ fn test_fileheader() { // Media // FileStatusLine, with priorities // MenuKeypressLine -// And, of course, the HTML parsing