From: Simon Tatham Date: Fri, 29 Dec 2023 20:01:04 +0000 (+0000) Subject: Migrate back to html2text. X-Git-Url: https://www.chiark.greenend.org.uk/ucgi/~ian/git?a=commitdiff_plain;h=61fcc610a5e1522e743cf9f245a793e6720dc890;p=mastodonochrome.git Migrate back to html2text. I gave up on using it previously because it lacked some important features - in particular, it didn't support CSS on elements, which is necessary for identifying hashtags and mentions in Mastodon's output HTML. In 0.9.2 that feature is present, so I can try again! Also, 0.9.2 has the neat ability to save the RenderTree that's an intermediate product of the pipeline, just before we depend on the width. So we can keep that after parsing a toot, and then only re-run the necessary back end of the pipeline to re-render at a different width. The amount of code is extremely reduced. --- diff --git a/Cargo.toml b/Cargo.toml index 440b7fc..44f2525 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -7,8 +7,7 @@ edition = "2021" [dependencies] chrono = { version = "0.4.31", features = ["serde"] } crossterm = "0.27.0" -html2text = { version = "0.9.0", features = ["css"] } -html5ever = "0.26.0" +html2text = { version = "0.9.2", features = ["css"] } itertools = "0.12.0" ratatui = "0.25.0" regex = "1.10.2" diff --git a/src/html.rs b/src/html.rs index b7639c6..697fd93 100644 --- a/src/html.rs +++ b/src/html.rs @@ -1,305 +1,185 @@ -use html5ever::{Attribute, ExpandedName, ParseOpts, parse_document}; -use html5ever::interface::{ElementFlags, NodeOrText, QuirksMode, TreeSink}; -pub use html5ever::QualName; -use html5ever::tendril::{StrTendril, TendrilSink}; -use std::borrow::Cow; -use std::collections::{HashMap, HashSet}; - -#[derive(Debug)] -enum TreeNodeContents { - Text { text: StrTendril }, - Element { - name: QualName, - attrs: Vec, - children: Vec, - }, - Boring, -} - -#[derive(Debug)] -struct TreeNode { - contents: TreeNodeContents, - parent: Option, -} +use html2text::{config, Colour}; +pub use html2text::RenderTree; +use html2text::render::text_renderer::{ + TextDecorator, TaggedLine, TaggedLineElement +}; -type Handle = usize; +use super::coloured_string::ColouredString; -#[derive(Debug, Default)] -struct Tree { - nodes: Vec, - error: Option, +#[derive(Clone, Debug, Default)] +pub struct OurDecorator { } -pub trait Receiver { - fn start_tag(&mut self, tag: &str, attrs: &HashMap); - fn end_tag(&mut self, tag: &str, attrs: &HashMap); - fn text(&mut self, text: &str); +impl OurDecorator { + pub fn new() -> OurDecorator { + OurDecorator { } + } } -fn qualname_to_string(qn: &QualName) -> String { - if qn.ns == ns!(html) || qn.ns == ns!() { - qn.local.to_string() - } else { - dbg!(&qn); - format!("{}:{}", qn.ns.to_string(), qn.local.to_string()) +impl TextDecorator for OurDecorator { + type Annotation = char; + + /// Return an annotation and rendering prefix for a link. + fn decorate_link_start(&mut self, _url: &str) + -> (String, Self::Annotation) { + ("".to_string(), 'u') } -} -impl Tree { - fn new_node(&mut self, contents: TreeNodeContents) -> Handle { - let handle = self.nodes.len(); - self.nodes.push(TreeNode { - contents, - parent: None, - }); - handle + /// Return a suffix for after a link. + fn decorate_link_end(&mut self) -> String { "".to_string() } + + /// Return an annotation and rendering prefix for em + fn decorate_em_start(&mut self) -> (String, Self::Annotation) { + ("".to_string(), '_') } - fn not2handle(&mut self, not: NodeOrText) -> Handle { - match not { - NodeOrText::AppendNode(h) => h, - NodeOrText::AppendText(s) => self.new_node( - TreeNodeContents::Text { text: s }), - } + /// Return a suffix for after an em. + fn decorate_em_end(&mut self) -> String { "".to_string() } + + /// Return an annotation and rendering prefix for strong + fn decorate_strong_start(&mut self) -> (String, Self::Annotation) { + ("".to_string(), 's') } - fn merge_text_node_into_prev(&mut self, parent: Handle, index2: usize) { - let indices = match self.nodes[parent].contents { - TreeNodeContents::Element { - name: _, attrs: _, ref children - } => { - if index2 > 0 && index2 < children.len() { - Some((children[index2 - 1], children[index2])) - } else { - None - } - }, - _ => panic!("merge_text_node_into_prev with a bogus parent type"), - }; - - if let Some((child1, child2)) = indices { - let oldtext = match self.nodes[child2].contents { - TreeNodeContents::Text { ref text } => Some(text.clone()), - _ => None, - }; - let merged = match self.nodes[child1].contents { - TreeNodeContents::Text { text: ref mut text1 } => { - match oldtext { - Some(text2) => { text1.push_slice(&text2); true }, - _ => false, - } - }, - _ => false, - }; - if merged { - self.nodes[child2].contents = TreeNodeContents::Boring; - self.nodes[child2].parent = None; - match self.nodes[parent].contents { - TreeNodeContents::Element { - name: _, attrs: _, ref mut children - } => { - children.remove(index2); - }, - _ => panic!("we already checked this!"), - } - } - } + /// Return a suffix for after a strong. + fn decorate_strong_end(&mut self) -> String { "".to_string() } + + /// Return an annotation and rendering prefix for strikeout + fn decorate_strikeout_start(&mut self) -> (String, Self::Annotation) { + ("~".to_string(), ' ') } - fn walk_recurse(&self, node: Handle, receiver: &mut R) { - match &self.nodes[node].contents { - TreeNodeContents::Text { text } => receiver.text(&text), - TreeNodeContents::Element { name, attrs, children } => { - let mut attrmap = HashMap::new(); - for attr in attrs { - attrmap.insert(qualname_to_string(&attr.name), - attr.value.to_string()); - } - let tagname = qualname_to_string(&name); - receiver.start_tag(&tagname, &attrmap); - for child in children { - self.walk_recurse(*child, receiver); - } - receiver.end_tag(&tagname, &attrmap); - }, - _ => (), - }; + /// Return a suffix for after a strikeout. + fn decorate_strikeout_end(&mut self) -> String { "~".to_string() } + + /// Return an annotation and rendering prefix for code + fn decorate_code_start(&mut self) -> (String, Self::Annotation) { + ("".to_string(), 'c') } -} -impl TreeSink for Tree { - type Handle = Handle; - type Output = Self; + /// Return a suffix for after a code. + fn decorate_code_end(&mut self) -> String { "".to_string() } - fn finish(self) -> Self { self } - fn get_document(&mut self) -> Handle { 0 } + /// Return an annotation for the initial part of a preformatted line + fn decorate_preformat_first(&mut self) -> Self::Annotation { 'c' } - fn parse_error(&mut self, msg: Cow<'static, str>) { - self.error = Some(msg.to_string()); - } - - fn elem_name<'a>(&'a self, target: &'a Handle) -> ExpandedName<'a> { - match self.nodes[*target].contents { - TreeNodeContents::Element { ref name, .. } => name.expanded(), - _ => panic!("html5ever promised to only call elem_name on Element"), - } + /// Return an annotation for a continuation line when a preformatted + /// line doesn't fit. + fn decorate_preformat_cont(&mut self) -> Self::Annotation { 'c' } + + /// Return an annotation and rendering prefix for a link. + fn decorate_image(&mut self, _src: &str, _title: &str) + -> (String, Self::Annotation) { + ("".to_string(), 'm') } - fn create_element(&mut self, name: QualName, - attrs: Vec, _flags: ElementFlags) - -> Handle { - self.new_node(TreeNodeContents::Element { - name: name, - attrs: attrs, - children: Vec::new(), - }) + /// Return prefix string of header in specific level. + fn header_prefix(&mut self, level: usize) -> String { + "#".repeat(level) + " " } - fn create_comment(&mut self, _text: StrTendril) - -> Handle { self.new_node(TreeNodeContents::Boring) } - fn create_pi(&mut self, _target: StrTendril, - _data: StrTendril) - -> Handle { self.new_node(TreeNodeContents::Boring) } - - fn append(&mut self, parent: &Handle, child: NodeOrText) { - let parent = *parent; - let child = self.not2handle(child); - if parent == child { return } - let check_index = { - let children = match self.nodes[parent].contents { - TreeNodeContents::Element { - name: _, attrs: _, ref mut children - } => children, - _ => panic!("append with a bogus parent type"), - }; - children.push(child); - children.len() - 1 - }; - self.nodes[child].parent = Some(parent); - self.merge_text_node_into_prev(parent, check_index); + + /// Return prefix string of quoted block. + fn quote_prefix(&mut self) -> String { "> ".to_string() } + + /// Return prefix string of unordered list item. + fn unordered_item_prefix(&mut self) -> String { " - ".to_string() } + + /// Return prefix string of ith ordered list item. + fn ordered_item_prefix(&mut self, i: i64) -> String { + format!(" {}. ", i) } - fn remove_from_parent(&mut self, target: &Handle) { - let target = *target; - if let Some(parent) = self.nodes[target].parent { - match self.nodes[parent].contents { - TreeNodeContents::Element { - name: _, attrs: _, ref mut children - } => { - children.retain(|&h| h != target); - }, - _ => (), - } - self.nodes[target].parent = None; - } + + /// Return a new decorator of the same type which can be used + /// for sub blocks. + fn make_subblock_decorator(&self) -> Self { + OurDecorator::new() } - fn append_before_sibling(&mut self, sibling: &Handle, - new_node: NodeOrText) { - let child = self.not2handle(new_node); - let sibling = *sibling; - - self.remove_from_parent(&child); - - // Link to new parent next to sibling - match self.nodes[sibling].parent { - None => panic!( - "html5ever tried to append_before_sibling to the root"), - Some(parent) => { - match self.nodes[parent].contents { - TreeNodeContents::Element { - name: _, attrs: _, ref mut children - } => { - match children.iter().position(|h| *h == sibling) { - Some(i) => { - children.insert(i, child); - self.nodes[child].parent = Some(parent); - self.merge_text_node_into_prev(parent, i+1); - } - None => panic!("node not a child of its parent"), - } - }, - _ => panic!("node had a wrong parent type"), - } - } + + /// Return an annotation corresponding to adding colour, or none. + fn push_colour(&mut self, col: Colour) -> Option { + match col.r { + 1 => Some('@'), + 4 => Some('#'), + _ => None, } } - fn append_based_on_parent_node(&mut self, element: &Handle, - prev_element: &Handle, - child: NodeOrText) { - if self.nodes[*element].parent.is_some() { - self.append_before_sibling(element, child); - } else { - self.append(prev_element, child); - } + + /// Pop the last colour pushed if we pushed one. + fn pop_colour(&mut self) -> bool { + true } - fn add_attrs_if_missing(&mut self, target: &Handle, - mut attrs: Vec) { - let target = *target; - let target_attrs = match self.nodes[target].contents { - TreeNodeContents::Element { - name: _, ref mut attrs, .. - } => attrs, - _ => panic!("add_attrs_if_missing to a bogus node type"), - }; - - let mut present = HashSet::new(); - for attr in &mut *target_attrs { - present.insert(attr.name.clone()); - } - for attr in attrs.drain(0..) { - if !present.contains(&attr.name) { - present.insert(attr.name.clone()); - target_attrs.push(attr); - } - } + + /// Finish with a document, and return extra lines (eg footnotes) + /// to add to the rendered text. + fn finalise(&mut self, _links: Vec) + -> Vec> { + Vec::new() } - fn reparent_children(&mut self, old_parent: &Handle, - new_parent: &Handle) { - let old_parent = *old_parent; - let new_parent = *new_parent; - - let mut old_children = match self.nodes[old_parent].contents { - TreeNodeContents::Element { - name: _, attrs: _, ref mut children - } => { - let mut old_children = Vec::new(); - old_children.append(children); - old_children - }, - _ => panic!("reparent_children from a bogus node type"), - }; +} - for i in old_children.iter() { - self.nodes[*i].parent = Some(new_parent); - } +pub fn parse(html: &str) -> Result { + let cfg = config::plain().add_css(r##" +.mention { color: #010203; } +.hashtag { color: #040506; } +"##)?; + let dom = cfg.parse_html(html.as_bytes())?; + cfg.dom_to_render_tree(&dom) +} - match self.nodes[old_parent].contents { - TreeNodeContents::Element { - name: _, attrs: _, ref mut children - } => { - children.append(&mut old_children); - }, - _ => panic!("reparent_children to a bogus node type"), - }; +fn try_render(rt: &RenderTree, wrapwidth: usize, fullwidth: usize) -> + Result>>, html2text::Error> +{ + let cfg = config::with_decorator(OurDecorator::new()) + .max_wrap_width(wrapwidth); + cfg.render_to_lines(rt.clone(), fullwidth) +} + +fn render_tl(rt: &RenderTree, width: usize) -> Vec>> { + if let Ok(lines) = try_render(rt, width, width) { + return lines; } - fn append_doctype_to_document(&mut self, _name: StrTendril, - _public_id: StrTendril, - _system_id: StrTendril) {} + let mut wbad = width; + let mut wgood = { + let mut w = width; + loop { + w += w / 2; + if let Ok(_) = try_render(rt, width, w) { + break w; + } + } + }; - fn get_template_contents(&mut self, _target: &Handle) - -> Handle { self.new_node(TreeNodeContents::Boring) } + while wgood - wbad > 1 { + let wmid = wbad + (wgood - wbad) / 2; + if let Ok(_) = try_render(rt, width, wmid) { + wgood = wmid; + } else { + wbad = wmid; + } + } - fn same_node(&self, x: &Handle, y: &Handle) -> bool { x == y } - fn set_quirks_mode(&mut self, _mode: QuirksMode) {} + try_render(rt, width, wgood + wgood / 4) + .expect("Shouldn't be too narrow now") } -fn render_read(mut input: impl std::io::Read) -> Tree { - parse_document(Tree::default(), ParseOpts::default()) - .from_utf8() - .read_from(&mut input) - .unwrap() +fn to_coloured_string(tl: &TaggedLine>) -> ColouredString { + let mut cs = ColouredString::plain(""); + for e in tl.iter() { + match e { + TaggedLineElement::Str(ts) => { + let c: char = match ts.tag.first() { + Some(c) => *c, + None => ' ', + }; + cs.push_str(&ColouredString::uniform(&ts.s, c).slice()); + }, + _ => (), + } + } + cs } -pub fn render(html: &str, receiver: &mut R) { - let tree = render_read(html.as_bytes()); - tree.walk_recurse(0, receiver); +pub fn render(rt: &RenderTree, width: usize) -> Vec { + render_tl(rt, width).iter().map(to_coloured_string).collect() } diff --git a/src/lib.rs b/src/lib.rs index aa2a62d..0b185a4 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,6 +1,3 @@ -#[macro_use] -extern crate html5ever; - pub mod types; pub mod auth; pub mod html; diff --git a/src/text.rs b/src/text.rs index 04f8d6b..38777bd 100644 --- a/src/text.rs +++ b/src/text.rs @@ -2,7 +2,7 @@ use chrono::{DateTime, Local, Utc}; #[cfg(test)] use chrono::NaiveDateTime; use core::cmp::{min, max}; -use std::collections::{HashMap, BTreeMap, BTreeSet}; +use std::collections::BTreeMap; use unicode_width::UnicodeWidthStr; use super::html; @@ -273,12 +273,16 @@ impl Paragraph { self } - pub fn push_para(&mut self, para: &Paragraph) { + pub fn end_word(&mut self) { if let Some(word) = self.words.last() { if !word.is_space() { self.push_text(&ColouredString::plain(" "), false); } } + } + + pub fn push_para(&mut self, para: &Paragraph) { + self.end_word(); self.words.extend_from_slice(¶.words); } @@ -540,161 +544,48 @@ fn trim_para_list(paras: &mut Vec) { } } -struct HTMLFormatter { - paras: Vec, - colourstack: Vec, - bad_tags: BTreeSet, // so we report more than 1 in consistent order - indent: usize, - pre_tag: usize, +pub enum Html { + Rt(html::RenderTree), + Bad(String), } -impl HTMLFormatter { - fn new() -> Self { - HTMLFormatter { - paras: vec! { Paragraph::new() }, - colourstack: vec! { ' ' }, - bad_tags: BTreeSet::new(), - indent: 0, - pre_tag: 0, +impl Html { + pub fn new(html: &str) -> Self { + match html::parse(html) { + Ok(rt) => Html::Rt(rt), + Err(e) => Html::Bad(e.to_string()), } } - fn finish(mut self) -> Vec { - trim_para_list(&mut self.paras); + pub fn to_para(&self) -> Paragraph { + let mut para = Paragraph::new(); - if !self.bad_tags.is_empty() { - let mut para = Paragraph::new().add( - &ColouredString::uniform("Unsupported markup tags:", '!')); - for tag in self.bad_tags.iter() { - para.push_text(&ColouredString::uniform( - &format!(" <{}>", tag), '!'), false); - } - self.paras.splice(0..0, vec!{ - para, - Paragraph::new(), - }); + // With ordinary wrapping paragraphs it shouldn't matter what + // width we pick here. I pick a nice big one _just_ in case of + // rogue table cells. + for line in self.render(1024) { + para.end_word(); + para.push_text(&line, true); } - self.paras - } - - fn new_para(&self) -> Paragraph { - Paragraph::new().set_indent(self.indent, self.indent) - } - fn last_para(&self) -> &Paragraph { - // self.paras always contains at least one paragraph, so unwrap is OK - self.paras.last().unwrap() - } - fn last_para_mut(&mut self) -> &mut Paragraph { - self.paras.last_mut().unwrap() + para } } -impl html::Receiver for HTMLFormatter { - fn start_tag(&mut self, tag: &str, attrs: &HashMap) { - match tag { - "a" => { - let mut colour = ' '; - if attrs.get("href").is_some() { - colour = 'u'; - } - if let Some(classes) = attrs.get("class") { - if classes.split(' ').any(|x| x == "hashtag") { - colour = '#'; - } else if classes.split(' ').any(|x| x == "mention") { - colour = '@'; - } - } - self.colourstack.push(colour); - }, - "p" => { - if !self.last_para().is_empty() { - self.paras.push(Paragraph::new()); - } - self.paras.push(self.new_para()); - }, - "pre" => { - if !self.last_para().is_empty() { - self.paras.push(Paragraph::new()); - } - self.paras.push(self.new_para()); - self.pre_tag += 1; - self.colourstack.push('c'); - }, - "br" => self.paras.push(self.new_para()), - "blockquote" => { - self.indent += 2; - self.paras.push(self.new_para()); - }, - "code" => self.colourstack.push('c'), - "strong" => self.colourstack.push('s'), - "em" | "i" => self.colourstack.push('_'), - - // do nothing, except don't report these as unknown tags - "span" | "html" | "head" | "body" => (), - - _ => { - self.bad_tags.insert(tag.to_owned()); - }, - } - } - fn end_tag(&mut self, tag: &str, _attrs: &HashMap) { - match tag { - "p" => { - if !self.last_para().is_empty() { - self.paras.push(Paragraph::new()); - } - }, - "pre" => { - self.pre_tag -= 1; - self.colourstack.pop(); - if !self.last_para().is_empty() { - self.paras.push(Paragraph::new()); - } - }, - "blockquote" => { - if !self.last_para().is_empty() { - self.paras.push(Paragraph::new()); - } - self.indent -= 2; - self.paras.push(self.new_para()); - }, - "a" | "code" | "strong" | "em" | "i" => { - self.colourstack.pop(); +impl TextFragment for Html { + fn render(&self, width: usize) -> Vec { + match self { + Html::Rt(ref rt) => html::render(rt, width - min(width, 1)), + Html::Bad(e) => vec! { + ColouredString::uniform(e, '!'), }, - _ => (), - } - } - fn text(&mut self, text: &str) { - let colour = *self.colourstack.last().unwrap(); - if self.pre_tag > 0 { - for (i, line) in text.split('\n').enumerate() { - if i > 0 { - self.paras.push(self.new_para()); - } - self.last_para_mut().push_text(&ColouredString::uniform( - line, colour), false); - } - } else { - self.last_para_mut().push_text(&ColouredString::uniform( - text, colour), true); } } } -pub fn parse_html(html: &str) -> Vec { - let mut recv = HTMLFormatter::new(); - html::render(html, &mut recv); - recv.finish() -} - #[cfg(test)] fn render_html(html: &str, width: usize) -> Vec { - parse_html(html) - .into_iter() - .map(|para| para.render(width)) - .flatten() - .collect() + Html::new(html).render(width) } #[test] @@ -758,14 +649,6 @@ fn test_html() { ColouredString::general("URL to https://www.example.com/stuff/.", " uuuuuuuuuuuuuuuuuuuuuuuuuuuuuu "), }); - - assert_eq!(render_html("

Test of some unsupported HTML tags

", 50), - vec! { - ColouredString::general("Unsupported markup tags: ", - "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!"), - ColouredString::plain(""), - ColouredString::plain("Test of some unsupported HTML tags"), - }); } pub struct ExtendableIndicator { @@ -834,12 +717,10 @@ pub struct InReplyToLine { } impl InReplyToLine { - pub fn new(post: &Vec) -> Self { + pub fn new(post: Paragraph) -> Self { let mut para = Paragraph::new().add(&ColouredString::plain("Re: ")); let currlen = para.words.len(); - for cpara in post { - para.push_para(cpara); - } + para.push_para(&post); para.delete_mention_words_from(currlen); InReplyToLine { para: para @@ -864,16 +745,10 @@ impl TextFragment for InReplyToLine { #[test] fn test_in_reply_to() { - let post = vec! { - Paragraph::new().add(&ColouredString::general( - "@stoat @weasel take a look at this otter!", - "@@@@@@ @@@@@@@ ")), - Paragraph::new().add(&ColouredString::general( - "@badger might also like it", - "@@@@@@@ ")), - }; + let post = Html::new( + "

@stoat @weasel take a look at this otter!

@badger might also like it

"); - let irt = InReplyToLine::new(&post); + let irt = InReplyToLine::new(post.to_para()); assert_eq!(irt.render(48), vec!{ ColouredString::general( "Re: take a look at this otter! @badger might...", @@ -1495,7 +1370,7 @@ pub struct StatusDisplay { from: UsernameHeader, via: Option, irt: Option, - content: Vec, + content: Html, media: Vec, blank: BlankLine, } @@ -1525,18 +1400,16 @@ impl StatusDisplay { None => None, Some(id) => { let parent_text = match client.status_by_id(id) { - Ok(st) => parse_html(&st.content), - Err(e) => { - vec! { Paragraph::new().add(&ColouredString::plain( - &format!("[unavailable: {}]", e) - )) } - }, + Ok(st) => Html::new(&st.content).to_para(), + Err(e) => Paragraph::new().add(&ColouredString::plain( + &format!("[unavailable: {}]", e) + )), }; - Some(InReplyToLine::new(&parent_text)) + Some(InReplyToLine::new(parent_text)) }, }; - let content = parse_html(&st.content); + let content = Html::new(&st.content); let media = st.media_attachments.iter().map(|m| { let desc_ref = match &m.description { @@ -1575,10 +1448,10 @@ impl TextFragment for StatusDisplay { push_fragment(&mut lines,irt.render(width)); } push_fragment(&mut lines, self.blank.render(width)); - for para in &self.content { - push_fragment(&mut lines, para.render(width)); - } - if self.content.len() > 0 { + let rendered_content = self.content.render(width); + let content_empty = rendered_content.len() == 0; + push_fragment(&mut lines, rendered_content); + if !content_empty { push_fragment(&mut lines, self.blank.render(width)); } for m in &self.media {