From: Simon Tatham Date: Sat, 23 Dec 2023 18:26:56 +0000 (+0000) Subject: Abandon html2text and try html5ever. X-Git-Url: https://www.chiark.greenend.org.uk/ucgi/~ian/git?a=commitdiff_plain;h=2cd9c0c91939c7f763db6a9ba0a7003a4947a9e9;p=mastodonochrome.git Abandon html2text and try html5ever. I _think_ this gets me a tree structure containing the HTML DOM. Now I have to do the same formatting and wrapping that I'm doing in the Python version, but that really should just be a matter of conversion. --- diff --git a/Cargo.toml b/Cargo.toml index 7d7387d..9877f9a 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -8,6 +8,7 @@ edition = "2021" chrono = { version = "0.4.31", features = ["serde"] } crossterm = "0.27.0" html2text = { version = "0.9.0", features = ["css"] } +html5ever = "0.26.0" ratatui = "0.25.0" regex = "1.10.2" reqwest = { version = "0.11.23", features = ["blocking"] } diff --git a/src/html.rs b/src/html.rs index 11b6a1b..c663ea3 100644 --- a/src/html.rs +++ b/src/html.rs @@ -1,124 +1,270 @@ -use html2text::{config, Colour}; -use html2text::render::text_renderer::{TextDecorator, TaggedLine}; +use html5ever::{Attribute, ExpandedName, ParseOpts, parse_document}; +use html5ever::interface::{ElementFlags, NodeOrText, QualName, QuirksMode, + TreeSink}; +use html5ever::tendril::{StrTendril, TendrilSink}; +use std::borrow::Cow; +use std::collections::HashSet; -#[derive(Clone, Debug, Default)] -pub struct OurDecorator { +#[derive(Debug)] +enum TreeNodeContents { + Text { text: StrTendril }, + Element { + name: QualName, + attrs: Vec, + children: Vec, + }, + Boring, } -impl OurDecorator { - pub fn new() -> OurDecorator { - OurDecorator { } - } +#[derive(Debug)] +struct TreeNode { + contents: TreeNodeContents, + parent: Option, } -impl TextDecorator for OurDecorator { - type Annotation = char; - - /// Return an annotation and rendering prefix for a link. - fn decorate_link_start(&mut self, _url: &str) - -> (String, Self::Annotation) { - ("".to_string(), 'U') - } +type Handle = usize; - /// Return a suffix for after a link. - fn decorate_link_end(&mut self) -> String { "".to_string() } +#[derive(Debug, Default)] +struct Tree { + nodes: Vec, + error: Option, +} - /// Return an annotation and rendering prefix for em - fn decorate_em_start(&mut self) -> (String, Self::Annotation) { - ("".to_string(), '_') +impl Tree { + fn new_node(&mut self, contents: TreeNodeContents) -> Handle { + let handle = self.nodes.len(); + self.nodes.push(TreeNode { + contents, + parent: None, + }); + handle } - /// Return a suffix for after an em. - fn decorate_em_end(&mut self) -> String { "".to_string() } - - /// Return an annotation and rendering prefix for strong - fn decorate_strong_start(&mut self) -> (String, Self::Annotation) { - ("".to_string(), 's') + fn not2handle(&mut self, not: NodeOrText) -> Handle { + match not { + NodeOrText::AppendNode(h) => h, + NodeOrText::AppendText(s) => self.new_node( + TreeNodeContents::Text { text: s }), + } } - /// Return a suffix for after a strong. - fn decorate_strong_end(&mut self) -> String { "".to_string() } + fn merge_text_node_into_prev(&mut self, parent: Handle, index2: usize) { + let indices = match self.nodes[parent].contents { + TreeNodeContents::Element { + name: _, attrs: _, ref children + } => { + if index2 > 0 && index2 < children.len() { + Some((children[index2 - 1], children[index2])) + } else { + None + } + }, + _ => panic!("merge_text_node_into_prev with a bogus parent type"), + }; - /// Return an annotation and rendering prefix for strikeout - fn decorate_strikeout_start(&mut self) -> (String, Self::Annotation) { - ("~".to_string(), ' ') + if let Some((child1, child2)) = indices { + let oldtext = match self.nodes[child2].contents { + TreeNodeContents::Text { ref text } => Some(text.clone()), + _ => None, + }; + let merged = match self.nodes[child1].contents { + TreeNodeContents::Text { text: ref mut text1 } => { + match oldtext { + Some(text2) => { text1.push_slice(&text2); true }, + _ => false, + } + }, + _ => false, + }; + if merged { + self.nodes[child2].contents = TreeNodeContents::Boring; + self.nodes[child2].parent = None; + match self.nodes[parent].contents { + TreeNodeContents::Element { + name: _, attrs: _, ref mut children + } => { + children.remove(index2); + }, + _ => panic!("we already checked this!"), + } + } + } } +} + +impl TreeSink for Tree { + type Handle = Handle; + type Output = Self; - /// Return a suffix for after a strikeout. - fn decorate_strikeout_end(&mut self) -> String { "~".to_string() } + fn finish(self) -> Self { self } + fn get_document(&mut self) -> Handle { 0 } - /// Return an annotation and rendering prefix for code - fn decorate_code_start(&mut self) -> (String, Self::Annotation) { - ("".to_string(), 'c') + fn parse_error(&mut self, msg: Cow<'static, str>) { + self.error = Some(msg.to_string()); + } + + fn elem_name<'a>(&'a self, target: &'a Handle) -> ExpandedName<'a> { + match self.nodes[*target].contents { + TreeNodeContents::Element { ref name, .. } => name.expanded(), + _ => panic!("html5ever promised to only call elem_name on Element"), + } } - /// Return a suffix for after a code. - fn decorate_code_end(&mut self) -> String { "".to_string() } + fn create_element(&mut self, name: QualName, + attrs: Vec, _flags: ElementFlags) + -> Handle { + self.new_node(TreeNodeContents::Element { + name: name, + attrs: attrs, + children: Vec::new(), + }) + } + fn create_comment(&mut self, _text: StrTendril) + -> Handle { self.new_node(TreeNodeContents::Boring) } + fn create_pi(&mut self, _target: StrTendril, + _data: StrTendril) + -> Handle { self.new_node(TreeNodeContents::Boring) } - /// Return an annotation for the initial part of a preformatted line - fn decorate_preformat_first(&mut self) -> Self::Annotation { 'c' } + fn append(&mut self, parent: &Handle, child: NodeOrText) { + let parent = *parent; + let child = self.not2handle(child); + let check_index = { + let children = match self.nodes[parent].contents { + TreeNodeContents::Element { + name: _, attrs: _, ref mut children + } => children, + _ => panic!("append with a bogus parent type"), + }; + children.push(child); + children.len() - 1 + }; + self.nodes[child].parent = Some(parent); + self.merge_text_node_into_prev(parent, check_index); + } + fn remove_from_parent(&mut self, target: &Handle) { + let target = *target; + if let Some(parent) = self.nodes[target].parent { + match self.nodes[parent].contents { + TreeNodeContents::Element { + name: _, attrs: _, ref mut children + } => { + children.retain(|&h| h != target); + }, + _ => (), + } + self.nodes[target].parent = None; + } + } + fn append_before_sibling(&mut self, sibling: &Handle, + new_node: NodeOrText) { + let child = self.not2handle(new_node); + let sibling = *sibling; - /// Return an annotation for a continuation line when a preformatted - /// line doesn't fit. - fn decorate_preformat_cont(&mut self) -> Self::Annotation { 'c' } + self.remove_from_parent(&child); - /// Return an annotation and rendering prefix for a link. - fn decorate_image(&mut self, _src: &str, _title: &str) - -> (String, Self::Annotation) { - ("".to_string(), 'm') + // Link to new parent next to sibling + match self.nodes[sibling].parent { + None => panic!( + "html5ever tried to append_before_sibling to the root"), + Some(parent) => { + match self.nodes[parent].contents { + TreeNodeContents::Element { + name: _, attrs: _, ref mut children + } => { + match children.iter().position(|h| *h == sibling) { + Some(i) => { + children.insert(i, child); + self.nodes[child].parent = Some(parent); + self.merge_text_node_into_prev(parent, i+1); + } + None => panic!("node not a child of its parent"), + } + }, + _ => panic!("node had a wrong parent type"), + } + } + } + } + fn append_based_on_parent_node(&mut self, element: &Handle, + prev_element: &Handle, + child: NodeOrText) { + if self.nodes[*element].parent.is_some() { + self.append_before_sibling(element, child); + } else { + self.append(prev_element, child); + } } + fn add_attrs_if_missing(&mut self, target: &Handle, + mut attrs: Vec) { + let target = *target; + let target_attrs = match self.nodes[target].contents { + TreeNodeContents::Element { + name: _, ref mut attrs, .. + } => attrs, + _ => panic!("add_attrs_if_missing to a bogus node type"), + }; - /// Return prefix string of header in specific level. - fn header_prefix(&mut self, level: usize) -> String { - "#".repeat(level) + " " + let mut present = HashSet::new(); + for attr in &mut *target_attrs { + present.insert(attr.name.clone()); + } + for attr in attrs.drain(0..) { + if !present.contains(&attr.name) { + present.insert(attr.name.clone()); + target_attrs.push(attr); + } + } } + fn reparent_children(&mut self, old_parent: &Handle, + new_parent: &Handle) { + let old_parent = *old_parent; + let new_parent = *new_parent; - /// Return prefix string of quoted block. - fn quote_prefix(&mut self) -> String { "> ".to_string() } + let mut old_children = match self.nodes[old_parent].contents { + TreeNodeContents::Element { + name: _, attrs: _, ref mut children + } => { + let mut old_children = Vec::new(); + old_children.append(children); + old_children + }, + _ => panic!("reparent_children from a bogus node type"), + }; - /// Return prefix string of unordered list item. - fn unordered_item_prefix(&mut self) -> String { " - ".to_string() } + for i in old_children.iter() { + self.nodes[*i].parent = Some(new_parent); + } - /// Return prefix string of ith ordered list item. - fn ordered_item_prefix(&mut self, i: i64) -> String { - format!(" {}. ", i) + match self.nodes[old_parent].contents { + TreeNodeContents::Element { + name: _, attrs: _, ref mut children + } => { + children.append(&mut old_children); + }, + _ => panic!("reparent_children to a bogus node type"), + }; } - /// Return a new decorator of the same type which can be used - /// for sub blocks. - fn make_subblock_decorator(&self) -> Self { - OurDecorator::new() - } + fn append_doctype_to_document(&mut self, _name: StrTendril, + _public_id: StrTendril, + _system_id: StrTendril) {} - /// Return an annotation corresponding to adding colour, or none. - fn push_colour(&mut self, col: Colour) -> Option { - dbg!(col); - match col.r { - 1 => Some('@'), - 4 => Some('#'), - _ => None, - } - } + fn get_template_contents(&mut self, _target: &Handle) + -> Handle { self.new_node(TreeNodeContents::Boring) } - /// Pop the last colour pushed if we pushed one. - fn pop_colour(&mut self) -> bool { - true - } + fn same_node(&self, x: &Handle, y: &Handle) -> bool { x == y } + fn set_quirks_mode(&mut self, _mode: QuirksMode) {} +} - /// Finish with a document, and return extra lines (eg footnotes) - /// to add to the rendered text. - fn finalise(&mut self, _links: Vec) - -> Vec> { - Vec::new() - } +pub fn render_read(mut input: impl std::io::Read) { + let root = parse_document(Tree::default(), ParseOpts::default()) + .from_utf8() + .read_from(&mut input) + .unwrap(); + + dbg!(root); } -pub fn render(html: &str) - -> Result>>, html2text::Error> { - config::with_decorator(OurDecorator::new()) - .add_css(r##" -.mention { color: #010203; } -.hashtag { color: #040506; } -"##)? - .lines_from_read(html.as_bytes(), 80) +pub fn render(html: &str) { + render_read(html.as_bytes()) } diff --git a/src/main.rs b/src/main.rs index 04d66cb..d49d314 100644 --- a/src/main.rs +++ b/src/main.rs @@ -118,6 +118,6 @@ fn main() -> std::io::Result<()> { "https://hachyderm.io/api/v1/statuses/111602135142646031") .send().unwrap().text().unwrap(); let st: Status = serde_json::from_str(&body).unwrap(); - dbg!(render(&st.content).unwrap()); + render(&st.content); Ok(()) }