-use html5ever::{Attribute, ExpandedName, ParseOpts, parse_document};
-use html5ever::interface::{ElementFlags, NodeOrText, QuirksMode, TreeSink};
-pub use html5ever::QualName;
-use html5ever::tendril::{StrTendril, TendrilSink};
-use std::borrow::Cow;
-use std::collections::{HashMap, HashSet};
-
-#[derive(Debug)]
-enum TreeNodeContents {
- Text { text: StrTendril },
- Element {
- name: QualName,
- attrs: Vec<Attribute>,
- children: Vec<Handle>,
- },
- Boring,
-}
-
-#[derive(Debug)]
-struct TreeNode {
- contents: TreeNodeContents,
- parent: Option<Handle>,
-}
+use html2text::{config, Colour};
+pub use html2text::RenderTree;
+use html2text::render::text_renderer::{
+ TextDecorator, TaggedLine, TaggedLineElement
+};
-type Handle = usize;
+use super::coloured_string::ColouredString;
-#[derive(Debug, Default)]
-struct Tree {
- nodes: Vec<TreeNode>,
- error: Option<String>,
+#[derive(Clone, Debug, Default)]
+pub struct OurDecorator {
}
-pub trait Receiver {
- fn start_tag(&mut self, tag: &str, attrs: &HashMap<String, String>);
- fn end_tag(&mut self, tag: &str, attrs: &HashMap<String, String>);
- fn text(&mut self, text: &str);
+impl OurDecorator {
+ pub fn new() -> OurDecorator {
+ OurDecorator { }
+ }
}
-fn qualname_to_string(qn: &QualName) -> String {
- if qn.ns == ns!(html) || qn.ns == ns!() {
- qn.local.to_string()
- } else {
- dbg!(&qn);
- format!("{}:{}", qn.ns.to_string(), qn.local.to_string())
+impl TextDecorator for OurDecorator {
+ type Annotation = char;
+
+ /// Return an annotation and rendering prefix for a link.
+ fn decorate_link_start(&mut self, _url: &str)
+ -> (String, Self::Annotation) {
+ ("".to_string(), 'u')
}
-}
-impl Tree {
- fn new_node(&mut self, contents: TreeNodeContents) -> Handle {
- let handle = self.nodes.len();
- self.nodes.push(TreeNode {
- contents,
- parent: None,
- });
- handle
+ /// Return a suffix for after a link.
+ fn decorate_link_end(&mut self) -> String { "".to_string() }
+
+ /// Return an annotation and rendering prefix for em
+ fn decorate_em_start(&mut self) -> (String, Self::Annotation) {
+ ("".to_string(), '_')
}
- fn not2handle(&mut self, not: NodeOrText<Handle>) -> Handle {
- match not {
- NodeOrText::AppendNode(h) => h,
- NodeOrText::AppendText(s) => self.new_node(
- TreeNodeContents::Text { text: s }),
- }
+ /// Return a suffix for after an em.
+ fn decorate_em_end(&mut self) -> String { "".to_string() }
+
+ /// Return an annotation and rendering prefix for strong
+ fn decorate_strong_start(&mut self) -> (String, Self::Annotation) {
+ ("".to_string(), 's')
}
- fn merge_text_node_into_prev(&mut self, parent: Handle, index2: usize) {
- let indices = match self.nodes[parent].contents {
- TreeNodeContents::Element {
- name: _, attrs: _, ref children
- } => {
- if index2 > 0 && index2 < children.len() {
- Some((children[index2 - 1], children[index2]))
- } else {
- None
- }
- },
- _ => panic!("merge_text_node_into_prev with a bogus parent type"),
- };
-
- if let Some((child1, child2)) = indices {
- let oldtext = match self.nodes[child2].contents {
- TreeNodeContents::Text { ref text } => Some(text.clone()),
- _ => None,
- };
- let merged = match self.nodes[child1].contents {
- TreeNodeContents::Text { text: ref mut text1 } => {
- match oldtext {
- Some(text2) => { text1.push_slice(&text2); true },
- _ => false,
- }
- },
- _ => false,
- };
- if merged {
- self.nodes[child2].contents = TreeNodeContents::Boring;
- self.nodes[child2].parent = None;
- match self.nodes[parent].contents {
- TreeNodeContents::Element {
- name: _, attrs: _, ref mut children
- } => {
- children.remove(index2);
- },
- _ => panic!("we already checked this!"),
- }
- }
- }
+ /// Return a suffix for after a strong.
+ fn decorate_strong_end(&mut self) -> String { "".to_string() }
+
+ /// Return an annotation and rendering prefix for strikeout
+ fn decorate_strikeout_start(&mut self) -> (String, Self::Annotation) {
+ ("~".to_string(), ' ')
}
- fn walk_recurse<R: Receiver>(&self, node: Handle, receiver: &mut R) {
- match &self.nodes[node].contents {
- TreeNodeContents::Text { text } => receiver.text(&text),
- TreeNodeContents::Element { name, attrs, children } => {
- let mut attrmap = HashMap::new();
- for attr in attrs {
- attrmap.insert(qualname_to_string(&attr.name),
- attr.value.to_string());
- }
- let tagname = qualname_to_string(&name);
- receiver.start_tag(&tagname, &attrmap);
- for child in children {
- self.walk_recurse(*child, receiver);
- }
- receiver.end_tag(&tagname, &attrmap);
- },
- _ => (),
- };
+ /// Return a suffix for after a strikeout.
+ fn decorate_strikeout_end(&mut self) -> String { "~".to_string() }
+
+ /// Return an annotation and rendering prefix for code
+ fn decorate_code_start(&mut self) -> (String, Self::Annotation) {
+ ("".to_string(), 'c')
}
-}
-impl TreeSink for Tree {
- type Handle = Handle;
- type Output = Self;
+ /// Return a suffix for after a code.
+ fn decorate_code_end(&mut self) -> String { "".to_string() }
- fn finish(self) -> Self { self }
- fn get_document(&mut self) -> Handle { 0 }
+ /// Return an annotation for the initial part of a preformatted line
+ fn decorate_preformat_first(&mut self) -> Self::Annotation { 'c' }
- fn parse_error(&mut self, msg: Cow<'static, str>) {
- self.error = Some(msg.to_string());
- }
-
- fn elem_name<'a>(&'a self, target: &'a Handle) -> ExpandedName<'a> {
- match self.nodes[*target].contents {
- TreeNodeContents::Element { ref name, .. } => name.expanded(),
- _ => panic!("html5ever promised to only call elem_name on Element"),
- }
+ /// Return an annotation for a continuation line when a preformatted
+ /// line doesn't fit.
+ fn decorate_preformat_cont(&mut self) -> Self::Annotation { 'c' }
+
+ /// Return an annotation and rendering prefix for a link.
+ fn decorate_image(&mut self, _src: &str, _title: &str)
+ -> (String, Self::Annotation) {
+ ("".to_string(), 'm')
}
- fn create_element(&mut self, name: QualName,
- attrs: Vec<Attribute>, _flags: ElementFlags)
- -> Handle {
- self.new_node(TreeNodeContents::Element {
- name: name,
- attrs: attrs,
- children: Vec::new(),
- })
+ /// Return prefix string of header in specific level.
+ fn header_prefix(&mut self, level: usize) -> String {
+ "#".repeat(level) + " "
}
- fn create_comment(&mut self, _text: StrTendril)
- -> Handle { self.new_node(TreeNodeContents::Boring) }
- fn create_pi(&mut self, _target: StrTendril,
- _data: StrTendril)
- -> Handle { self.new_node(TreeNodeContents::Boring) }
-
- fn append(&mut self, parent: &Handle, child: NodeOrText<Handle>) {
- let parent = *parent;
- let child = self.not2handle(child);
- if parent == child { return }
- let check_index = {
- let children = match self.nodes[parent].contents {
- TreeNodeContents::Element {
- name: _, attrs: _, ref mut children
- } => children,
- _ => panic!("append with a bogus parent type"),
- };
- children.push(child);
- children.len() - 1
- };
- self.nodes[child].parent = Some(parent);
- self.merge_text_node_into_prev(parent, check_index);
+
+ /// Return prefix string of quoted block.
+ fn quote_prefix(&mut self) -> String { "> ".to_string() }
+
+ /// Return prefix string of unordered list item.
+ fn unordered_item_prefix(&mut self) -> String { " - ".to_string() }
+
+ /// Return prefix string of ith ordered list item.
+ fn ordered_item_prefix(&mut self, i: i64) -> String {
+ format!(" {}. ", i)
}
- fn remove_from_parent(&mut self, target: &Handle) {
- let target = *target;
- if let Some(parent) = self.nodes[target].parent {
- match self.nodes[parent].contents {
- TreeNodeContents::Element {
- name: _, attrs: _, ref mut children
- } => {
- children.retain(|&h| h != target);
- },
- _ => (),
- }
- self.nodes[target].parent = None;
- }
+
+ /// Return a new decorator of the same type which can be used
+ /// for sub blocks.
+ fn make_subblock_decorator(&self) -> Self {
+ OurDecorator::new()
}
- fn append_before_sibling(&mut self, sibling: &Handle,
- new_node: NodeOrText<Handle>) {
- let child = self.not2handle(new_node);
- let sibling = *sibling;
-
- self.remove_from_parent(&child);
-
- // Link to new parent next to sibling
- match self.nodes[sibling].parent {
- None => panic!(
- "html5ever tried to append_before_sibling to the root"),
- Some(parent) => {
- match self.nodes[parent].contents {
- TreeNodeContents::Element {
- name: _, attrs: _, ref mut children
- } => {
- match children.iter().position(|h| *h == sibling) {
- Some(i) => {
- children.insert(i, child);
- self.nodes[child].parent = Some(parent);
- self.merge_text_node_into_prev(parent, i+1);
- }
- None => panic!("node not a child of its parent"),
- }
- },
- _ => panic!("node had a wrong parent type"),
- }
- }
+
+ /// Return an annotation corresponding to adding colour, or none.
+ fn push_colour(&mut self, col: Colour) -> Option<Self::Annotation> {
+ match col.r {
+ 1 => Some('@'),
+ 4 => Some('#'),
+ _ => None,
}
}
- fn append_based_on_parent_node(&mut self, element: &Handle,
- prev_element: &Handle,
- child: NodeOrText<Handle>) {
- if self.nodes[*element].parent.is_some() {
- self.append_before_sibling(element, child);
- } else {
- self.append(prev_element, child);
- }
+
+ /// Pop the last colour pushed if we pushed one.
+ fn pop_colour(&mut self) -> bool {
+ true
}
- fn add_attrs_if_missing(&mut self, target: &Handle,
- mut attrs: Vec<Attribute>) {
- let target = *target;
- let target_attrs = match self.nodes[target].contents {
- TreeNodeContents::Element {
- name: _, ref mut attrs, ..
- } => attrs,
- _ => panic!("add_attrs_if_missing to a bogus node type"),
- };
-
- let mut present = HashSet::new();
- for attr in &mut *target_attrs {
- present.insert(attr.name.clone());
- }
- for attr in attrs.drain(0..) {
- if !present.contains(&attr.name) {
- present.insert(attr.name.clone());
- target_attrs.push(attr);
- }
- }
+
+ /// Finish with a document, and return extra lines (eg footnotes)
+ /// to add to the rendered text.
+ fn finalise(&mut self, _links: Vec<String>)
+ -> Vec<TaggedLine<Self::Annotation>> {
+ Vec::new()
}
- fn reparent_children(&mut self, old_parent: &Handle,
- new_parent: &Handle) {
- let old_parent = *old_parent;
- let new_parent = *new_parent;
-
- let mut old_children = match self.nodes[old_parent].contents {
- TreeNodeContents::Element {
- name: _, attrs: _, ref mut children
- } => {
- let mut old_children = Vec::new();
- old_children.append(children);
- old_children
- },
- _ => panic!("reparent_children from a bogus node type"),
- };
+}
- for i in old_children.iter() {
- self.nodes[*i].parent = Some(new_parent);
- }
+pub fn parse(html: &str) -> Result<RenderTree, html2text::Error> {
+ let cfg = config::plain().add_css(r##"
+.mention { color: #010203; }
+.hashtag { color: #040506; }
+"##)?;
+ let dom = cfg.parse_html(html.as_bytes())?;
+ cfg.dom_to_render_tree(&dom)
+}
- match self.nodes[old_parent].contents {
- TreeNodeContents::Element {
- name: _, attrs: _, ref mut children
- } => {
- children.append(&mut old_children);
- },
- _ => panic!("reparent_children to a bogus node type"),
- };
+fn try_render(rt: &RenderTree, wrapwidth: usize, fullwidth: usize) ->
+ Result<Vec<TaggedLine<Vec<char>>>, html2text::Error>
+{
+ let cfg = config::with_decorator(OurDecorator::new())
+ .max_wrap_width(wrapwidth);
+ cfg.render_to_lines(rt.clone(), fullwidth)
+}
+
+fn render_tl(rt: &RenderTree, width: usize) -> Vec<TaggedLine<Vec<char>>> {
+ if let Ok(lines) = try_render(rt, width, width) {
+ return lines;
}
- fn append_doctype_to_document(&mut self, _name: StrTendril,
- _public_id: StrTendril,
- _system_id: StrTendril) {}
+ let mut wbad = width;
+ let mut wgood = {
+ let mut w = width;
+ loop {
+ w += w / 2;
+ if let Ok(_) = try_render(rt, width, w) {
+ break w;
+ }
+ }
+ };
- fn get_template_contents(&mut self, _target: &Handle)
- -> Handle { self.new_node(TreeNodeContents::Boring) }
+ while wgood - wbad > 1 {
+ let wmid = wbad + (wgood - wbad) / 2;
+ if let Ok(_) = try_render(rt, width, wmid) {
+ wgood = wmid;
+ } else {
+ wbad = wmid;
+ }
+ }
- fn same_node(&self, x: &Handle, y: &Handle) -> bool { x == y }
- fn set_quirks_mode(&mut self, _mode: QuirksMode) {}
+ try_render(rt, width, wgood + wgood / 4)
+ .expect("Shouldn't be too narrow now")
}
-fn render_read(mut input: impl std::io::Read) -> Tree {
- parse_document(Tree::default(), ParseOpts::default())
- .from_utf8()
- .read_from(&mut input)
- .unwrap()
+fn to_coloured_string(tl: &TaggedLine<Vec<char>>) -> ColouredString {
+ let mut cs = ColouredString::plain("");
+ for e in tl.iter() {
+ match e {
+ TaggedLineElement::Str(ts) => {
+ let c: char = match ts.tag.first() {
+ Some(c) => *c,
+ None => ' ',
+ };
+ cs.push_str(&ColouredString::uniform(&ts.s, c).slice());
+ },
+ _ => (),
+ }
+ }
+ cs
}
-pub fn render<R: Receiver>(html: &str, receiver: &mut R) {
- let tree = render_read(html.as_bytes());
- tree.walk_recurse(0, receiver);
+pub fn render(rt: &RenderTree, width: usize) -> Vec<ColouredString> {
+ render_tl(rt, width).iter().map(to_coloured_string).collect()
}
#[cfg(test)]
use chrono::NaiveDateTime;
use core::cmp::{min, max};
-use std::collections::{HashMap, BTreeMap, BTreeSet};
+use std::collections::BTreeMap;
use unicode_width::UnicodeWidthStr;
use super::html;
self
}
- pub fn push_para(&mut self, para: &Paragraph) {
+ pub fn end_word(&mut self) {
if let Some(word) = self.words.last() {
if !word.is_space() {
self.push_text(&ColouredString::plain(" "), false);
}
}
+ }
+
+ pub fn push_para(&mut self, para: &Paragraph) {
+ self.end_word();
self.words.extend_from_slice(¶.words);
}
}
}
-struct HTMLFormatter {
- paras: Vec<Paragraph>,
- colourstack: Vec<char>,
- bad_tags: BTreeSet<String>, // so we report more than 1 in consistent order
- indent: usize,
- pre_tag: usize,
+pub enum Html {
+ Rt(html::RenderTree),
+ Bad(String),
}
-impl HTMLFormatter {
- fn new() -> Self {
- HTMLFormatter {
- paras: vec! { Paragraph::new() },
- colourstack: vec! { ' ' },
- bad_tags: BTreeSet::new(),
- indent: 0,
- pre_tag: 0,
+impl Html {
+ pub fn new(html: &str) -> Self {
+ match html::parse(html) {
+ Ok(rt) => Html::Rt(rt),
+ Err(e) => Html::Bad(e.to_string()),
}
}
- fn finish(mut self) -> Vec<Paragraph> {
- trim_para_list(&mut self.paras);
+ pub fn to_para(&self) -> Paragraph {
+ let mut para = Paragraph::new();
- if !self.bad_tags.is_empty() {
- let mut para = Paragraph::new().add(
- &ColouredString::uniform("Unsupported markup tags:", '!'));
- for tag in self.bad_tags.iter() {
- para.push_text(&ColouredString::uniform(
- &format!(" <{}>", tag), '!'), false);
- }
- self.paras.splice(0..0, vec!{
- para,
- Paragraph::new(),
- });
+ // With ordinary wrapping paragraphs it shouldn't matter what
+ // width we pick here. I pick a nice big one _just_ in case of
+ // rogue table cells.
+ for line in self.render(1024) {
+ para.end_word();
+ para.push_text(&line, true);
}
- self.paras
- }
-
- fn new_para(&self) -> Paragraph {
- Paragraph::new().set_indent(self.indent, self.indent)
- }
- fn last_para(&self) -> &Paragraph {
- // self.paras always contains at least one paragraph, so unwrap is OK
- self.paras.last().unwrap()
- }
- fn last_para_mut(&mut self) -> &mut Paragraph {
- self.paras.last_mut().unwrap()
+ para
}
}
-impl html::Receiver for HTMLFormatter {
- fn start_tag(&mut self, tag: &str, attrs: &HashMap<String, String>) {
- match tag {
- "a" => {
- let mut colour = ' ';
- if attrs.get("href").is_some() {
- colour = 'u';
- }
- if let Some(classes) = attrs.get("class") {
- if classes.split(' ').any(|x| x == "hashtag") {
- colour = '#';
- } else if classes.split(' ').any(|x| x == "mention") {
- colour = '@';
- }
- }
- self.colourstack.push(colour);
- },
- "p" => {
- if !self.last_para().is_empty() {
- self.paras.push(Paragraph::new());
- }
- self.paras.push(self.new_para());
- },
- "pre" => {
- if !self.last_para().is_empty() {
- self.paras.push(Paragraph::new());
- }
- self.paras.push(self.new_para());
- self.pre_tag += 1;
- self.colourstack.push('c');
- },
- "br" => self.paras.push(self.new_para()),
- "blockquote" => {
- self.indent += 2;
- self.paras.push(self.new_para());
- },
- "code" => self.colourstack.push('c'),
- "strong" => self.colourstack.push('s'),
- "em" | "i" => self.colourstack.push('_'),
-
- // do nothing, except don't report these as unknown tags
- "span" | "html" | "head" | "body" => (),
-
- _ => {
- self.bad_tags.insert(tag.to_owned());
- },
- }
- }
- fn end_tag(&mut self, tag: &str, _attrs: &HashMap<String, String>) {
- match tag {
- "p" => {
- if !self.last_para().is_empty() {
- self.paras.push(Paragraph::new());
- }
- },
- "pre" => {
- self.pre_tag -= 1;
- self.colourstack.pop();
- if !self.last_para().is_empty() {
- self.paras.push(Paragraph::new());
- }
- },
- "blockquote" => {
- if !self.last_para().is_empty() {
- self.paras.push(Paragraph::new());
- }
- self.indent -= 2;
- self.paras.push(self.new_para());
- },
- "a" | "code" | "strong" | "em" | "i" => {
- self.colourstack.pop();
+impl TextFragment for Html {
+ fn render(&self, width: usize) -> Vec<ColouredString> {
+ match self {
+ Html::Rt(ref rt) => html::render(rt, width - min(width, 1)),
+ Html::Bad(e) => vec! {
+ ColouredString::uniform(e, '!'),
},
- _ => (),
- }
- }
- fn text(&mut self, text: &str) {
- let colour = *self.colourstack.last().unwrap();
- if self.pre_tag > 0 {
- for (i, line) in text.split('\n').enumerate() {
- if i > 0 {
- self.paras.push(self.new_para());
- }
- self.last_para_mut().push_text(&ColouredString::uniform(
- line, colour), false);
- }
- } else {
- self.last_para_mut().push_text(&ColouredString::uniform(
- text, colour), true);
}
}
}
-pub fn parse_html(html: &str) -> Vec<Paragraph> {
- let mut recv = HTMLFormatter::new();
- html::render(html, &mut recv);
- recv.finish()
-}
-
#[cfg(test)]
fn render_html(html: &str, width: usize) -> Vec<ColouredString> {
- parse_html(html)
- .into_iter()
- .map(|para| para.render(width))
- .flatten()
- .collect()
+ Html::new(html).render(width)
}
#[test]
ColouredString::general("URL to https://www.example.com/stuff/.",
" uuuuuuuuuuuuuuuuuuuuuuuuuuuuuu "),
});
-
- assert_eq!(render_html("<p>Test of some <nonsense>unsupported</nonsense> <blither>HTML tags</blither></p>", 50),
- vec! {
- ColouredString::general("Unsupported markup tags: <blither> <nonsense>",
- "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!"),
- ColouredString::plain(""),
- ColouredString::plain("Test of some unsupported HTML tags"),
- });
}
pub struct ExtendableIndicator {
}
impl InReplyToLine {
- pub fn new(post: &Vec<Paragraph>) -> Self {
+ pub fn new(post: Paragraph) -> Self {
let mut para = Paragraph::new().add(&ColouredString::plain("Re: "));
let currlen = para.words.len();
- for cpara in post {
- para.push_para(cpara);
- }
+ para.push_para(&post);
para.delete_mention_words_from(currlen);
InReplyToLine {
para: para
#[test]
fn test_in_reply_to() {
- let post = vec! {
- Paragraph::new().add(&ColouredString::general(
- "@stoat @weasel take a look at this otter!",
- "@@@@@@ @@@@@@@ ")),
- Paragraph::new().add(&ColouredString::general(
- "@badger might also like it",
- "@@@@@@@ ")),
- };
+ let post = Html::new(
+ "<p><span class=\"h-card\" translate=\"no\"><a href=\"https://some.instance/@stoat\" class=\"u-url mention\" rel=\"nofollow noopener noreferrer\" target=\"_blank\">@<span>stoat</span></a></span> <span class=\"h-card\" translate=\"no\"><a href=\"https://some.instance/@weasel\" class=\"u-url mention\" rel=\"nofollow noopener noreferrer\" target=\"_blank\">@<span>weasel</span></a></span> take a look at this otter!</p><p><span class=\"h-card\" translate=\"no\"><a href=\"https://some.instance/@badger\" class=\"u-url mention\" rel=\"nofollow noopener noreferrer\" target=\"_blank\">@<span>badger</span></a></span> might also like it</p>");
- let irt = InReplyToLine::new(&post);
+ let irt = InReplyToLine::new(post.to_para());
assert_eq!(irt.render(48), vec!{
ColouredString::general(
"Re: take a look at this otter! @badger might...",
from: UsernameHeader,
via: Option<UsernameHeader>,
irt: Option<InReplyToLine>,
- content: Vec<Paragraph>,
+ content: Html,
media: Vec<Media>,
blank: BlankLine,
}
None => None,
Some(id) => {
let parent_text = match client.status_by_id(id) {
- Ok(st) => parse_html(&st.content),
- Err(e) => {
- vec! { Paragraph::new().add(&ColouredString::plain(
- &format!("[unavailable: {}]", e)
- )) }
- },
+ Ok(st) => Html::new(&st.content).to_para(),
+ Err(e) => Paragraph::new().add(&ColouredString::plain(
+ &format!("[unavailable: {}]", e)
+ )),
};
- Some(InReplyToLine::new(&parent_text))
+ Some(InReplyToLine::new(parent_text))
},
};
- let content = parse_html(&st.content);
+ let content = Html::new(&st.content);
let media = st.media_attachments.iter().map(|m| {
let desc_ref = match &m.description {
push_fragment(&mut lines,irt.render(width));
}
push_fragment(&mut lines, self.blank.render(width));
- for para in &self.content {
- push_fragment(&mut lines, para.render(width));
- }
- if self.content.len() > 0 {
+ let rendered_content = self.content.render(width);
+ let content_empty = rendered_content.len() == 0;
+ push_fragment(&mut lines, rendered_content);
+ if !content_empty {
push_fragment(&mut lines, self.blank.render(width));
}
for m in &self.media {