chiark / gitweb /
First cut at parsing HTML of a toot.
authorSimon Tatham <anakin@pobox.com>
Sat, 23 Dec 2023 15:22:29 +0000 (15:22 +0000)
committerSimon Tatham <anakin@pobox.com>
Sat, 23 Dec 2023 17:12:34 +0000 (17:12 +0000)
But it doesn't work right, because the CSS is ignored. I think that's
because html2text only processes CSS classes on <span> elements, not
on <a> where the Mastodon server puts them.

Cargo.toml
src/html.rs [new file with mode: 0644]
src/lib.rs
src/main.rs
src/types.rs

index 164d99fbea6b4016c9a37be2c24cfebfcf42fac5..a59e453b0142f4d2fcda0f52ab8adc063eb92be6 100644 (file)
@@ -7,7 +7,7 @@ edition = "2021"
 [dependencies]
 chrono = { version = "0.4.31", features = ["serde"] }
 crossterm = "0.27.0"
-html2text = "0.9.0"
+html2text = { version = "0.9.0", features = ["css"] }
 ratatui = "0.25.0"
 reqwest = { version = "0.11.23", features = ["blocking"] }
 serde = { version = "1.0.193", features = ["derive"] }
diff --git a/src/html.rs b/src/html.rs
new file mode 100644 (file)
index 0000000..11b6a1b
--- /dev/null
@@ -0,0 +1,124 @@
+use html2text::{config, Colour};
+use html2text::render::text_renderer::{TextDecorator, TaggedLine};
+
+#[derive(Clone, Debug, Default)]
+pub struct OurDecorator {
+}
+
+impl OurDecorator {
+    pub fn new() -> OurDecorator {
+        OurDecorator { }
+    }
+}
+
+impl TextDecorator for OurDecorator {
+    type Annotation = char;
+
+    /// Return an annotation and rendering prefix for a link.
+    fn decorate_link_start(&mut self, _url: &str)
+                           -> (String, Self::Annotation) {
+        ("".to_string(), 'U')
+    }
+
+    /// Return a suffix for after a link.
+    fn decorate_link_end(&mut self) -> String { "".to_string() }
+
+    /// Return an annotation and rendering prefix for em
+    fn decorate_em_start(&mut self) -> (String, Self::Annotation) {
+        ("".to_string(), '_')
+    }
+
+    /// Return a suffix for after an em.
+    fn decorate_em_end(&mut self) -> String { "".to_string() }
+
+    /// Return an annotation and rendering prefix for strong
+    fn decorate_strong_start(&mut self) -> (String, Self::Annotation) {
+        ("".to_string(), 's')
+    }
+
+    /// Return a suffix for after a strong.
+    fn decorate_strong_end(&mut self) -> String { "".to_string() }
+
+    /// Return an annotation and rendering prefix for strikeout
+    fn decorate_strikeout_start(&mut self) -> (String, Self::Annotation) {
+        ("~".to_string(), ' ')
+    }
+
+    /// Return a suffix for after a strikeout.
+    fn decorate_strikeout_end(&mut self) -> String { "~".to_string() }
+
+    /// Return an annotation and rendering prefix for code
+    fn decorate_code_start(&mut self) -> (String, Self::Annotation) {
+        ("".to_string(), 'c')
+    }
+
+    /// Return a suffix for after a code.
+    fn decorate_code_end(&mut self) -> String { "".to_string() }
+
+    /// Return an annotation for the initial part of a preformatted line
+    fn decorate_preformat_first(&mut self) -> Self::Annotation { 'c' }
+
+    /// Return an annotation for a continuation line when a preformatted
+    /// line doesn't fit.
+    fn decorate_preformat_cont(&mut self) -> Self::Annotation { 'c' }
+
+    /// Return an annotation and rendering prefix for a link.
+    fn decorate_image(&mut self, _src: &str, _title: &str)
+                      -> (String, Self::Annotation) {
+        ("".to_string(), 'm')
+    }
+
+    /// Return prefix string of header in specific level.
+    fn header_prefix(&mut self, level: usize) -> String {
+        "#".repeat(level) + " "
+    }
+
+    /// Return prefix string of quoted block.
+    fn quote_prefix(&mut self) -> String { "> ".to_string() }
+
+    /// Return prefix string of unordered list item.
+    fn unordered_item_prefix(&mut self) -> String { " - ".to_string() }
+
+    /// Return prefix string of ith ordered list item.
+    fn ordered_item_prefix(&mut self, i: i64) -> String {
+        format!(" {}. ", i)
+    }
+
+    /// Return a new decorator of the same type which can be used
+    /// for sub blocks.
+    fn make_subblock_decorator(&self) -> Self {
+        OurDecorator::new()
+    }
+
+    /// Return an annotation corresponding to adding colour, or none.
+    fn push_colour(&mut self, col: Colour) -> Option<Self::Annotation> {
+        dbg!(col);
+        match col.r {
+            1 => Some('@'),
+            4 => Some('#'),
+            _ => None,
+        }
+    }
+
+    /// Pop the last colour pushed if we pushed one.
+    fn pop_colour(&mut self) -> bool {
+        true
+    }
+
+    /// Finish with a document, and return extra lines (eg footnotes)
+    /// to add to the rendered text.
+    fn finalise(&mut self, _links: Vec<String>)
+                -> Vec<TaggedLine<Self::Annotation>> {
+        Vec::new()
+    }
+}
+
+pub fn render(html: &str)
+              -> Result<Vec<TaggedLine<Vec<char>>>, html2text::Error> {
+    config::with_decorator(OurDecorator::new())
+        .add_css(r##"
+.mention { color: #010203; }
+.hashtag { color: #040506; }
+"##)?
+        .lines_from_read(html.as_bytes(), 80)
+}
index bf2f4571a110ec308af7b871800b572d85e55c73..976d77abe060b166345432290c24700b5defc0a9 100644 (file)
@@ -1,5 +1,6 @@
 pub mod types;
 pub mod auth;
+pub mod html;
 
 #[derive(Debug)]
 pub enum OurError {
index d73eb863a612fc6803c568f16a9773d5f660c339..04d66cb4d3a60999ef53dcf07e430da2af11c0e4 100644 (file)
@@ -1,6 +1,7 @@
-// use mastodonochrome::types::*;
+use mastodonochrome::types::*;
 use mastodonochrome::OurError;
 use mastodonochrome::auth::AuthConfig;
+use mastodonochrome::html::render;
 use std::io::Read;
 use std::io::Write;
 
@@ -19,7 +20,7 @@ use ratatui::{
 use std::io::stdout;
 
 #[allow(unused)]
-fn streaming(auth: &AuthConfig) -> Result<(), mastodonochrome::OurError> {
+fn streaming() -> Result<(), mastodonochrome::OurError> {
     let auth = AuthConfig::load()?;
 
     let client = reqwest::blocking::Client::new();
@@ -61,7 +62,8 @@ fn streaming(auth: &AuthConfig) -> Result<(), mastodonochrome::OurError> {
     Ok(())
 }
 
-fn main() -> std::io::Result<()> {
+#[allow(unused)]
+fn tui() -> std::io::Result<()> {
     stdout().execute(EnterAlternateScreen)?;
     enable_raw_mode()?;
     let mut terminal = Terminal::new(CrosstermBackend::new(stdout()))?;
@@ -109,3 +111,13 @@ fn main() -> std::io::Result<()> {
     disable_raw_mode()?;
     Ok(())
 }
+
+fn main() -> std::io::Result<()> {
+    let client = reqwest::blocking::Client::new();
+    let body = client.get(
+        "https://hachyderm.io/api/v1/statuses/111602135142646031")
+        .send().unwrap().text().unwrap();
+    let st: Status = serde_json::from_str(&body).unwrap();
+    dbg!(render(&st.content).unwrap());
+    Ok(())
+}
index 4da4ea186e0de5105f3e082995f24dfddfeb244f..bedb6923aa1a9e4945332a06374ff461eff17d20 100644 (file)
@@ -5,45 +5,45 @@ use std::option::Option;
 
 #[derive(Serialize, Deserialize, Debug)]
 pub struct AccountField {
-    name: String,
-    value: String,
-    verified_at: Option<DateTime<Utc>>,
+    pub name: String,
+    pub value: String,
+    pub verified_at: Option<DateTime<Utc>>,
 }
 
 #[derive(Serialize, Deserialize, Debug)]
 pub struct Account {
-    id: String,
-    username: String,
-    acct: String,
-    url: String,
-    display_name: String,
-    note: String,
-    avatar: String,
-    avatar_static: String,
-    header: String,
-    header_static: String,
-    locked: bool,
-    fields: Vec<AccountField>,
-    // emojis: Vec<Emoji>,
-    bot: bool,
-    group: bool,
-    discoverable: Option<bool>,
-    noindex: Option<bool>,
-    moved: Option<Box<Account>>,
-    suspended: Option<bool>,
-    limited: Option<bool>,
-    created_at: DateTime<Utc>,
-    last_status_at: Option<String>, // this lacks a timezone, so serde
-                                    // can't deserialize it in the obvious way
-    statuses_count: u64,
-    followers_count: u64,
-    following_count: u64,
+    pub id: String,
+    pub username: String,
+    pub acct: String,
+    pub url: String,
+    pub display_name: String,
+    pub note: String,
+    pub avatar: String,
+    pub avatar_static: String,
+    pub header: String,
+    pub header_static: String,
+    pub locked: bool,
+    pub fields: Vec<AccountField>,
+    // pub emojis: Vec<Emoji>,
+    pub bot: bool,
+    pub group: bool,
+    pub discoverable: Option<bool>,
+    pub noindex: Option<bool>,
+    pub moved: Option<Box<Account>>,
+    pub suspended: Option<bool>,
+    pub limited: Option<bool>,
+    pub created_at: DateTime<Utc>,
+    pub last_status_at: Option<String>, // lacks a timezone, so serde can't
+                                        // deserialize it in the obvious way
+    pub statuses_count: u64,
+    pub followers_count: u64,
+    pub following_count: u64,
 }
 
 #[derive(Serialize, Deserialize, Debug)]
 pub struct Application {
-    name: String,
-    website: Option<String>,
+    pub name: String,
+    pub website: Option<String>,
 }
 
 #[derive(Serialize, Deserialize, Debug)]
@@ -65,53 +65,53 @@ pub enum MediaType {
 
 #[derive(Serialize, Deserialize, Debug)]
 pub struct MediaAttachment {
-    id: String,
-    #[serde(rename="type")] mediatype: MediaType,
-    url: String,
-    preview_url: String,
-    remote_url: Option<String>,
-    description: Option<String>,
+    pub id: String,
+    #[serde(rename="type")] pub mediatype: MediaType,
+    pub url: String,
+    pub preview_url: String,
+    pub remote_url: Option<String>,
+    pub description: Option<String>,
 }
 
 #[derive(Serialize, Deserialize, Debug)]
 pub struct StatusMention {
-    id: String,
-    username: String,
-    url: String,
-    acct: String,
+    pub id: String,
+    pub username: String,
+    pub url: String,
+    pub acct: String,
 }
 
 #[derive(Serialize, Deserialize, Debug)]
 pub struct Status {
-    id: String,
-    uri: String,
-    created_at: DateTime<Utc>,
-    account: Account,
-    content: String,
-    visibility: Visibility,
-    sensitive: bool,
-    spoiler_text: String,
-    media_attachments: Vec<MediaAttachment>,
-    application: Option<Application>,
-    mentions: Vec<StatusMention>,
-    // tags: Vec<Hashtag>,
-    // emojis: Vec<Emoji>,
-    reblogs_count: u64,
-    favourites_count: u64,
-    replies_count: u64,
-    url: String,
-    in_reply_to_id: Option<String>,
-    in_reply_to_account_id: Option<String>,
-    reblog: Option<Box<Status>>,
-    // poll: Option<Poll>,
-    // card: Option<PreviewCard>,
-    language: Option<String>,
-    text: Option<String>,
-    edited_at: Option<DateTime<Utc>>,
-    favourited: Option<bool>,
-    reblogged: Option<bool>,
-    muted: Option<bool>,
-    bookmarked: Option<bool>,
-    pinned: Option<bool>,
-    filtered: Option<bool>,
+    pub id: String,
+    pub uri: String,
+    pub created_at: DateTime<Utc>,
+    pub account: Account,
+    pub content: String,
+    pub visibility: Visibility,
+    pub sensitive: bool,
+    pub spoiler_text: String,
+    pub media_attachments: Vec<MediaAttachment>,
+    pub application: Option<Application>,
+    pub mentions: Vec<StatusMention>,
+    // pub tags: Vec<Hashtag>,
+    // pub emojis: Vec<Emoji>,
+    pub reblogs_count: u64,
+    pub favourites_count: u64,
+    pub replies_count: u64,
+    pub url: String,
+    pub in_reply_to_id: Option<String>,
+    pub in_reply_to_account_id: Option<String>,
+    pub reblog: Option<Box<Status>>,
+    // pub poll: Option<Poll>,
+    // pub card: Option<PreviewCard>,
+    pub language: Option<String>,
+    pub text: Option<String>,
+    pub edited_at: Option<DateTime<Utc>>,
+    pub favourited: Option<bool>,
+    pub reblogged: Option<bool>,
+    pub muted: Option<bool>,
+    pub bookmarked: Option<bool>,
+    pub pinned: Option<bool>,
+    pub filtered: Option<bool>,
 }