From 73796295ad667e0af5681c65774f3afbe323aa90 Mon Sep 17 00:00:00 2001 From: =?utf8?q?Vladim=C3=ADr=20Vondru=C5=A1?= Date: Tue, 19 Dec 2017 16:46:32 +0100 Subject: [PATCH] m.htmlsanity: improve detection for applying typography improvements. Previously smart quotes were applied to arbitrary fields and even field names. Ugh. And author was hyphenated + smart quoted but a category/tag not. Also, have fun spotting the differences in the test files :P --- doc/plugins/htmlsanity.rst | 5 +- pelican-plugins/m/htmlsanity.py | 62 +++++++++++-------- .../htmlsanity_typography/article-jumbo.html | 12 ++-- .../htmlsanity_typography/article-lang.html | 4 +- .../htmlsanity_typography/articles/jumbo.rst | 2 +- .../author-an-author.html | 14 ++--- .../category-a-category.html | 6 +- .../tag-tagging-a-name.html | 6 +- 8 files changed, 60 insertions(+), 51 deletions(-) diff --git a/doc/plugins/htmlsanity.rst b/doc/plugins/htmlsanity.rst index bfc786c2..ace16bdb 100644 --- a/doc/plugins/htmlsanity.rst +++ b/doc/plugins/htmlsanity.rst @@ -112,8 +112,9 @@ French-style quotes. This plugin contains a patched version of `smart_quotes option `_ -from Docutils, which is based off SmartyPants, but with proper language awareness -on top. See for yourself: +from Docutils, which is based off SmartyPants, but with proper language +awareness on top. It is applied to whole document contents and fields that are +included in the :py:`FORMATTED_FIELDS`. See for yourself: .. code-figure:: diff --git a/pelican-plugins/m/htmlsanity.py b/pelican-plugins/m/htmlsanity.py index 6419314a..42046a98 100644 --- a/pelican-plugins/m/htmlsanity.py +++ b/pelican-plugins/m/htmlsanity.py @@ -72,6 +72,28 @@ def extract_document_language(document): return language +def can_apply_typography(txtnode): + # Exclude: + # - literals and spans inside literals + # - raw code (such as SVG) + # - field names + # - bibliographic elements (author, date, ... fields) + if isinstance(txtnode.parent, nodes.literal) or \ + isinstance(txtnode.parent.parent, nodes.literal) or \ + isinstance(txtnode.parent, nodes.raw) or \ + isinstance(txtnode.parent, nodes.field_name) or \ + isinstance(txtnode.parent, nodes.Bibliographic): + return False + + # From fields include only the ones that are in FORMATTED_FIELDS + if isinstance(txtnode.parent.parent, nodes.field_body): + field_name_index = txtnode.parent.parent.parent.first_child_matching_class(nodes.field_name) + if txtnode.parent.parent.parent[field_name_index][0] in settings['FORMATTED_FIELDS']: + return True + return False + + return True + class SmartQuotes(docutils.transforms.universal.SmartQuotes): """Smart quote transform @@ -109,15 +131,14 @@ class SmartQuotes(docutils.transforms.universal.SmartQuotes): continue # list of text nodes in the "text block": - # Patched here to exclude text spans inside literal nodes. - # Hopefully two nesting levels are enough. - txtnodes = [txtnode for txtnode in node.traverse(nodes.Text) - if not isinstance(txtnode.parent, - nodes.option_string) and - not isinstance(txtnode.parent, - nodes.literal) and - not isinstance(txtnode.parent.parent, - nodes.literal)] + # Patched here to exclude more stuff. + txtnodes = [] + for txtnode in node.traverse(nodes.Text): + if not can_apply_typography(txtnode): continue + # Don't convert -- in option strings + if isinstance(txtnode.parent, nodes.option_string): continue + + txtnodes += [txtnode] # language: use typographical quotes for language "lang" lang = node.get_language_code(document_language) @@ -169,27 +190,14 @@ class Pyphen(Transform): # Go through all text words and hyphenate them for node in self.document.traverse(nodes.TextElement): - # Skip preformatted text blocks, special elements and field names - if isinstance(node, (nodes.FixedTextElement, nodes.Special, nodes.field_name)): + # Skip preformatted text blocks and special elements + if isinstance(node, (nodes.FixedTextElement, nodes.Special)): continue for txtnode in node.traverse(nodes.Text): - # Exclude: - # - document title - # - literals and spans inside literals - # - raw code (such as SVG) - if isinstance(txtnode.parent, nodes.title) or \ - isinstance(txtnode.parent, nodes.literal) or \ - isinstance(txtnode.parent.parent, nodes.literal) or \ - isinstance(txtnode.parent, nodes.raw): - continue - - # From fields include only the ones that are in - # FORMATTED_FIELDS - if isinstance(txtnode.parent.parent, nodes.field_body): - field_name_index = txtnode.parent.parent.parent.first_child_matching_class(nodes.field_name) - if txtnode.parent.parent.parent[field_name_index][0] not in settings['FORMATTED_FIELDS']: - continue + if not can_apply_typography(txtnode): continue + # Don't hyphenate document title + if isinstance(txtnode.parent, nodes.title): continue # Useful for debugging, don't remove ;) #print(repr(txtnode.parent), repr(txtnode.parent.parent), repr(txtnode.parent.parent.parent)) diff --git a/pelican-plugins/m/test/htmlsanity_typography/article-jumbo.html b/pelican-plugins/m/test/htmlsanity_typography/article-jumbo.html index c06c07bf..5a8f029b 100644 --- a/pelican-plugins/m/test/htmlsanity_typography/article-jumbo.html +++ b/pelican-plugins/m/test/htmlsanity_typography/article-jumbo.html @@ -13,8 +13,8 @@ - - + + @@ -29,12 +29,12 @@
-
+
Dec 10, 2017
- +
@@ -64,7 +64,7 @@ @@ -80,7 +80,7 @@

Au­thors

    -
  1. An Au­thor
  2. +
  3. An Author
diff --git a/pelican-plugins/m/test/htmlsanity_typography/article-lang.html b/pelican-plugins/m/test/htmlsanity_typography/article-lang.html index 96b273ae..49fe820d 100644 --- a/pelican-plugins/m/test/htmlsanity_typography/article-lang.html +++ b/pelican-plugins/m/test/htmlsanity_typography/article-lang.html @@ -44,7 +44,7 @@ te­dy mít čes­ké dě­le­ní slov. „A ta­ké čes­ké uvo­zov­ky.“