From: Vladimír Vondruš Date: Tue, 19 Dec 2017 15:46:32 +0000 (+0100) Subject: m.htmlsanity: improve detection for applying typography improvements. X-Git-Url: https://www.chiark.greenend.org.uk/ucgi/~cjwatson/git?a=commitdiff_plain;h=73796295ad667e0af5681c65774f3afbe323aa90;p=blog.git m.htmlsanity: improve detection for applying typography improvements. Previously smart quotes were applied to arbitrary fields and even field names. Ugh. And author was hyphenated + smart quoted but a category/tag not. Also, have fun spotting the differences in the test files :P --- diff --git a/doc/plugins/htmlsanity.rst b/doc/plugins/htmlsanity.rst index bfc786c2..ace16bdb 100644 --- a/doc/plugins/htmlsanity.rst +++ b/doc/plugins/htmlsanity.rst @@ -112,8 +112,9 @@ French-style quotes. This plugin contains a patched version of `smart_quotes option `_ -from Docutils, which is based off SmartyPants, but with proper language awareness -on top. See for yourself: +from Docutils, which is based off SmartyPants, but with proper language +awareness on top. It is applied to whole document contents and fields that are +included in the :py:`FORMATTED_FIELDS`. See for yourself: .. code-figure:: diff --git a/pelican-plugins/m/htmlsanity.py b/pelican-plugins/m/htmlsanity.py index 6419314a..42046a98 100644 --- a/pelican-plugins/m/htmlsanity.py +++ b/pelican-plugins/m/htmlsanity.py @@ -72,6 +72,28 @@ def extract_document_language(document): return language +def can_apply_typography(txtnode): + # Exclude: + # - literals and spans inside literals + # - raw code (such as SVG) + # - field names + # - bibliographic elements (author, date, ... fields) + if isinstance(txtnode.parent, nodes.literal) or \ + isinstance(txtnode.parent.parent, nodes.literal) or \ + isinstance(txtnode.parent, nodes.raw) or \ + isinstance(txtnode.parent, nodes.field_name) or \ + isinstance(txtnode.parent, nodes.Bibliographic): + return False + + # From fields include only the ones that are in FORMATTED_FIELDS + if isinstance(txtnode.parent.parent, nodes.field_body): + field_name_index = txtnode.parent.parent.parent.first_child_matching_class(nodes.field_name) + if txtnode.parent.parent.parent[field_name_index][0] in settings['FORMATTED_FIELDS']: + return True + return False + + return True + class SmartQuotes(docutils.transforms.universal.SmartQuotes): """Smart quote transform @@ -109,15 +131,14 @@ class SmartQuotes(docutils.transforms.universal.SmartQuotes): continue # list of text nodes in the "text block": - # Patched here to exclude text spans inside literal nodes. - # Hopefully two nesting levels are enough. - txtnodes = [txtnode for txtnode in node.traverse(nodes.Text) - if not isinstance(txtnode.parent, - nodes.option_string) and - not isinstance(txtnode.parent, - nodes.literal) and - not isinstance(txtnode.parent.parent, - nodes.literal)] + # Patched here to exclude more stuff. + txtnodes = [] + for txtnode in node.traverse(nodes.Text): + if not can_apply_typography(txtnode): continue + # Don't convert -- in option strings + if isinstance(txtnode.parent, nodes.option_string): continue + + txtnodes += [txtnode] # language: use typographical quotes for language "lang" lang = node.get_language_code(document_language) @@ -169,27 +190,14 @@ class Pyphen(Transform): # Go through all text words and hyphenate them for node in self.document.traverse(nodes.TextElement): - # Skip preformatted text blocks, special elements and field names - if isinstance(node, (nodes.FixedTextElement, nodes.Special, nodes.field_name)): + # Skip preformatted text blocks and special elements + if isinstance(node, (nodes.FixedTextElement, nodes.Special)): continue for txtnode in node.traverse(nodes.Text): - # Exclude: - # - document title - # - literals and spans inside literals - # - raw code (such as SVG) - if isinstance(txtnode.parent, nodes.title) or \ - isinstance(txtnode.parent, nodes.literal) or \ - isinstance(txtnode.parent.parent, nodes.literal) or \ - isinstance(txtnode.parent, nodes.raw): - continue - - # From fields include only the ones that are in - # FORMATTED_FIELDS - if isinstance(txtnode.parent.parent, nodes.field_body): - field_name_index = txtnode.parent.parent.parent.first_child_matching_class(nodes.field_name) - if txtnode.parent.parent.parent[field_name_index][0] not in settings['FORMATTED_FIELDS']: - continue + if not can_apply_typography(txtnode): continue + # Don't hyphenate document title + if isinstance(txtnode.parent, nodes.title): continue # Useful for debugging, don't remove ;) #print(repr(txtnode.parent), repr(txtnode.parent.parent), repr(txtnode.parent.parent.parent)) diff --git a/pelican-plugins/m/test/htmlsanity_typography/article-jumbo.html b/pelican-plugins/m/test/htmlsanity_typography/article-jumbo.html index c06c07bf..5a8f029b 100644 --- a/pelican-plugins/m/test/htmlsanity_typography/article-jumbo.html +++ b/pelican-plugins/m/test/htmlsanity_typography/article-jumbo.html @@ -13,8 +13,8 @@ - - + + @@ -29,12 +29,12 @@
-
+
Dec 10, 2017
- +
@@ -64,7 +64,7 @@ @@ -80,7 +80,7 @@

Au­thors

    -
  1. An Au­thor
  2. +
  3. An Author
diff --git a/pelican-plugins/m/test/htmlsanity_typography/article-lang.html b/pelican-plugins/m/test/htmlsanity_typography/article-lang.html index 96b273ae..49fe820d 100644 --- a/pelican-plugins/m/test/htmlsanity_typography/article-lang.html +++ b/pelican-plugins/m/test/htmlsanity_typography/article-lang.html @@ -44,7 +44,7 @@ te­dy mít čes­ké dě­le­ní slov. „A ta­ké čes­ké uvo­zov­ky.“