htmlsanity: hyphenation implementation.

author Vladimír Vondruš <mosra@centrum.cz>

Tue, 4 Jul 2017 09:11:05 +0000 (11:11 +0200)

committer Vladimír Vondruš <mosra@centrum.cz>

Wed, 5 Jul 2017 23:02:07 +0000 (01:02 +0200)
author Vladimír Vondruš <mosra@centrum.cz>
Tue, 4 Jul 2017 09:11:05 +0000 (11:11 +0200)
committer Vladimír Vondruš <mosra@centrum.cz>
Wed, 5 Jul 2017 23:02:07 +0000 (01:02 +0200)
diff --git a/pelican-plugins/m/htmlsanity.py b/pelican-plugins/m/htmlsanity.py

index 67ffc06f175c62d93824fb8f271425c57490c1e6..bbbd61c4e6b45a4d5e0e2869a14deaf34d4a8659 100644 (file)
--- a/pelican-plugins/m/htmlsanity.py
+++ b/pelican-plugins/m/htmlsanity.py
@@ -9,11 +9,19 @@ from docutils.utils import smartquotes
  import pelican.signals
  from pelican.readers import RstReader
  
+try:
+    import pyphen
+except ImportError:
+    pyphen = None
+
  # These come from settings
+enable_hyphenation = False
  smart_quotes = False
  hyphenation_lang = 'en'
  docutils_settings = {}
  
+words_re = re.compile("""\w+""", re.UNICODE|re.X)
+
  class SmartQuotes(docutils.transforms.universal.SmartQuotes):
      """Smart quote transform
  
@@ -96,6 +104,53 @@ class SmartQuotes(docutils.transforms.universal.SmartQuotes):
  
              self.unsupported_languages = set() # reset
  
+class Pyphen(Transform):
+    """Hyphenation using pyphen
+
+    Enabled or disabled using HTMLSANITY_HYPHENATION boolean option, defaulting
+    to ``False``.
+    """
+
+    # Max Docutils priority is 990, be sure that this is applied at the very
+    # last
+    default_priority = 991
+
+    def __init__(self, document, startnode):
+        Transform.__init__(self, document, startnode=startnode)
+
+    def apply(self):
+        global enable_hyphenation
+        if not enable_hyphenation:
+            return
+
+        document_language = self.document.settings.language_code
+
+        pyphen_for_lang = {}
+
+        # Go through all text words and hyphenate them
+        for node in self.document.traverse(nodes.TextElement):
+            # Skip preformatted text blocks, special elements and field names
+            if isinstance(node, (nodes.FixedTextElement, nodes.Special, nodes.field_name)):
+                continue
+
+            # Proper language-dependent hyphenation
+            lang = node.get_language_code(document_language)
+
+            # Create new Pyphen object for given lang, if not yet cached. I'm
+            # assuming this is faster than recreating the instance for every
+            # text node
+            if lang not in pyphen_for_lang:
+                pyphen_for_lang[lang] = pyphen.Pyphen(lang=lang)
+
+            for txtnode in node.traverse(nodes.Text):
+                # Exclude:
+                #  - literals and spans inside literals
+                #  - field bodies (such as various :save_as: etc.)
+                if isinstance(txtnode.parent, nodes.literal) or isinstance(txtnode.parent.parent, nodes.literal) or isinstance(txtnode.parent.parent, nodes.field_body):
+                    continue
+
+                txtnode.parent.replace(txtnode, nodes.Text(words_re.sub(lambda m: pyphen_for_lang[lang].inserted(m.group(0), '\u00AD'), txtnode.astext())))
+
  class SaneHtmlTranslator(HTMLTranslator):
      """Sane HTML translator
  
@@ -103,6 +158,15 @@ class SaneHtmlTranslator(HTMLTranslator):
      further.
      """
  
+    # Overrides the ones from docutils HTML5 writer to enable the soft hyphen
+    # character
+    special_characters = {ord('&'): '&amp;',
+                          ord('<'): '&lt;',
+                          ord('"'): '&quot;',
+                          ord('>'): '&gt;',
+                          ord('@'): '&#64;', # may thwart address harvesters
+                          ord('\u00AD'): '&shy;'}
+
      # Somehow this does the trick and removes docinfo from the body. Was
      # present in the HTML4 translator but not in the HTML5 one, so copying it
      # verbatim over
@@ -368,7 +432,7 @@ class SaneHtmlWriter(docutils.writers.html5_polyglot.Writer):
          self.translator_class = SaneHtmlTranslator
  
      def get_transforms(self):
-        return docutils.writers.html5_polyglot.Writer.get_transforms(self) + [SmartQuotes]
+        return docutils.writers.html5_polyglot.Writer.get_transforms(self) + [SmartQuotes, Pyphen]
  
  class SaneRstReader(RstReader):
      writer_class = SaneHtmlWriter
@@ -397,8 +461,10 @@ def render_rst(value):
  def configure_pelican(pelicanobj):
      pelicanobj.settings['JINJA_FILTERS']['render_rst'] = render_rst
  
-    global enable_smart_quotes, docutils_settings
-    enable_smart_quotes = pelicanobj.settings.get('HTMLSANITY_SMART_QUOTES', False)
+    global enable_hyphenation, smart_quotes, hyphenation_lang, docutils_settings
+    enable_hyphenation = pelicanobj.settings.get('HTMLSANITY_HYPHENATION', False)
+    smart_quotes = pelicanobj.settings.get('HTMLSANITY_SMART_QUOTES', False)
+    hyphenation_lang = pelicanobj.settings['DEFAULT_LANG']
      docutils_settings = pelicanobj.settings['DOCUTILS_SETTINGS']
  
  def add_reader(readers):
author	Vladimír Vondruš <mosra@centrum.cz>
	Tue, 4 Jul 2017 09:11:05 +0000 (11:11 +0200)
committer	Vladimír Vondruš <mosra@centrum.cz>
	Wed, 5 Jul 2017 23:02:07 +0000 (01:02 +0200)