]> git.scottworley.com Git - paperdoorknob/blobdiff - texify.py
Always have Thread.__init__ fetch the HTML
[paperdoorknob] / texify.py
index 4fb15575832c66cbd99d4d12bbb9644cee1bfdaa..65b4230bc135db2cea845c7ae25e48be8e57f25a 100644 (file)
--- a/texify.py
+++ b/texify.py
@@ -6,6 +6,7 @@
 
 
 from abc import ABC, abstractmethod
+import re
 import subprocess
 
 from bs4.element import Tag
@@ -27,3 +28,45 @@ class PandocTexifier(Texifier):
                               input=html.encode(),
                               stdout=subprocess.PIPE,
                               check=True).stdout
+
+
+class TexifierError(Exception):
+    pass
+
+
+class DirectTexifier(Texifier):
+    def _texify_children(self, html: Tag) -> bytes:
+        out = b''
+        for c in html.children:
+            if isinstance(c, str):
+                out += c.encode('UTF-8')
+            elif isinstance(c, Tag):
+                out += self.texify(c).strip()
+            else:
+                raise TexifierError(f"Unsupported PageElement: {type(c)}")
+        return re.sub(b'[ \n]+', b' ', out).strip() + b'\n'
+
+    def texify(self, html: Tag) -> bytes:
+        if html.name == 'em':
+            return b'\\emph{' + self._texify_children(html).strip() + b'}\n'
+        if html.name == 'a' and 'href' in html.attrs:
+            return b'\\href{' + html.attrs['href'].encode(
+                'UTF-8') + b'}{' + self._texify_children(html).strip() + b'}\n'
+        return self._texify_children(html)
+
+
+class TexifierVerificationError(Exception):
+    pass
+
+
+class VerifyingTexifier(Texifier):
+    def __init__(self, a: Texifier, b: Texifier) -> None:
+        self._a = a
+        self._b = b
+
+    def texify(self, html: Tag) -> bytes:
+        aout = self._a.texify(html)
+        bout = self._b.texify(html)
+        if aout != bout:
+            raise TexifierVerificationError(aout, bout)
+        return aout