X-Git-Url: http://git.scottworley.com/paperdoorknob/blobdiff_plain/79631507ecb69945b7cedb10c3aaed4b8dd788b0..47e940089d73e13a8f7f1d12344c2eb9bcaa3454:/texify.py diff --git a/texify.py b/texify.py index 4fb1557..83a8c67 100644 --- a/texify.py +++ b/texify.py @@ -6,6 +6,7 @@ from abc import ABC, abstractmethod +import re import subprocess from bs4.element import Tag @@ -27,3 +28,42 @@ class PandocTexifier(Texifier): input=html.encode(), stdout=subprocess.PIPE, check=True).stdout + + +class TexifierError(Exception): + pass + + +class DirectTexifier(Texifier): + def _texify_children(self, html: Tag) -> bytes: + out = b'' + for c in html.children: + if isinstance(c, str): + out += c.encode('UTF-8') + elif isinstance(c, Tag): + out += self.texify(c).strip() + else: + raise TexifierError(f"Unsupported PageElement: {type(c)}") + return re.sub(b'[ \n]+', b' ', out).strip() + b'\n' + + def texify(self, html: Tag) -> bytes: + if html.name == 'em': + return b'\\emph{' + self._texify_children(html).strip() + b'}\n' + return self._texify_children(html) + + +class TexifierVerificationError(Exception): + pass + + +class VerifyingTexifier(Texifier): + def __init__(self, a: Texifier, b: Texifier) -> None: + self._a = a + self._b = b + + def texify(self, html: Tag) -> bytes: + aout = self._a.texify(html) + bout = self._b.texify(html) + if aout != bout: + raise TexifierVerificationError(aout, bout) + return aout