X-Git-Url: http://git.scottworley.com/paperdoorknob/blobdiff_plain/79631507ecb69945b7cedb10c3aaed4b8dd788b0..19ed28f37b59e480cebe8f122e07ce18ce536db8:/texify.py?ds=inline diff --git a/texify.py b/texify.py index 4fb1557..65b4230 100644 --- a/texify.py +++ b/texify.py @@ -6,6 +6,7 @@ from abc import ABC, abstractmethod +import re import subprocess from bs4.element import Tag @@ -27,3 +28,45 @@ class PandocTexifier(Texifier): input=html.encode(), stdout=subprocess.PIPE, check=True).stdout + + +class TexifierError(Exception): + pass + + +class DirectTexifier(Texifier): + def _texify_children(self, html: Tag) -> bytes: + out = b'' + for c in html.children: + if isinstance(c, str): + out += c.encode('UTF-8') + elif isinstance(c, Tag): + out += self.texify(c).strip() + else: + raise TexifierError(f"Unsupported PageElement: {type(c)}") + return re.sub(b'[ \n]+', b' ', out).strip() + b'\n' + + def texify(self, html: Tag) -> bytes: + if html.name == 'em': + return b'\\emph{' + self._texify_children(html).strip() + b'}\n' + if html.name == 'a' and 'href' in html.attrs: + return b'\\href{' + html.attrs['href'].encode( + 'UTF-8') + b'}{' + self._texify_children(html).strip() + b'}\n' + return self._texify_children(html) + + +class TexifierVerificationError(Exception): + pass + + +class VerifyingTexifier(Texifier): + def __init__(self, a: Texifier, b: Texifier) -> None: + self._a = a + self._b = b + + def texify(self, html: Tag) -> bytes: + aout = self._a.texify(html) + bout = self._b.texify(html) + if aout != bout: + raise TexifierVerificationError(aout, bout) + return aout