]> git.scottworley.com Git - paperdoorknob/commitdiff
Contemplate generating LaTeX directly
authorScott Worley <scottworley@scottworley.com>
Tue, 19 Dec 2023 10:23:48 +0000 (02:23 -0800)
committerScott Worley <scottworley@scottworley.com>
Wed, 20 Dec 2023 04:10:48 +0000 (20:10 -0800)
paperdoorknob_test.py
texify.py

index 453f5a6338dfe5513b3e2cf68160786a4deb72c1..4cead9772866e3425f227c55acb3c32063659627 100644 (file)
@@ -11,7 +11,7 @@ import subprocess
 import paperdoorknob
 from testing.fakeserver import FakeGlowficServer
 from fetch import DirectFetcher
-from texify import PandocTexifier
+from texify import DirectTexifier, PandocTexifier, VerifyingTexifier
 
 TIMEOUT = 8
 
@@ -44,6 +44,14 @@ Pretty sure.
 \\end{document}
 ''')
 
+    def testDirectTexifier(self) -> None:
+        texifier = VerifyingTexifier(
+            PandocTexifier('pandoc'), DirectTexifier())
+        with DirectFetcher(TIMEOUT) as f:
+            buf = io.BytesIO()
+            paperdoorknob.process(
+                f"http://localhost:{self._port}", f, texifier, buf)
+
     def testPDF(self) -> None:
         texifier = PandocTexifier('pandoc')
         with DirectFetcher(TIMEOUT) as f:
index 4fb15575832c66cbd99d4d12bbb9644cee1bfdaa..83a8c671087d47fee49e25553be568032cf2f2c5 100644 (file)
--- a/texify.py
+++ b/texify.py
@@ -6,6 +6,7 @@
 
 
 from abc import ABC, abstractmethod
+import re
 import subprocess
 
 from bs4.element import Tag
@@ -27,3 +28,42 @@ class PandocTexifier(Texifier):
                               input=html.encode(),
                               stdout=subprocess.PIPE,
                               check=True).stdout
+
+
+class TexifierError(Exception):
+    pass
+
+
+class DirectTexifier(Texifier):
+    def _texify_children(self, html: Tag) -> bytes:
+        out = b''
+        for c in html.children:
+            if isinstance(c, str):
+                out += c.encode('UTF-8')
+            elif isinstance(c, Tag):
+                out += self.texify(c).strip()
+            else:
+                raise TexifierError(f"Unsupported PageElement: {type(c)}")
+        return re.sub(b'[ \n]+', b' ', out).strip() + b'\n'
+
+    def texify(self, html: Tag) -> bytes:
+        if html.name == 'em':
+            return b'\\emph{' + self._texify_children(html).strip() + b'}\n'
+        return self._texify_children(html)
+
+
+class TexifierVerificationError(Exception):
+    pass
+
+
+class VerifyingTexifier(Texifier):
+    def __init__(self, a: Texifier, b: Texifier) -> None:
+        self._a = a
+        self._b = b
+
+    def texify(self, html: Tag) -> bytes:
+        aout = self._a.texify(html)
+        bout = self._b.texify(html)
+        if aout != bout:
+            raise TexifierVerificationError(aout, bout)
+        return aout