]> git.scottworley.com Git - paperdoorknob/blob - texify.py
Handle Unicode characters ≈ and ◁
[paperdoorknob] / texify.py
1 # paperdoorknob: Print glowfic
2 #
3 # This program is free software: you can redistribute it and/or modify it
4 # under the terms of the GNU General Public License as published by the
5 # Free Software Foundation, version 3.
6
7
8 from abc import ABC, abstractmethod
9 import re
10 import subprocess
11
12 from bs4.element import Tag
13
14
15 class Texifier(ABC):
16 @abstractmethod
17 def texify(self, html: Tag) -> bytes:
18 raise NotImplementedError()
19
20
21 class PandocTexifier(Texifier):
22
23 def __init__(self, pandoc_path: str) -> None:
24 self._pandoc_path = pandoc_path
25
26 def texify(self, html: Tag) -> bytes:
27 return subprocess.run([self._pandoc_path, '--from=html', '--to=latex'],
28 input=html.encode(),
29 stdout=subprocess.PIPE,
30 check=True).stdout
31
32
33 class TexifierError(Exception):
34 pass
35
36
37 class DirectTexifier(Texifier):
38 def _texify_children(self, html: Tag) -> bytes:
39 out = b''
40 for c in html.children:
41 if isinstance(c, str):
42 out += c.encode('UTF-8')
43 elif isinstance(c, Tag):
44 out += self.texify(c).strip()
45 else:
46 raise TexifierError(f"Unsupported PageElement: {type(c)}")
47 return re.sub(b'[ \n]+', b' ', out).strip() + b'\n'
48
49 def texify(self, html: Tag) -> bytes:
50 if html.name == 'em':
51 return b'\\emph{' + self._texify_children(html).strip() + b'}\n'
52 if html.name == 'a' and 'href' in html.attrs:
53 return b'\\href{' + html.attrs['href'].encode(
54 'UTF-8') + b'}{' + self._texify_children(html).strip() + b'}\n'
55 return self._texify_children(html)
56
57
58 class TexifierVerificationError(Exception):
59 pass
60
61
62 class VerifyingTexifier(Texifier):
63 def __init__(self, a: Texifier, b: Texifier) -> None:
64 self._a = a
65 self._b = b
66
67 def texify(self, html: Tag) -> bytes:
68 aout = self._a.texify(html)
69 bout = self._b.texify(html)
70 if aout != bout:
71 raise TexifierVerificationError(aout, bout)
72 return aout