from xdg_base_dirs import xdg_cache_home
from fetch import CachingFetcher
+from htmlfilter import ApplyHTMLFilters, HTMLFilters
from spec import Spec
from texify import PandocTexifier
metavar='PATH',
help='Where to keep the http cache (instead of %(default)s)',
default=os.path.join(xdg_cache_home(), "paperdoorknob"))
+ parser.add_argument(
+ '--htmlfilters',
+ help='Which HTML filters to use (default: %(default)s)',
+ default=','.join(f[0] for f in HTMLFilters))
parser.add_argument(
'--out',
help='The filename stem at which to write output ' +
args = _command_line_parser().parse_args()
with CachingFetcher(args.cache_path, args.timeout) as fetcher:
with open(args.out + '.tex', 'wb') as texout:
- yield Spec(args.url, fetcher, PandocTexifier(args.pandoc or 'pandoc'), texout)
+ yield Spec(
+ args.url,
+ fetcher,
+ lambda x: ApplyHTMLFilters(args.htmlfilters, x),
+ PandocTexifier(args.pandoc or 'pandoc'),
+ texout)
--- /dev/null
+# paperdoorknob: Print glowfic
+#
+# This program is free software: you can redistribute it and/or modify it
+# under the terms of the GNU General Public License as published by the
+# Free Software Foundation, version 3.
+
+
+import re
+
+from typing import Callable, List, Tuple
+
+
+HTMLFilters: List[Tuple[str, Callable[[bytes], bytes]]] = [
+ ("NoNBSP", lambda x: re.sub(b"( |\xc2\xa0)", b" ", x)),
+]
+
+
+class HTMLFilterError(Exception):
+ pass
+
+
+def ApplyHTMLFilters(filter_list: str, data: bytes) -> bytes:
+ for filter_name in filter_list.split(','):
+ filters = [f for (name, f) in HTMLFilters if name == filter_name]
+ if len(filters) == 0:
+ raise HTMLFilterError(f"Unknown HTML filter: {filter_name}")
+ if len(filters) > 1:
+ raise HTMLFilterError(
+ f"Multiple HTML filters with the same name!?: {filter_name}")
+ data = filters[0](data)
+ return data
--- /dev/null
+# paperdoorknob: Print glowfic
+#
+# This program is free software: you can redistribute it and/or modify it
+# under the terms of the GNU General Public License as published by the
+# Free Software Foundation, version 3.
+
+
+import unittest
+from htmlfilter import ApplyHTMLFilters
+
+
+class TestHTMLFilters(unittest.TestCase):
+
+ def testStripNBSP(self) -> None:
+ self.assertEqual(
+ ApplyHTMLFilters(
+ "NoNBSP",
+ b"<p>It's <em>really </em>cold.</p>"),
+ b"<p>It's <em>really </em>cold.</p>")
+
+
+if __name__ == '__main__':
+ unittest.main()
def process(spec: Spec) -> None:
spec.texout.write(b'\\documentclass{article}\n\\begin{document}\n')
- html = clean(parse(spec.fetcher.fetch(spec.url)))
+ html = clean(parse(spec.htmlfilter(spec.fetcher.fetch(spec.url))))
for r in replies(html):
spec.texout.write(spec.texifier.texify(r))
spec.texout.write(b'\\end{document}\n')
spec = Spec(
self.url(),
self.fetcher(),
+ lambda x: x,
PandocTexifier('pandoc'),
buf)
paperdoorknob.process(spec)
texifier = VerifyingTexifier(
PandocTexifier('pandoc'), DirectTexifier())
buf = io.BytesIO()
- spec = Spec(self.url(), self.fetcher(), texifier, buf)
+ spec = Spec(self.url(), self.fetcher(), lambda x: x, texifier, buf)
paperdoorknob.process(spec)
def testPDF(self) -> None:
spec = Spec(
self.url(),
self.fetcher(),
+ lambda x: x,
PandocTexifier('pandoc'),
out)
paperdoorknob.process(spec)
py_modules=[
'args',
'fetch',
+ 'htmlfilter',
'paperdoorknob',
'spec',
'texify',
from dataclasses import dataclass
-from typing import IO
+from typing import Callable, IO
from fetch import Fetcher
from texify import Texifier
class Spec:
url: str
fetcher: Fetcher
+ htmlfilter: Callable[[bytes], bytes]
texifier: Texifier
texout: IO[bytes]