From: Scott Worley Date: Wed, 20 Dec 2023 03:39:16 +0000 (-0800) Subject: Strip all   X-Git-Url: http://git.scottworley.com/paperdoorknob/commitdiff_plain/929db57622c6b8756ed341d7ae2862126a7c638f?ds=inline Strip all   Project Lawful does not use   carefully/thoughtfully. They are all over the place, at the end of every sentence, randomly around s, etc. Just remove them all. But allow this behavior to be flag-controlled, in case this step is not desired when processing other works. --- diff --git a/args.py b/args.py index 2a03836..1931d42 100644 --- a/args.py +++ b/args.py @@ -14,6 +14,7 @@ from typing import Iterator from xdg_base_dirs import xdg_cache_home from fetch import CachingFetcher +from htmlfilter import ApplyHTMLFilters, HTMLFilters from spec import Spec from texify import PandocTexifier @@ -25,6 +26,10 @@ def _command_line_parser() -> ArgumentParser: metavar='PATH', help='Where to keep the http cache (instead of %(default)s)', default=os.path.join(xdg_cache_home(), "paperdoorknob")) + parser.add_argument( + '--htmlfilters', + help='Which HTML filters to use (default: %(default)s)', + default=','.join(f[0] for f in HTMLFilters)) parser.add_argument( '--out', help='The filename stem at which to write output ' + @@ -44,4 +49,9 @@ def spec_from_commandline_args() -> Iterator[Spec]: args = _command_line_parser().parse_args() with CachingFetcher(args.cache_path, args.timeout) as fetcher: with open(args.out + '.tex', 'wb') as texout: - yield Spec(args.url, fetcher, PandocTexifier(args.pandoc or 'pandoc'), texout) + yield Spec( + args.url, + fetcher, + lambda x: ApplyHTMLFilters(args.htmlfilters, x), + PandocTexifier(args.pandoc or 'pandoc'), + texout) diff --git a/htmlfilter.py b/htmlfilter.py new file mode 100644 index 0000000..9bc9b6f --- /dev/null +++ b/htmlfilter.py @@ -0,0 +1,31 @@ +# paperdoorknob: Print glowfic +# +# This program is free software: you can redistribute it and/or modify it +# under the terms of the GNU General Public License as published by the +# Free Software Foundation, version 3. + + +import re + +from typing import Callable, List, Tuple + + +HTMLFilters: List[Tuple[str, Callable[[bytes], bytes]]] = [ + ("NoNBSP", lambda x: re.sub(b"( |\xc2\xa0)", b" ", x)), +] + + +class HTMLFilterError(Exception): + pass + + +def ApplyHTMLFilters(filter_list: str, data: bytes) -> bytes: + for filter_name in filter_list.split(','): + filters = [f for (name, f) in HTMLFilters if name == filter_name] + if len(filters) == 0: + raise HTMLFilterError(f"Unknown HTML filter: {filter_name}") + if len(filters) > 1: + raise HTMLFilterError( + f"Multiple HTML filters with the same name!?: {filter_name}") + data = filters[0](data) + return data diff --git a/htmlfilter_test.py b/htmlfilter_test.py new file mode 100644 index 0000000..6b5584c --- /dev/null +++ b/htmlfilter_test.py @@ -0,0 +1,23 @@ +# paperdoorknob: Print glowfic +# +# This program is free software: you can redistribute it and/or modify it +# under the terms of the GNU General Public License as published by the +# Free Software Foundation, version 3. + + +import unittest +from htmlfilter import ApplyHTMLFilters + + +class TestHTMLFilters(unittest.TestCase): + + def testStripNBSP(self) -> None: + self.assertEqual( + ApplyHTMLFilters( + "NoNBSP", + b"

It's really cold.

"), + b"

It's really cold.

") + + +if __name__ == '__main__': + unittest.main() diff --git a/paperdoorknob.py b/paperdoorknob.py index 653d0c6..34223e4 100644 --- a/paperdoorknob.py +++ b/paperdoorknob.py @@ -46,7 +46,7 @@ def replies(html: BeautifulSoup) -> Iterable[Tag]: def process(spec: Spec) -> None: spec.texout.write(b'\\documentclass{article}\n\\begin{document}\n') - html = clean(parse(spec.fetcher.fetch(spec.url))) + html = clean(parse(spec.htmlfilter(spec.fetcher.fetch(spec.url)))) for r in replies(html): spec.texout.write(spec.texifier.texify(r)) spec.texout.write(b'\\end{document}\n') diff --git a/paperdoorknob_test.py b/paperdoorknob_test.py index 425d5e3..14f6dd9 100644 --- a/paperdoorknob_test.py +++ b/paperdoorknob_test.py @@ -46,6 +46,7 @@ class BaseTestProcess(ABC): spec = Spec( self.url(), self.fetcher(), + lambda x: x, PandocTexifier('pandoc'), buf) paperdoorknob.process(spec) @@ -61,7 +62,7 @@ Pretty sure. texifier = VerifyingTexifier( PandocTexifier('pandoc'), DirectTexifier()) buf = io.BytesIO() - spec = Spec(self.url(), self.fetcher(), texifier, buf) + spec = Spec(self.url(), self.fetcher(), lambda x: x, texifier, buf) paperdoorknob.process(spec) def testPDF(self) -> None: @@ -69,6 +70,7 @@ Pretty sure. spec = Spec( self.url(), self.fetcher(), + lambda x: x, PandocTexifier('pandoc'), out) paperdoorknob.process(spec) diff --git a/setup.py b/setup.py index edaa429..e83ec01 100644 --- a/setup.py +++ b/setup.py @@ -10,6 +10,7 @@ setup( py_modules=[ 'args', 'fetch', + 'htmlfilter', 'paperdoorknob', 'spec', 'texify', diff --git a/spec.py b/spec.py index 1322897..98832bf 100644 --- a/spec.py +++ b/spec.py @@ -7,7 +7,7 @@ from dataclasses import dataclass -from typing import IO +from typing import Callable, IO from fetch import Fetcher from texify import Texifier @@ -17,5 +17,6 @@ from texify import Texifier class Spec: url: str fetcher: Fetcher + htmlfilter: Callable[[bytes], bytes] texifier: Texifier texout: IO[bytes]