From 8be20b9d36c49271ed392038750dfba981c283b6 Mon Sep 17 00:00:00 2001 From: Scott Worley Date: Tue, 19 Dec 2023 21:57:13 -0800 Subject: [PATCH] Extensible, flag-controlled DOM filters --- args.py | 6 ++++++ domfilter.py | 31 +++++++++++++++++++++++++++++++ domfilter_test.py | 42 ++++++++++++++++++++++++++++++++++++++++++ paperdoorknob.py | 11 ++--------- paperdoorknob_test.py | 20 ++++++++++++++------ setup.py | 1 + spec.py | 3 +++ 7 files changed, 99 insertions(+), 15 deletions(-) create mode 100644 domfilter.py create mode 100644 domfilter_test.py diff --git a/args.py b/args.py index 1931d42..2198a1d 100644 --- a/args.py +++ b/args.py @@ -13,6 +13,7 @@ from typing import Iterator from xdg_base_dirs import xdg_cache_home +from domfilter import ApplyDOMFilters, DOMFilters from fetch import CachingFetcher from htmlfilter import ApplyHTMLFilters, HTMLFilters from spec import Spec @@ -26,6 +27,10 @@ def _command_line_parser() -> ArgumentParser: metavar='PATH', help='Where to keep the http cache (instead of %(default)s)', default=os.path.join(xdg_cache_home(), "paperdoorknob")) + parser.add_argument( + '--domfilters', + help='Which DOM filters to use (default: %(default)s)', + default=','.join(f[0] for f in DOMFilters)) parser.add_argument( '--htmlfilters', help='Which HTML filters to use (default: %(default)s)', @@ -53,5 +58,6 @@ def spec_from_commandline_args() -> Iterator[Spec]: args.url, fetcher, lambda x: ApplyHTMLFilters(args.htmlfilters, x), + lambda x: ApplyDOMFilters(args.domfilters, x), PandocTexifier(args.pandoc or 'pandoc'), texout) diff --git a/domfilter.py b/domfilter.py new file mode 100644 index 0000000..22f96a1 --- /dev/null +++ b/domfilter.py @@ -0,0 +1,31 @@ +# paperdoorknob: Print glowfic +# +# This program is free software: you can redistribute it and/or modify it +# under the terms of the GNU General Public License as published by the +# Free Software Foundation, version 3. + + +from typing import Any, Callable, List, Tuple + +from bs4.element import Tag + + +DOMFilters: List[Tuple[str, Callable[[Tag], Any]]] = [ + ("NoEdit", lambda x: [eb.decompose() for eb in x.find_all("div", class_="post-edit-box")]), + ("NoFooter", lambda x: [foot.decompose() for foot in x.find_all("div", class_="post-footer")]), +] + + +class DOMFilterError(Exception): + pass + + +def ApplyDOMFilters(filter_list: str, tag: Tag) -> None: + for filter_name in filter_list.split(','): + filters = [f for (name, f) in DOMFilters if name == filter_name] + if len(filters) == 0: + raise DOMFilterError(f"Unknown DOM filter: {filter_name}") + if len(filters) > 1: + raise DOMFilterError( + f"Multiple DOM filters with the same name!?: {filter_name}") + filters[0](tag) diff --git a/domfilter_test.py b/domfilter_test.py new file mode 100644 index 0000000..c3c8ee2 --- /dev/null +++ b/domfilter_test.py @@ -0,0 +1,42 @@ +# paperdoorknob: Print glowfic +# +# This program is free software: you can redistribute it and/or modify it +# under the terms of the GNU General Public License as published by the +# Free Software Foundation, version 3. + + +import unittest + +from bs4 import BeautifulSoup + +from domfilter import ApplyDOMFilters + + +class TestDOMFilters(unittest.TestCase): + + def setUp(self) -> None: + self._html = BeautifulSoup(b''' +
+
This is the edit box
+ This is glowfic + +
''', 'html.parser') + + def testStripFooters(self) -> None: + ApplyDOMFilters("NoFooter", self._html) + self.assertEqual(list(self._html.stripped_strings), + ["This is the edit box", "This is glowfic"]) + + def testStripEditBoxes(self) -> None: + ApplyDOMFilters("NoEdit", self._html) + self.assertEqual(list(self._html.stripped_strings), + ["This is glowfic", "This is the footer"]) + + def testStripFootersAndEditBoxes(self) -> None: + ApplyDOMFilters("NoEdit,NoFooter", self._html) + self.assertEqual(list(self._html.stripped_strings), + ["This is glowfic"]) + + +if __name__ == '__main__': + unittest.main() diff --git a/paperdoorknob.py b/paperdoorknob.py index 34223e4..2bcf31a 100644 --- a/paperdoorknob.py +++ b/paperdoorknob.py @@ -20,14 +20,6 @@ def parse(content: bytes) -> BeautifulSoup: return BeautifulSoup(content, 'html.parser') -def clean(html: BeautifulSoup) -> BeautifulSoup: - for eb in html.find_all("div", class_="post-edit-box"): - eb.decompose() - for footer in html.find_all("div", class_="post-footer"): - footer.decompose() - return html - - def replies(html: BeautifulSoup) -> Iterable[Tag]: def text() -> Tag: body = html.body @@ -46,8 +38,9 @@ def replies(html: BeautifulSoup) -> Iterable[Tag]: def process(spec: Spec) -> None: spec.texout.write(b'\\documentclass{article}\n\\begin{document}\n') - html = clean(parse(spec.htmlfilter(spec.fetcher.fetch(spec.url)))) + html = parse(spec.htmlfilter(spec.fetcher.fetch(spec.url))) for r in replies(html): + spec.domfilter(r) spec.texout.write(spec.texifier.texify(r)) spec.texout.write(b'\\end{document}\n') diff --git a/paperdoorknob_test.py b/paperdoorknob_test.py index 14f6dd9..aa50340 100644 --- a/paperdoorknob_test.py +++ b/paperdoorknob_test.py @@ -13,6 +13,7 @@ import subprocess import paperdoorknob from testing.fakeserver import FakeGlowficServer +from domfilter import ApplyDOMFilters from fetch import DirectFetcher, FakeFetcher, Fetcher from spec import Spec from texify import DirectTexifier, PandocTexifier, VerifyingTexifier @@ -31,11 +32,10 @@ class BaseTestProcess(ABC): raise NotImplementedError() def testReplies(self) -> None: - replies = paperdoorknob.replies( - paperdoorknob.clean( - paperdoorknob.parse( - self.fetcher().fetch( - self.url())))) + replies = list(paperdoorknob.replies( + paperdoorknob.parse(self.fetcher().fetch(self.url())))) + for r in replies: + ApplyDOMFilters('NoEdit,NoFooter', r) assert [r.text.strip() for r in replies] == [ "This is glowfic", "You sure?", @@ -47,6 +47,7 @@ class BaseTestProcess(ABC): self.url(), self.fetcher(), lambda x: x, + lambda x: ApplyDOMFilters('NoEdit,NoFooter', x), PandocTexifier('pandoc'), buf) paperdoorknob.process(spec) @@ -62,7 +63,13 @@ Pretty sure. texifier = VerifyingTexifier( PandocTexifier('pandoc'), DirectTexifier()) buf = io.BytesIO() - spec = Spec(self.url(), self.fetcher(), lambda x: x, texifier, buf) + spec = Spec( + self.url(), + self.fetcher(), + lambda x: x, + lambda x: ApplyDOMFilters('NoEdit,NoFooter', x), + texifier, + buf) paperdoorknob.process(spec) def testPDF(self) -> None: @@ -71,6 +78,7 @@ Pretty sure. self.url(), self.fetcher(), lambda x: x, + lambda x: ApplyDOMFilters('NoEdit,NoFooter', x), PandocTexifier('pandoc'), out) paperdoorknob.process(spec) diff --git a/setup.py b/setup.py index e83ec01..948b6bc 100644 --- a/setup.py +++ b/setup.py @@ -9,6 +9,7 @@ setup( url="https://git.scottworley.com/paperdoorknob", py_modules=[ 'args', + 'domfilter', 'fetch', 'htmlfilter', 'paperdoorknob', diff --git a/spec.py b/spec.py index 98832bf..75d00c3 100644 --- a/spec.py +++ b/spec.py @@ -9,6 +9,8 @@ from dataclasses import dataclass from typing import Callable, IO +from bs4.element import Tag + from fetch import Fetcher from texify import Texifier @@ -18,5 +20,6 @@ class Spec: url: str fetcher: Fetcher htmlfilter: Callable[[bytes], bytes] + domfilter: Callable[[Tag], None] texifier: Texifier texout: IO[bytes] -- 2.44.1