From 8be20b9d36c49271ed392038750dfba981c283b6 Mon Sep 17 00:00:00 2001
From: Scott Worley <scottworley@scottworley.com>
Date: Tue, 19 Dec 2023 21:57:13 -0800
Subject: [PATCH] Extensible, flag-controlled DOM filters

---
 args.py               |  6 ++++++
 domfilter.py          | 31 +++++++++++++++++++++++++++++++
 domfilter_test.py     | 42 ++++++++++++++++++++++++++++++++++++++++++
 paperdoorknob.py      | 11 ++---------
 paperdoorknob_test.py | 20 ++++++++++++++------
 setup.py              |  1 +
 spec.py               |  3 +++
 7 files changed, 99 insertions(+), 15 deletions(-)
 create mode 100644 domfilter.py
 create mode 100644 domfilter_test.py
diff --git a/args.py b/args.py
index 1931d42..2198a1d 100644
--- a/args.py
+++ b/args.py
@@ -13,6 +13,7 @@ from typing import Iterator
 
 from xdg_base_dirs import xdg_cache_home
 
+from domfilter import ApplyDOMFilters, DOMFilters
 from fetch import CachingFetcher
 from htmlfilter import ApplyHTMLFilters, HTMLFilters
 from spec import Spec
@@ -26,6 +27,10 @@ def _command_line_parser() -> ArgumentParser:
         metavar='PATH',
         help='Where to keep the http cache (instead of %(default)s)',
         default=os.path.join(xdg_cache_home(), "paperdoorknob"))
+    parser.add_argument(
+        '--domfilters',
+        help='Which DOM filters to use (default: %(default)s)',
+        default=','.join(f[0] for f in DOMFilters))
     parser.add_argument(
         '--htmlfilters',
         help='Which HTML filters to use (default: %(default)s)',
@@ -53,5 +58,6 @@ def spec_from_commandline_args() -> Iterator[Spec]:
                 args.url,
                 fetcher,
                 lambda x: ApplyHTMLFilters(args.htmlfilters, x),
+                lambda x: ApplyDOMFilters(args.domfilters, x),
                 PandocTexifier(args.pandoc or 'pandoc'),
                 texout)
diff --git a/domfilter.py b/domfilter.py
new file mode 100644
index 0000000..22f96a1
--- /dev/null
+++ b/domfilter.py
@@ -0,0 +1,31 @@
+# paperdoorknob: Print glowfic
+#
+# This program is free software: you can redistribute it and/or modify it
+# under the terms of the GNU General Public License as published by the
+# Free Software Foundation, version 3.
+
+
+from typing import Any, Callable, List, Tuple
+
+from bs4.element import Tag
+
+
+DOMFilters: List[Tuple[str, Callable[[Tag], Any]]] = [
+    ("NoEdit", lambda x: [eb.decompose() for eb in x.find_all("div", class_="post-edit-box")]),
+    ("NoFooter", lambda x: [foot.decompose() for foot in x.find_all("div", class_="post-footer")]),
+]
+
+
+class DOMFilterError(Exception):
+    pass
+
+
+def ApplyDOMFilters(filter_list: str, tag: Tag) -> None:
+    for filter_name in filter_list.split(','):
+        filters = [f for (name, f) in DOMFilters if name == filter_name]
+        if len(filters) == 0:
+            raise DOMFilterError(f"Unknown DOM filter: {filter_name}")
+        if len(filters) > 1:
+            raise DOMFilterError(
+                f"Multiple DOM filters with the same name!?: {filter_name}")
+        filters[0](tag)
diff --git a/domfilter_test.py b/domfilter_test.py
new file mode 100644
index 0000000..c3c8ee2
--- /dev/null
+++ b/domfilter_test.py
@@ -0,0 +1,42 @@
+# paperdoorknob: Print glowfic
+#
+# This program is free software: you can redistribute it and/or modify it
+# under the terms of the GNU General Public License as published by the
+# Free Software Foundation, version 3.
+
+
+import unittest
+
+from bs4 import BeautifulSoup
+
+from domfilter import ApplyDOMFilters
+
+
+class TestDOMFilters(unittest.TestCase):
+
+    def setUp(self) -> None:
+        self._html = BeautifulSoup(b'''
+            <div class="post-container post-post">
+              <div class="post-edit-box">This is the edit box</div>
+              This is glowfic
+              <div class="post-footer">This is the footer</div>
+            </div>''', 'html.parser')
+
+    def testStripFooters(self) -> None:
+        ApplyDOMFilters("NoFooter", self._html)
+        self.assertEqual(list(self._html.stripped_strings),
+                         ["This is the edit box", "This is glowfic"])
+
+    def testStripEditBoxes(self) -> None:
+        ApplyDOMFilters("NoEdit", self._html)
+        self.assertEqual(list(self._html.stripped_strings),
+                         ["This is glowfic", "This is the footer"])
+
+    def testStripFootersAndEditBoxes(self) -> None:
+        ApplyDOMFilters("NoEdit,NoFooter", self._html)
+        self.assertEqual(list(self._html.stripped_strings),
+                         ["This is glowfic"])
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/paperdoorknob.py b/paperdoorknob.py
index 34223e4..2bcf31a 100644
--- a/paperdoorknob.py
+++ b/paperdoorknob.py
@@ -20,14 +20,6 @@ def parse(content: bytes) -> BeautifulSoup:
     return BeautifulSoup(content, 'html.parser')
 
 
-def clean(html: BeautifulSoup) -> BeautifulSoup:
-    for eb in html.find_all("div", class_="post-edit-box"):
-        eb.decompose()
-    for footer in html.find_all("div", class_="post-footer"):
-        footer.decompose()
-    return html
-
-
 def replies(html: BeautifulSoup) -> Iterable[Tag]:
     def text() -> Tag:
         body = html.body
@@ -46,8 +38,9 @@ def replies(html: BeautifulSoup) -> Iterable[Tag]:
 
 def process(spec: Spec) -> None:
     spec.texout.write(b'\\documentclass{article}\n\\begin{document}\n')
-    html = clean(parse(spec.htmlfilter(spec.fetcher.fetch(spec.url))))
+    html = parse(spec.htmlfilter(spec.fetcher.fetch(spec.url)))
     for r in replies(html):
+        spec.domfilter(r)
         spec.texout.write(spec.texifier.texify(r))
     spec.texout.write(b'\\end{document}\n')
 
diff --git a/paperdoorknob_test.py b/paperdoorknob_test.py
index 14f6dd9..aa50340 100644
--- a/paperdoorknob_test.py
+++ b/paperdoorknob_test.py
@@ -13,6 +13,7 @@ import subprocess
 import paperdoorknob
 
 from testing.fakeserver import FakeGlowficServer
+from domfilter import ApplyDOMFilters
 from fetch import DirectFetcher, FakeFetcher, Fetcher
 from spec import Spec
 from texify import DirectTexifier, PandocTexifier, VerifyingTexifier
@@ -31,11 +32,10 @@ class BaseTestProcess(ABC):
         raise NotImplementedError()
 
     def testReplies(self) -> None:
-        replies = paperdoorknob.replies(
-            paperdoorknob.clean(
-                paperdoorknob.parse(
-                    self.fetcher().fetch(
-                        self.url()))))
+        replies = list(paperdoorknob.replies(
+            paperdoorknob.parse(self.fetcher().fetch(self.url()))))
+        for r in replies:
+            ApplyDOMFilters('NoEdit,NoFooter', r)
         assert [r.text.strip() for r in replies] == [
             "This is glowfic",
             "You sure?",
@@ -47,6 +47,7 @@ class BaseTestProcess(ABC):
             self.url(),
             self.fetcher(),
             lambda x: x,
+            lambda x: ApplyDOMFilters('NoEdit,NoFooter', x),
             PandocTexifier('pandoc'),
             buf)
         paperdoorknob.process(spec)
@@ -62,7 +63,13 @@ Pretty sure.
         texifier = VerifyingTexifier(
             PandocTexifier('pandoc'), DirectTexifier())
         buf = io.BytesIO()
-        spec = Spec(self.url(), self.fetcher(), lambda x: x, texifier, buf)
+        spec = Spec(
+            self.url(),
+            self.fetcher(),
+            lambda x: x,
+            lambda x: ApplyDOMFilters('NoEdit,NoFooter', x),
+            texifier,
+            buf)
         paperdoorknob.process(spec)
 
     def testPDF(self) -> None:
@@ -71,6 +78,7 @@ Pretty sure.
                 self.url(),
                 self.fetcher(),
                 lambda x: x,
+                lambda x: ApplyDOMFilters('NoEdit,NoFooter', x),
                 PandocTexifier('pandoc'),
                 out)
             paperdoorknob.process(spec)
diff --git a/setup.py b/setup.py
index e83ec01..948b6bc 100644
--- a/setup.py
+++ b/setup.py
@@ -9,6 +9,7 @@ setup(
     url="https://git.scottworley.com/paperdoorknob",
     py_modules=[
         'args',
+        'domfilter',
         'fetch',
         'htmlfilter',
         'paperdoorknob',
diff --git a/spec.py b/spec.py
index 98832bf..75d00c3 100644
--- a/spec.py
+++ b/spec.py
@@ -9,6 +9,8 @@ from dataclasses import dataclass
 
 from typing import Callable, IO
 
+from bs4.element import Tag
+
 from fetch import Fetcher
 from texify import Texifier
 
@@ -18,5 +20,6 @@ class Spec:
     url: str
     fetcher: Fetcher
     htmlfilter: Callable[[bytes], bytes]
+    domfilter: Callable[[Tag], None]
     texifier: Texifier
     texout: IO[bytes]
-- 
2.44.1