]> git.scottworley.com Git - paperdoorknob/commitdiff
Extensible, flag-controlled DOM filters
authorScott Worley <scottworley@scottworley.com>
Wed, 20 Dec 2023 05:57:13 +0000 (21:57 -0800)
committerScott Worley <scottworley@scottworley.com>
Wed, 20 Dec 2023 05:57:13 +0000 (21:57 -0800)
args.py
domfilter.py [new file with mode: 0644]
domfilter_test.py [new file with mode: 0644]
paperdoorknob.py
paperdoorknob_test.py
setup.py
spec.py

diff --git a/args.py b/args.py
index 1931d4271f242dc918287d28a7bedf2e4ee71f81..2198a1dae5e13a50ce23269093d3414072d66227 100644 (file)
--- a/args.py
+++ b/args.py
@@ -13,6 +13,7 @@ from typing import Iterator
 
 from xdg_base_dirs import xdg_cache_home
 
 
 from xdg_base_dirs import xdg_cache_home
 
+from domfilter import ApplyDOMFilters, DOMFilters
 from fetch import CachingFetcher
 from htmlfilter import ApplyHTMLFilters, HTMLFilters
 from spec import Spec
 from fetch import CachingFetcher
 from htmlfilter import ApplyHTMLFilters, HTMLFilters
 from spec import Spec
@@ -26,6 +27,10 @@ def _command_line_parser() -> ArgumentParser:
         metavar='PATH',
         help='Where to keep the http cache (instead of %(default)s)',
         default=os.path.join(xdg_cache_home(), "paperdoorknob"))
         metavar='PATH',
         help='Where to keep the http cache (instead of %(default)s)',
         default=os.path.join(xdg_cache_home(), "paperdoorknob"))
+    parser.add_argument(
+        '--domfilters',
+        help='Which DOM filters to use (default: %(default)s)',
+        default=','.join(f[0] for f in DOMFilters))
     parser.add_argument(
         '--htmlfilters',
         help='Which HTML filters to use (default: %(default)s)',
     parser.add_argument(
         '--htmlfilters',
         help='Which HTML filters to use (default: %(default)s)',
@@ -53,5 +58,6 @@ def spec_from_commandline_args() -> Iterator[Spec]:
                 args.url,
                 fetcher,
                 lambda x: ApplyHTMLFilters(args.htmlfilters, x),
                 args.url,
                 fetcher,
                 lambda x: ApplyHTMLFilters(args.htmlfilters, x),
+                lambda x: ApplyDOMFilters(args.domfilters, x),
                 PandocTexifier(args.pandoc or 'pandoc'),
                 texout)
                 PandocTexifier(args.pandoc or 'pandoc'),
                 texout)
diff --git a/domfilter.py b/domfilter.py
new file mode 100644 (file)
index 0000000..22f96a1
--- /dev/null
@@ -0,0 +1,31 @@
+# paperdoorknob: Print glowfic
+#
+# This program is free software: you can redistribute it and/or modify it
+# under the terms of the GNU General Public License as published by the
+# Free Software Foundation, version 3.
+
+
+from typing import Any, Callable, List, Tuple
+
+from bs4.element import Tag
+
+
+DOMFilters: List[Tuple[str, Callable[[Tag], Any]]] = [
+    ("NoEdit", lambda x: [eb.decompose() for eb in x.find_all("div", class_="post-edit-box")]),
+    ("NoFooter", lambda x: [foot.decompose() for foot in x.find_all("div", class_="post-footer")]),
+]
+
+
+class DOMFilterError(Exception):
+    pass
+
+
+def ApplyDOMFilters(filter_list: str, tag: Tag) -> None:
+    for filter_name in filter_list.split(','):
+        filters = [f for (name, f) in DOMFilters if name == filter_name]
+        if len(filters) == 0:
+            raise DOMFilterError(f"Unknown DOM filter: {filter_name}")
+        if len(filters) > 1:
+            raise DOMFilterError(
+                f"Multiple DOM filters with the same name!?: {filter_name}")
+        filters[0](tag)
diff --git a/domfilter_test.py b/domfilter_test.py
new file mode 100644 (file)
index 0000000..c3c8ee2
--- /dev/null
@@ -0,0 +1,42 @@
+# paperdoorknob: Print glowfic
+#
+# This program is free software: you can redistribute it and/or modify it
+# under the terms of the GNU General Public License as published by the
+# Free Software Foundation, version 3.
+
+
+import unittest
+
+from bs4 import BeautifulSoup
+
+from domfilter import ApplyDOMFilters
+
+
+class TestDOMFilters(unittest.TestCase):
+
+    def setUp(self) -> None:
+        self._html = BeautifulSoup(b'''
+            <div class="post-container post-post">
+              <div class="post-edit-box">This is the edit box</div>
+              This is glowfic
+              <div class="post-footer">This is the footer</div>
+            </div>''', 'html.parser')
+
+    def testStripFooters(self) -> None:
+        ApplyDOMFilters("NoFooter", self._html)
+        self.assertEqual(list(self._html.stripped_strings),
+                         ["This is the edit box", "This is glowfic"])
+
+    def testStripEditBoxes(self) -> None:
+        ApplyDOMFilters("NoEdit", self._html)
+        self.assertEqual(list(self._html.stripped_strings),
+                         ["This is glowfic", "This is the footer"])
+
+    def testStripFootersAndEditBoxes(self) -> None:
+        ApplyDOMFilters("NoEdit,NoFooter", self._html)
+        self.assertEqual(list(self._html.stripped_strings),
+                         ["This is glowfic"])
+
+
+if __name__ == '__main__':
+    unittest.main()
index 34223e4c35eb7636370c1032764de6455c075933..2bcf31afe3c1b38407eec51f79babed956847fc9 100644 (file)
@@ -20,14 +20,6 @@ def parse(content: bytes) -> BeautifulSoup:
     return BeautifulSoup(content, 'html.parser')
 
 
     return BeautifulSoup(content, 'html.parser')
 
 
-def clean(html: BeautifulSoup) -> BeautifulSoup:
-    for eb in html.find_all("div", class_="post-edit-box"):
-        eb.decompose()
-    for footer in html.find_all("div", class_="post-footer"):
-        footer.decompose()
-    return html
-
-
 def replies(html: BeautifulSoup) -> Iterable[Tag]:
     def text() -> Tag:
         body = html.body
 def replies(html: BeautifulSoup) -> Iterable[Tag]:
     def text() -> Tag:
         body = html.body
@@ -46,8 +38,9 @@ def replies(html: BeautifulSoup) -> Iterable[Tag]:
 
 def process(spec: Spec) -> None:
     spec.texout.write(b'\\documentclass{article}\n\\begin{document}\n')
 
 def process(spec: Spec) -> None:
     spec.texout.write(b'\\documentclass{article}\n\\begin{document}\n')
-    html = clean(parse(spec.htmlfilter(spec.fetcher.fetch(spec.url))))
+    html = parse(spec.htmlfilter(spec.fetcher.fetch(spec.url)))
     for r in replies(html):
     for r in replies(html):
+        spec.domfilter(r)
         spec.texout.write(spec.texifier.texify(r))
     spec.texout.write(b'\\end{document}\n')
 
         spec.texout.write(spec.texifier.texify(r))
     spec.texout.write(b'\\end{document}\n')
 
index 14f6dd9cde0057e459c3fa7bd961c1fdf02fc436..aa50340ecfd7709ed3437b9ce006d22a5fe1804d 100644 (file)
@@ -13,6 +13,7 @@ import subprocess
 import paperdoorknob
 
 from testing.fakeserver import FakeGlowficServer
 import paperdoorknob
 
 from testing.fakeserver import FakeGlowficServer
+from domfilter import ApplyDOMFilters
 from fetch import DirectFetcher, FakeFetcher, Fetcher
 from spec import Spec
 from texify import DirectTexifier, PandocTexifier, VerifyingTexifier
 from fetch import DirectFetcher, FakeFetcher, Fetcher
 from spec import Spec
 from texify import DirectTexifier, PandocTexifier, VerifyingTexifier
@@ -31,11 +32,10 @@ class BaseTestProcess(ABC):
         raise NotImplementedError()
 
     def testReplies(self) -> None:
         raise NotImplementedError()
 
     def testReplies(self) -> None:
-        replies = paperdoorknob.replies(
-            paperdoorknob.clean(
-                paperdoorknob.parse(
-                    self.fetcher().fetch(
-                        self.url()))))
+        replies = list(paperdoorknob.replies(
+            paperdoorknob.parse(self.fetcher().fetch(self.url()))))
+        for r in replies:
+            ApplyDOMFilters('NoEdit,NoFooter', r)
         assert [r.text.strip() for r in replies] == [
             "This is glowfic",
             "You sure?",
         assert [r.text.strip() for r in replies] == [
             "This is glowfic",
             "You sure?",
@@ -47,6 +47,7 @@ class BaseTestProcess(ABC):
             self.url(),
             self.fetcher(),
             lambda x: x,
             self.url(),
             self.fetcher(),
             lambda x: x,
+            lambda x: ApplyDOMFilters('NoEdit,NoFooter', x),
             PandocTexifier('pandoc'),
             buf)
         paperdoorknob.process(spec)
             PandocTexifier('pandoc'),
             buf)
         paperdoorknob.process(spec)
@@ -62,7 +63,13 @@ Pretty sure.
         texifier = VerifyingTexifier(
             PandocTexifier('pandoc'), DirectTexifier())
         buf = io.BytesIO()
         texifier = VerifyingTexifier(
             PandocTexifier('pandoc'), DirectTexifier())
         buf = io.BytesIO()
-        spec = Spec(self.url(), self.fetcher(), lambda x: x, texifier, buf)
+        spec = Spec(
+            self.url(),
+            self.fetcher(),
+            lambda x: x,
+            lambda x: ApplyDOMFilters('NoEdit,NoFooter', x),
+            texifier,
+            buf)
         paperdoorknob.process(spec)
 
     def testPDF(self) -> None:
         paperdoorknob.process(spec)
 
     def testPDF(self) -> None:
@@ -71,6 +78,7 @@ Pretty sure.
                 self.url(),
                 self.fetcher(),
                 lambda x: x,
                 self.url(),
                 self.fetcher(),
                 lambda x: x,
+                lambda x: ApplyDOMFilters('NoEdit,NoFooter', x),
                 PandocTexifier('pandoc'),
                 out)
             paperdoorknob.process(spec)
                 PandocTexifier('pandoc'),
                 out)
             paperdoorknob.process(spec)
index e83ec01dfe86a4e1c8aca6e6fd3d0d020a245736..948b6bc5f3a79544f2411bc22df6509c27105f4d 100644 (file)
--- a/setup.py
+++ b/setup.py
@@ -9,6 +9,7 @@ setup(
     url="https://git.scottworley.com/paperdoorknob",
     py_modules=[
         'args',
     url="https://git.scottworley.com/paperdoorknob",
     py_modules=[
         'args',
+        'domfilter',
         'fetch',
         'htmlfilter',
         'paperdoorknob',
         'fetch',
         'htmlfilter',
         'paperdoorknob',
diff --git a/spec.py b/spec.py
index 98832bfddfed9978ea67e14edca2b5900d5bc56d..75d00c36f2040557d498eded29fde11dea2ea4e2 100644 (file)
--- a/spec.py
+++ b/spec.py
@@ -9,6 +9,8 @@ from dataclasses import dataclass
 
 from typing import Callable, IO
 
 
 from typing import Callable, IO
 
+from bs4.element import Tag
+
 from fetch import Fetcher
 from texify import Texifier
 
 from fetch import Fetcher
 from texify import Texifier
 
@@ -18,5 +20,6 @@ class Spec:
     url: str
     fetcher: Fetcher
     htmlfilter: Callable[[bytes], bytes]
     url: str
     fetcher: Fetcher
     htmlfilter: Callable[[bytes], bytes]
+    domfilter: Callable[[Tag], None]
     texifier: Texifier
     texout: IO[bytes]
     texifier: Texifier
     texout: IO[bytes]