from xdg_base_dirs import xdg_cache_home
+from domfilter import ApplyDOMFilters, DOMFilters
from fetch import CachingFetcher
from htmlfilter import ApplyHTMLFilters, HTMLFilters
from spec import Spec
metavar='PATH',
help='Where to keep the http cache (instead of %(default)s)',
default=os.path.join(xdg_cache_home(), "paperdoorknob"))
+ parser.add_argument(
+ '--domfilters',
+ help='Which DOM filters to use (default: %(default)s)',
+ default=','.join(f[0] for f in DOMFilters))
parser.add_argument(
'--htmlfilters',
help='Which HTML filters to use (default: %(default)s)',
args.url,
fetcher,
lambda x: ApplyHTMLFilters(args.htmlfilters, x),
+ lambda x: ApplyDOMFilters(args.domfilters, x),
PandocTexifier(args.pandoc or 'pandoc'),
texout)
--- /dev/null
+# paperdoorknob: Print glowfic
+#
+# This program is free software: you can redistribute it and/or modify it
+# under the terms of the GNU General Public License as published by the
+# Free Software Foundation, version 3.
+
+
+from typing import Any, Callable, List, Tuple
+
+from bs4.element import Tag
+
+
+DOMFilters: List[Tuple[str, Callable[[Tag], Any]]] = [
+ ("NoEdit", lambda x: [eb.decompose() for eb in x.find_all("div", class_="post-edit-box")]),
+ ("NoFooter", lambda x: [foot.decompose() for foot in x.find_all("div", class_="post-footer")]),
+]
+
+
+class DOMFilterError(Exception):
+ pass
+
+
+def ApplyDOMFilters(filter_list: str, tag: Tag) -> None:
+ for filter_name in filter_list.split(','):
+ filters = [f for (name, f) in DOMFilters if name == filter_name]
+ if len(filters) == 0:
+ raise DOMFilterError(f"Unknown DOM filter: {filter_name}")
+ if len(filters) > 1:
+ raise DOMFilterError(
+ f"Multiple DOM filters with the same name!?: {filter_name}")
+ filters[0](tag)
--- /dev/null
+# paperdoorknob: Print glowfic
+#
+# This program is free software: you can redistribute it and/or modify it
+# under the terms of the GNU General Public License as published by the
+# Free Software Foundation, version 3.
+
+
+import unittest
+
+from bs4 import BeautifulSoup
+
+from domfilter import ApplyDOMFilters
+
+
+class TestDOMFilters(unittest.TestCase):
+
+ def setUp(self) -> None:
+ self._html = BeautifulSoup(b'''
+ <div class="post-container post-post">
+ <div class="post-edit-box">This is the edit box</div>
+ This is glowfic
+ <div class="post-footer">This is the footer</div>
+ </div>''', 'html.parser')
+
+ def testStripFooters(self) -> None:
+ ApplyDOMFilters("NoFooter", self._html)
+ self.assertEqual(list(self._html.stripped_strings),
+ ["This is the edit box", "This is glowfic"])
+
+ def testStripEditBoxes(self) -> None:
+ ApplyDOMFilters("NoEdit", self._html)
+ self.assertEqual(list(self._html.stripped_strings),
+ ["This is glowfic", "This is the footer"])
+
+ def testStripFootersAndEditBoxes(self) -> None:
+ ApplyDOMFilters("NoEdit,NoFooter", self._html)
+ self.assertEqual(list(self._html.stripped_strings),
+ ["This is glowfic"])
+
+
+if __name__ == '__main__':
+ unittest.main()
return BeautifulSoup(content, 'html.parser')
-def clean(html: BeautifulSoup) -> BeautifulSoup:
- for eb in html.find_all("div", class_="post-edit-box"):
- eb.decompose()
- for footer in html.find_all("div", class_="post-footer"):
- footer.decompose()
- return html
-
-
def replies(html: BeautifulSoup) -> Iterable[Tag]:
def text() -> Tag:
body = html.body
def process(spec: Spec) -> None:
spec.texout.write(b'\\documentclass{article}\n\\begin{document}\n')
- html = clean(parse(spec.htmlfilter(spec.fetcher.fetch(spec.url))))
+ html = parse(spec.htmlfilter(spec.fetcher.fetch(spec.url)))
for r in replies(html):
+ spec.domfilter(r)
spec.texout.write(spec.texifier.texify(r))
spec.texout.write(b'\\end{document}\n')
import paperdoorknob
from testing.fakeserver import FakeGlowficServer
+from domfilter import ApplyDOMFilters
from fetch import DirectFetcher, FakeFetcher, Fetcher
from spec import Spec
from texify import DirectTexifier, PandocTexifier, VerifyingTexifier
raise NotImplementedError()
def testReplies(self) -> None:
- replies = paperdoorknob.replies(
- paperdoorknob.clean(
- paperdoorknob.parse(
- self.fetcher().fetch(
- self.url()))))
+ replies = list(paperdoorknob.replies(
+ paperdoorknob.parse(self.fetcher().fetch(self.url()))))
+ for r in replies:
+ ApplyDOMFilters('NoEdit,NoFooter', r)
assert [r.text.strip() for r in replies] == [
"This is glowfic",
"You sure?",
self.url(),
self.fetcher(),
lambda x: x,
+ lambda x: ApplyDOMFilters('NoEdit,NoFooter', x),
PandocTexifier('pandoc'),
buf)
paperdoorknob.process(spec)
texifier = VerifyingTexifier(
PandocTexifier('pandoc'), DirectTexifier())
buf = io.BytesIO()
- spec = Spec(self.url(), self.fetcher(), lambda x: x, texifier, buf)
+ spec = Spec(
+ self.url(),
+ self.fetcher(),
+ lambda x: x,
+ lambda x: ApplyDOMFilters('NoEdit,NoFooter', x),
+ texifier,
+ buf)
paperdoorknob.process(spec)
def testPDF(self) -> None:
self.url(),
self.fetcher(),
lambda x: x,
+ lambda x: ApplyDOMFilters('NoEdit,NoFooter', x),
PandocTexifier('pandoc'),
out)
paperdoorknob.process(spec)
url="https://git.scottworley.com/paperdoorknob",
py_modules=[
'args',
+ 'domfilter',
'fetch',
'htmlfilter',
'paperdoorknob',
from typing import Callable, IO
+from bs4.element import Tag
+
from fetch import Fetcher
from texify import Texifier
url: str
fetcher: Fetcher
htmlfilter: Callable[[bytes], bytes]
+ domfilter: Callable[[Tag], None]
texifier: Texifier
texout: IO[bytes]