From: Scott Worley Date: Wed, 20 Dec 2023 19:43:09 +0000 (-0800) Subject: More structure and tests around splitting the page into chunks' DOMs. X-Git-Url: http://git.scottworley.com/paperdoorknob/commitdiff_plain/e6adf6ced68d667429975110f2d1a1bd9c8d79b6?ds=inline More structure and tests around splitting the page into chunks' DOMs. --- diff --git a/glowfic.py b/glowfic.py new file mode 100644 index 0000000..901246b --- /dev/null +++ b/glowfic.py @@ -0,0 +1,39 @@ +# paperdoorknob: Print glowfic +# +# This program is free software: you can redistribute it and/or modify it +# under the terms of the GNU General Public License as published by the +# Free Software Foundation, version 3. + + +import itertools + +from typing import Iterable + +from bs4 import BeautifulSoup +from bs4.element import Tag + +# We avoid the name "post" because the Glowfic community uses the term +# inconsistently: +# * The Glowfic software sometimes uses "post" to refer to a whole thread +# (eg: in the URL), but more often uses "post" to refer to just the first +# chunk in a thread. The non-first chunks are "replies". +# * Readers and this software don't need to distinguish first-chunks and +# non-first-chunks. +# * Humans in the community tend to use "posts" to mean "chunks" ("replies" +# in the Glowfic software's lexicon). + + +def chunkDOMs(html: BeautifulSoup) -> Iterable[Tag]: + def text() -> Tag: + body = html.body + assert body + text = body.find_next("div", class_="post-post") + assert isinstance(text, Tag) + return text + + def the_replies() -> Iterable[Tag]: + rs = html.find_all("div", class_="post-reply") + assert all(isinstance(r, Tag) for r in rs) + return rs + + return itertools.chain([text()], the_replies()) diff --git a/glowfic_test.py b/glowfic_test.py new file mode 100644 index 0000000..d847333 --- /dev/null +++ b/glowfic_test.py @@ -0,0 +1,50 @@ +# paperdoorknob: Print glowfic +# +# This program is free software: you can redistribute it and/or modify it +# under the terms of the GNU General Public License as published by the +# Free Software Foundation, version 3. + + +import unittest + +from bs4 import BeautifulSoup + +from glowfic import chunkDOMs + + +class TestSplit(unittest.TestCase): + + def testSplit1(self) -> None: + soup = BeautifulSoup(b''' +
+ The "post" +
''', 'html.parser') + self.assertEqual([list(t.stripped_strings) for t in chunkDOMs(soup)], + [['The "post"']]) + + def testSplit2(self) -> None: + soup = BeautifulSoup(b''' + +
The "post"
+
+
The "reply"
+
+ ''', 'html.parser') + self.assertEqual([list(t.stripped_strings) for t in chunkDOMs(soup)], + [['The "post"'], ['The "reply"']]) + + def testSplit3(self) -> None: + soup = BeautifulSoup(b''' + +
The "post"
+
+
1st reply
+
2nd reply
+
+ ''', 'html.parser') + self.assertEqual([list(t.stripped_strings) for t in chunkDOMs(soup)], + [['The "post"'], ['1st reply'], ['2nd reply']]) + + +if __name__ == '__main__': + unittest.main() diff --git a/paperdoorknob.py b/paperdoorknob.py index 26eb7be..84ed702 100644 --- a/paperdoorknob.py +++ b/paperdoorknob.py @@ -5,14 +5,10 @@ # Free Software Foundation, version 3. -import itertools - -from typing import Iterable - from bs4 import BeautifulSoup -from bs4.element import Tag from args import spec_from_commandline_args +from glowfic import chunkDOMs from spec import Spec @@ -20,22 +16,6 @@ def parse(content: bytes) -> BeautifulSoup: return BeautifulSoup(content, 'html.parser') -def replies(html: BeautifulSoup) -> Iterable[Tag]: - def text() -> Tag: - body = html.body - assert body - text = body.find_next("div", class_="post-post") - assert isinstance(text, Tag) - return text - - def the_replies() -> Iterable[Tag]: - rs = html.find_all("div", class_="post-reply") - assert all(isinstance(r, Tag) for r in rs) - return rs - - return itertools.chain([text()], the_replies()) - - def process(spec: Spec) -> None: spec.texout.write(b'\\documentclass{article}\n') if spec.geometry is not None: @@ -44,7 +24,7 @@ def process(spec: Spec) -> None: b']{geometry}\n') spec.texout.write(b'\\begin{document}\n') html = parse(spec.htmlfilter(spec.fetcher.fetch(spec.url))) - for r in replies(html): + for r in chunkDOMs(html): spec.domfilter(r) spec.texout.write(spec.texifier.texify(r)) spec.texout.write(b'\\end{document}\n') diff --git a/paperdoorknob_test.py b/paperdoorknob_test.py index 65fc2a9..249b6e4 100644 --- a/paperdoorknob_test.py +++ b/paperdoorknob_test.py @@ -32,16 +32,6 @@ class BaseTestProcess(ABC): def fetcher(self) -> Fetcher: raise NotImplementedError() - def testReplies(self) -> None: - replies = list(paperdoorknob.replies( - paperdoorknob.parse(self.fetcher().fetch(self.url())))) - for r in replies: - ApplyDOMFilters('NoEdit,NoFooter', r) - assert [r.text.strip() for r in replies] == [ - "This is glowfic", - "You sure?", - "Pretty sure."] - def testProcess(self) -> None: buf = io.BytesIO() spec = Spec( diff --git a/setup.py b/setup.py index 1ef11e6..2152739 100644 --- a/setup.py +++ b/setup.py @@ -11,6 +11,7 @@ setup( 'args', 'domfilter', 'fetch', + 'glowfic', 'htmlfilter', 'images', 'paperdoorknob',