More structure and tests around splitting the page into chunks' DOMs.

author Scott Worley <scottworley@scottworley.com>

Wed, 20 Dec 2023 19:43:09 +0000 (11:43 -0800)

committer Scott Worley <scottworley@scottworley.com>

Wed, 20 Dec 2023 19:43:09 +0000 (11:43 -0800)
author Scott Worley <scottworley@scottworley.com>
Wed, 20 Dec 2023 19:43:09 +0000 (11:43 -0800)
committer Scott Worley <scottworley@scottworley.com>
Wed, 20 Dec 2023 19:43:09 +0000 (11:43 -0800)
diff --git a/glowfic.py b/glowfic.py

new file mode 100644 (file)

index 0000000..901246b
--- /dev/null
+++ b/glowfic.py
@@ -0,0 +1,39 @@
+# paperdoorknob: Print glowfic
+#
+# This program is free software: you can redistribute it and/or modify it
+# under the terms of the GNU General Public License as published by the
+# Free Software Foundation, version 3.
+
+
+import itertools
+
+from typing import Iterable
+
+from bs4 import BeautifulSoup
+from bs4.element import Tag
+
+# We avoid the name "post" because the Glowfic community uses the term
+# inconsistently:
+#  * The Glowfic software sometimes uses "post" to refer to a whole thread
+#    (eg: in the URL), but more often uses "post" to refer to just the first
+#    chunk in a thread.  The non-first chunks are "replies".
+#  * Readers and this software don't need to distinguish first-chunks and
+#    non-first-chunks.
+#  * Humans in the community tend to use "posts" to mean "chunks" ("replies"
+#    in the Glowfic software's lexicon).
+
+
+def chunkDOMs(html: BeautifulSoup) -> Iterable[Tag]:
+    def text() -> Tag:
+        body = html.body
+        assert body
+        text = body.find_next("div", class_="post-post")
+        assert isinstance(text, Tag)
+        return text
+
+    def the_replies() -> Iterable[Tag]:
+        rs = html.find_all("div", class_="post-reply")
+        assert all(isinstance(r, Tag) for r in rs)
+        return rs
+
+    return itertools.chain([text()], the_replies())
diff --git a/glowfic_test.py b/glowfic_test.py

new file mode 100644 (file)

index 0000000..d847333
--- /dev/null
+++ b/glowfic_test.py
@@ -0,0 +1,50 @@
+# paperdoorknob: Print glowfic
+#
+# This program is free software: you can redistribute it and/or modify it
+# under the terms of the GNU General Public License as published by the
+# Free Software Foundation, version 3.
+
+
+import unittest
+
+from bs4 import BeautifulSoup
+
+from glowfic import chunkDOMs
+
+
+class TestSplit(unittest.TestCase):
+
+    def testSplit1(self) -> None:
+        soup = BeautifulSoup(b'''
+            <html><body><div class="post-container post-post">
+              The "post"
+            </div></body></html>''', 'html.parser')
+        self.assertEqual([list(t.stripped_strings) for t in chunkDOMs(soup)],
+                         [['The "post"']])
+
+    def testSplit2(self) -> None:
+        soup = BeautifulSoup(b'''
+            <html><body>
+              <div class="post-container post-post">The "post"</div>
+              <div class="flat-post-replies">
+                <div class="post-container post-reply">The "reply"</div>
+              </div>
+            </body></html>''', 'html.parser')
+        self.assertEqual([list(t.stripped_strings) for t in chunkDOMs(soup)],
+                         [['The "post"'], ['The "reply"']])
+
+    def testSplit3(self) -> None:
+        soup = BeautifulSoup(b'''
+            <html><body>
+              <div class="post-container post-post">The "post"</div>
+              <div class="flat-post-replies">
+                <div class="post-container post-reply">1st reply</div>
+                <div class="post-container post-reply">2nd reply</div>
+              </div>
+            </body></html>''', 'html.parser')
+        self.assertEqual([list(t.stripped_strings) for t in chunkDOMs(soup)],
+                         [['The "post"'], ['1st reply'], ['2nd reply']])
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/paperdoorknob.py b/paperdoorknob.py

index 26eb7be054a823b6183bea6dff3415dfef713a6a..84ed70228d3d9b4040fc765c3ced2be68d1d58ea 100644 (file)
--- a/paperdoorknob.py
+++ b/paperdoorknob.py
@@ -5,14 +5,10 @@
  # Free Software Foundation, version 3.
  
  
  # Free Software Foundation, version 3.
  
  
-import itertools
-
-from typing import Iterable
-
  from bs4 import BeautifulSoup
  from bs4 import BeautifulSoup
-from bs4.element import Tag
  
  from args import spec_from_commandline_args
  
  from args import spec_from_commandline_args
+from glowfic import chunkDOMs
  from spec import Spec
  
  
  from spec import Spec
  
  
@@ -20,22 +16,6 @@ def parse(content: bytes) -> BeautifulSoup:
      return BeautifulSoup(content, 'html.parser')
  
  
      return BeautifulSoup(content, 'html.parser')
  
  
-def replies(html: BeautifulSoup) -> Iterable[Tag]:
-    def text() -> Tag:
-        body = html.body
-        assert body
-        text = body.find_next("div", class_="post-post")
-        assert isinstance(text, Tag)
-        return text
-
-    def the_replies() -> Iterable[Tag]:
-        rs = html.find_all("div", class_="post-reply")
-        assert all(isinstance(r, Tag) for r in rs)
-        return rs
-
-    return itertools.chain([text()], the_replies())
-
-
  def process(spec: Spec) -> None:
      spec.texout.write(b'\\documentclass{article}\n')
      if spec.geometry is not None:
  def process(spec: Spec) -> None:
      spec.texout.write(b'\\documentclass{article}\n')
      if spec.geometry is not None:
@@ -44,7 +24,7 @@ def process(spec: Spec) -> None:
                            b']{geometry}\n')
      spec.texout.write(b'\\begin{document}\n')
      html = parse(spec.htmlfilter(spec.fetcher.fetch(spec.url)))
                            b']{geometry}\n')
      spec.texout.write(b'\\begin{document}\n')
      html = parse(spec.htmlfilter(spec.fetcher.fetch(spec.url)))
-    for r in replies(html):
+    for r in chunkDOMs(html):
          spec.domfilter(r)
          spec.texout.write(spec.texifier.texify(r))
      spec.texout.write(b'\\end{document}\n')
          spec.domfilter(r)
          spec.texout.write(spec.texifier.texify(r))
      spec.texout.write(b'\\end{document}\n')
diff --git a/paperdoorknob_test.py b/paperdoorknob_test.py

index 65fc2a9f308fbba82657389a87dd9b0971b312da..249b6e42e936fa5f60a18e021c133dc824ca15a6 100644 (file)
--- a/paperdoorknob_test.py
+++ b/paperdoorknob_test.py
@@ -32,16 +32,6 @@ class BaseTestProcess(ABC):
      def fetcher(self) -> Fetcher:
          raise NotImplementedError()
  
      def fetcher(self) -> Fetcher:
          raise NotImplementedError()
  
-    def testReplies(self) -> None:
-        replies = list(paperdoorknob.replies(
-            paperdoorknob.parse(self.fetcher().fetch(self.url()))))
-        for r in replies:
-            ApplyDOMFilters('NoEdit,NoFooter', r)
-        assert [r.text.strip() for r in replies] == [
-            "This is glowfic",
-            "You sure?",
-            "Pretty sure."]
-
      def testProcess(self) -> None:
          buf = io.BytesIO()
          spec = Spec(
      def testProcess(self) -> None:
          buf = io.BytesIO()
          spec = Spec(
diff --git a/setup.py b/setup.py

index 1ef11e6e9ccebbd71f57eb57453bc64cecc3b1c4..21527392028491df1dfffc0d1b5e870662223d03 100644 (file)
--- a/setup.py
+++ b/setup.py
@@ -11,6 +11,7 @@ setup(
          'args',
          'domfilter',
          'fetch',
          'args',
          'domfilter',
          'fetch',
+        'glowfic',
          'htmlfilter',
          'images',
          'paperdoorknob',
          'htmlfilter',
          'images',
          'paperdoorknob',
author	Scott Worley <scottworley@scottworley.com>
	Wed, 20 Dec 2023 19:43:09 +0000 (11:43 -0800)
committer	Scott Worley <scottworley@scottworley.com>
	Wed, 20 Dec 2023 19:43:09 +0000 (11:43 -0800)
glowfic.py	[new file with mode: 0644]	patch \| blob
glowfic_test.py	[new file with mode: 0644]	patch \| blob
paperdoorknob.py		patch \| blob \| blame \| history
paperdoorknob_test.py		patch \| blob \| blame \| history
setup.py		patch \| blob \| blame \| history