]> git.scottworley.com Git - paperdoorknob/commitdiff
Reify Chunk
authorScott Worley <scottworley@scottworley.com>
Wed, 20 Dec 2023 21:44:27 +0000 (13:44 -0800)
committerScott Worley <scottworley@scottworley.com>
Wed, 20 Dec 2023 22:55:38 +0000 (14:55 -0800)
glowfic.py
glowfic_test.py
testdata/empty-content.html [new file with mode: 0644]

index 901246b10533f390ff565263fc9d5b1465b0af3c..41697867b2453e0af35175de866f02fe71409c54 100644 (file)
@@ -5,6 +5,7 @@
 # Free Software Foundation, version 3.
 
 
+from dataclasses import dataclass
 import itertools
 
 from typing import Iterable
@@ -12,15 +13,26 @@ from typing import Iterable
 from bs4 import BeautifulSoup
 from bs4.element import Tag
 
+from images import ImageStore
+
+
+@dataclass(frozen=True)
+class Chunk:
+    icon: str | None
+    character: str | None
+    screen_name: str | None
+    author: str | None
+    content: Tag
+
 # We avoid the name "post" because the Glowfic community uses the term
 # inconsistently:
 #  * The Glowfic software sometimes uses "post" to refer to a whole thread
-#    (eg: in the URL), but more often uses "post" to refer to just the first
-#    chunk in a thread.  The non-first chunks are "replies".
+#    (in the URL), sometimes uses "post" to refer to chunks (in the CSS),
+#    but mostly uses "post" to refer to just the first chunk in a thread
+#    (in the HTML and UI).  The non-first chunks are "replies".
 #  * Readers and this software don't need to distinguish first-chunks and
 #    non-first-chunks.
-#  * Humans in the community tend to use "posts" to mean "chunks" ("replies"
-#    in the Glowfic software's lexicon).
+#  * Humans in the community tend to use "posts" to mean chunks.
 
 
 def chunkDOMs(html: BeautifulSoup) -> Iterable[Tag]:
@@ -37,3 +49,31 @@ def chunkDOMs(html: BeautifulSoup) -> Iterable[Tag]:
         return rs
 
     return itertools.chain([text()], the_replies())
+
+
+def makeChunk(chunk_dom: Tag, image_store: ImageStore) -> Chunk:
+
+    def getIcon() -> str | None:
+        icon_div = chunk_dom.find_next('div', class_='post-icon')
+        if icon_div is None:
+            return None
+        icon_img = icon_div.find_next('img')
+        if icon_img is None:
+            return None
+        assert isinstance(icon_img, Tag)
+        return image_store.get_image(icon_img.attrs['src'])
+
+    def getTextByClass(css_class: str) -> str | None:
+        div = chunk_dom.find_next('div', class_=css_class)
+        if div is None:
+            return None
+        return div.text.strip()
+
+    content = chunk_dom.find_next('div', class_='post-content')
+    assert isinstance(content, Tag)
+
+    return Chunk(getIcon(),
+                 getTextByClass('post-character'),
+                 getTextByClass('post-screenname'),
+                 getTextByClass('post-author'),
+                 content)
index d84733382a52a4f63692060e1b746d3bf9849796..d3a6d563f8930150eb32296ceb1973afb96d3af8 100644 (file)
@@ -9,7 +9,8 @@ import unittest
 
 from bs4 import BeautifulSoup
 
-from glowfic import chunkDOMs
+from images import FakeImageStore
+from glowfic import chunkDOMs, makeChunk
 
 
 class TestSplit(unittest.TestCase):
@@ -46,5 +47,22 @@ class TestSplit(unittest.TestCase):
                          [['The "post"'], ['1st reply'], ['2nd reply']])
 
 
+class TestMakeChunk(unittest.TestCase):
+
+    def testEmptyContent(self) -> None:
+        with open('testdata/empty-content.html', 'rb') as f:
+            soup = BeautifulSoup(f, 'html.parser')
+        c = makeChunk(next(iter(chunkDOMs(soup))), FakeImageStore())
+        self.assertEqual(
+            c.icon,
+            'stored:https://d1anwqy6ci9o1i.cloudfront.net/' +
+            'users%2F366%2Ficons%2Fxqmypqvflgdy28aorw9ml_shock.png')
+        self.assertEqual(c.character, 'Keltham')
+        self.assertEqual(c.screen_name, 'lawful chaotic')
+        self.assertEqual(c.author, 'Iarwain')
+        self.assertEqual(str(c.content),
+                         '<div class="post-content"><p></p></div>')
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/testdata/empty-content.html b/testdata/empty-content.html
new file mode 100644 (file)
index 0000000..e1dc3f5
--- /dev/null
@@ -0,0 +1,32 @@
+<html>
+  <body>
+    <div class="post-container post-post ">
+      <a id="reply-1616124" class="noheight"> </a>
+      <div class="padding-10">
+        <div class="post-info-box">
+          <div class="post-icon">
+            <a href="/icons/200477">
+              <img alt="shock" title="shock" class="icon" src="https://d1anwqy6ci9o1i.cloudfront.net/users%2F366%2Ficons%2Fxqmypqvflgdy28aorw9ml_shock.png">
+            </a>
+          </div>
+          <div class="post-info-text">
+            <div class="post-character">
+              <a href="/characters/11729">Keltham</a>
+            </div>
+            <div class="post-screenname">lawful chaotic</div>
+            <div class="post-author"><a href="/users/366">Iarwain</a></div>
+          </div>
+        </div>
+        <div class="post-edit-box">
+          <a rel="alternate" href="/replies/1616124#reply-1616124">
+            <img title="Permalink" alt="Permalink" src="https://dhtmoj33sf3e0.cloudfront.net/assets/icons/link-bb9df2e290558f33c20c21f4a2a85841eb4ccb1bd09f6266d3e80679f30ccf62.png">
+          </a>
+        </div>
+        <div class="post-content"><p></p></div>
+      </div>
+      <div class="post-footer"><div class="right-align"><div class="padding-5">
+        Posted <span class="post-posted"><time datetime="2021-06-29T02:48:07Z" title="2021-06-29 02:48 UTC">Jun 28, 2021, 7:48 PM</time></span>
+      </div></div></div>
+    </div>
+  </body>
+</html>