From: Scott Worley Date: Fri, 12 Jan 2024 02:17:26 +0000 (-0800) Subject: Reify Thread X-Git-Url: http://git.scottworley.com/paperdoorknob/commitdiff_plain/940270992010ea5b4c912eb02f502923c0487a31?ds=sidebyside Reify Thread --- diff --git a/glowfic.py b/glowfic.py index 012bd06..372b3aa 100644 --- a/glowfic.py +++ b/glowfic.py @@ -54,20 +54,25 @@ class Chunk: # * Humans in the community tend to use "posts" to mean chunks. -def chunkDOMs(html: BeautifulSoup) -> Iterable[Tag]: - def text() -> Tag: - body = html.body - assert body - text = body.find_next("div", class_="post-post") - assert isinstance(text, Tag) - return text - - def the_replies() -> Iterable[Tag]: - rs = html.find_all("div", class_="post-reply") - assert all(isinstance(r, Tag) for r in rs) - return rs - - return itertools.chain([text()], the_replies()) +class Thread: + + def __init__(self, html: BeautifulSoup) -> None: + self._html = html + + def chunkDOMs(self) -> Iterable[Tag]: + def text() -> Tag: + body = self._html.body + assert body + text = body.find_next("div", class_="post-post") + assert isinstance(text, Tag) + return text + + def the_replies() -> Iterable[Tag]: + rs = self._html.find_all("div", class_="post-reply") + assert all(isinstance(r, Tag) for r in rs) + return rs + + return itertools.chain([text()], the_replies()) def makeChunk(chunk_dom: Tag, image_store: ImageStore) -> Chunk: diff --git a/glowfic_test.py b/glowfic_test.py index 2d01c2a..ed32945 100644 --- a/glowfic_test.py +++ b/glowfic_test.py @@ -10,41 +10,41 @@ import unittest from bs4 import BeautifulSoup from images import FakeImageStore -from glowfic import chunkDOMs, makeChunk +from glowfic import makeChunk, Thread from texify import PandocTexifier class TestSplit(unittest.TestCase): def testSplit1(self) -> None: - soup = BeautifulSoup(b''' + t = Thread(BeautifulSoup(b'''
The "post" -
''', 'html.parser') - self.assertEqual([list(t.stripped_strings) for t in chunkDOMs(soup)], + ''', 'html.parser')) + self.assertEqual([list(t.stripped_strings) for t in t.chunkDOMs()], [['The "post"']]) def testSplit2(self) -> None: - soup = BeautifulSoup(b''' + t = Thread(BeautifulSoup(b'''
The "post"
The "reply"
- ''', 'html.parser') - self.assertEqual([list(t.stripped_strings) for t in chunkDOMs(soup)], + ''', 'html.parser')) + self.assertEqual([list(t.stripped_strings) for t in t.chunkDOMs()], [['The "post"'], ['The "reply"']]) def testSplit3(self) -> None: - soup = BeautifulSoup(b''' + t = Thread(BeautifulSoup(b'''
The "post"
1st reply
2nd reply
- ''', 'html.parser') - self.assertEqual([list(t.stripped_strings) for t in chunkDOMs(soup)], + ''', 'html.parser')) + self.assertEqual([list(t.stripped_strings) for t in t.chunkDOMs()], [['The "post"'], ['1st reply'], ['2nd reply']]) @@ -52,8 +52,8 @@ class TestMakeChunk(unittest.TestCase): def testEmptyContent(self) -> None: with open('testdata/empty-content.html', 'rb') as f: - soup = BeautifulSoup(f, 'html.parser') - c = makeChunk(next(iter(chunkDOMs(soup))), FakeImageStore()) + t = Thread(BeautifulSoup(f, 'html.parser')) + c = makeChunk(next(iter(t.chunkDOMs())), FakeImageStore()) self.assertEqual( c.icon, 'stored:https://d1anwqy6ci9o1i.cloudfront.net/' + diff --git a/paperdoorknob.py b/paperdoorknob.py index 1350784..ade0b88 100644 --- a/paperdoorknob.py +++ b/paperdoorknob.py @@ -10,7 +10,7 @@ from bs4 import BeautifulSoup from bs4.element import Tag from args import spec_from_commandline_args -from glowfic import chunkDOMs, flatURL, makeChunk, renderChunk +from glowfic import flatURL, makeChunk, renderChunk, Thread from spec import Spec @@ -57,10 +57,11 @@ def process(spec: Spec) -> None: html = spec.fetcher.fetch(url) spec.log('Parsing HTML...\r') dom = parse(spec.htmlfilter(html)) + thread = Thread(dom) spec.log('Counting chunks...\r') - num_chunks = ilen(chunkDOMs(dom)) + num_chunks = ilen(thread.chunkDOMs()) title = get_title(dom) or "chunk" - for i, r in enumerate(chunkDOMs(dom)): + for i, r in enumerate(thread.chunkDOMs()): percent = 100.0 * i / num_chunks spec.log(f'Processing {title} {i} of {num_chunks} ({percent:.1f}%)\r') spec.domfilter(r)