# * Humans in the community tend to use "posts" to mean chunks.
-def chunkDOMs(html: BeautifulSoup) -> Iterable[Tag]:
- def text() -> Tag:
- body = html.body
- assert body
- text = body.find_next("div", class_="post-post")
- assert isinstance(text, Tag)
- return text
-
- def the_replies() -> Iterable[Tag]:
- rs = html.find_all("div", class_="post-reply")
- assert all(isinstance(r, Tag) for r in rs)
- return rs
-
- return itertools.chain([text()], the_replies())
+class Thread:
+
+ def __init__(self, html: BeautifulSoup) -> None:
+ self._html = html
+
+ def chunkDOMs(self) -> Iterable[Tag]:
+ def text() -> Tag:
+ body = self._html.body
+ assert body
+ text = body.find_next("div", class_="post-post")
+ assert isinstance(text, Tag)
+ return text
+
+ def the_replies() -> Iterable[Tag]:
+ rs = self._html.find_all("div", class_="post-reply")
+ assert all(isinstance(r, Tag) for r in rs)
+ return rs
+
+ return itertools.chain([text()], the_replies())
def makeChunk(chunk_dom: Tag, image_store: ImageStore) -> Chunk:
from bs4 import BeautifulSoup
from images import FakeImageStore
-from glowfic import chunkDOMs, makeChunk
+from glowfic import makeChunk, Thread
from texify import PandocTexifier
class TestSplit(unittest.TestCase):
def testSplit1(self) -> None:
- soup = BeautifulSoup(b'''
+ t = Thread(BeautifulSoup(b'''
<html><body><div class="post-container post-post">
The "post"
- </div></body></html>''', 'html.parser')
- self.assertEqual([list(t.stripped_strings) for t in chunkDOMs(soup)],
+ </div></body></html>''', 'html.parser'))
+ self.assertEqual([list(t.stripped_strings) for t in t.chunkDOMs()],
[['The "post"']])
def testSplit2(self) -> None:
- soup = BeautifulSoup(b'''
+ t = Thread(BeautifulSoup(b'''
<html><body>
<div class="post-container post-post">The "post"</div>
<div class="flat-post-replies">
<div class="post-container post-reply">The "reply"</div>
</div>
- </body></html>''', 'html.parser')
- self.assertEqual([list(t.stripped_strings) for t in chunkDOMs(soup)],
+ </body></html>''', 'html.parser'))
+ self.assertEqual([list(t.stripped_strings) for t in t.chunkDOMs()],
[['The "post"'], ['The "reply"']])
def testSplit3(self) -> None:
- soup = BeautifulSoup(b'''
+ t = Thread(BeautifulSoup(b'''
<html><body>
<div class="post-container post-post">The "post"</div>
<div class="flat-post-replies">
<div class="post-container post-reply">1st reply</div>
<div class="post-container post-reply">2nd reply</div>
</div>
- </body></html>''', 'html.parser')
- self.assertEqual([list(t.stripped_strings) for t in chunkDOMs(soup)],
+ </body></html>''', 'html.parser'))
+ self.assertEqual([list(t.stripped_strings) for t in t.chunkDOMs()],
[['The "post"'], ['1st reply'], ['2nd reply']])
def testEmptyContent(self) -> None:
with open('testdata/empty-content.html', 'rb') as f:
- soup = BeautifulSoup(f, 'html.parser')
- c = makeChunk(next(iter(chunkDOMs(soup))), FakeImageStore())
+ t = Thread(BeautifulSoup(f, 'html.parser'))
+ c = makeChunk(next(iter(t.chunkDOMs())), FakeImageStore())
self.assertEqual(
c.icon,
'stored:https://d1anwqy6ci9o1i.cloudfront.net/' +
from bs4.element import Tag
from args import spec_from_commandline_args
-from glowfic import chunkDOMs, flatURL, makeChunk, renderChunk
+from glowfic import flatURL, makeChunk, renderChunk, Thread
from spec import Spec
html = spec.fetcher.fetch(url)
spec.log('Parsing HTML...\r')
dom = parse(spec.htmlfilter(html))
+ thread = Thread(dom)
spec.log('Counting chunks...\r')
- num_chunks = ilen(chunkDOMs(dom))
+ num_chunks = ilen(thread.chunkDOMs())
title = get_title(dom) or "chunk"
- for i, r in enumerate(chunkDOMs(dom)):
+ for i, r in enumerate(thread.chunkDOMs()):
percent = 100.0 * i / num_chunks
spec.log(f'Processing {title} {i} of {num_chunks} ({percent:.1f}%)\r')
spec.domfilter(r)