# paperdoorknob: Print glowfic # # This program is free software: you can redistribute it and/or modify it # under the terms of the GNU General Public License as published by the # Free Software Foundation, version 3. from dataclasses import dataclass import itertools from typing import Iterable from bs4 import BeautifulSoup from bs4.element import Tag from images import ImageStore @dataclass(frozen=True) class Chunk: icon: str | None character: str | None screen_name: str | None author: str | None content: Tag # We avoid the name "post" because the Glowfic community uses the term # inconsistently: # * The Glowfic software sometimes uses "post" to refer to a whole thread # (in the URL), sometimes uses "post" to refer to chunks (in the CSS), # but mostly uses "post" to refer to just the first chunk in a thread # (in the HTML and UI). The non-first chunks are "replies". # * Readers and this software don't need to distinguish first-chunks and # non-first-chunks. # * Humans in the community tend to use "posts" to mean chunks. def chunkDOMs(html: BeautifulSoup) -> Iterable[Tag]: def text() -> Tag: body = html.body assert body text = body.find_next("div", class_="post-post") assert isinstance(text, Tag) return text def the_replies() -> Iterable[Tag]: rs = html.find_all("div", class_="post-reply") assert all(isinstance(r, Tag) for r in rs) return rs return itertools.chain([text()], the_replies()) def makeChunk(chunk_dom: Tag, image_store: ImageStore) -> Chunk: def getIcon() -> str | None: icon_div = chunk_dom.find_next('div', class_='post-icon') if icon_div is None: return None icon_img = icon_div.find_next('img') if icon_img is None: return None assert isinstance(icon_img, Tag) return image_store.get_image(icon_img.attrs['src']) def getTextByClass(css_class: str) -> str | None: div = chunk_dom.find_next('div', class_=css_class) if div is None: return None return div.text.strip() content = chunk_dom.find_next('div', class_='post-content') assert isinstance(content, Tag) return Chunk(getIcon(), getTextByClass('post-character'), getTextByClass('post-screenname'), getTextByClass('post-author'), content)