git.scottworley.com Git - paperdoorknob/blob - glowfic.py

   1 # paperdoorknob: Print glowfic
   2 #
   3 # This program is free software: you can redistribute it and/or modify it
   4 # under the terms of the GNU General Public License as published by the
   5 # Free Software Foundation, version 3.
   6
   7
   8 import dataclasses
   9 import itertools
  10 from urllib.parse import parse_qsl, urlencode, urljoin, urlparse, urlunparse
  11
  12 from typing import Any, Iterable
  13
  14 from bs4 import BeautifulSoup
  15 from bs4.element import Tag
  16
  17 from images import ImageStore
  18 from spec import Spec
  19 from texify import Texifier
  20
  21
  22 def ilen(it: Iterable[Any]) -> int:
  23     return sum(1 for _ in it)
  24
  25
  26 def _removeViewFromURL(url: str) -> str:
  27     u = urlparse(url)
  28     old_qs = parse_qsl(u.query)
  29     new_qs = [(k, v) for k, v in old_qs if k != 'view']
  30     return urlunparse(u._replace(query=urlencode(new_qs)))
  31
  32
  33 def nonFlatURL(url: str) -> str:
  34     return _removeViewFromURL(url)
  35
  36
  37 def flatURL(url: str) -> str:
  38     u = urlparse(_removeViewFromURL(url))
  39     qs = parse_qsl(u.query) + [('view', 'flat')]
  40     return urlunparse(u._replace(query=urlencode(qs)))
  41
  42
  43 @dataclasses.dataclass(frozen=True)
  44 class Chunk:
  45     icon: str | None
  46     character: Tag | None
  47     screen_name: Tag | None
  48     author: Tag | None
  49     content: Tag
  50
  51 # We avoid the name "post" because the Glowfic community uses the term
  52 # inconsistently:
  53 #  * The Glowfic software sometimes uses "post" to refer to a whole thread
  54 #    (in the URL), sometimes uses "post" to refer to chunks (in the CSS),
  55 #    but mostly uses "post" to refer to just the first chunk in a thread
  56 #    (in the HTML and UI).  The non-first chunks are "replies".
  57 #  * Readers and this software don't need to distinguish first-chunks and
  58 #    non-first-chunks.
  59 #  * Humans in the community tend to use "posts" to mean chunks.
  60
  61
  62 class Thread:
  63
  64     def __init__(self, spec: Spec) -> None:
  65         def find_next_thread(dom: BeautifulSoup) -> str | None:
  66             for c in dom.findChildren('div', class_='post-navheader'):
  67                 for a in c.findChildren('a'):
  68                     if 'Next Post' in a.text and 'href' in a.attrs and isinstance(
  69                             a.attrs['href'], str):
  70                         return urljoin(spec.url, a.attrs['href'])
  71             return None
  72
  73         spec.log('Fetching HTML...\r')
  74         html = spec.fetcher.fetch(spec.url)
  75         flat_html = spec.fetcher.fetch(flatURL(spec.url))
  76         spec.log('Parsing HTML...\r')
  77         self._next_thread = find_next_thread(
  78             BeautifulSoup(spec.htmlfilter(html), 'html.parser'))
  79         self._dom = BeautifulSoup(spec.htmlfilter(flat_html), 'html.parser')
  80         self._spec = spec
  81
  82     def title(self) -> str | None:
  83         span = self._dom.findChild("span", id="post-title")
  84         if not isinstance(span, Tag):
  85             return None
  86         return span.text.strip()
  87
  88     def next_thread(self) -> str | None:
  89         return self._next_thread
  90
  91     def chunkDOMs(self) -> Iterable[Tag]:
  92         def text() -> Tag:
  93             body = self._dom.body
  94             assert body
  95             text = body.find_next("div", class_="post-post")
  96             assert isinstance(text, Tag)
  97             return text
  98
  99         def the_replies() -> Iterable[Tag]:
 100             rs = self._dom.find_all("div", class_="post-reply")
 101             assert all(isinstance(r, Tag) for r in rs)
 102             return rs
 103
 104         return itertools.chain([text()], the_replies())
 105
 106     def emit(self) -> None:
 107         self._spec.log('Counting chunks...\r')
 108         num_chunks = ilen(self.chunkDOMs())
 109         title = self.title() or "chunk"
 110         for i, r in enumerate(self.chunkDOMs()):
 111             percent = 100.0 * i / num_chunks
 112             self._spec.log(
 113                 f'Processing {title} {i} of {num_chunks} ({percent:.1f}%)\r')
 114             self._spec.domfilter(r)
 115             chunk = makeChunk(r, self._spec.images)
 116             self._spec.texout.write(
 117                 self._spec.texfilter(renderChunk(self._spec.texifier, chunk)))
 118         self._spec.log('')
 119         next_url = self.next_thread()
 120         if next_url is not None:
 121             Thread(dataclasses.replace(self._spec, url=next_url)).emit()
 122
 123
 124 def makeChunk(chunk_dom: Tag, image_store: ImageStore) -> Chunk:
 125
 126     def getIcon() -> str | None:
 127         icon_div = chunk_dom.findChild('div', class_='post-icon')
 128         if icon_div is None:
 129             return None
 130         assert isinstance(icon_div, Tag)
 131         icon_img = icon_div.findChild('img')
 132         if icon_img is None:
 133             return None
 134         assert isinstance(icon_img, Tag)
 135         return image_store.get_image(icon_img.attrs['src'])
 136
 137     def getByClass(css_class: str) -> Tag | None:
 138         tag = chunk_dom.findChild('div', class_=css_class)
 139         assert tag is None or isinstance(tag, Tag)
 140         return tag
 141
 142     def stripHREF(tag: Tag) -> None:
 143         for c in tag.findChildren("a"):
 144             if "href" in c.attrs:
 145                 del c.attrs["href"]
 146
 147     def getMeta(css_class: str) -> Tag | None:
 148         tag = getByClass(css_class)
 149         if tag is None:
 150             return None
 151         stripHREF(tag)
 152         return tag
 153
 154     content = chunk_dom.findChild('div', class_='post-content')
 155     assert isinstance(content, Tag)
 156
 157     return Chunk(getIcon(),
 158                  getMeta('post-character'),
 159                  getMeta('post-screenname'),
 160                  getMeta('post-author'),
 161                  content)
 162
 163
 164 def renderChunk(texifier: Texifier, chunk: Chunk) -> bytes:
 165     return b''.join([
 166         br'\glowhead{',
 167         br'\glowicon{%s}' % chunk.icon.encode('UTF-8') if chunk.icon else b'',
 168         b'}{',
 169         texifier.texify(chunk.character) if chunk.character else b'',
 170         b'}{',
 171         texifier.texify(chunk.screen_name) if chunk.screen_name else b'',
 172         b'}{',
 173         texifier.texify(chunk.author) if chunk.author else b'',
 174         b'}',
 175         texifier.texify(chunk.content)])
 176
 177
 178 ContentOnlyLayout = br'''
 179 \newcommand{\glowhead}[4]{}
 180 '''
 181
 182
 183 BelowIconLayout = br'''
 184 \newcommand{\glowhead}[4]{\wrapstuffclear
 185 \begin{wrapstuff}[l]
 186 \fbox{
 187 \begin{varwidth}{0.5\textwidth}
 188   \smash{\parbox[t][0pt]{0pt}{
 189     \setlength{\fboxrule}{0.2pt}
 190     \setlength{\fboxsep}{0pt}
 191     \vspace{-3.4pt}
 192     \fbox{\hspace{107mm}}
 193   }\\*}
 194   \vspace{-1em}
 195 \begin{center}
 196 #1\ifnotempty
 197 {#1}{\\*}#2\ifnotempty
 198 {#2}{\\*}#3\ifnotempty
 199 {#3}{\\*}#4
 200 \end{center}
 201 \end{varwidth}
 202 }
 203 \end{wrapstuff}
 204
 205 \strut
 206
 207 \noindent}'''
 208
 209
 210 # Why is \textwidth not the width of the text?
 211 # Why is the width of the text .765\textwidth?
 212 BesideIconLayout = br'''
 213 \newcommand{\glowhead}[4]{
 214
 215 \strut
 216
 217 \noindent\fbox{
 218 #1
 219 \parbox[b]{.765\textwidth}{
 220 \begin{center}
 221 #2\ifnotempty
 222 {#2}{\\*}#3\ifnotempty
 223 {#3}{\\*}#4
 224 \end{center}
 225 }
 226 }\\*
 227 \vspace{-0.75em}\\*
 228 \noindent}'''