git.scottworley.com Git - paperdoorknob/blob - glowfic.py

   1 # paperdoorknob: Print glowfic
   2 #
   3 # This program is free software: you can redistribute it and/or modify it
   4 # under the terms of the GNU General Public License as published by the
   5 # Free Software Foundation, version 3.
   6
   7
   8 from dataclasses import dataclass
   9 import itertools
  10 from urllib.parse import parse_qsl, urlencode, urljoin, urlparse, urlunparse
  11
  12 from typing import Iterable
  13
  14 from bs4 import BeautifulSoup
  15 from bs4.element import Tag
  16
  17 from images import ImageStore
  18 from spec import Spec
  19 from texify import Texifier
  20
  21
  22 def _removeViewFromURL(url: str) -> str:
  23     u = urlparse(url)
  24     old_qs = parse_qsl(u.query)
  25     new_qs = [(k, v) for k, v in old_qs if k != 'view']
  26     return urlunparse(u._replace(query=urlencode(new_qs)))
  27
  28
  29 def nonFlatURL(url: str) -> str:
  30     return _removeViewFromURL(url)
  31
  32
  33 def flatURL(url: str) -> str:
  34     u = urlparse(_removeViewFromURL(url))
  35     qs = parse_qsl(u.query) + [('view', 'flat')]
  36     return urlunparse(u._replace(query=urlencode(qs)))
  37
  38
  39 @dataclass(frozen=True)
  40 class Chunk:
  41     icon: str | None
  42     character: Tag | None
  43     screen_name: Tag | None
  44     author: Tag | None
  45     content: Tag
  46
  47 # We avoid the name "post" because the Glowfic community uses the term
  48 # inconsistently:
  49 #  * The Glowfic software sometimes uses "post" to refer to a whole thread
  50 #    (in the URL), sometimes uses "post" to refer to chunks (in the CSS),
  51 #    but mostly uses "post" to refer to just the first chunk in a thread
  52 #    (in the HTML and UI).  The non-first chunks are "replies".
  53 #  * Readers and this software don't need to distinguish first-chunks and
  54 #    non-first-chunks.
  55 #  * Humans in the community tend to use "posts" to mean chunks.
  56
  57
  58 class Thread:
  59
  60     def __init__(self, spec: Spec) -> None:
  61         def find_next_thread(dom: BeautifulSoup) -> str | None:
  62             for c in dom.findChildren('div', class_='post-navheader'):
  63                 for a in c.findChildren('a'):
  64                     if 'Next Post' in a.text and 'href' in a.attrs and isinstance(
  65                             a.attrs['href'], str):
  66                         return urljoin(spec.url, a.attrs['href'])
  67             return None
  68
  69         spec.log('Fetching HTML...\r')
  70         html = spec.fetcher.fetch(spec.url)
  71         flat_html = spec.fetcher.fetch(flatURL(spec.url))
  72         spec.log('Parsing HTML...\r')
  73         self._next_thread = find_next_thread(
  74             BeautifulSoup(spec.htmlfilter(html), 'html.parser'))
  75         self._dom = BeautifulSoup(spec.htmlfilter(flat_html), 'html.parser')
  76
  77     def title(self) -> str | None:
  78         span = self._dom.findChild("span", id="post-title")
  79         if not isinstance(span, Tag):
  80             return None
  81         return span.text.strip()
  82
  83     def next_thread(self) -> str | None:
  84         return self._next_thread
  85
  86     def chunkDOMs(self) -> Iterable[Tag]:
  87         def text() -> Tag:
  88             body = self._dom.body
  89             assert body
  90             text = body.find_next("div", class_="post-post")
  91             assert isinstance(text, Tag)
  92             return text
  93
  94         def the_replies() -> Iterable[Tag]:
  95             rs = self._dom.find_all("div", class_="post-reply")
  96             assert all(isinstance(r, Tag) for r in rs)
  97             return rs
  98
  99         return itertools.chain([text()], the_replies())
 100
 101
 102 def makeChunk(chunk_dom: Tag, image_store: ImageStore) -> Chunk:
 103
 104     def getIcon() -> str | None:
 105         icon_div = chunk_dom.findChild('div', class_='post-icon')
 106         if icon_div is None:
 107             return None
 108         assert isinstance(icon_div, Tag)
 109         icon_img = icon_div.findChild('img')
 110         if icon_img is None:
 111             return None
 112         assert isinstance(icon_img, Tag)
 113         return image_store.get_image(icon_img.attrs['src'])
 114
 115     def getByClass(css_class: str) -> Tag | None:
 116         tag = chunk_dom.findChild('div', class_=css_class)
 117         assert tag is None or isinstance(tag, Tag)
 118         return tag
 119
 120     def stripHREF(tag: Tag) -> None:
 121         for c in tag.findChildren("a"):
 122             if "href" in c.attrs:
 123                 del c.attrs["href"]
 124
 125     def getMeta(css_class: str) -> Tag | None:
 126         tag = getByClass(css_class)
 127         if tag is None:
 128             return None
 129         stripHREF(tag)
 130         return tag
 131
 132     content = chunk_dom.findChild('div', class_='post-content')
 133     assert isinstance(content, Tag)
 134
 135     return Chunk(getIcon(),
 136                  getMeta('post-character'),
 137                  getMeta('post-screenname'),
 138                  getMeta('post-author'),
 139                  content)
 140
 141
 142 def renderChunk(texifier: Texifier, chunk: Chunk) -> bytes:
 143     return b''.join([
 144         br'\glowhead{',
 145         br'\glowicon{%s}' % chunk.icon.encode('UTF-8') if chunk.icon else b'',
 146         b'}{',
 147         texifier.texify(chunk.character) if chunk.character else b'',
 148         b'}{',
 149         texifier.texify(chunk.screen_name) if chunk.screen_name else b'',
 150         b'}{',
 151         texifier.texify(chunk.author) if chunk.author else b'',
 152         b'}',
 153         texifier.texify(chunk.content)])
 154
 155
 156 ContentOnlyLayout = br'''
 157 \newcommand{\glowhead}[4]{}
 158 '''
 159
 160
 161 BelowIconLayout = br'''
 162 \newcommand{\glowhead}[4]{\wrapstuffclear
 163 \begin{wrapstuff}[l]
 164 \fbox{
 165 \begin{varwidth}{0.5\textwidth}
 166   \smash{\parbox[t][0pt]{0pt}{
 167     \setlength{\fboxrule}{0.2pt}
 168     \setlength{\fboxsep}{0pt}
 169     \vspace{-3.4pt}
 170     \fbox{\hspace{107mm}}
 171   }\\*}
 172   \vspace{-1em}
 173 \begin{center}
 174 #1\ifnotempty
 175 {#1}{\\*}#2\ifnotempty
 176 {#2}{\\*}#3\ifnotempty
 177 {#3}{\\*}#4
 178 \end{center}
 179 \end{varwidth}
 180 }
 181 \end{wrapstuff}
 182
 183 \strut
 184
 185 \noindent}'''
 186
 187
 188 # Why is \textwidth not the width of the text?
 189 # Why is the width of the text .765\textwidth?
 190 BesideIconLayout = br'''
 191 \newcommand{\glowhead}[4]{
 192
 193 \strut
 194
 195 \noindent\fbox{
 196 #1
 197 \parbox[b]{.765\textwidth}{
 198 \begin{center}
 199 #2\ifnotempty
 200 {#2}{\\*}#3\ifnotempty
 201 {#3}{\\*}#4
 202 \end{center}
 203 }
 204 }\\*
 205 \vspace{-0.75em}\\*
 206 \noindent}'''