]> git.scottworley.com Git - paperdoorknob/blame - glowfic.py
Move per-thread processing stuff into Thread
[paperdoorknob] / glowfic.py
CommitLineData
e6adf6ce
SW
1# paperdoorknob: Print glowfic
2#
3# This program is free software: you can redistribute it and/or modify it
4# under the terms of the GNU General Public License as published by the
5# Free Software Foundation, version 3.
6
7
aa060d9b 8from dataclasses import dataclass
e6adf6ce 9import itertools
fc2aaa75 10from urllib.parse import parse_qsl, urlencode, urljoin, urlparse, urlunparse
e6adf6ce 11
24b87bad 12from typing import Any, Iterable
e6adf6ce
SW
13
14from bs4 import BeautifulSoup
15from bs4.element import Tag
16
aa060d9b 17from images import ImageStore
70adfbff 18from spec import Spec
d2a41ff4 19from texify import Texifier
aa060d9b
SW
20
21
24b87bad
SW
22def ilen(it: Iterable[Any]) -> int:
23 return sum(1 for _ in it)
24
25
1452f8d3
SW
26def _removeViewFromURL(url: str) -> str:
27 u = urlparse(url)
28 old_qs = parse_qsl(u.query)
29 new_qs = [(k, v) for k, v in old_qs if k != 'view']
30 return urlunparse(u._replace(query=urlencode(new_qs)))
31
32
33def nonFlatURL(url: str) -> str:
34 return _removeViewFromURL(url)
35
36
37def flatURL(url: str) -> str:
38 u = urlparse(_removeViewFromURL(url))
39 qs = parse_qsl(u.query) + [('view', 'flat')]
40 return urlunparse(u._replace(query=urlencode(qs)))
41
42
aa060d9b
SW
43@dataclass(frozen=True)
44class Chunk:
45 icon: str | None
37c47bc2
SW
46 character: Tag | None
47 screen_name: Tag | None
48 author: Tag | None
aa060d9b
SW
49 content: Tag
50
e6adf6ce
SW
51# We avoid the name "post" because the Glowfic community uses the term
52# inconsistently:
53# * The Glowfic software sometimes uses "post" to refer to a whole thread
aa060d9b
SW
54# (in the URL), sometimes uses "post" to refer to chunks (in the CSS),
55# but mostly uses "post" to refer to just the first chunk in a thread
56# (in the HTML and UI). The non-first chunks are "replies".
e6adf6ce
SW
57# * Readers and this software don't need to distinguish first-chunks and
58# non-first-chunks.
aa060d9b 59# * Humans in the community tend to use "posts" to mean chunks.
e6adf6ce
SW
60
61
94027099
SW
62class Thread:
63
19ed28f3 64 def __init__(self, spec: Spec) -> None:
53f396b4
SW
65 def find_next_thread(dom: BeautifulSoup) -> str | None:
66 for c in dom.findChildren('div', class_='post-navheader'):
67 for a in c.findChildren('a'):
68 if 'Next Post' in a.text and 'href' in a.attrs and isinstance(
69 a.attrs['href'], str):
fc2aaa75 70 return urljoin(spec.url, a.attrs['href'])
53f396b4
SW
71 return None
72
19ed28f3 73 spec.log('Fetching HTML...\r')
53f396b4
SW
74 html = spec.fetcher.fetch(spec.url)
75 flat_html = spec.fetcher.fetch(flatURL(spec.url))
19ed28f3 76 spec.log('Parsing HTML...\r')
53f396b4
SW
77 self._next_thread = find_next_thread(
78 BeautifulSoup(spec.htmlfilter(html), 'html.parser'))
79 self._dom = BeautifulSoup(spec.htmlfilter(flat_html), 'html.parser')
24b87bad 80 self._spec = spec
94027099 81
21e82200
SW
82 def title(self) -> str | None:
83 span = self._dom.findChild("span", id="post-title")
84 if not isinstance(span, Tag):
85 return None
86 return span.text.strip()
87
53f396b4
SW
88 def next_thread(self) -> str | None:
89 return self._next_thread
90
94027099
SW
91 def chunkDOMs(self) -> Iterable[Tag]:
92 def text() -> Tag:
a18519bf 93 body = self._dom.body
94027099
SW
94 assert body
95 text = body.find_next("div", class_="post-post")
96 assert isinstance(text, Tag)
97 return text
98
99 def the_replies() -> Iterable[Tag]:
a18519bf 100 rs = self._dom.find_all("div", class_="post-reply")
94027099
SW
101 assert all(isinstance(r, Tag) for r in rs)
102 return rs
103
104 return itertools.chain([text()], the_replies())
aa060d9b 105
24b87bad
SW
106 def emit(self) -> None:
107 self._spec.log('Counting chunks...\r')
108 num_chunks = ilen(self.chunkDOMs())
109 title = self.title() or "chunk"
110 for i, r in enumerate(self.chunkDOMs()):
111 percent = 100.0 * i / num_chunks
112 self._spec.log(
113 f'Processing {title} {i} of {num_chunks} ({percent:.1f}%)\r')
114 self._spec.domfilter(r)
115 chunk = makeChunk(r, self._spec.images)
116 self._spec.texout.write(
117 self._spec.texfilter(renderChunk(self._spec.texifier, chunk)))
118 self._spec.log('')
119
aa060d9b
SW
120
121def makeChunk(chunk_dom: Tag, image_store: ImageStore) -> Chunk:
122
123 def getIcon() -> str | None:
551bb1c9 124 icon_div = chunk_dom.findChild('div', class_='post-icon')
aa060d9b
SW
125 if icon_div is None:
126 return None
551bb1c9
SW
127 assert isinstance(icon_div, Tag)
128 icon_img = icon_div.findChild('img')
aa060d9b
SW
129 if icon_img is None:
130 return None
131 assert isinstance(icon_img, Tag)
132 return image_store.get_image(icon_img.attrs['src'])
133
37c47bc2 134 def getByClass(css_class: str) -> Tag | None:
551bb1c9 135 tag = chunk_dom.findChild('div', class_=css_class)
37c47bc2
SW
136 assert tag is None or isinstance(tag, Tag)
137 return tag
aa060d9b 138
62043b2b
SW
139 def stripHREF(tag: Tag) -> None:
140 for c in tag.findChildren("a"):
141 if "href" in c.attrs:
142 del c.attrs["href"]
143
144 def getMeta(css_class: str) -> Tag | None:
145 tag = getByClass(css_class)
146 if tag is None:
147 return None
148 stripHREF(tag)
149 return tag
150
551bb1c9 151 content = chunk_dom.findChild('div', class_='post-content')
aa060d9b
SW
152 assert isinstance(content, Tag)
153
154 return Chunk(getIcon(),
62043b2b
SW
155 getMeta('post-character'),
156 getMeta('post-screenname'),
157 getMeta('post-author'),
aa060d9b 158 content)
d2a41ff4
SW
159
160
1fac41bf
SW
161def renderChunk(texifier: Texifier, chunk: Chunk) -> bytes:
162 return b''.join([
163 br'\glowhead{',
164 br'\glowicon{%s}' % chunk.icon.encode('UTF-8') if chunk.icon else b'',
165 b'}{',
166 texifier.texify(chunk.character) if chunk.character else b'',
167 b'}{',
168 texifier.texify(chunk.screen_name) if chunk.screen_name else b'',
169 b'}{',
170 texifier.texify(chunk.author) if chunk.author else b'',
171 b'}',
172 texifier.texify(chunk.content)])
d2a41ff4 173
d2a41ff4 174
1fac41bf
SW
175ContentOnlyLayout = br'''
176\newcommand{\glowhead}[4]{}
177'''
d2a41ff4 178
d2a41ff4 179
1fac41bf
SW
180BelowIconLayout = br'''
181\newcommand{\glowhead}[4]{\wrapstuffclear
67612898
SW
182\begin{wrapstuff}[l]
183\fbox{
184\begin{varwidth}{0.5\textwidth}
185 \smash{\parbox[t][0pt]{0pt}{
186 \setlength{\fboxrule}{0.2pt}
187 \setlength{\fboxsep}{0pt}
188 \vspace{-3.4pt}
189 \fbox{\hspace{107mm}}
190 }\\*}
191 \vspace{-1em}
192\begin{center}
1fac41bf
SW
193#1\ifnotempty
194{#1}{\\*}#2\ifnotempty
195{#2}{\\*}#3\ifnotempty
196{#3}{\\*}#4
67612898
SW
197\end{center}
198\end{varwidth}
23dabdf5 199}
67612898 200\end{wrapstuff}
23dabdf5 201
67612898 202\strut
16385131 203
1fac41bf 204\noindent}'''
f75c1629
SW
205
206
1fac41bf
SW
207# Why is \textwidth not the width of the text?
208# Why is the width of the text .765\textwidth?
209BesideIconLayout = br'''
210\newcommand{\glowhead}[4]{
f75c1629 211
1fac41bf 212\strut
f75c1629 213
1fac41bf
SW
214\noindent\fbox{
215#1
67612898
SW
216\parbox[b]{.765\textwidth}{
217\begin{center}
1fac41bf
SW
218#2\ifnotempty
219{#2}{\\*}#3\ifnotempty
220{#3}{\\*}#4
67612898 221\end{center}
f75c1629 222}
67612898
SW
223}\\*
224\vspace{-0.75em}\\*
1fac41bf 225\noindent}'''