]> git.scottworley.com Git - paperdoorknob/blame - glowfic.py
Handle Unicode characters ≈ and ◁
[paperdoorknob] / glowfic.py
CommitLineData
e6adf6ce
SW
1# paperdoorknob: Print glowfic
2#
3# This program is free software: you can redistribute it and/or modify it
4# under the terms of the GNU General Public License as published by the
5# Free Software Foundation, version 3.
6
7
6f430a74 8import dataclasses
e6adf6ce 9import itertools
fc2aaa75 10from urllib.parse import parse_qsl, urlencode, urljoin, urlparse, urlunparse
e6adf6ce 11
24b87bad 12from typing import Any, Iterable
e6adf6ce
SW
13
14from bs4 import BeautifulSoup
15from bs4.element import Tag
16
aa060d9b 17from images import ImageStore
70adfbff 18from spec import Spec
d2a41ff4 19from texify import Texifier
aa060d9b
SW
20
21
24b87bad
SW
22def ilen(it: Iterable[Any]) -> int:
23 return sum(1 for _ in it)
24
25
1452f8d3
SW
26def _removeViewFromURL(url: str) -> str:
27 u = urlparse(url)
28 old_qs = parse_qsl(u.query)
29 new_qs = [(k, v) for k, v in old_qs if k != 'view']
30 return urlunparse(u._replace(query=urlencode(new_qs)))
31
32
33def nonFlatURL(url: str) -> str:
34 return _removeViewFromURL(url)
35
36
37def flatURL(url: str) -> str:
38 u = urlparse(_removeViewFromURL(url))
39 qs = parse_qsl(u.query) + [('view', 'flat')]
40 return urlunparse(u._replace(query=urlencode(qs)))
41
42
6f430a74 43@dataclasses.dataclass(frozen=True)
aa060d9b
SW
44class Chunk:
45 icon: str | None
37c47bc2
SW
46 character: Tag | None
47 screen_name: Tag | None
48 author: Tag | None
aa060d9b
SW
49 content: Tag
50
e6adf6ce
SW
51# We avoid the name "post" because the Glowfic community uses the term
52# inconsistently:
53# * The Glowfic software sometimes uses "post" to refer to a whole thread
aa060d9b
SW
54# (in the URL), sometimes uses "post" to refer to chunks (in the CSS),
55# but mostly uses "post" to refer to just the first chunk in a thread
56# (in the HTML and UI). The non-first chunks are "replies".
e6adf6ce
SW
57# * Readers and this software don't need to distinguish first-chunks and
58# non-first-chunks.
aa060d9b 59# * Humans in the community tend to use "posts" to mean chunks.
e6adf6ce
SW
60
61
94027099
SW
62class Thread:
63
19ed28f3 64 def __init__(self, spec: Spec) -> None:
53f396b4
SW
65 def find_next_thread(dom: BeautifulSoup) -> str | None:
66 for c in dom.findChildren('div', class_='post-navheader'):
67 for a in c.findChildren('a'):
68 if 'Next Post' in a.text and 'href' in a.attrs and isinstance(
69 a.attrs['href'], str):
fc2aaa75 70 return urljoin(spec.url, a.attrs['href'])
53f396b4
SW
71 return None
72
19ed28f3 73 spec.log('Fetching HTML...\r')
53f396b4
SW
74 html = spec.fetcher.fetch(spec.url)
75 flat_html = spec.fetcher.fetch(flatURL(spec.url))
19ed28f3 76 spec.log('Parsing HTML...\r')
53f396b4
SW
77 self._next_thread = find_next_thread(
78 BeautifulSoup(spec.htmlfilter(html), 'html.parser'))
79 self._dom = BeautifulSoup(spec.htmlfilter(flat_html), 'html.parser')
24b87bad 80 self._spec = spec
94027099 81
21e82200
SW
82 def title(self) -> str | None:
83 span = self._dom.findChild("span", id="post-title")
84 if not isinstance(span, Tag):
85 return None
86 return span.text.strip()
87
53f396b4
SW
88 def next_thread(self) -> str | None:
89 return self._next_thread
90
94027099
SW
91 def chunkDOMs(self) -> Iterable[Tag]:
92 def text() -> Tag:
a18519bf 93 body = self._dom.body
94027099
SW
94 assert body
95 text = body.find_next("div", class_="post-post")
96 assert isinstance(text, Tag)
97 return text
98
99 def the_replies() -> Iterable[Tag]:
a18519bf 100 rs = self._dom.find_all("div", class_="post-reply")
94027099
SW
101 assert all(isinstance(r, Tag) for r in rs)
102 return rs
103
104 return itertools.chain([text()], the_replies())
aa060d9b 105
24b87bad
SW
106 def emit(self) -> None:
107 self._spec.log('Counting chunks...\r')
108 num_chunks = ilen(self.chunkDOMs())
109 title = self.title() or "chunk"
110 for i, r in enumerate(self.chunkDOMs()):
f74e5779 111 percent = 100.0 * (i + 1) / num_chunks
24b87bad 112 self._spec.log(
f74e5779 113 f'Processing {title} {i+1} of {num_chunks} ({percent:.1f}%)\r')
24b87bad
SW
114 self._spec.domfilter(r)
115 chunk = makeChunk(r, self._spec.images)
116 self._spec.texout.write(
117 self._spec.texfilter(renderChunk(self._spec.texifier, chunk)))
118 self._spec.log('')
6f430a74
SW
119 next_url = self.next_thread()
120 if next_url is not None:
121 Thread(dataclasses.replace(self._spec, url=next_url)).emit()
24b87bad 122
aa060d9b
SW
123
124def makeChunk(chunk_dom: Tag, image_store: ImageStore) -> Chunk:
125
126 def getIcon() -> str | None:
551bb1c9 127 icon_div = chunk_dom.findChild('div', class_='post-icon')
aa060d9b
SW
128 if icon_div is None:
129 return None
551bb1c9
SW
130 assert isinstance(icon_div, Tag)
131 icon_img = icon_div.findChild('img')
aa060d9b
SW
132 if icon_img is None:
133 return None
134 assert isinstance(icon_img, Tag)
135 return image_store.get_image(icon_img.attrs['src'])
136
37c47bc2 137 def getByClass(css_class: str) -> Tag | None:
551bb1c9 138 tag = chunk_dom.findChild('div', class_=css_class)
37c47bc2
SW
139 assert tag is None or isinstance(tag, Tag)
140 return tag
aa060d9b 141
62043b2b
SW
142 def stripHREF(tag: Tag) -> None:
143 for c in tag.findChildren("a"):
144 if "href" in c.attrs:
145 del c.attrs["href"]
146
147 def getMeta(css_class: str) -> Tag | None:
148 tag = getByClass(css_class)
149 if tag is None:
150 return None
151 stripHREF(tag)
152 return tag
153
551bb1c9 154 content = chunk_dom.findChild('div', class_='post-content')
aa060d9b
SW
155 assert isinstance(content, Tag)
156
157 return Chunk(getIcon(),
62043b2b
SW
158 getMeta('post-character'),
159 getMeta('post-screenname'),
160 getMeta('post-author'),
aa060d9b 161 content)
d2a41ff4
SW
162
163
1fac41bf
SW
164def renderChunk(texifier: Texifier, chunk: Chunk) -> bytes:
165 return b''.join([
166 br'\glowhead{',
167 br'\glowicon{%s}' % chunk.icon.encode('UTF-8') if chunk.icon else b'',
168 b'}{',
169 texifier.texify(chunk.character) if chunk.character else b'',
170 b'}{',
171 texifier.texify(chunk.screen_name) if chunk.screen_name else b'',
172 b'}{',
173 texifier.texify(chunk.author) if chunk.author else b'',
174 b'}',
175 texifier.texify(chunk.content)])
d2a41ff4 176
d2a41ff4 177
1fac41bf
SW
178ContentOnlyLayout = br'''
179\newcommand{\glowhead}[4]{}
180'''
d2a41ff4 181
d2a41ff4 182
1fac41bf
SW
183BelowIconLayout = br'''
184\newcommand{\glowhead}[4]{\wrapstuffclear
67612898
SW
185\begin{wrapstuff}[l]
186\fbox{
187\begin{varwidth}{0.5\textwidth}
188 \smash{\parbox[t][0pt]{0pt}{
189 \setlength{\fboxrule}{0.2pt}
190 \setlength{\fboxsep}{0pt}
191 \vspace{-3.4pt}
192 \fbox{\hspace{107mm}}
193 }\\*}
194 \vspace{-1em}
195\begin{center}
1fac41bf
SW
196#1\ifnotempty
197{#1}{\\*}#2\ifnotempty
198{#2}{\\*}#3\ifnotempty
199{#3}{\\*}#4
67612898
SW
200\end{center}
201\end{varwidth}
23dabdf5 202}
67612898 203\end{wrapstuff}
23dabdf5 204
67612898 205\strut
16385131 206
1fac41bf 207\noindent}'''
f75c1629
SW
208
209
1fac41bf
SW
210# Why is \textwidth not the width of the text?
211# Why is the width of the text .765\textwidth?
212BesideIconLayout = br'''
213\newcommand{\glowhead}[4]{
f75c1629 214
1fac41bf 215\strut
f75c1629 216
1fac41bf
SW
217\noindent\fbox{
218#1
67612898
SW
219\parbox[b]{.765\textwidth}{
220\begin{center}
1fac41bf
SW
221#2\ifnotempty
222{#2}{\\*}#3\ifnotempty
223{#3}{\\*}#4
67612898 224\end{center}
f75c1629 225}
67612898
SW
226}\\*
227\vspace{-0.75em}\\*
1fac41bf 228\noindent}'''