]> git.scottworley.com Git - paperdoorknob/blame_incremental - glowfic.py
Handle Unicode characters ≈ and ◁
[paperdoorknob] / glowfic.py
... / ...
CommitLineData
1# paperdoorknob: Print glowfic
2#
3# This program is free software: you can redistribute it and/or modify it
4# under the terms of the GNU General Public License as published by the
5# Free Software Foundation, version 3.
6
7
8import dataclasses
9import itertools
10from urllib.parse import parse_qsl, urlencode, urljoin, urlparse, urlunparse
11
12from typing import Any, Iterable
13
14from bs4 import BeautifulSoup
15from bs4.element import Tag
16
17from images import ImageStore
18from spec import Spec
19from texify import Texifier
20
21
22def ilen(it: Iterable[Any]) -> int:
23 return sum(1 for _ in it)
24
25
26def _removeViewFromURL(url: str) -> str:
27 u = urlparse(url)
28 old_qs = parse_qsl(u.query)
29 new_qs = [(k, v) for k, v in old_qs if k != 'view']
30 return urlunparse(u._replace(query=urlencode(new_qs)))
31
32
33def nonFlatURL(url: str) -> str:
34 return _removeViewFromURL(url)
35
36
37def flatURL(url: str) -> str:
38 u = urlparse(_removeViewFromURL(url))
39 qs = parse_qsl(u.query) + [('view', 'flat')]
40 return urlunparse(u._replace(query=urlencode(qs)))
41
42
43@dataclasses.dataclass(frozen=True)
44class Chunk:
45 icon: str | None
46 character: Tag | None
47 screen_name: Tag | None
48 author: Tag | None
49 content: Tag
50
51# We avoid the name "post" because the Glowfic community uses the term
52# inconsistently:
53# * The Glowfic software sometimes uses "post" to refer to a whole thread
54# (in the URL), sometimes uses "post" to refer to chunks (in the CSS),
55# but mostly uses "post" to refer to just the first chunk in a thread
56# (in the HTML and UI). The non-first chunks are "replies".
57# * Readers and this software don't need to distinguish first-chunks and
58# non-first-chunks.
59# * Humans in the community tend to use "posts" to mean chunks.
60
61
62class Thread:
63
64 def __init__(self, spec: Spec) -> None:
65 def find_next_thread(dom: BeautifulSoup) -> str | None:
66 for c in dom.findChildren('div', class_='post-navheader'):
67 for a in c.findChildren('a'):
68 if 'Next Post' in a.text and 'href' in a.attrs and isinstance(
69 a.attrs['href'], str):
70 return urljoin(spec.url, a.attrs['href'])
71 return None
72
73 spec.log('Fetching HTML...\r')
74 html = spec.fetcher.fetch(spec.url)
75 flat_html = spec.fetcher.fetch(flatURL(spec.url))
76 spec.log('Parsing HTML...\r')
77 self._next_thread = find_next_thread(
78 BeautifulSoup(spec.htmlfilter(html), 'html.parser'))
79 self._dom = BeautifulSoup(spec.htmlfilter(flat_html), 'html.parser')
80 self._spec = spec
81
82 def title(self) -> str | None:
83 span = self._dom.findChild("span", id="post-title")
84 if not isinstance(span, Tag):
85 return None
86 return span.text.strip()
87
88 def next_thread(self) -> str | None:
89 return self._next_thread
90
91 def chunkDOMs(self) -> Iterable[Tag]:
92 def text() -> Tag:
93 body = self._dom.body
94 assert body
95 text = body.find_next("div", class_="post-post")
96 assert isinstance(text, Tag)
97 return text
98
99 def the_replies() -> Iterable[Tag]:
100 rs = self._dom.find_all("div", class_="post-reply")
101 assert all(isinstance(r, Tag) for r in rs)
102 return rs
103
104 return itertools.chain([text()], the_replies())
105
106 def emit(self) -> None:
107 self._spec.log('Counting chunks...\r')
108 num_chunks = ilen(self.chunkDOMs())
109 title = self.title() or "chunk"
110 for i, r in enumerate(self.chunkDOMs()):
111 percent = 100.0 * (i + 1) / num_chunks
112 self._spec.log(
113 f'Processing {title} {i+1} of {num_chunks} ({percent:.1f}%)\r')
114 self._spec.domfilter(r)
115 chunk = makeChunk(r, self._spec.images)
116 self._spec.texout.write(
117 self._spec.texfilter(renderChunk(self._spec.texifier, chunk)))
118 self._spec.log('')
119 next_url = self.next_thread()
120 if next_url is not None:
121 Thread(dataclasses.replace(self._spec, url=next_url)).emit()
122
123
124def makeChunk(chunk_dom: Tag, image_store: ImageStore) -> Chunk:
125
126 def getIcon() -> str | None:
127 icon_div = chunk_dom.findChild('div', class_='post-icon')
128 if icon_div is None:
129 return None
130 assert isinstance(icon_div, Tag)
131 icon_img = icon_div.findChild('img')
132 if icon_img is None:
133 return None
134 assert isinstance(icon_img, Tag)
135 return image_store.get_image(icon_img.attrs['src'])
136
137 def getByClass(css_class: str) -> Tag | None:
138 tag = chunk_dom.findChild('div', class_=css_class)
139 assert tag is None or isinstance(tag, Tag)
140 return tag
141
142 def stripHREF(tag: Tag) -> None:
143 for c in tag.findChildren("a"):
144 if "href" in c.attrs:
145 del c.attrs["href"]
146
147 def getMeta(css_class: str) -> Tag | None:
148 tag = getByClass(css_class)
149 if tag is None:
150 return None
151 stripHREF(tag)
152 return tag
153
154 content = chunk_dom.findChild('div', class_='post-content')
155 assert isinstance(content, Tag)
156
157 return Chunk(getIcon(),
158 getMeta('post-character'),
159 getMeta('post-screenname'),
160 getMeta('post-author'),
161 content)
162
163
164def renderChunk(texifier: Texifier, chunk: Chunk) -> bytes:
165 return b''.join([
166 br'\glowhead{',
167 br'\glowicon{%s}' % chunk.icon.encode('UTF-8') if chunk.icon else b'',
168 b'}{',
169 texifier.texify(chunk.character) if chunk.character else b'',
170 b'}{',
171 texifier.texify(chunk.screen_name) if chunk.screen_name else b'',
172 b'}{',
173 texifier.texify(chunk.author) if chunk.author else b'',
174 b'}',
175 texifier.texify(chunk.content)])
176
177
178ContentOnlyLayout = br'''
179\newcommand{\glowhead}[4]{}
180'''
181
182
183BelowIconLayout = br'''
184\newcommand{\glowhead}[4]{\wrapstuffclear
185\begin{wrapstuff}[l]
186\fbox{
187\begin{varwidth}{0.5\textwidth}
188 \smash{\parbox[t][0pt]{0pt}{
189 \setlength{\fboxrule}{0.2pt}
190 \setlength{\fboxsep}{0pt}
191 \vspace{-3.4pt}
192 \fbox{\hspace{107mm}}
193 }\\*}
194 \vspace{-1em}
195\begin{center}
196#1\ifnotempty
197{#1}{\\*}#2\ifnotempty
198{#2}{\\*}#3\ifnotempty
199{#3}{\\*}#4
200\end{center}
201\end{varwidth}
202}
203\end{wrapstuff}
204
205\strut
206
207\noindent}'''
208
209
210# Why is \textwidth not the width of the text?
211# Why is the width of the text .765\textwidth?
212BesideIconLayout = br'''
213\newcommand{\glowhead}[4]{
214
215\strut
216
217\noindent\fbox{
218#1
219\parbox[b]{.765\textwidth}{
220\begin{center}
221#2\ifnotempty
222{#2}{\\*}#3\ifnotempty
223{#3}{\\*}#4
224\end{center}
225}
226}\\*
227\vspace{-0.75em}\\*
228\noindent}'''