]> git.scottworley.com Git - paperdoorknob/blob - glowfic.py
Handle Unicode characters ≈ and ◁
[paperdoorknob] / glowfic.py
1 # paperdoorknob: Print glowfic
2 #
3 # This program is free software: you can redistribute it and/or modify it
4 # under the terms of the GNU General Public License as published by the
5 # Free Software Foundation, version 3.
6
7
8 import dataclasses
9 import itertools
10 from urllib.parse import parse_qsl, urlencode, urljoin, urlparse, urlunparse
11
12 from typing import Any, Iterable
13
14 from bs4 import BeautifulSoup
15 from bs4.element import Tag
16
17 from images import ImageStore
18 from spec import Spec
19 from texify import Texifier
20
21
22 def ilen(it: Iterable[Any]) -> int:
23 return sum(1 for _ in it)
24
25
26 def _removeViewFromURL(url: str) -> str:
27 u = urlparse(url)
28 old_qs = parse_qsl(u.query)
29 new_qs = [(k, v) for k, v in old_qs if k != 'view']
30 return urlunparse(u._replace(query=urlencode(new_qs)))
31
32
33 def nonFlatURL(url: str) -> str:
34 return _removeViewFromURL(url)
35
36
37 def flatURL(url: str) -> str:
38 u = urlparse(_removeViewFromURL(url))
39 qs = parse_qsl(u.query) + [('view', 'flat')]
40 return urlunparse(u._replace(query=urlencode(qs)))
41
42
43 @dataclasses.dataclass(frozen=True)
44 class Chunk:
45 icon: str | None
46 character: Tag | None
47 screen_name: Tag | None
48 author: Tag | None
49 content: Tag
50
51 # We avoid the name "post" because the Glowfic community uses the term
52 # inconsistently:
53 # * The Glowfic software sometimes uses "post" to refer to a whole thread
54 # (in the URL), sometimes uses "post" to refer to chunks (in the CSS),
55 # but mostly uses "post" to refer to just the first chunk in a thread
56 # (in the HTML and UI). The non-first chunks are "replies".
57 # * Readers and this software don't need to distinguish first-chunks and
58 # non-first-chunks.
59 # * Humans in the community tend to use "posts" to mean chunks.
60
61
62 class Thread:
63
64 def __init__(self, spec: Spec) -> None:
65 def find_next_thread(dom: BeautifulSoup) -> str | None:
66 for c in dom.findChildren('div', class_='post-navheader'):
67 for a in c.findChildren('a'):
68 if 'Next Post' in a.text and 'href' in a.attrs and isinstance(
69 a.attrs['href'], str):
70 return urljoin(spec.url, a.attrs['href'])
71 return None
72
73 spec.log('Fetching HTML...\r')
74 html = spec.fetcher.fetch(spec.url)
75 flat_html = spec.fetcher.fetch(flatURL(spec.url))
76 spec.log('Parsing HTML...\r')
77 self._next_thread = find_next_thread(
78 BeautifulSoup(spec.htmlfilter(html), 'html.parser'))
79 self._dom = BeautifulSoup(spec.htmlfilter(flat_html), 'html.parser')
80 self._spec = spec
81
82 def title(self) -> str | None:
83 span = self._dom.findChild("span", id="post-title")
84 if not isinstance(span, Tag):
85 return None
86 return span.text.strip()
87
88 def next_thread(self) -> str | None:
89 return self._next_thread
90
91 def chunkDOMs(self) -> Iterable[Tag]:
92 def text() -> Tag:
93 body = self._dom.body
94 assert body
95 text = body.find_next("div", class_="post-post")
96 assert isinstance(text, Tag)
97 return text
98
99 def the_replies() -> Iterable[Tag]:
100 rs = self._dom.find_all("div", class_="post-reply")
101 assert all(isinstance(r, Tag) for r in rs)
102 return rs
103
104 return itertools.chain([text()], the_replies())
105
106 def emit(self) -> None:
107 self._spec.log('Counting chunks...\r')
108 num_chunks = ilen(self.chunkDOMs())
109 title = self.title() or "chunk"
110 for i, r in enumerate(self.chunkDOMs()):
111 percent = 100.0 * (i + 1) / num_chunks
112 self._spec.log(
113 f'Processing {title} {i+1} of {num_chunks} ({percent:.1f}%)\r')
114 self._spec.domfilter(r)
115 chunk = makeChunk(r, self._spec.images)
116 self._spec.texout.write(
117 self._spec.texfilter(renderChunk(self._spec.texifier, chunk)))
118 self._spec.log('')
119 next_url = self.next_thread()
120 if next_url is not None:
121 Thread(dataclasses.replace(self._spec, url=next_url)).emit()
122
123
124 def makeChunk(chunk_dom: Tag, image_store: ImageStore) -> Chunk:
125
126 def getIcon() -> str | None:
127 icon_div = chunk_dom.findChild('div', class_='post-icon')
128 if icon_div is None:
129 return None
130 assert isinstance(icon_div, Tag)
131 icon_img = icon_div.findChild('img')
132 if icon_img is None:
133 return None
134 assert isinstance(icon_img, Tag)
135 return image_store.get_image(icon_img.attrs['src'])
136
137 def getByClass(css_class: str) -> Tag | None:
138 tag = chunk_dom.findChild('div', class_=css_class)
139 assert tag is None or isinstance(tag, Tag)
140 return tag
141
142 def stripHREF(tag: Tag) -> None:
143 for c in tag.findChildren("a"):
144 if "href" in c.attrs:
145 del c.attrs["href"]
146
147 def getMeta(css_class: str) -> Tag | None:
148 tag = getByClass(css_class)
149 if tag is None:
150 return None
151 stripHREF(tag)
152 return tag
153
154 content = chunk_dom.findChild('div', class_='post-content')
155 assert isinstance(content, Tag)
156
157 return Chunk(getIcon(),
158 getMeta('post-character'),
159 getMeta('post-screenname'),
160 getMeta('post-author'),
161 content)
162
163
164 def renderChunk(texifier: Texifier, chunk: Chunk) -> bytes:
165 return b''.join([
166 br'\glowhead{',
167 br'\glowicon{%s}' % chunk.icon.encode('UTF-8') if chunk.icon else b'',
168 b'}{',
169 texifier.texify(chunk.character) if chunk.character else b'',
170 b'}{',
171 texifier.texify(chunk.screen_name) if chunk.screen_name else b'',
172 b'}{',
173 texifier.texify(chunk.author) if chunk.author else b'',
174 b'}',
175 texifier.texify(chunk.content)])
176
177
178 ContentOnlyLayout = br'''
179 \newcommand{\glowhead}[4]{}
180 '''
181
182
183 BelowIconLayout = br'''
184 \newcommand{\glowhead}[4]{\wrapstuffclear
185 \begin{wrapstuff}[l]
186 \fbox{
187 \begin{varwidth}{0.5\textwidth}
188 \smash{\parbox[t][0pt]{0pt}{
189 \setlength{\fboxrule}{0.2pt}
190 \setlength{\fboxsep}{0pt}
191 \vspace{-3.4pt}
192 \fbox{\hspace{107mm}}
193 }\\*}
194 \vspace{-1em}
195 \begin{center}
196 #1\ifnotempty
197 {#1}{\\*}#2\ifnotempty
198 {#2}{\\*}#3\ifnotempty
199 {#3}{\\*}#4
200 \end{center}
201 \end{varwidth}
202 }
203 \end{wrapstuff}
204
205 \strut
206
207 \noindent}'''
208
209
210 # Why is \textwidth not the width of the text?
211 # Why is the width of the text .765\textwidth?
212 BesideIconLayout = br'''
213 \newcommand{\glowhead}[4]{
214
215 \strut
216
217 \noindent\fbox{
218 #1
219 \parbox[b]{.765\textwidth}{
220 \begin{center}
221 #2\ifnotempty
222 {#2}{\\*}#3\ifnotempty
223 {#3}{\\*}#4
224 \end{center}
225 }
226 }\\*
227 \vspace{-0.75em}\\*
228 \noindent}'''