]> git.scottworley.com Git - paperdoorknob/blob - glowfic.py
Move per-thread processing stuff into Thread
[paperdoorknob] / glowfic.py
1 # paperdoorknob: Print glowfic
2 #
3 # This program is free software: you can redistribute it and/or modify it
4 # under the terms of the GNU General Public License as published by the
5 # Free Software Foundation, version 3.
6
7
8 from dataclasses import dataclass
9 import itertools
10 from urllib.parse import parse_qsl, urlencode, urljoin, urlparse, urlunparse
11
12 from typing import Any, Iterable
13
14 from bs4 import BeautifulSoup
15 from bs4.element import Tag
16
17 from images import ImageStore
18 from spec import Spec
19 from texify import Texifier
20
21
22 def ilen(it: Iterable[Any]) -> int:
23 return sum(1 for _ in it)
24
25
26 def _removeViewFromURL(url: str) -> str:
27 u = urlparse(url)
28 old_qs = parse_qsl(u.query)
29 new_qs = [(k, v) for k, v in old_qs if k != 'view']
30 return urlunparse(u._replace(query=urlencode(new_qs)))
31
32
33 def nonFlatURL(url: str) -> str:
34 return _removeViewFromURL(url)
35
36
37 def flatURL(url: str) -> str:
38 u = urlparse(_removeViewFromURL(url))
39 qs = parse_qsl(u.query) + [('view', 'flat')]
40 return urlunparse(u._replace(query=urlencode(qs)))
41
42
43 @dataclass(frozen=True)
44 class Chunk:
45 icon: str | None
46 character: Tag | None
47 screen_name: Tag | None
48 author: Tag | None
49 content: Tag
50
51 # We avoid the name "post" because the Glowfic community uses the term
52 # inconsistently:
53 # * The Glowfic software sometimes uses "post" to refer to a whole thread
54 # (in the URL), sometimes uses "post" to refer to chunks (in the CSS),
55 # but mostly uses "post" to refer to just the first chunk in a thread
56 # (in the HTML and UI). The non-first chunks are "replies".
57 # * Readers and this software don't need to distinguish first-chunks and
58 # non-first-chunks.
59 # * Humans in the community tend to use "posts" to mean chunks.
60
61
62 class Thread:
63
64 def __init__(self, spec: Spec) -> None:
65 def find_next_thread(dom: BeautifulSoup) -> str | None:
66 for c in dom.findChildren('div', class_='post-navheader'):
67 for a in c.findChildren('a'):
68 if 'Next Post' in a.text and 'href' in a.attrs and isinstance(
69 a.attrs['href'], str):
70 return urljoin(spec.url, a.attrs['href'])
71 return None
72
73 spec.log('Fetching HTML...\r')
74 html = spec.fetcher.fetch(spec.url)
75 flat_html = spec.fetcher.fetch(flatURL(spec.url))
76 spec.log('Parsing HTML...\r')
77 self._next_thread = find_next_thread(
78 BeautifulSoup(spec.htmlfilter(html), 'html.parser'))
79 self._dom = BeautifulSoup(spec.htmlfilter(flat_html), 'html.parser')
80 self._spec = spec
81
82 def title(self) -> str | None:
83 span = self._dom.findChild("span", id="post-title")
84 if not isinstance(span, Tag):
85 return None
86 return span.text.strip()
87
88 def next_thread(self) -> str | None:
89 return self._next_thread
90
91 def chunkDOMs(self) -> Iterable[Tag]:
92 def text() -> Tag:
93 body = self._dom.body
94 assert body
95 text = body.find_next("div", class_="post-post")
96 assert isinstance(text, Tag)
97 return text
98
99 def the_replies() -> Iterable[Tag]:
100 rs = self._dom.find_all("div", class_="post-reply")
101 assert all(isinstance(r, Tag) for r in rs)
102 return rs
103
104 return itertools.chain([text()], the_replies())
105
106 def emit(self) -> None:
107 self._spec.log('Counting chunks...\r')
108 num_chunks = ilen(self.chunkDOMs())
109 title = self.title() or "chunk"
110 for i, r in enumerate(self.chunkDOMs()):
111 percent = 100.0 * i / num_chunks
112 self._spec.log(
113 f'Processing {title} {i} of {num_chunks} ({percent:.1f}%)\r')
114 self._spec.domfilter(r)
115 chunk = makeChunk(r, self._spec.images)
116 self._spec.texout.write(
117 self._spec.texfilter(renderChunk(self._spec.texifier, chunk)))
118 self._spec.log('')
119
120
121 def makeChunk(chunk_dom: Tag, image_store: ImageStore) -> Chunk:
122
123 def getIcon() -> str | None:
124 icon_div = chunk_dom.findChild('div', class_='post-icon')
125 if icon_div is None:
126 return None
127 assert isinstance(icon_div, Tag)
128 icon_img = icon_div.findChild('img')
129 if icon_img is None:
130 return None
131 assert isinstance(icon_img, Tag)
132 return image_store.get_image(icon_img.attrs['src'])
133
134 def getByClass(css_class: str) -> Tag | None:
135 tag = chunk_dom.findChild('div', class_=css_class)
136 assert tag is None or isinstance(tag, Tag)
137 return tag
138
139 def stripHREF(tag: Tag) -> None:
140 for c in tag.findChildren("a"):
141 if "href" in c.attrs:
142 del c.attrs["href"]
143
144 def getMeta(css_class: str) -> Tag | None:
145 tag = getByClass(css_class)
146 if tag is None:
147 return None
148 stripHREF(tag)
149 return tag
150
151 content = chunk_dom.findChild('div', class_='post-content')
152 assert isinstance(content, Tag)
153
154 return Chunk(getIcon(),
155 getMeta('post-character'),
156 getMeta('post-screenname'),
157 getMeta('post-author'),
158 content)
159
160
161 def renderChunk(texifier: Texifier, chunk: Chunk) -> bytes:
162 return b''.join([
163 br'\glowhead{',
164 br'\glowicon{%s}' % chunk.icon.encode('UTF-8') if chunk.icon else b'',
165 b'}{',
166 texifier.texify(chunk.character) if chunk.character else b'',
167 b'}{',
168 texifier.texify(chunk.screen_name) if chunk.screen_name else b'',
169 b'}{',
170 texifier.texify(chunk.author) if chunk.author else b'',
171 b'}',
172 texifier.texify(chunk.content)])
173
174
175 ContentOnlyLayout = br'''
176 \newcommand{\glowhead}[4]{}
177 '''
178
179
180 BelowIconLayout = br'''
181 \newcommand{\glowhead}[4]{\wrapstuffclear
182 \begin{wrapstuff}[l]
183 \fbox{
184 \begin{varwidth}{0.5\textwidth}
185 \smash{\parbox[t][0pt]{0pt}{
186 \setlength{\fboxrule}{0.2pt}
187 \setlength{\fboxsep}{0pt}
188 \vspace{-3.4pt}
189 \fbox{\hspace{107mm}}
190 }\\*}
191 \vspace{-1em}
192 \begin{center}
193 #1\ifnotempty
194 {#1}{\\*}#2\ifnotempty
195 {#2}{\\*}#3\ifnotempty
196 {#3}{\\*}#4
197 \end{center}
198 \end{varwidth}
199 }
200 \end{wrapstuff}
201
202 \strut
203
204 \noindent}'''
205
206
207 # Why is \textwidth not the width of the text?
208 # Why is the width of the text .765\textwidth?
209 BesideIconLayout = br'''
210 \newcommand{\glowhead}[4]{
211
212 \strut
213
214 \noindent\fbox{
215 #1
216 \parbox[b]{.765\textwidth}{
217 \begin{center}
218 #2\ifnotempty
219 {#2}{\\*}#3\ifnotempty
220 {#3}{\\*}#4
221 \end{center}
222 }
223 }\\*
224 \vspace{-0.75em}\\*
225 \noindent}'''