]> git.scottworley.com Git - paperdoorknob/blame - glowfic.py
next_thread should be an absolute URL
[paperdoorknob] / glowfic.py
CommitLineData
e6adf6ce
SW
1# paperdoorknob: Print glowfic
2#
3# This program is free software: you can redistribute it and/or modify it
4# under the terms of the GNU General Public License as published by the
5# Free Software Foundation, version 3.
6
7
aa060d9b 8from dataclasses import dataclass
e6adf6ce 9import itertools
fc2aaa75 10from urllib.parse import parse_qsl, urlencode, urljoin, urlparse, urlunparse
e6adf6ce
SW
11
12from typing import Iterable
13
14from bs4 import BeautifulSoup
15from bs4.element import Tag
16
aa060d9b 17from images import ImageStore
70adfbff 18from spec import Spec
d2a41ff4 19from texify import Texifier
aa060d9b
SW
20
21
1452f8d3
SW
22def _removeViewFromURL(url: str) -> str:
23 u = urlparse(url)
24 old_qs = parse_qsl(u.query)
25 new_qs = [(k, v) for k, v in old_qs if k != 'view']
26 return urlunparse(u._replace(query=urlencode(new_qs)))
27
28
29def nonFlatURL(url: str) -> str:
30 return _removeViewFromURL(url)
31
32
33def flatURL(url: str) -> str:
34 u = urlparse(_removeViewFromURL(url))
35 qs = parse_qsl(u.query) + [('view', 'flat')]
36 return urlunparse(u._replace(query=urlencode(qs)))
37
38
aa060d9b
SW
39@dataclass(frozen=True)
40class Chunk:
41 icon: str | None
37c47bc2
SW
42 character: Tag | None
43 screen_name: Tag | None
44 author: Tag | None
aa060d9b
SW
45 content: Tag
46
e6adf6ce
SW
47# We avoid the name "post" because the Glowfic community uses the term
48# inconsistently:
49# * The Glowfic software sometimes uses "post" to refer to a whole thread
aa060d9b
SW
50# (in the URL), sometimes uses "post" to refer to chunks (in the CSS),
51# but mostly uses "post" to refer to just the first chunk in a thread
52# (in the HTML and UI). The non-first chunks are "replies".
e6adf6ce
SW
53# * Readers and this software don't need to distinguish first-chunks and
54# non-first-chunks.
aa060d9b 55# * Humans in the community tend to use "posts" to mean chunks.
e6adf6ce
SW
56
57
94027099
SW
58class Thread:
59
19ed28f3 60 def __init__(self, spec: Spec) -> None:
53f396b4
SW
61 def find_next_thread(dom: BeautifulSoup) -> str | None:
62 for c in dom.findChildren('div', class_='post-navheader'):
63 for a in c.findChildren('a'):
64 if 'Next Post' in a.text and 'href' in a.attrs and isinstance(
65 a.attrs['href'], str):
fc2aaa75 66 return urljoin(spec.url, a.attrs['href'])
53f396b4
SW
67 return None
68
19ed28f3 69 spec.log('Fetching HTML...\r')
53f396b4
SW
70 html = spec.fetcher.fetch(spec.url)
71 flat_html = spec.fetcher.fetch(flatURL(spec.url))
19ed28f3 72 spec.log('Parsing HTML...\r')
53f396b4
SW
73 self._next_thread = find_next_thread(
74 BeautifulSoup(spec.htmlfilter(html), 'html.parser'))
75 self._dom = BeautifulSoup(spec.htmlfilter(flat_html), 'html.parser')
94027099 76
21e82200
SW
77 def title(self) -> str | None:
78 span = self._dom.findChild("span", id="post-title")
79 if not isinstance(span, Tag):
80 return None
81 return span.text.strip()
82
53f396b4
SW
83 def next_thread(self) -> str | None:
84 return self._next_thread
85
94027099
SW
86 def chunkDOMs(self) -> Iterable[Tag]:
87 def text() -> Tag:
a18519bf 88 body = self._dom.body
94027099
SW
89 assert body
90 text = body.find_next("div", class_="post-post")
91 assert isinstance(text, Tag)
92 return text
93
94 def the_replies() -> Iterable[Tag]:
a18519bf 95 rs = self._dom.find_all("div", class_="post-reply")
94027099
SW
96 assert all(isinstance(r, Tag) for r in rs)
97 return rs
98
99 return itertools.chain([text()], the_replies())
aa060d9b
SW
100
101
102def makeChunk(chunk_dom: Tag, image_store: ImageStore) -> Chunk:
103
104 def getIcon() -> str | None:
551bb1c9 105 icon_div = chunk_dom.findChild('div', class_='post-icon')
aa060d9b
SW
106 if icon_div is None:
107 return None
551bb1c9
SW
108 assert isinstance(icon_div, Tag)
109 icon_img = icon_div.findChild('img')
aa060d9b
SW
110 if icon_img is None:
111 return None
112 assert isinstance(icon_img, Tag)
113 return image_store.get_image(icon_img.attrs['src'])
114
37c47bc2 115 def getByClass(css_class: str) -> Tag | None:
551bb1c9 116 tag = chunk_dom.findChild('div', class_=css_class)
37c47bc2
SW
117 assert tag is None or isinstance(tag, Tag)
118 return tag
aa060d9b 119
62043b2b
SW
120 def stripHREF(tag: Tag) -> None:
121 for c in tag.findChildren("a"):
122 if "href" in c.attrs:
123 del c.attrs["href"]
124
125 def getMeta(css_class: str) -> Tag | None:
126 tag = getByClass(css_class)
127 if tag is None:
128 return None
129 stripHREF(tag)
130 return tag
131
551bb1c9 132 content = chunk_dom.findChild('div', class_='post-content')
aa060d9b
SW
133 assert isinstance(content, Tag)
134
135 return Chunk(getIcon(),
62043b2b
SW
136 getMeta('post-character'),
137 getMeta('post-screenname'),
138 getMeta('post-author'),
aa060d9b 139 content)
d2a41ff4
SW
140
141
1fac41bf
SW
142def renderChunk(texifier: Texifier, chunk: Chunk) -> bytes:
143 return b''.join([
144 br'\glowhead{',
145 br'\glowicon{%s}' % chunk.icon.encode('UTF-8') if chunk.icon else b'',
146 b'}{',
147 texifier.texify(chunk.character) if chunk.character else b'',
148 b'}{',
149 texifier.texify(chunk.screen_name) if chunk.screen_name else b'',
150 b'}{',
151 texifier.texify(chunk.author) if chunk.author else b'',
152 b'}',
153 texifier.texify(chunk.content)])
d2a41ff4 154
d2a41ff4 155
1fac41bf
SW
156ContentOnlyLayout = br'''
157\newcommand{\glowhead}[4]{}
158'''
d2a41ff4 159
d2a41ff4 160
1fac41bf
SW
161BelowIconLayout = br'''
162\newcommand{\glowhead}[4]{\wrapstuffclear
67612898
SW
163\begin{wrapstuff}[l]
164\fbox{
165\begin{varwidth}{0.5\textwidth}
166 \smash{\parbox[t][0pt]{0pt}{
167 \setlength{\fboxrule}{0.2pt}
168 \setlength{\fboxsep}{0pt}
169 \vspace{-3.4pt}
170 \fbox{\hspace{107mm}}
171 }\\*}
172 \vspace{-1em}
173\begin{center}
1fac41bf
SW
174#1\ifnotempty
175{#1}{\\*}#2\ifnotempty
176{#2}{\\*}#3\ifnotempty
177{#3}{\\*}#4
67612898
SW
178\end{center}
179\end{varwidth}
23dabdf5 180}
67612898 181\end{wrapstuff}
23dabdf5 182
67612898 183\strut
16385131 184
1fac41bf 185\noindent}'''
f75c1629
SW
186
187
1fac41bf
SW
188# Why is \textwidth not the width of the text?
189# Why is the width of the text .765\textwidth?
190BesideIconLayout = br'''
191\newcommand{\glowhead}[4]{
f75c1629 192
1fac41bf 193\strut
f75c1629 194
1fac41bf
SW
195\noindent\fbox{
196#1
67612898
SW
197\parbox[b]{.765\textwidth}{
198\begin{center}
1fac41bf
SW
199#2\ifnotempty
200{#2}{\\*}#3\ifnotempty
201{#3}{\\*}#4
67612898 202\end{center}
f75c1629 203}
67612898
SW
204}\\*
205\vspace{-0.75em}\\*
1fac41bf 206\noindent}'''