]> git.scottworley.com Git - paperdoorknob/blob - glowfic.py
Fetch the non-flat view to get the next-thread link
[paperdoorknob] / glowfic.py
1 # paperdoorknob: Print glowfic
2 #
3 # This program is free software: you can redistribute it and/or modify it
4 # under the terms of the GNU General Public License as published by the
5 # Free Software Foundation, version 3.
6
7
8 from dataclasses import dataclass
9 import itertools
10 from urllib.parse import parse_qsl, urlencode, urlparse, urlunparse
11
12 from typing import Iterable
13
14 from bs4 import BeautifulSoup
15 from bs4.element import Tag
16
17 from images import ImageStore
18 from spec import Spec
19 from texify import Texifier
20
21
22 def _removeViewFromURL(url: str) -> str:
23 u = urlparse(url)
24 old_qs = parse_qsl(u.query)
25 new_qs = [(k, v) for k, v in old_qs if k != 'view']
26 return urlunparse(u._replace(query=urlencode(new_qs)))
27
28
29 def nonFlatURL(url: str) -> str:
30 return _removeViewFromURL(url)
31
32
33 def flatURL(url: str) -> str:
34 u = urlparse(_removeViewFromURL(url))
35 qs = parse_qsl(u.query) + [('view', 'flat')]
36 return urlunparse(u._replace(query=urlencode(qs)))
37
38
39 @dataclass(frozen=True)
40 class Chunk:
41 icon: str | None
42 character: Tag | None
43 screen_name: Tag | None
44 author: Tag | None
45 content: Tag
46
47 # We avoid the name "post" because the Glowfic community uses the term
48 # inconsistently:
49 # * The Glowfic software sometimes uses "post" to refer to a whole thread
50 # (in the URL), sometimes uses "post" to refer to chunks (in the CSS),
51 # but mostly uses "post" to refer to just the first chunk in a thread
52 # (in the HTML and UI). The non-first chunks are "replies".
53 # * Readers and this software don't need to distinguish first-chunks and
54 # non-first-chunks.
55 # * Humans in the community tend to use "posts" to mean chunks.
56
57
58 class Thread:
59
60 def __init__(self, spec: Spec) -> None:
61 def find_next_thread(dom: BeautifulSoup) -> str | None:
62 for c in dom.findChildren('div', class_='post-navheader'):
63 for a in c.findChildren('a'):
64 if 'Next Post' in a.text and 'href' in a.attrs and isinstance(
65 a.attrs['href'], str):
66 return a.attrs['href']
67 return None
68
69 spec.log('Fetching HTML...\r')
70 html = spec.fetcher.fetch(spec.url)
71 flat_html = spec.fetcher.fetch(flatURL(spec.url))
72 spec.log('Parsing HTML...\r')
73 self._next_thread = find_next_thread(
74 BeautifulSoup(spec.htmlfilter(html), 'html.parser'))
75 self._dom = BeautifulSoup(spec.htmlfilter(flat_html), 'html.parser')
76
77 def title(self) -> str | None:
78 span = self._dom.findChild("span", id="post-title")
79 if not isinstance(span, Tag):
80 return None
81 return span.text.strip()
82
83 def next_thread(self) -> str | None:
84 return self._next_thread
85
86 def chunkDOMs(self) -> Iterable[Tag]:
87 def text() -> Tag:
88 body = self._dom.body
89 assert body
90 text = body.find_next("div", class_="post-post")
91 assert isinstance(text, Tag)
92 return text
93
94 def the_replies() -> Iterable[Tag]:
95 rs = self._dom.find_all("div", class_="post-reply")
96 assert all(isinstance(r, Tag) for r in rs)
97 return rs
98
99 return itertools.chain([text()], the_replies())
100
101
102 def makeChunk(chunk_dom: Tag, image_store: ImageStore) -> Chunk:
103
104 def getIcon() -> str | None:
105 icon_div = chunk_dom.findChild('div', class_='post-icon')
106 if icon_div is None:
107 return None
108 assert isinstance(icon_div, Tag)
109 icon_img = icon_div.findChild('img')
110 if icon_img is None:
111 return None
112 assert isinstance(icon_img, Tag)
113 return image_store.get_image(icon_img.attrs['src'])
114
115 def getByClass(css_class: str) -> Tag | None:
116 tag = chunk_dom.findChild('div', class_=css_class)
117 assert tag is None or isinstance(tag, Tag)
118 return tag
119
120 def stripHREF(tag: Tag) -> None:
121 for c in tag.findChildren("a"):
122 if "href" in c.attrs:
123 del c.attrs["href"]
124
125 def getMeta(css_class: str) -> Tag | None:
126 tag = getByClass(css_class)
127 if tag is None:
128 return None
129 stripHREF(tag)
130 return tag
131
132 content = chunk_dom.findChild('div', class_='post-content')
133 assert isinstance(content, Tag)
134
135 return Chunk(getIcon(),
136 getMeta('post-character'),
137 getMeta('post-screenname'),
138 getMeta('post-author'),
139 content)
140
141
142 def renderChunk(texifier: Texifier, chunk: Chunk) -> bytes:
143 return b''.join([
144 br'\glowhead{',
145 br'\glowicon{%s}' % chunk.icon.encode('UTF-8') if chunk.icon else b'',
146 b'}{',
147 texifier.texify(chunk.character) if chunk.character else b'',
148 b'}{',
149 texifier.texify(chunk.screen_name) if chunk.screen_name else b'',
150 b'}{',
151 texifier.texify(chunk.author) if chunk.author else b'',
152 b'}',
153 texifier.texify(chunk.content)])
154
155
156 ContentOnlyLayout = br'''
157 \newcommand{\glowhead}[4]{}
158 '''
159
160
161 BelowIconLayout = br'''
162 \newcommand{\glowhead}[4]{\wrapstuffclear
163 \begin{wrapstuff}[l]
164 \fbox{
165 \begin{varwidth}{0.5\textwidth}
166 \smash{\parbox[t][0pt]{0pt}{
167 \setlength{\fboxrule}{0.2pt}
168 \setlength{\fboxsep}{0pt}
169 \vspace{-3.4pt}
170 \fbox{\hspace{107mm}}
171 }\\*}
172 \vspace{-1em}
173 \begin{center}
174 #1\ifnotempty
175 {#1}{\\*}#2\ifnotempty
176 {#2}{\\*}#3\ifnotempty
177 {#3}{\\*}#4
178 \end{center}
179 \end{varwidth}
180 }
181 \end{wrapstuff}
182
183 \strut
184
185 \noindent}'''
186
187
188 # Why is \textwidth not the width of the text?
189 # Why is the width of the text .765\textwidth?
190 BesideIconLayout = br'''
191 \newcommand{\glowhead}[4]{
192
193 \strut
194
195 \noindent\fbox{
196 #1
197 \parbox[b]{.765\textwidth}{
198 \begin{center}
199 #2\ifnotempty
200 {#2}{\\*}#3\ifnotempty
201 {#3}{\\*}#4
202 \end{center}
203 }
204 }\\*
205 \vspace{-0.75em}\\*
206 \noindent}'''