]> git.scottworley.com Git - paperdoorknob/blob - glowfic.py
Always have Thread.__init__ fetch the HTML
[paperdoorknob] / glowfic.py
1 # paperdoorknob: Print glowfic
2 #
3 # This program is free software: you can redistribute it and/or modify it
4 # under the terms of the GNU General Public License as published by the
5 # Free Software Foundation, version 3.
6
7
8 from dataclasses import dataclass
9 import itertools
10 from urllib.parse import parse_qsl, urlencode, urlparse, urlunparse
11
12 from typing import Iterable
13
14 from bs4 import BeautifulSoup
15 from bs4.element import Tag
16
17 from images import ImageStore
18 from spec import Spec
19 from texify import Texifier
20
21
22 def _removeViewFromURL(url: str) -> str:
23 u = urlparse(url)
24 old_qs = parse_qsl(u.query)
25 new_qs = [(k, v) for k, v in old_qs if k != 'view']
26 return urlunparse(u._replace(query=urlencode(new_qs)))
27
28
29 def nonFlatURL(url: str) -> str:
30 return _removeViewFromURL(url)
31
32
33 def flatURL(url: str) -> str:
34 u = urlparse(_removeViewFromURL(url))
35 qs = parse_qsl(u.query) + [('view', 'flat')]
36 return urlunparse(u._replace(query=urlencode(qs)))
37
38
39 @dataclass(frozen=True)
40 class Chunk:
41 icon: str | None
42 character: Tag | None
43 screen_name: Tag | None
44 author: Tag | None
45 content: Tag
46
47 # We avoid the name "post" because the Glowfic community uses the term
48 # inconsistently:
49 # * The Glowfic software sometimes uses "post" to refer to a whole thread
50 # (in the URL), sometimes uses "post" to refer to chunks (in the CSS),
51 # but mostly uses "post" to refer to just the first chunk in a thread
52 # (in the HTML and UI). The non-first chunks are "replies".
53 # * Readers and this software don't need to distinguish first-chunks and
54 # non-first-chunks.
55 # * Humans in the community tend to use "posts" to mean chunks.
56
57
58 class Thread:
59
60 def __init__(self, spec: Spec) -> None:
61 spec.log('Fetching HTML...\r')
62 html = spec.fetcher.fetch(flatURL(spec.url))
63 spec.log('Parsing HTML...\r')
64 self._dom = BeautifulSoup(spec.htmlfilter(html), 'html.parser')
65
66 def title(self) -> str | None:
67 span = self._dom.findChild("span", id="post-title")
68 if not isinstance(span, Tag):
69 return None
70 return span.text.strip()
71
72 def chunkDOMs(self) -> Iterable[Tag]:
73 def text() -> Tag:
74 body = self._dom.body
75 assert body
76 text = body.find_next("div", class_="post-post")
77 assert isinstance(text, Tag)
78 return text
79
80 def the_replies() -> Iterable[Tag]:
81 rs = self._dom.find_all("div", class_="post-reply")
82 assert all(isinstance(r, Tag) for r in rs)
83 return rs
84
85 return itertools.chain([text()], the_replies())
86
87
88 def makeChunk(chunk_dom: Tag, image_store: ImageStore) -> Chunk:
89
90 def getIcon() -> str | None:
91 icon_div = chunk_dom.findChild('div', class_='post-icon')
92 if icon_div is None:
93 return None
94 assert isinstance(icon_div, Tag)
95 icon_img = icon_div.findChild('img')
96 if icon_img is None:
97 return None
98 assert isinstance(icon_img, Tag)
99 return image_store.get_image(icon_img.attrs['src'])
100
101 def getByClass(css_class: str) -> Tag | None:
102 tag = chunk_dom.findChild('div', class_=css_class)
103 assert tag is None or isinstance(tag, Tag)
104 return tag
105
106 def stripHREF(tag: Tag) -> None:
107 for c in tag.findChildren("a"):
108 if "href" in c.attrs:
109 del c.attrs["href"]
110
111 def getMeta(css_class: str) -> Tag | None:
112 tag = getByClass(css_class)
113 if tag is None:
114 return None
115 stripHREF(tag)
116 return tag
117
118 content = chunk_dom.findChild('div', class_='post-content')
119 assert isinstance(content, Tag)
120
121 return Chunk(getIcon(),
122 getMeta('post-character'),
123 getMeta('post-screenname'),
124 getMeta('post-author'),
125 content)
126
127
128 def renderChunk(texifier: Texifier, chunk: Chunk) -> bytes:
129 return b''.join([
130 br'\glowhead{',
131 br'\glowicon{%s}' % chunk.icon.encode('UTF-8') if chunk.icon else b'',
132 b'}{',
133 texifier.texify(chunk.character) if chunk.character else b'',
134 b'}{',
135 texifier.texify(chunk.screen_name) if chunk.screen_name else b'',
136 b'}{',
137 texifier.texify(chunk.author) if chunk.author else b'',
138 b'}',
139 texifier.texify(chunk.content)])
140
141
142 ContentOnlyLayout = br'''
143 \newcommand{\glowhead}[4]{}
144 '''
145
146
147 BelowIconLayout = br'''
148 \newcommand{\glowhead}[4]{\wrapstuffclear
149 \begin{wrapstuff}[l]
150 \fbox{
151 \begin{varwidth}{0.5\textwidth}
152 \smash{\parbox[t][0pt]{0pt}{
153 \setlength{\fboxrule}{0.2pt}
154 \setlength{\fboxsep}{0pt}
155 \vspace{-3.4pt}
156 \fbox{\hspace{107mm}}
157 }\\*}
158 \vspace{-1em}
159 \begin{center}
160 #1\ifnotempty
161 {#1}{\\*}#2\ifnotempty
162 {#2}{\\*}#3\ifnotempty
163 {#3}{\\*}#4
164 \end{center}
165 \end{varwidth}
166 }
167 \end{wrapstuff}
168
169 \strut
170
171 \noindent}'''
172
173
174 # Why is \textwidth not the width of the text?
175 # Why is the width of the text .765\textwidth?
176 BesideIconLayout = br'''
177 \newcommand{\glowhead}[4]{
178
179 \strut
180
181 \noindent\fbox{
182 #1
183 \parbox[b]{.765\textwidth}{
184 \begin{center}
185 #2\ifnotempty
186 {#2}{\\*}#3\ifnotempty
187 {#3}{\\*}#4
188 \end{center}
189 }
190 }\\*
191 \vspace{-0.75em}\\*
192 \noindent}'''