]>
git.scottworley.com Git - paperdoorknob/blob - glowfic.py
94fbd34bcfa00c4041d4fe74c5796775c162ca60
1 # paperdoorknob: Print glowfic
3 # This program is free software: you can redistribute it and/or modify it
4 # under the terms of the GNU General Public License as published by the
5 # Free Software Foundation, version 3.
10 from urllib
.parse
import parse_qsl
, urlencode
, urljoin
, urlparse
, urlunparse
12 from typing
import Any
, Iterable
14 from bs4
import BeautifulSoup
15 from bs4
.element
import Tag
17 from images
import ImageStore
19 from texify
import Texifier
22 def ilen(it
: Iterable
[Any
]) -> int:
23 return sum(1 for _
in it
)
26 def _removeViewFromURL(url
: str) -> str:
28 old_qs
= parse_qsl(u
.query
)
29 new_qs
= [(k
, v
) for k
, v
in old_qs
if k
!= 'view']
30 return urlunparse(u
._replace
(query
=urlencode(new_qs
)))
33 def nonFlatURL(url
: str) -> str:
34 return _removeViewFromURL(url
)
37 def flatURL(url
: str) -> str:
38 u
= urlparse(_removeViewFromURL(url
))
39 qs
= parse_qsl(u
.query
) + [('view', 'flat')]
40 return urlunparse(u
._replace
(query
=urlencode(qs
)))
43 @dataclasses.dataclass(frozen
=True)
47 screen_name
: Tag |
None
51 # We avoid the name "post" because the Glowfic community uses the term
53 # * The Glowfic software sometimes uses "post" to refer to a whole thread
54 # (in the URL), sometimes uses "post" to refer to chunks (in the CSS),
55 # but mostly uses "post" to refer to just the first chunk in a thread
56 # (in the HTML and UI). The non-first chunks are "replies".
57 # * Readers and this software don't need to distinguish first-chunks and
59 # * Humans in the community tend to use "posts" to mean chunks.
64 def __init__(self
, spec
: Spec
) -> None:
65 def find_next_thread(dom
: BeautifulSoup
) -> str |
None:
66 for c
in dom
.findChildren('div', class_
='post-navheader'):
67 for a
in c
.findChildren('a'):
68 if 'Next Post' in a
.text
and 'href' in a
.attrs
and isinstance(
69 a
.attrs
['href'], str):
70 return urljoin(spec
.url
, a
.attrs
['href'])
73 spec
.log('Fetching HTML...\r')
74 html
= spec
.fetcher
.fetch(spec
.url
)
75 flat_html
= spec
.fetcher
.fetch(flatURL(spec
.url
))
76 spec
.log('Parsing HTML...\r')
77 self
._next
_thread
= find_next_thread(
78 BeautifulSoup(spec
.htmlfilter(html
), 'html.parser'))
79 self
._dom
= BeautifulSoup(spec
.htmlfilter(flat_html
), 'html.parser')
82 def title(self
) -> str |
None:
83 span
= self
._dom
.findChild("span", id="post-title")
84 if not isinstance(span
, Tag
):
86 return span
.text
.strip()
88 def next_thread(self
) -> str |
None:
89 return self
._next
_thread
91 def chunkDOMs(self
) -> Iterable
[Tag
]:
95 text
= body
.find_next("div", class_
="post-post")
96 assert isinstance(text
, Tag
)
99 def the_replies() -> Iterable
[Tag
]:
100 rs
= self
._dom
.find_all("div", class_
="post-reply")
101 assert all(isinstance(r
, Tag
) for r
in rs
)
104 return itertools
.chain([text()], the_replies())
106 def emit(self
) -> None:
107 self
._spec
.log('Counting chunks...\r')
108 num_chunks
= ilen(self
.chunkDOMs())
109 title
= self
.title() or "chunk"
110 for i
, r
in enumerate(self
.chunkDOMs()):
111 percent
= 100.0 * i
/ num_chunks
113 f
'Processing {title} {i} of {num_chunks} ({percent:.1f}%)\r')
114 self
._spec
.domfilter(r
)
115 chunk
= makeChunk(r
, self
._spec
.images
)
116 self
._spec
.texout
.write(
117 self
._spec
.texfilter(renderChunk(self
._spec
.texifier
, chunk
)))
119 next_url
= self
.next_thread()
120 if next_url
is not None:
121 Thread(dataclasses
.replace(self
._spec
, url
=next_url
)).emit()
124 def makeChunk(chunk_dom
: Tag
, image_store
: ImageStore
) -> Chunk
:
126 def getIcon() -> str |
None:
127 icon_div
= chunk_dom
.findChild('div', class_
='post-icon')
130 assert isinstance(icon_div
, Tag
)
131 icon_img
= icon_div
.findChild('img')
134 assert isinstance(icon_img
, Tag
)
135 return image_store
.get_image(icon_img
.attrs
['src'])
137 def getByClass(css_class
: str) -> Tag |
None:
138 tag
= chunk_dom
.findChild('div', class_
=css_class
)
139 assert tag
is None or isinstance(tag
, Tag
)
142 def stripHREF(tag
: Tag
) -> None:
143 for c
in tag
.findChildren("a"):
144 if "href" in c
.attrs
:
147 def getMeta(css_class
: str) -> Tag |
None:
148 tag
= getByClass(css_class
)
154 content
= chunk_dom
.findChild('div', class_
='post-content')
155 assert isinstance(content
, Tag
)
157 return Chunk(getIcon(),
158 getMeta('post-character'),
159 getMeta('post-screenname'),
160 getMeta('post-author'),
164 def renderChunk(texifier
: Texifier
, chunk
: Chunk
) -> bytes:
167 br
'\glowicon{%s}' % chunk
.icon
.encode('UTF-8') if chunk
.icon
else b
'',
169 texifier
.texify(chunk
.character
) if chunk
.character
else b
'',
171 texifier
.texify(chunk
.screen_name
) if chunk
.screen_name
else b
'',
173 texifier
.texify(chunk
.author
) if chunk
.author
else b
'',
175 texifier
.texify(chunk
.content
)])
178 ContentOnlyLayout
= br
'''
179 \newcommand{\glowhead}[4]{}
183 BelowIconLayout
= br
'''
184 \newcommand{\glowhead}[4]{\wrapstuffclear
187 \begin{varwidth}{0.5\textwidth}
188 \smash{\parbox[t][0pt]{0pt}{
189 \setlength{\fboxrule}{0.2pt}
190 \setlength{\fboxsep}{0pt}
192 \fbox{\hspace{107mm}}
197 {#1}{\\*}#2\ifnotempty
198 {#2}{\\*}#3\ifnotempty
210 # Why is \textwidth not the width of the text?
211 # Why is the width of the text .765\textwidth?
212 BesideIconLayout
= br
'''
213 \newcommand{\glowhead}[4]{
219 \parbox[b]{.765\textwidth}{
222 {#2}{\\*}#3\ifnotempty