git.scottworley.com Git - paperdoorknob/blame_incremental

... / ...

Commit	Line	Data
	1	# paperdoorknob: Print glowfic
	2	#
	3	# This program is free software: you can redistribute it and/or modify it
	4	# under the terms of the GNU General Public License as published by the
	5	# Free Software Foundation, version 3.
	6
	7
	8	import dataclasses
	9	import itertools
	10	from urllib.parse import parse_qsl, urlencode, urljoin, urlparse, urlunparse
	11
	12	from typing import Any, Iterable
	13
	14	from bs4 import BeautifulSoup
	15	from bs4.element import Tag
	16
	17	from images import ImageStore
	18	from spec import Spec
	19	from texify import Texifier
	20
	21
	22	def ilen(it: Iterable[Any]) -> int:
	23	return sum(1 for _ in it)
	24
	25
	26	def _removeViewFromURL(url: str) -> str:
	27	u = urlparse(url)
	28	old_qs = parse_qsl(u.query)
	29	new_qs = [(k, v) for k, v in old_qs if k != 'view']
	30	return urlunparse(u._replace(query=urlencode(new_qs)))
	31
	32
	33	def nonFlatURL(url: str) -> str:
	34	return _removeViewFromURL(url)
	35
	36
	37	def flatURL(url: str) -> str:
	38	u = urlparse(_removeViewFromURL(url))
	39	qs = parse_qsl(u.query) + [('view', 'flat')]
	40	return urlunparse(u._replace(query=urlencode(qs)))
	41
	42
	43	@dataclasses.dataclass(frozen=True)
	44	class Chunk:
	45	icon: str \| None
	46	character: Tag \| None
	47	screen_name: Tag \| None
	48	author: Tag \| None
	49	content: Tag
	50
	51	# We avoid the name "post" because the Glowfic community uses the term
	52	# inconsistently:
	53	# * The Glowfic software sometimes uses "post" to refer to a whole thread
	54	# (in the URL), sometimes uses "post" to refer to chunks (in the CSS),
	55	# but mostly uses "post" to refer to just the first chunk in a thread
	56	# (in the HTML and UI). The non-first chunks are "replies".
	57	# * Readers and this software don't need to distinguish first-chunks and
	58	# non-first-chunks.
	59	# * Humans in the community tend to use "posts" to mean chunks.
	60
	61
	62	class Thread:
	63
	64	def __init__(self, spec: Spec) -> None:
	65	def find_next_thread(dom: BeautifulSoup) -> str \| None:
	66	for c in dom.findChildren('div', class_='post-navheader'):
	67	for a in c.findChildren('a'):
	68	if 'Next Post' in a.text and 'href' in a.attrs and isinstance(
	69	a.attrs['href'], str):
	70	return urljoin(spec.url, a.attrs['href'])
	71	return None
	72
	73	spec.log('Fetching HTML...\r')
	74	html = spec.fetcher.fetch(spec.url)
	75	flat_html = spec.fetcher.fetch(flatURL(spec.url))
	76	spec.log('Parsing HTML...\r')
	77	self._next_thread = find_next_thread(
	78	BeautifulSoup(spec.htmlfilter(html), 'html.parser'))
	79	self._dom = BeautifulSoup(spec.htmlfilter(flat_html), 'html.parser')
	80	self._spec = spec
	81
	82	def title(self) -> str \| None:
	83	span = self._dom.findChild("span", id="post-title")
	84	if not isinstance(span, Tag):
	85	return None
	86	return span.text.strip()
	87
	88	def next_thread(self) -> str \| None:
	89	return self._next_thread
	90
	91	def chunkDOMs(self) -> Iterable[Tag]:
	92	def text() -> Tag:
	93	body = self._dom.body
	94	assert body
	95	text = body.find_next("div", class_="post-post")
	96	assert isinstance(text, Tag)
	97	return text
	98
	99	def the_replies() -> Iterable[Tag]:
	100	rs = self._dom.find_all("div", class_="post-reply")
	101	assert all(isinstance(r, Tag) for r in rs)
	102	return rs
	103
	104	return itertools.chain([text()], the_replies())
	105
	106	def emit(self) -> None:
	107	self._spec.log('Counting chunks...\r')
	108	num_chunks = ilen(self.chunkDOMs())
	109	title = self.title() or "chunk"
	110	for i, r in enumerate(self.chunkDOMs()):
	111	percent = 100.0 * i / num_chunks
	112	self._spec.log(
	113	f'Processing {title} {i} of {num_chunks} ({percent:.1f}%)\r')
	114	self._spec.domfilter(r)
	115	chunk = makeChunk(r, self._spec.images)
	116	self._spec.texout.write(
	117	self._spec.texfilter(renderChunk(self._spec.texifier, chunk)))
	118	self._spec.log('')
	119	next_url = self.next_thread()
	120	if next_url is not None:
	121	Thread(dataclasses.replace(self._spec, url=next_url)).emit()
	122
	123
	124	def makeChunk(chunk_dom: Tag, image_store: ImageStore) -> Chunk:
	125
	126	def getIcon() -> str \| None:
	127	icon_div = chunk_dom.findChild('div', class_='post-icon')
	128	if icon_div is None:
	129	return None
	130	assert isinstance(icon_div, Tag)
	131	icon_img = icon_div.findChild('img')
	132	if icon_img is None:
	133	return None
	134	assert isinstance(icon_img, Tag)
	135	return image_store.get_image(icon_img.attrs['src'])
	136
	137	def getByClass(css_class: str) -> Tag \| None:
	138	tag = chunk_dom.findChild('div', class_=css_class)
	139	assert tag is None or isinstance(tag, Tag)
	140	return tag
	141
	142	def stripHREF(tag: Tag) -> None:
	143	for c in tag.findChildren("a"):
	144	if "href" in c.attrs:
	145	del c.attrs["href"]
	146
	147	def getMeta(css_class: str) -> Tag \| None:
	148	tag = getByClass(css_class)
	149	if tag is None:
	150	return None
	151	stripHREF(tag)
	152	return tag
	153
	154	content = chunk_dom.findChild('div', class_='post-content')
	155	assert isinstance(content, Tag)
	156
	157	return Chunk(getIcon(),
	158	getMeta('post-character'),
	159	getMeta('post-screenname'),
	160	getMeta('post-author'),
	161	content)
	162
	163
	164	def renderChunk(texifier: Texifier, chunk: Chunk) -> bytes:
	165	return b''.join([
	166	br'\glowhead{',
	167	br'\glowicon{%s}' % chunk.icon.encode('UTF-8') if chunk.icon else b'',
	168	b'}{',
	169	texifier.texify(chunk.character) if chunk.character else b'',
	170	b'}{',
	171	texifier.texify(chunk.screen_name) if chunk.screen_name else b'',
	172	b'}{',
	173	texifier.texify(chunk.author) if chunk.author else b'',
	174	b'}',
	175	texifier.texify(chunk.content)])
	176
	177
	178	ContentOnlyLayout = br'''
	179	\newcommand{\glowhead}[4]{}
	180	'''
	181
	182
	183	BelowIconLayout = br'''
	184	\newcommand{\glowhead}[4]{\wrapstuffclear
	185	\begin{wrapstuff}[l]
	186	\fbox{
	187	\begin{varwidth}{0.5\textwidth}
	188	\smash{\parbox[t][0pt]{0pt}{
	189	\setlength{\fboxrule}{0.2pt}
	190	\setlength{\fboxsep}{0pt}
	191	\vspace{-3.4pt}
	192	\fbox{\hspace{107mm}}
	193	}\\*}
	194	\vspace{-1em}
	195	\begin{center}
	196	#1\ifnotempty
	197	{#1}{\\*}#2\ifnotempty
	198	{#2}{\\*}#3\ifnotempty
	199	{#3}{\\*}#4
	200	\end{center}
	201	\end{varwidth}
	202	}
	203	\end{wrapstuff}
	204
	205	\strut
	206
	207	\noindent}'''
	208
	209
	210	# Why is \textwidth not the width of the text?
	211	# Why is the width of the text .765\textwidth?
	212	BesideIconLayout = br'''
	213	\newcommand{\glowhead}[4]{
	214
	215	\strut
	216
	217	\noindent\fbox{
	218	#1
	219	\parbox[b]{.765\textwidth}{
	220	\begin{center}
	221	#2\ifnotempty
	222	{#2}{\\*}#3\ifnotempty
	223	{#3}{\\*}#4
	224	\end{center}
	225	}
	226	}\\*
	227	\vspace{-0.75em}\\*
	228	\noindent}'''