git.scottworley.com Git - paperdoorknob/blame_incremental

... / ...

Commit	Line	Data
	1	# paperdoorknob: Print glowfic
	2	#
	3	# This program is free software: you can redistribute it and/or modify it
	4	# under the terms of the GNU General Public License as published by the
	5	# Free Software Foundation, version 3.
	6
	7
	8	from dataclasses import dataclass
	9	import itertools
	10	from urllib.parse import parse_qsl, urlencode, urljoin, urlparse, urlunparse
	11
	12	from typing import Any, Iterable
	13
	14	from bs4 import BeautifulSoup
	15	from bs4.element import Tag
	16
	17	from images import ImageStore
	18	from spec import Spec
	19	from texify import Texifier
	20
	21
	22	def ilen(it: Iterable[Any]) -> int:
	23	return sum(1 for _ in it)
	24
	25
	26	def _removeViewFromURL(url: str) -> str:
	27	u = urlparse(url)
	28	old_qs = parse_qsl(u.query)
	29	new_qs = [(k, v) for k, v in old_qs if k != 'view']
	30	return urlunparse(u._replace(query=urlencode(new_qs)))
	31
	32
	33	def nonFlatURL(url: str) -> str:
	34	return _removeViewFromURL(url)
	35
	36
	37	def flatURL(url: str) -> str:
	38	u = urlparse(_removeViewFromURL(url))
	39	qs = parse_qsl(u.query) + [('view', 'flat')]
	40	return urlunparse(u._replace(query=urlencode(qs)))
	41
	42
	43	@dataclass(frozen=True)
	44	class Chunk:
	45	icon: str \| None
	46	character: Tag \| None
	47	screen_name: Tag \| None
	48	author: Tag \| None
	49	content: Tag
	50
	51	# We avoid the name "post" because the Glowfic community uses the term
	52	# inconsistently:
	53	# * The Glowfic software sometimes uses "post" to refer to a whole thread
	54	# (in the URL), sometimes uses "post" to refer to chunks (in the CSS),
	55	# but mostly uses "post" to refer to just the first chunk in a thread
	56	# (in the HTML and UI). The non-first chunks are "replies".
	57	# * Readers and this software don't need to distinguish first-chunks and
	58	# non-first-chunks.
	59	# * Humans in the community tend to use "posts" to mean chunks.
	60
	61
	62	class Thread:
	63
	64	def __init__(self, spec: Spec) -> None:
	65	def find_next_thread(dom: BeautifulSoup) -> str \| None:
	66	for c in dom.findChildren('div', class_='post-navheader'):
	67	for a in c.findChildren('a'):
	68	if 'Next Post' in a.text and 'href' in a.attrs and isinstance(
	69	a.attrs['href'], str):
	70	return urljoin(spec.url, a.attrs['href'])
	71	return None
	72
	73	spec.log('Fetching HTML...\r')
	74	html = spec.fetcher.fetch(spec.url)
	75	flat_html = spec.fetcher.fetch(flatURL(spec.url))
	76	spec.log('Parsing HTML...\r')
	77	self._next_thread = find_next_thread(
	78	BeautifulSoup(spec.htmlfilter(html), 'html.parser'))
	79	self._dom = BeautifulSoup(spec.htmlfilter(flat_html), 'html.parser')
	80	self._spec = spec
	81
	82	def title(self) -> str \| None:
	83	span = self._dom.findChild("span", id="post-title")
	84	if not isinstance(span, Tag):
	85	return None
	86	return span.text.strip()
	87
	88	def next_thread(self) -> str \| None:
	89	return self._next_thread
	90
	91	def chunkDOMs(self) -> Iterable[Tag]:
	92	def text() -> Tag:
	93	body = self._dom.body
	94	assert body
	95	text = body.find_next("div", class_="post-post")
	96	assert isinstance(text, Tag)
	97	return text
	98
	99	def the_replies() -> Iterable[Tag]:
	100	rs = self._dom.find_all("div", class_="post-reply")
	101	assert all(isinstance(r, Tag) for r in rs)
	102	return rs
	103
	104	return itertools.chain([text()], the_replies())
	105
	106	def emit(self) -> None:
	107	self._spec.log('Counting chunks...\r')
	108	num_chunks = ilen(self.chunkDOMs())
	109	title = self.title() or "chunk"
	110	for i, r in enumerate(self.chunkDOMs()):
	111	percent = 100.0 * i / num_chunks
	112	self._spec.log(
	113	f'Processing {title} {i} of {num_chunks} ({percent:.1f}%)\r')
	114	self._spec.domfilter(r)
	115	chunk = makeChunk(r, self._spec.images)
	116	self._spec.texout.write(
	117	self._spec.texfilter(renderChunk(self._spec.texifier, chunk)))
	118	self._spec.log('')
	119
	120
	121	def makeChunk(chunk_dom: Tag, image_store: ImageStore) -> Chunk:
	122
	123	def getIcon() -> str \| None:
	124	icon_div = chunk_dom.findChild('div', class_='post-icon')
	125	if icon_div is None:
	126	return None
	127	assert isinstance(icon_div, Tag)
	128	icon_img = icon_div.findChild('img')
	129	if icon_img is None:
	130	return None
	131	assert isinstance(icon_img, Tag)
	132	return image_store.get_image(icon_img.attrs['src'])
	133
	134	def getByClass(css_class: str) -> Tag \| None:
	135	tag = chunk_dom.findChild('div', class_=css_class)
	136	assert tag is None or isinstance(tag, Tag)
	137	return tag
	138
	139	def stripHREF(tag: Tag) -> None:
	140	for c in tag.findChildren("a"):
	141	if "href" in c.attrs:
	142	del c.attrs["href"]
	143
	144	def getMeta(css_class: str) -> Tag \| None:
	145	tag = getByClass(css_class)
	146	if tag is None:
	147	return None
	148	stripHREF(tag)
	149	return tag
	150
	151	content = chunk_dom.findChild('div', class_='post-content')
	152	assert isinstance(content, Tag)
	153
	154	return Chunk(getIcon(),
	155	getMeta('post-character'),
	156	getMeta('post-screenname'),
	157	getMeta('post-author'),
	158	content)
	159
	160
	161	def renderChunk(texifier: Texifier, chunk: Chunk) -> bytes:
	162	return b''.join([
	163	br'\glowhead{',
	164	br'\glowicon{%s}' % chunk.icon.encode('UTF-8') if chunk.icon else b'',
	165	b'}{',
	166	texifier.texify(chunk.character) if chunk.character else b'',
	167	b'}{',
	168	texifier.texify(chunk.screen_name) if chunk.screen_name else b'',
	169	b'}{',
	170	texifier.texify(chunk.author) if chunk.author else b'',
	171	b'}',
	172	texifier.texify(chunk.content)])
	173
	174
	175	ContentOnlyLayout = br'''
	176	\newcommand{\glowhead}[4]{}
	177	'''
	178
	179
	180	BelowIconLayout = br'''
	181	\newcommand{\glowhead}[4]{\wrapstuffclear
	182	\begin{wrapstuff}[l]
	183	\fbox{
	184	\begin{varwidth}{0.5\textwidth}
	185	\smash{\parbox[t][0pt]{0pt}{
	186	\setlength{\fboxrule}{0.2pt}
	187	\setlength{\fboxsep}{0pt}
	188	\vspace{-3.4pt}
	189	\fbox{\hspace{107mm}}
	190	}\\*}
	191	\vspace{-1em}
	192	\begin{center}
	193	#1\ifnotempty
	194	{#1}{\\*}#2\ifnotempty
	195	{#2}{\\*}#3\ifnotempty
	196	{#3}{\\*}#4
	197	\end{center}
	198	\end{varwidth}
	199	}
	200	\end{wrapstuff}
	201
	202	\strut
	203
	204	\noindent}'''
	205
	206
	207	# Why is \textwidth not the width of the text?
	208	# Why is the width of the text .765\textwidth?
	209	BesideIconLayout = br'''
	210	\newcommand{\glowhead}[4]{
	211
	212	\strut
	213
	214	\noindent\fbox{
	215	#1
	216	\parbox[b]{.765\textwidth}{
	217	\begin{center}
	218	#2\ifnotempty
	219	{#2}{\\*}#3\ifnotempty
	220	{#3}{\\*}#4
	221	\end{center}
	222	}
	223	}\\*
	224	\vspace{-0.75em}\\*
	225	\noindent}'''