[paperdoorknob] / glowfic.py

# paperdoorknob: Print glowfic
#
# This program is free software: you can redistribute it and/or modify it
# under the terms of the GNU General Public License as published by the
# Free Software Foundation, version 3.


import dataclasses
import itertools
from urllib.parse import parse_qsl, urlencode, urljoin, urlparse, urlunparse

from typing import Any, Iterable

from bs4 import BeautifulSoup
from bs4.element import Tag

from images import ImageStore
from spec import Spec
from texify import Texifier


def ilen(it: Iterable[Any]) -> int:
    return sum(1 for _ in it)


def _removeViewFromURL(url: str) -> str:
    u = urlparse(url)
    old_qs = parse_qsl(u.query)
    new_qs = [(k, v) for k, v in old_qs if k != 'view']
    return urlunparse(u._replace(query=urlencode(new_qs)))


def nonFlatURL(url: str) -> str:
    return _removeViewFromURL(url)


def flatURL(url: str) -> str:
    u = urlparse(_removeViewFromURL(url))
    qs = parse_qsl(u.query) + [('view', 'flat')]
    return urlunparse(u._replace(query=urlencode(qs)))


@dataclasses.dataclass(frozen=True)
class Chunk:
    icon: str | None
    character: Tag | None
    screen_name: Tag | None
    author: Tag | None
    content: Tag

# We avoid the name "post" because the Glowfic community uses the term
# inconsistently:
#  * The Glowfic software sometimes uses "post" to refer to a whole thread
#    (in the URL), sometimes uses "post" to refer to chunks (in the CSS),
#    but mostly uses "post" to refer to just the first chunk in a thread
#    (in the HTML and UI).  The non-first chunks are "replies".
#  * Readers and this software don't need to distinguish first-chunks and
#    non-first-chunks.
#  * Humans in the community tend to use "posts" to mean chunks.


class Thread:

    def __init__(self, spec: Spec) -> None:
        def find_next_thread(dom: BeautifulSoup) -> str | None:
            for c in dom.findChildren('div', class_='post-navheader'):
                for a in c.findChildren('a'):
                    if 'Next Post' in a.text and 'href' in a.attrs and isinstance(
                            a.attrs['href'], str):
                        return urljoin(spec.url, a.attrs['href'])
            return None

        spec.log('Fetching HTML...\r')
        html = spec.fetcher.fetch(spec.url)
        flat_html = spec.fetcher.fetch(flatURL(spec.url))
        spec.log('Parsing HTML...\r')
        self._next_thread = find_next_thread(
            BeautifulSoup(spec.htmlfilter(html), 'html.parser'))
        self._dom = BeautifulSoup(spec.htmlfilter(flat_html), 'html.parser')
        self._spec = spec

    def title(self) -> str | None:
        span = self._dom.findChild("span", id="post-title")
        if not isinstance(span, Tag):
            return None
        return span.text.strip()

    def next_thread(self) -> str | None:
        return self._next_thread

    def chunkDOMs(self) -> Iterable[Tag]:
        def text() -> Tag:
            body = self._dom.body
            assert body
            text = body.find_next("div", class_="post-post")
            assert isinstance(text, Tag)
            return text

        def the_replies() -> Iterable[Tag]:
            rs = self._dom.find_all("div", class_="post-reply")
            assert all(isinstance(r, Tag) for r in rs)
            return rs

        return itertools.chain([text()], the_replies())

    def emit(self) -> None:
        self._spec.log('Counting chunks...\r')
        num_chunks = ilen(self.chunkDOMs())
        title = self.title() or "chunk"
        for i, r in enumerate(self.chunkDOMs()):
            percent = 100.0 * (i + 1) / num_chunks
            self._spec.log(
                f'Processing {title} {i+1} of {num_chunks} ({percent:.1f}%)\r')
            self._spec.domfilter(r)
            chunk = makeChunk(r, self._spec.images)
            self._spec.texout.write(
                self._spec.texfilter(renderChunk(self._spec.texifier, chunk)))
        self._spec.log('')
        next_url = self.next_thread()
        if next_url is not None:
            Thread(dataclasses.replace(self._spec, url=next_url)).emit()


def makeChunk(chunk_dom: Tag, image_store: ImageStore) -> Chunk:

    def getIcon() -> str | None:
        icon_div = chunk_dom.findChild('div', class_='post-icon')
        if icon_div is None:
            return None
        assert isinstance(icon_div, Tag)
        icon_img = icon_div.findChild('img')
        if icon_img is None:
            return None
        assert isinstance(icon_img, Tag)
        return image_store.get_image(icon_img.attrs['src'])

    def getByClass(css_class: str) -> Tag | None:
        tag = chunk_dom.findChild('div', class_=css_class)
        assert tag is None or isinstance(tag, Tag)
        return tag

    def stripHREF(tag: Tag) -> None:
        for c in tag.findChildren("a"):
            if "href" in c.attrs:
                del c.attrs["href"]

    def getMeta(css_class: str) -> Tag | None:
        tag = getByClass(css_class)
        if tag is None:
            return None
        stripHREF(tag)
        return tag

    content = chunk_dom.findChild('div', class_='post-content')
    assert isinstance(content, Tag)

    return Chunk(getIcon(),
                 getMeta('post-character'),
                 getMeta('post-screenname'),
                 getMeta('post-author'),
                 content)


def renderChunk(texifier: Texifier, chunk: Chunk) -> bytes:
    return b''.join([
        br'\glowhead{',
        br'\glowicon{%s}' % chunk.icon.encode('UTF-8') if chunk.icon else b'',
        b'}{',
        texifier.texify(chunk.character) if chunk.character else b'',
        b'}{',
        texifier.texify(chunk.screen_name) if chunk.screen_name else b'',
        b'}{',
        texifier.texify(chunk.author) if chunk.author else b'',
        b'}',
        texifier.texify(chunk.content)])


ContentOnlyLayout = br'''
\newcommand{\glowhead}[4]{}
'''


BelowIconLayout = br'''
\newcommand{\glowhead}[4]{\wrapstuffclear
\begin{wrapstuff}[l]
\fbox{
\begin{varwidth}{0.5\textwidth}
  \smash{\parbox[t][0pt]{0pt}{
    \setlength{\fboxrule}{0.2pt}
    \setlength{\fboxsep}{0pt}
    \vspace{-3.4pt}
    \fbox{\hspace{107mm}}
  }\\*}
  \vspace{-1em}
\begin{center}
#1\ifnotempty
{#1}{\\*}#2\ifnotempty
{#2}{\\*}#3\ifnotempty
{#3}{\\*}#4
\end{center}
\end{varwidth}
}
\end{wrapstuff}

\strut

\noindent}'''


# Why is \textwidth not the width of the text?
# Why is the width of the text .765\textwidth?
BesideIconLayout = br'''
\newcommand{\glowhead}[4]{

\strut

\noindent\fbox{
#1
\parbox[b]{.765\textwidth}{
\begin{center}
#2\ifnotempty
{#2}{\\*}#3\ifnotempty
{#3}{\\*}#4
\end{center}
}
}\\*
\vspace{-0.75em}\\*
\noindent}'''
Commit	Line	Data
e6adf6ce SW	1	# paperdoorknob: Print glowfic
	2	#
	3	# This program is free software: you can redistribute it and/or modify it
	4	# under the terms of the GNU General Public License as published by the
	5	# Free Software Foundation, version 3.
	6
	7
6f430a74	8	import dataclasses
e6adf6ce	9	import itertools
fc2aaa75	10	from urllib.parse import parse_qsl, urlencode, urljoin, urlparse, urlunparse
e6adf6ce	11
24b87bad	12	from typing import Any, Iterable
e6adf6ce SW	13
	14	from bs4 import BeautifulSoup
	15	from bs4.element import Tag
	16
aa060d9b	17	from images import ImageStore
70adfbff	18	from spec import Spec
d2a41ff4	19	from texify import Texifier
aa060d9b SW	20
aa060d9b SW	21
24b87bad SW	22	def ilen(it: Iterable[Any]) -> int:
	23	return sum(1 for _ in it)
	24
	25
1452f8d3 SW	26	def _removeViewFromURL(url: str) -> str:
	27	u = urlparse(url)
	28	old_qs = parse_qsl(u.query)
	29	new_qs = [(k, v) for k, v in old_qs if k != 'view']
	30	return urlunparse(u._replace(query=urlencode(new_qs)))
	31
	32
	33	def nonFlatURL(url: str) -> str:
	34	return _removeViewFromURL(url)
	35
	36
	37	def flatURL(url: str) -> str:
	38	u = urlparse(_removeViewFromURL(url))
	39	qs = parse_qsl(u.query) + [('view', 'flat')]
	40	return urlunparse(u._replace(query=urlencode(qs)))
	41
	42
6f430a74	43	@dataclasses.dataclass(frozen=True)
aa060d9b SW	44	class Chunk:
aa060d9b SW	45	icon: str \| None
37c47bc2 SW	46	character: Tag \| None
	47	screen_name: Tag \| None
	48	author: Tag \| None
aa060d9b SW	49	content: Tag
aa060d9b SW	50
e6adf6ce SW	51	# We avoid the name "post" because the Glowfic community uses the term
	52	# inconsistently:
	53	# * The Glowfic software sometimes uses "post" to refer to a whole thread
aa060d9b SW	54	# (in the URL), sometimes uses "post" to refer to chunks (in the CSS),
	55	# but mostly uses "post" to refer to just the first chunk in a thread
	56	# (in the HTML and UI). The non-first chunks are "replies".
e6adf6ce SW	57	# * Readers and this software don't need to distinguish first-chunks and
e6adf6ce SW	58	# non-first-chunks.
aa060d9b	59	# * Humans in the community tend to use "posts" to mean chunks.
e6adf6ce SW	60
e6adf6ce SW	61
94027099 SW	62	class Thread:
94027099 SW	63
19ed28f3	64	def __init__(self, spec: Spec) -> None:
53f396b4 SW	65	def find_next_thread(dom: BeautifulSoup) -> str \| None:
	66	for c in dom.findChildren('div', class_='post-navheader'):
	67	for a in c.findChildren('a'):
	68	if 'Next Post' in a.text and 'href' in a.attrs and isinstance(
	69	a.attrs['href'], str):
fc2aaa75	70	return urljoin(spec.url, a.attrs['href'])
53f396b4 SW	71	return None
53f396b4 SW	72
19ed28f3	73	spec.log('Fetching HTML...\r')
53f396b4 SW	74	html = spec.fetcher.fetch(spec.url)
53f396b4 SW	75	flat_html = spec.fetcher.fetch(flatURL(spec.url))
19ed28f3	76	spec.log('Parsing HTML...\r')
53f396b4 SW	77	self._next_thread = find_next_thread(
	78	BeautifulSoup(spec.htmlfilter(html), 'html.parser'))
	79	self._dom = BeautifulSoup(spec.htmlfilter(flat_html), 'html.parser')
24b87bad	80	self._spec = spec
94027099	81
21e82200 SW	82	def title(self) -> str \| None:
	83	span = self._dom.findChild("span", id="post-title")
	84	if not isinstance(span, Tag):
	85	return None
	86	return span.text.strip()
	87
53f396b4 SW	88	def next_thread(self) -> str \| None:
	89	return self._next_thread
	90
94027099 SW	91	def chunkDOMs(self) -> Iterable[Tag]:
94027099 SW	92	def text() -> Tag:
a18519bf	93	body = self._dom.body
94027099 SW	94	assert body
	95	text = body.find_next("div", class_="post-post")
	96	assert isinstance(text, Tag)
	97	return text
	98
	99	def the_replies() -> Iterable[Tag]:
a18519bf	100	rs = self._dom.find_all("div", class_="post-reply")
94027099 SW	101	assert all(isinstance(r, Tag) for r in rs)
	102	return rs
	103
	104	return itertools.chain([text()], the_replies())
aa060d9b	105
24b87bad SW	106	def emit(self) -> None:
	107	self._spec.log('Counting chunks...\r')
	108	num_chunks = ilen(self.chunkDOMs())
	109	title = self.title() or "chunk"
	110	for i, r in enumerate(self.chunkDOMs()):
f74e5779	111	percent = 100.0 * (i + 1) / num_chunks
24b87bad	112	self._spec.log(
f74e5779	113	f'Processing {title} {i+1} of {num_chunks} ({percent:.1f}%)\r')
24b87bad SW	114	self._spec.domfilter(r)
	115	chunk = makeChunk(r, self._spec.images)
	116	self._spec.texout.write(
	117	self._spec.texfilter(renderChunk(self._spec.texifier, chunk)))
	118	self._spec.log('')
6f430a74 SW	119	next_url = self.next_thread()
	120	if next_url is not None:
	121	Thread(dataclasses.replace(self._spec, url=next_url)).emit()
24b87bad	122
aa060d9b SW	123
	124	def makeChunk(chunk_dom: Tag, image_store: ImageStore) -> Chunk:
	125
	126	def getIcon() -> str \| None:
551bb1c9	127	icon_div = chunk_dom.findChild('div', class_='post-icon')
aa060d9b SW	128	if icon_div is None:
aa060d9b SW	129	return None
551bb1c9 SW	130	assert isinstance(icon_div, Tag)
551bb1c9 SW	131	icon_img = icon_div.findChild('img')
aa060d9b SW	132	if icon_img is None:
	133	return None
	134	assert isinstance(icon_img, Tag)
	135	return image_store.get_image(icon_img.attrs['src'])
	136
37c47bc2	137	def getByClass(css_class: str) -> Tag \| None:
551bb1c9	138	tag = chunk_dom.findChild('div', class_=css_class)
37c47bc2 SW	139	assert tag is None or isinstance(tag, Tag)
37c47bc2 SW	140	return tag
aa060d9b	141
62043b2b SW	142	def stripHREF(tag: Tag) -> None:
	143	for c in tag.findChildren("a"):
	144	if "href" in c.attrs:
	145	del c.attrs["href"]
	146
	147	def getMeta(css_class: str) -> Tag \| None:
	148	tag = getByClass(css_class)
	149	if tag is None:
	150	return None
	151	stripHREF(tag)
	152	return tag
	153
551bb1c9	154	content = chunk_dom.findChild('div', class_='post-content')
aa060d9b SW	155	assert isinstance(content, Tag)
	156
	157	return Chunk(getIcon(),
62043b2b SW	158	getMeta('post-character'),
	159	getMeta('post-screenname'),
	160	getMeta('post-author'),
aa060d9b	161	content)
d2a41ff4 SW	162
d2a41ff4 SW	163
1fac41bf SW	164	def renderChunk(texifier: Texifier, chunk: Chunk) -> bytes:
	165	return b''.join([
	166	br'\glowhead{',
	167	br'\glowicon{%s}' % chunk.icon.encode('UTF-8') if chunk.icon else b'',
	168	b'}{',
	169	texifier.texify(chunk.character) if chunk.character else b'',
	170	b'}{',
	171	texifier.texify(chunk.screen_name) if chunk.screen_name else b'',
	172	b'}{',
	173	texifier.texify(chunk.author) if chunk.author else b'',
	174	b'}',
	175	texifier.texify(chunk.content)])
d2a41ff4	176
d2a41ff4	177
1fac41bf SW	178	ContentOnlyLayout = br'''
	179	\newcommand{\glowhead}[4]{}
	180	'''
d2a41ff4	181
d2a41ff4	182
1fac41bf SW	183	BelowIconLayout = br'''
1fac41bf SW	184	\newcommand{\glowhead}[4]{\wrapstuffclear
67612898 SW	185	\begin{wrapstuff}[l]
	186	\fbox{
	187	\begin{varwidth}{0.5\textwidth}
	188	\smash{\parbox[t][0pt]{0pt}{
	189	\setlength{\fboxrule}{0.2pt}
	190	\setlength{\fboxsep}{0pt}
	191	\vspace{-3.4pt}
	192	\fbox{\hspace{107mm}}
	193	}\\*}
	194	\vspace{-1em}
	195	\begin{center}
1fac41bf SW	196	#1\ifnotempty
	197	{#1}{\\*}#2\ifnotempty
	198	{#2}{\\*}#3\ifnotempty
	199	{#3}{\\*}#4
67612898 SW	200	\end{center}
67612898 SW	201	\end{varwidth}
23dabdf5	202	}
67612898	203	\end{wrapstuff}
23dabdf5	204
67612898	205	\strut
16385131	206
1fac41bf	207	\noindent}'''
f75c1629 SW	208
f75c1629 SW	209
1fac41bf SW	210	# Why is \textwidth not the width of the text?
	211	# Why is the width of the text .765\textwidth?
	212	BesideIconLayout = br'''
	213	\newcommand{\glowhead}[4]{
f75c1629	214
1fac41bf	215	\strut
f75c1629	216
1fac41bf SW	217	\noindent\fbox{
1fac41bf SW	218	#1
67612898 SW	219	\parbox[b]{.765\textwidth}{
67612898 SW	220	\begin{center}
1fac41bf SW	221	#2\ifnotempty
	222	{#2}{\\*}#3\ifnotempty
	223	{#3}{\\*}#4
67612898	224	\end{center}
f75c1629	225	}
67612898 SW	226	}\\*
67612898 SW	227	\vspace{-0.75em}\\*
1fac41bf	228	\noindent}'''