[paperdoorknob] / glowfic.py

# paperdoorknob: Print glowfic
#
# This program is free software: you can redistribute it and/or modify it
# under the terms of the GNU General Public License as published by the
# Free Software Foundation, version 3.


from dataclasses import dataclass
import itertools
from urllib.parse import parse_qsl, urlencode, urljoin, urlparse, urlunparse

from typing import Iterable

from bs4 import BeautifulSoup
from bs4.element import Tag

from images import ImageStore
from spec import Spec
from texify import Texifier


def _removeViewFromURL(url: str) -> str:
    u = urlparse(url)
    old_qs = parse_qsl(u.query)
    new_qs = [(k, v) for k, v in old_qs if k != 'view']
    return urlunparse(u._replace(query=urlencode(new_qs)))


def nonFlatURL(url: str) -> str:
    return _removeViewFromURL(url)


def flatURL(url: str) -> str:
    u = urlparse(_removeViewFromURL(url))
    qs = parse_qsl(u.query) + [('view', 'flat')]
    return urlunparse(u._replace(query=urlencode(qs)))


@dataclass(frozen=True)
class Chunk:
    icon: str | None
    character: Tag | None
    screen_name: Tag | None
    author: Tag | None
    content: Tag

# We avoid the name "post" because the Glowfic community uses the term
# inconsistently:
#  * The Glowfic software sometimes uses "post" to refer to a whole thread
#    (in the URL), sometimes uses "post" to refer to chunks (in the CSS),
#    but mostly uses "post" to refer to just the first chunk in a thread
#    (in the HTML and UI).  The non-first chunks are "replies".
#  * Readers and this software don't need to distinguish first-chunks and
#    non-first-chunks.
#  * Humans in the community tend to use "posts" to mean chunks.


class Thread:

    def __init__(self, spec: Spec) -> None:
        def find_next_thread(dom: BeautifulSoup) -> str | None:
            for c in dom.findChildren('div', class_='post-navheader'):
                for a in c.findChildren('a'):
                    if 'Next Post' in a.text and 'href' in a.attrs and isinstance(
                            a.attrs['href'], str):
                        return urljoin(spec.url, a.attrs['href'])
            return None

        spec.log('Fetching HTML...\r')
        html = spec.fetcher.fetch(spec.url)
        flat_html = spec.fetcher.fetch(flatURL(spec.url))
        spec.log('Parsing HTML...\r')
        self._next_thread = find_next_thread(
            BeautifulSoup(spec.htmlfilter(html), 'html.parser'))
        self._dom = BeautifulSoup(spec.htmlfilter(flat_html), 'html.parser')

    def title(self) -> str | None:
        span = self._dom.findChild("span", id="post-title")
        if not isinstance(span, Tag):
            return None
        return span.text.strip()

    def next_thread(self) -> str | None:
        return self._next_thread

    def chunkDOMs(self) -> Iterable[Tag]:
        def text() -> Tag:
            body = self._dom.body
            assert body
            text = body.find_next("div", class_="post-post")
            assert isinstance(text, Tag)
            return text

        def the_replies() -> Iterable[Tag]:
            rs = self._dom.find_all("div", class_="post-reply")
            assert all(isinstance(r, Tag) for r in rs)
            return rs

        return itertools.chain([text()], the_replies())


def makeChunk(chunk_dom: Tag, image_store: ImageStore) -> Chunk:

    def getIcon() -> str | None:
        icon_div = chunk_dom.findChild('div', class_='post-icon')
        if icon_div is None:
            return None
        assert isinstance(icon_div, Tag)
        icon_img = icon_div.findChild('img')
        if icon_img is None:
            return None
        assert isinstance(icon_img, Tag)
        return image_store.get_image(icon_img.attrs['src'])

    def getByClass(css_class: str) -> Tag | None:
        tag = chunk_dom.findChild('div', class_=css_class)
        assert tag is None or isinstance(tag, Tag)
        return tag

    def stripHREF(tag: Tag) -> None:
        for c in tag.findChildren("a"):
            if "href" in c.attrs:
                del c.attrs["href"]

    def getMeta(css_class: str) -> Tag | None:
        tag = getByClass(css_class)
        if tag is None:
            return None
        stripHREF(tag)
        return tag

    content = chunk_dom.findChild('div', class_='post-content')
    assert isinstance(content, Tag)

    return Chunk(getIcon(),
                 getMeta('post-character'),
                 getMeta('post-screenname'),
                 getMeta('post-author'),
                 content)


def renderChunk(texifier: Texifier, chunk: Chunk) -> bytes:
    return b''.join([
        br'\glowhead{',
        br'\glowicon{%s}' % chunk.icon.encode('UTF-8') if chunk.icon else b'',
        b'}{',
        texifier.texify(chunk.character) if chunk.character else b'',
        b'}{',
        texifier.texify(chunk.screen_name) if chunk.screen_name else b'',
        b'}{',
        texifier.texify(chunk.author) if chunk.author else b'',
        b'}',
        texifier.texify(chunk.content)])


ContentOnlyLayout = br'''
\newcommand{\glowhead}[4]{}
'''


BelowIconLayout = br'''
\newcommand{\glowhead}[4]{\wrapstuffclear
\begin{wrapstuff}[l]
\fbox{
\begin{varwidth}{0.5\textwidth}
  \smash{\parbox[t][0pt]{0pt}{
    \setlength{\fboxrule}{0.2pt}
    \setlength{\fboxsep}{0pt}
    \vspace{-3.4pt}
    \fbox{\hspace{107mm}}
  }\\*}
  \vspace{-1em}
\begin{center}
#1\ifnotempty
{#1}{\\*}#2\ifnotempty
{#2}{\\*}#3\ifnotempty
{#3}{\\*}#4
\end{center}
\end{varwidth}
}
\end{wrapstuff}

\strut

\noindent}'''


# Why is \textwidth not the width of the text?
# Why is the width of the text .765\textwidth?
BesideIconLayout = br'''
\newcommand{\glowhead}[4]{

\strut

\noindent\fbox{
#1
\parbox[b]{.765\textwidth}{
\begin{center}
#2\ifnotempty
{#2}{\\*}#3\ifnotempty
{#3}{\\*}#4
\end{center}
}
}\\*
\vspace{-0.75em}\\*
\noindent}'''
Commit	Line	Data
e6adf6ce SW	1	# paperdoorknob: Print glowfic
	2	#
	3	# This program is free software: you can redistribute it and/or modify it
	4	# under the terms of the GNU General Public License as published by the
	5	# Free Software Foundation, version 3.
	6
	7
aa060d9b	8	from dataclasses import dataclass
e6adf6ce	9	import itertools
fc2aaa75	10	from urllib.parse import parse_qsl, urlencode, urljoin, urlparse, urlunparse
e6adf6ce SW	11
	12	from typing import Iterable
	13
	14	from bs4 import BeautifulSoup
	15	from bs4.element import Tag
	16
aa060d9b	17	from images import ImageStore
70adfbff	18	from spec import Spec
d2a41ff4	19	from texify import Texifier
aa060d9b SW	20
aa060d9b SW	21
1452f8d3 SW	22	def _removeViewFromURL(url: str) -> str:
	23	u = urlparse(url)
	24	old_qs = parse_qsl(u.query)
	25	new_qs = [(k, v) for k, v in old_qs if k != 'view']
	26	return urlunparse(u._replace(query=urlencode(new_qs)))
	27
	28
	29	def nonFlatURL(url: str) -> str:
	30	return _removeViewFromURL(url)
	31
	32
	33	def flatURL(url: str) -> str:
	34	u = urlparse(_removeViewFromURL(url))
	35	qs = parse_qsl(u.query) + [('view', 'flat')]
	36	return urlunparse(u._replace(query=urlencode(qs)))
	37
	38
aa060d9b SW	39	@dataclass(frozen=True)
	40	class Chunk:
	41	icon: str \| None
37c47bc2 SW	42	character: Tag \| None
	43	screen_name: Tag \| None
	44	author: Tag \| None
aa060d9b SW	45	content: Tag
aa060d9b SW	46
e6adf6ce SW	47	# We avoid the name "post" because the Glowfic community uses the term
	48	# inconsistently:
	49	# * The Glowfic software sometimes uses "post" to refer to a whole thread
aa060d9b SW	50	# (in the URL), sometimes uses "post" to refer to chunks (in the CSS),
	51	# but mostly uses "post" to refer to just the first chunk in a thread
	52	# (in the HTML and UI). The non-first chunks are "replies".
e6adf6ce SW	53	# * Readers and this software don't need to distinguish first-chunks and
e6adf6ce SW	54	# non-first-chunks.
aa060d9b	55	# * Humans in the community tend to use "posts" to mean chunks.
e6adf6ce SW	56
e6adf6ce SW	57
94027099 SW	58	class Thread:
94027099 SW	59
19ed28f3	60	def __init__(self, spec: Spec) -> None:
53f396b4 SW	61	def find_next_thread(dom: BeautifulSoup) -> str \| None:
	62	for c in dom.findChildren('div', class_='post-navheader'):
	63	for a in c.findChildren('a'):
	64	if 'Next Post' in a.text and 'href' in a.attrs and isinstance(
	65	a.attrs['href'], str):
fc2aaa75	66	return urljoin(spec.url, a.attrs['href'])
53f396b4 SW	67	return None
53f396b4 SW	68
19ed28f3	69	spec.log('Fetching HTML...\r')
53f396b4 SW	70	html = spec.fetcher.fetch(spec.url)
53f396b4 SW	71	flat_html = spec.fetcher.fetch(flatURL(spec.url))
19ed28f3	72	spec.log('Parsing HTML...\r')
53f396b4 SW	73	self._next_thread = find_next_thread(
	74	BeautifulSoup(spec.htmlfilter(html), 'html.parser'))
	75	self._dom = BeautifulSoup(spec.htmlfilter(flat_html), 'html.parser')
94027099	76
21e82200 SW	77	def title(self) -> str \| None:
	78	span = self._dom.findChild("span", id="post-title")
	79	if not isinstance(span, Tag):
	80	return None
	81	return span.text.strip()
	82
53f396b4 SW	83	def next_thread(self) -> str \| None:
	84	return self._next_thread
	85
94027099 SW	86	def chunkDOMs(self) -> Iterable[Tag]:
94027099 SW	87	def text() -> Tag:
a18519bf	88	body = self._dom.body
94027099 SW	89	assert body
	90	text = body.find_next("div", class_="post-post")
	91	assert isinstance(text, Tag)
	92	return text
	93
	94	def the_replies() -> Iterable[Tag]:
a18519bf	95	rs = self._dom.find_all("div", class_="post-reply")
94027099 SW	96	assert all(isinstance(r, Tag) for r in rs)
	97	return rs
	98
	99	return itertools.chain([text()], the_replies())
aa060d9b SW	100
	101
	102	def makeChunk(chunk_dom: Tag, image_store: ImageStore) -> Chunk:
	103
	104	def getIcon() -> str \| None:
551bb1c9	105	icon_div = chunk_dom.findChild('div', class_='post-icon')
aa060d9b SW	106	if icon_div is None:
aa060d9b SW	107	return None
551bb1c9 SW	108	assert isinstance(icon_div, Tag)
551bb1c9 SW	109	icon_img = icon_div.findChild('img')
aa060d9b SW	110	if icon_img is None:
	111	return None
	112	assert isinstance(icon_img, Tag)
	113	return image_store.get_image(icon_img.attrs['src'])
	114
37c47bc2	115	def getByClass(css_class: str) -> Tag \| None:
551bb1c9	116	tag = chunk_dom.findChild('div', class_=css_class)
37c47bc2 SW	117	assert tag is None or isinstance(tag, Tag)
37c47bc2 SW	118	return tag
aa060d9b	119
62043b2b SW	120	def stripHREF(tag: Tag) -> None:
	121	for c in tag.findChildren("a"):
	122	if "href" in c.attrs:
	123	del c.attrs["href"]
	124
	125	def getMeta(css_class: str) -> Tag \| None:
	126	tag = getByClass(css_class)
	127	if tag is None:
	128	return None
	129	stripHREF(tag)
	130	return tag
	131
551bb1c9	132	content = chunk_dom.findChild('div', class_='post-content')
aa060d9b SW	133	assert isinstance(content, Tag)
	134
	135	return Chunk(getIcon(),
62043b2b SW	136	getMeta('post-character'),
	137	getMeta('post-screenname'),
	138	getMeta('post-author'),
aa060d9b	139	content)
d2a41ff4 SW	140
d2a41ff4 SW	141
1fac41bf SW	142	def renderChunk(texifier: Texifier, chunk: Chunk) -> bytes:
	143	return b''.join([
	144	br'\glowhead{',
	145	br'\glowicon{%s}' % chunk.icon.encode('UTF-8') if chunk.icon else b'',
	146	b'}{',
	147	texifier.texify(chunk.character) if chunk.character else b'',
	148	b'}{',
	149	texifier.texify(chunk.screen_name) if chunk.screen_name else b'',
	150	b'}{',
	151	texifier.texify(chunk.author) if chunk.author else b'',
	152	b'}',
	153	texifier.texify(chunk.content)])
d2a41ff4	154
d2a41ff4	155
1fac41bf SW	156	ContentOnlyLayout = br'''
	157	\newcommand{\glowhead}[4]{}
	158	'''
d2a41ff4	159
d2a41ff4	160
1fac41bf SW	161	BelowIconLayout = br'''
1fac41bf SW	162	\newcommand{\glowhead}[4]{\wrapstuffclear
67612898 SW	163	\begin{wrapstuff}[l]
	164	\fbox{
	165	\begin{varwidth}{0.5\textwidth}
	166	\smash{\parbox[t][0pt]{0pt}{
	167	\setlength{\fboxrule}{0.2pt}
	168	\setlength{\fboxsep}{0pt}
	169	\vspace{-3.4pt}
	170	\fbox{\hspace{107mm}}
	171	}\\*}
	172	\vspace{-1em}
	173	\begin{center}
1fac41bf SW	174	#1\ifnotempty
	175	{#1}{\\*}#2\ifnotempty
	176	{#2}{\\*}#3\ifnotempty
	177	{#3}{\\*}#4
67612898 SW	178	\end{center}
67612898 SW	179	\end{varwidth}
23dabdf5	180	}
67612898	181	\end{wrapstuff}
23dabdf5	182
67612898	183	\strut
16385131	184
1fac41bf	185	\noindent}'''
f75c1629 SW	186
f75c1629 SW	187
1fac41bf SW	188	# Why is \textwidth not the width of the text?
	189	# Why is the width of the text .765\textwidth?
	190	BesideIconLayout = br'''
	191	\newcommand{\glowhead}[4]{
f75c1629	192
1fac41bf	193	\strut
f75c1629	194
1fac41bf SW	195	\noindent\fbox{
1fac41bf SW	196	#1
67612898 SW	197	\parbox[b]{.765\textwidth}{
67612898 SW	198	\begin{center}
1fac41bf SW	199	#2\ifnotempty
	200	{#2}{\\*}#3\ifnotempty
	201	{#3}{\\*}#4
67612898	202	\end{center}
f75c1629	203	}
67612898 SW	204	}\\*
67612898 SW	205	\vspace{-0.75em}\\*
1fac41bf	206	\noindent}'''