[paperdoorknob] / glowfic.py

# paperdoorknob: Print glowfic
#
# This program is free software: you can redistribute it and/or modify it
# under the terms of the GNU General Public License as published by the
# Free Software Foundation, version 3.


from dataclasses import dataclass
import itertools
from urllib.parse import parse_qsl, urlencode, urlparse, urlunparse

from typing import Iterable

from bs4 import BeautifulSoup
from bs4.element import Tag

from images import ImageStore
from spec import Spec
from texify import Texifier


def _removeViewFromURL(url: str) -> str:
    u = urlparse(url)
    old_qs = parse_qsl(u.query)
    new_qs = [(k, v) for k, v in old_qs if k != 'view']
    return urlunparse(u._replace(query=urlencode(new_qs)))


def nonFlatURL(url: str) -> str:
    return _removeViewFromURL(url)


def flatURL(url: str) -> str:
    u = urlparse(_removeViewFromURL(url))
    qs = parse_qsl(u.query) + [('view', 'flat')]
    return urlunparse(u._replace(query=urlencode(qs)))


@dataclass(frozen=True)
class Chunk:
    icon: str | None
    character: Tag | None
    screen_name: Tag | None
    author: Tag | None
    content: Tag

# We avoid the name "post" because the Glowfic community uses the term
# inconsistently:
#  * The Glowfic software sometimes uses "post" to refer to a whole thread
#    (in the URL), sometimes uses "post" to refer to chunks (in the CSS),
#    but mostly uses "post" to refer to just the first chunk in a thread
#    (in the HTML and UI).  The non-first chunks are "replies".
#  * Readers and this software don't need to distinguish first-chunks and
#    non-first-chunks.
#  * Humans in the community tend to use "posts" to mean chunks.


class Thread:

    def __init__(self, thing: BeautifulSoup | Spec) -> None:
        if isinstance(thing, Spec):
            spec = thing
            spec.log('Fetching HTML...\r')
            html = spec.fetcher.fetch(flatURL(spec.url))
            spec.log('Parsing HTML...\r')
            self._dom = BeautifulSoup(spec.htmlfilter(html), 'html.parser')
        else:
            self._dom = thing

    def title(self) -> str | None:
        span = self._dom.findChild("span", id="post-title")
        if not isinstance(span, Tag):
            return None
        return span.text.strip()

    def chunkDOMs(self) -> Iterable[Tag]:
        def text() -> Tag:
            body = self._dom.body
            assert body
            text = body.find_next("div", class_="post-post")
            assert isinstance(text, Tag)
            return text

        def the_replies() -> Iterable[Tag]:
            rs = self._dom.find_all("div", class_="post-reply")
            assert all(isinstance(r, Tag) for r in rs)
            return rs

        return itertools.chain([text()], the_replies())


def makeChunk(chunk_dom: Tag, image_store: ImageStore) -> Chunk:

    def getIcon() -> str | None:
        icon_div = chunk_dom.findChild('div', class_='post-icon')
        if icon_div is None:
            return None
        assert isinstance(icon_div, Tag)
        icon_img = icon_div.findChild('img')
        if icon_img is None:
            return None
        assert isinstance(icon_img, Tag)
        return image_store.get_image(icon_img.attrs['src'])

    def getByClass(css_class: str) -> Tag | None:
        tag = chunk_dom.findChild('div', class_=css_class)
        assert tag is None or isinstance(tag, Tag)
        return tag

    def stripHREF(tag: Tag) -> None:
        for c in tag.findChildren("a"):
            if "href" in c.attrs:
                del c.attrs["href"]

    def getMeta(css_class: str) -> Tag | None:
        tag = getByClass(css_class)
        if tag is None:
            return None
        stripHREF(tag)
        return tag

    content = chunk_dom.findChild('div', class_='post-content')
    assert isinstance(content, Tag)

    return Chunk(getIcon(),
                 getMeta('post-character'),
                 getMeta('post-screenname'),
                 getMeta('post-author'),
                 content)


def renderChunk(texifier: Texifier, chunk: Chunk) -> bytes:
    return b''.join([
        br'\glowhead{',
        br'\glowicon{%s}' % chunk.icon.encode('UTF-8') if chunk.icon else b'',
        b'}{',
        texifier.texify(chunk.character) if chunk.character else b'',
        b'}{',
        texifier.texify(chunk.screen_name) if chunk.screen_name else b'',
        b'}{',
        texifier.texify(chunk.author) if chunk.author else b'',
        b'}',
        texifier.texify(chunk.content)])


ContentOnlyLayout = br'''
\newcommand{\glowhead}[4]{}
'''


BelowIconLayout = br'''
\newcommand{\glowhead}[4]{\wrapstuffclear
\begin{wrapstuff}[l]
\fbox{
\begin{varwidth}{0.5\textwidth}
  \smash{\parbox[t][0pt]{0pt}{
    \setlength{\fboxrule}{0.2pt}
    \setlength{\fboxsep}{0pt}
    \vspace{-3.4pt}
    \fbox{\hspace{107mm}}
  }\\*}
  \vspace{-1em}
\begin{center}
#1\ifnotempty
{#1}{\\*}#2\ifnotempty
{#2}{\\*}#3\ifnotempty
{#3}{\\*}#4
\end{center}
\end{varwidth}
}
\end{wrapstuff}

\strut

\noindent}'''


# Why is \textwidth not the width of the text?
# Why is the width of the text .765\textwidth?
BesideIconLayout = br'''
\newcommand{\glowhead}[4]{

\strut

\noindent\fbox{
#1
\parbox[b]{.765\textwidth}{
\begin{center}
#2\ifnotempty
{#2}{\\*}#3\ifnotempty
{#3}{\\*}#4
\end{center}
}
}\\*
\vspace{-0.75em}\\*
\noindent}'''
Commit	Line	Data
e6adf6ce SW	1	# paperdoorknob: Print glowfic
	2	#
	3	# This program is free software: you can redistribute it and/or modify it
	4	# under the terms of the GNU General Public License as published by the
	5	# Free Software Foundation, version 3.
	6
	7
aa060d9b	8	from dataclasses import dataclass
e6adf6ce	9	import itertools
1452f8d3	10	from urllib.parse import parse_qsl, urlencode, urlparse, urlunparse
e6adf6ce SW	11
	12	from typing import Iterable
	13
	14	from bs4 import BeautifulSoup
	15	from bs4.element import Tag
	16
aa060d9b	17	from images import ImageStore
70adfbff	18	from spec import Spec
d2a41ff4	19	from texify import Texifier
aa060d9b SW	20
aa060d9b SW	21
1452f8d3 SW	22	def _removeViewFromURL(url: str) -> str:
	23	u = urlparse(url)
	24	old_qs = parse_qsl(u.query)
	25	new_qs = [(k, v) for k, v in old_qs if k != 'view']
	26	return urlunparse(u._replace(query=urlencode(new_qs)))
	27
	28
	29	def nonFlatURL(url: str) -> str:
	30	return _removeViewFromURL(url)
	31
	32
	33	def flatURL(url: str) -> str:
	34	u = urlparse(_removeViewFromURL(url))
	35	qs = parse_qsl(u.query) + [('view', 'flat')]
	36	return urlunparse(u._replace(query=urlencode(qs)))
	37
	38
aa060d9b SW	39	@dataclass(frozen=True)
	40	class Chunk:
	41	icon: str \| None
37c47bc2 SW	42	character: Tag \| None
	43	screen_name: Tag \| None
	44	author: Tag \| None
aa060d9b SW	45	content: Tag
aa060d9b SW	46
e6adf6ce SW	47	# We avoid the name "post" because the Glowfic community uses the term
	48	# inconsistently:
	49	# * The Glowfic software sometimes uses "post" to refer to a whole thread
aa060d9b SW	50	# (in the URL), sometimes uses "post" to refer to chunks (in the CSS),
	51	# but mostly uses "post" to refer to just the first chunk in a thread
	52	# (in the HTML and UI). The non-first chunks are "replies".
e6adf6ce SW	53	# * Readers and this software don't need to distinguish first-chunks and
e6adf6ce SW	54	# non-first-chunks.
aa060d9b	55	# * Humans in the community tend to use "posts" to mean chunks.
e6adf6ce SW	56
e6adf6ce SW	57
94027099 SW	58	class Thread:
94027099 SW	59
70adfbff SW	60	def __init__(self, thing: BeautifulSoup \| Spec) -> None:
	61	if isinstance(thing, Spec):
	62	spec = thing
	63	spec.log('Fetching HTML...\r')
	64	html = spec.fetcher.fetch(flatURL(spec.url))
	65	spec.log('Parsing HTML...\r')
	66	self._dom = BeautifulSoup(spec.htmlfilter(html), 'html.parser')
	67	else:
	68	self._dom = thing
94027099	69
21e82200 SW	70	def title(self) -> str \| None:
	71	span = self._dom.findChild("span", id="post-title")
	72	if not isinstance(span, Tag):
	73	return None
	74	return span.text.strip()
	75
94027099 SW	76	def chunkDOMs(self) -> Iterable[Tag]:
94027099 SW	77	def text() -> Tag:
a18519bf	78	body = self._dom.body
94027099 SW	79	assert body
	80	text = body.find_next("div", class_="post-post")
	81	assert isinstance(text, Tag)
	82	return text
	83
	84	def the_replies() -> Iterable[Tag]:
a18519bf	85	rs = self._dom.find_all("div", class_="post-reply")
94027099 SW	86	assert all(isinstance(r, Tag) for r in rs)
	87	return rs
	88
	89	return itertools.chain([text()], the_replies())
aa060d9b SW	90
	91
	92	def makeChunk(chunk_dom: Tag, image_store: ImageStore) -> Chunk:
	93
	94	def getIcon() -> str \| None:
551bb1c9	95	icon_div = chunk_dom.findChild('div', class_='post-icon')
aa060d9b SW	96	if icon_div is None:
aa060d9b SW	97	return None
551bb1c9 SW	98	assert isinstance(icon_div, Tag)
551bb1c9 SW	99	icon_img = icon_div.findChild('img')
aa060d9b SW	100	if icon_img is None:
	101	return None
	102	assert isinstance(icon_img, Tag)
	103	return image_store.get_image(icon_img.attrs['src'])
	104
37c47bc2	105	def getByClass(css_class: str) -> Tag \| None:
551bb1c9	106	tag = chunk_dom.findChild('div', class_=css_class)
37c47bc2 SW	107	assert tag is None or isinstance(tag, Tag)
37c47bc2 SW	108	return tag
aa060d9b	109
62043b2b SW	110	def stripHREF(tag: Tag) -> None:
	111	for c in tag.findChildren("a"):
	112	if "href" in c.attrs:
	113	del c.attrs["href"]
	114
	115	def getMeta(css_class: str) -> Tag \| None:
	116	tag = getByClass(css_class)
	117	if tag is None:
	118	return None
	119	stripHREF(tag)
	120	return tag
	121
551bb1c9	122	content = chunk_dom.findChild('div', class_='post-content')
aa060d9b SW	123	assert isinstance(content, Tag)
	124
	125	return Chunk(getIcon(),
62043b2b SW	126	getMeta('post-character'),
	127	getMeta('post-screenname'),
	128	getMeta('post-author'),
aa060d9b	129	content)
d2a41ff4 SW	130
d2a41ff4 SW	131
1fac41bf SW	132	def renderChunk(texifier: Texifier, chunk: Chunk) -> bytes:
	133	return b''.join([
	134	br'\glowhead{',
	135	br'\glowicon{%s}' % chunk.icon.encode('UTF-8') if chunk.icon else b'',
	136	b'}{',
	137	texifier.texify(chunk.character) if chunk.character else b'',
	138	b'}{',
	139	texifier.texify(chunk.screen_name) if chunk.screen_name else b'',
	140	b'}{',
	141	texifier.texify(chunk.author) if chunk.author else b'',
	142	b'}',
	143	texifier.texify(chunk.content)])
d2a41ff4	144
d2a41ff4	145
1fac41bf SW	146	ContentOnlyLayout = br'''
	147	\newcommand{\glowhead}[4]{}
	148	'''
d2a41ff4	149
d2a41ff4	150
1fac41bf SW	151	BelowIconLayout = br'''
1fac41bf SW	152	\newcommand{\glowhead}[4]{\wrapstuffclear
67612898 SW	153	\begin{wrapstuff}[l]
	154	\fbox{
	155	\begin{varwidth}{0.5\textwidth}
	156	\smash{\parbox[t][0pt]{0pt}{
	157	\setlength{\fboxrule}{0.2pt}
	158	\setlength{\fboxsep}{0pt}
	159	\vspace{-3.4pt}
	160	\fbox{\hspace{107mm}}
	161	}\\*}
	162	\vspace{-1em}
	163	\begin{center}
1fac41bf SW	164	#1\ifnotempty
	165	{#1}{\\*}#2\ifnotempty
	166	{#2}{\\*}#3\ifnotempty
	167	{#3}{\\*}#4
67612898 SW	168	\end{center}
67612898 SW	169	\end{varwidth}
23dabdf5	170	}
67612898	171	\end{wrapstuff}
23dabdf5	172
67612898	173	\strut
16385131	174
1fac41bf	175	\noindent}'''
f75c1629 SW	176
f75c1629 SW	177
1fac41bf SW	178	# Why is \textwidth not the width of the text?
	179	# Why is the width of the text .765\textwidth?
	180	BesideIconLayout = br'''
	181	\newcommand{\glowhead}[4]{
f75c1629	182
1fac41bf	183	\strut
f75c1629	184
1fac41bf SW	185	\noindent\fbox{
1fac41bf SW	186	#1
67612898 SW	187	\parbox[b]{.765\textwidth}{
67612898 SW	188	\begin{center}
1fac41bf SW	189	#2\ifnotempty
	190	{#2}{\\*}#3\ifnotempty
	191	{#3}{\\*}#4
67612898	192	\end{center}
f75c1629	193	}
67612898 SW	194	}\\*
67612898 SW	195	\vspace{-0.75em}\\*
1fac41bf	196	\noindent}'''