[paperdoorknob] / paperdoorknob.py

# paperdoorknob: Print glowfic
#
# This program is free software: you can redistribute it and/or modify it
# under the terms of the GNU General Public License as published by the
# Free Software Foundation, version 3.


from argparse import ArgumentParser
import itertools
import os.path
import subprocess

from typing import IO, Iterable

from bs4 import BeautifulSoup
from bs4.element import Tag
import requests
import requests_cache
from xdg_base_dirs import xdg_cache_home


def command_line_parser() -> ArgumentParser:
    parser = ArgumentParser(prog='paperdoorknob', description='Print glowfic')
    parser.add_argument(
        '--cache_path',
        metavar='PATH',
        help='Where to keep the http cache (instead of %(default)s)',
        default=os.path.join(xdg_cache_home(), "paperdoorknob"))
    parser.add_argument(
        '--out',
        help='The filename stem at which to write output ' +
             '(eg: "%(default)s" produces %(default)s.text, %(default)s.pdf, etc.)',
        default='book')
    parser.add_argument('--pandoc', help='Location of the pandoc executable')
    parser.add_argument(
        '--timeout',
        help='How long to wait for HTTP requests, in seconds',
        default=30)
    parser.add_argument('url', help='URL to retrieve')
    return parser


def fetch(url: str, session: requests.Session, timeout: int) -> BeautifulSoup:
    with session.get(url, timeout=timeout) as r:
        r.raise_for_status()
        return BeautifulSoup(r.text, 'html.parser')


def clean(html: BeautifulSoup) -> BeautifulSoup:
    for eb in html.find_all("div", class_="post-edit-box"):
        eb.decompose()
    for footer in html.find_all("div", class_="post-footer"):
        footer.decompose()
    return html


def replies(html: BeautifulSoup) -> Iterable[Tag]:
    def text() -> Tag:
        body = html.body
        assert body
        text = body.find_next("div", class_="post-post")
        assert isinstance(text, Tag)
        return text

    def the_replies() -> Iterable[Tag]:
        rs = html.find_all("div", class_="post-reply")
        assert all(isinstance(r, Tag) for r in rs)
        return rs

    return itertools.chain([text()], the_replies())


def html_to_tex(pandoc: str, tag: Tag) -> bytes:
    return subprocess.run([pandoc, '--from=html', '--to=latex'],
                          input=tag.encode(),
                          stdout=subprocess.PIPE,
                          check=True).stdout


def process(
        url: str,
        session: requests.Session,
        timeout: int,
        texout: IO[bytes],
        pandoc: str,
) -> None:
    html = clean(fetch(url, session, timeout))
    for r in replies(html):
        texout.write(html_to_tex(pandoc, r))


def main() -> None:
    args = command_line_parser().parse_args()
    with requests_cache.CachedSession(args.cache_path, cache_control=True) as session:
        with open(args.out + '.tex', 'wb') as texout:
            process(
                args.url,
                session,
                args.timeout,
                texout,
                args.pandoc or 'pandoc')


if __name__ == '__main__':
    main()
Commit	Line	Data
92b11a10 SW	1	# paperdoorknob: Print glowfic
	2	#
	3	# This program is free software: you can redistribute it and/or modify it
	4	# under the terms of the GNU General Public License as published by the
	5	# Free Software Foundation, version 3.
	6
	7
	8	from argparse import ArgumentParser
55958ec0	9	import itertools
ba3b7c52	10	import os.path
36ae1d5f	11	import subprocess
a0d30541	12
a2d42468	13	from typing import IO, Iterable
a0d30541	14
136277e3	15	from bs4 import BeautifulSoup
6409066b	16	from bs4.element import Tag
b25a2f90	17	import requests
b34a368f	18	import requests_cache
ba3b7c52	19	from xdg_base_dirs import xdg_cache_home
92b11a10 SW	20
	21
	22	def command_line_parser() -> ArgumentParser:
	23	parser = ArgumentParser(prog='paperdoorknob', description='Print glowfic')
ba3b7c52 SW	24	parser.add_argument(
	25	'--cache_path',
	26	metavar='PATH',
	27	help='Where to keep the http cache (instead of %(default)s)',
	28	default=os.path.join(xdg_cache_home(), "paperdoorknob"))
a2d42468 SW	29	parser.add_argument(
	30	'--out',
	31	help='The filename stem at which to write output ' +
	32	'(eg: "%(default)s" produces %(default)s.text, %(default)s.pdf, etc.)',
	33	default='book')
1e512629	34	parser.add_argument('--pandoc', help='Location of the pandoc executable')
b25a2f90 SW	35	parser.add_argument(
	36	'--timeout',
	37	help='How long to wait for HTTP requests, in seconds',
	38	default=30)
	39	parser.add_argument('url', help='URL to retrieve')
92b11a10 SW	40	return parser
	41
	42
136277e3	43	def fetch(url: str, session: requests.Session, timeout: int) -> BeautifulSoup:
e138a9b4 SW	44	with session.get(url, timeout=timeout) as r:
e138a9b4 SW	45	r.raise_for_status()
136277e3	46	return BeautifulSoup(r.text, 'html.parser')
b25a2f90 SW	47
b25a2f90 SW	48
47cfa3cd SW	49	def clean(html: BeautifulSoup) -> BeautifulSoup:
	50	for eb in html.find_all("div", class_="post-edit-box"):
	51	eb.decompose()
	52	for footer in html.find_all("div", class_="post-footer"):
	53	footer.decompose()
	54	return html
	55
	56
	57	def replies(html: BeautifulSoup) -> Iterable[Tag]:
	58	def text() -> Tag:
	59	body = html.body
	60	assert body
	61	text = body.find_next("div", class_="post-post")
	62	assert isinstance(text, Tag)
	63	return text
	64
	65	def the_replies() -> Iterable[Tag]:
	66	rs = html.find_all("div", class_="post-reply")
	67	assert all(isinstance(r, Tag) for r in rs)
	68	return rs
	69
	70	return itertools.chain([text()], the_replies())
	71
	72
36ae1d5f SW	73	def html_to_tex(pandoc: str, tag: Tag) -> bytes:
	74	return subprocess.run([pandoc, '--from=html', '--to=latex'],
	75	input=tag.encode(),
	76	stdout=subprocess.PIPE,
	77	check=True).stdout
	78
	79
47cfa3cd SW	80	def process(
	81	url: str,
	82	session: requests.Session,
a2d42468	83	timeout: int,
36ae1d5f SW	84	texout: IO[bytes],
36ae1d5f SW	85	pandoc: str,
a2d42468	86	) -> None:
47cfa3cd	87	html = clean(fetch(url, session, timeout))
36ae1d5f SW	88	for r in replies(html):
36ae1d5f SW	89	texout.write(html_to_tex(pandoc, r))
47cfa3cd SW	90
47cfa3cd SW	91
92b11a10	92	def main() -> None:
b25a2f90	93	args = command_line_parser().parse_args()
4c1cf54e	94	with requests_cache.CachedSession(args.cache_path, cache_control=True) as session:
36ae1d5f SW	95	with open(args.out + '.tex', 'wb') as texout:
	96	process(
	97	args.url,
	98	session,
	99	args.timeout,
	100	texout,
	101	args.pandoc or 'pandoc')
92b11a10 SW	102
	103
	104	if __name__ == '__main__':
	105	main()