[paperdoorknob] / paperdoorknob.py

# paperdoorknob: Print glowfic
#
# This program is free software: you can redistribute it and/or modify it
# under the terms of the GNU General Public License as published by the
# Free Software Foundation, version 3.


from argparse import ArgumentParser
import itertools
import os.path

from typing import IO, Iterable

from bs4 import BeautifulSoup
from bs4.element import Tag
from xdg_base_dirs import xdg_cache_home

from fetch import CachingFetcher, Fetcher
from texify import PandocTexifier, Texifier


def command_line_parser() -> ArgumentParser:
    parser = ArgumentParser(prog='paperdoorknob', description='Print glowfic')
    parser.add_argument(
        '--cache_path',
        metavar='PATH',
        help='Where to keep the http cache (instead of %(default)s)',
        default=os.path.join(xdg_cache_home(), "paperdoorknob"))
    parser.add_argument(
        '--out',
        help='The filename stem at which to write output ' +
             '(eg: "%(default)s" produces %(default)s.text, %(default)s.pdf, etc.)',
        default='book')
    parser.add_argument('--pandoc', help='Location of the pandoc executable')
    parser.add_argument(
        '--timeout',
        help='How long to wait for HTTP requests, in seconds',
        default=30)
    parser.add_argument('url', help='URL to retrieve')
    return parser


def parse(content: bytes) -> BeautifulSoup:
    return BeautifulSoup(content, 'html.parser')


def clean(html: BeautifulSoup) -> BeautifulSoup:
    for eb in html.find_all("div", class_="post-edit-box"):
        eb.decompose()
    for footer in html.find_all("div", class_="post-footer"):
        footer.decompose()
    return html


def replies(html: BeautifulSoup) -> Iterable[Tag]:
    def text() -> Tag:
        body = html.body
        assert body
        text = body.find_next("div", class_="post-post")
        assert isinstance(text, Tag)
        return text

    def the_replies() -> Iterable[Tag]:
        rs = html.find_all("div", class_="post-reply")
        assert all(isinstance(r, Tag) for r in rs)
        return rs

    return itertools.chain([text()], the_replies())


def process(
        url: str,
        fetcher: Fetcher,
        texifier: Texifier,
        texout: IO[bytes]) -> None:
    texout.write(b'\\documentclass{article}\n\\begin{document}\n')
    html = clean(parse(fetcher.fetch(url)))
    for r in replies(html):
        texout.write(texifier.texify(r))
    texout.write(b'\\end{document}\n')


def main() -> None:
    args = command_line_parser().parse_args()
    texifier = PandocTexifier(args.pandoc or 'pandoc')
    with CachingFetcher(args.cache_path, args.timeout) as fetcher:
        with open(args.out + '.tex', 'wb') as texout:
            process(args.url, fetcher, texifier, texout)


if __name__ == '__main__':
    main()
Commit	Line	Data
92b11a10 SW	1	# paperdoorknob: Print glowfic
	2	#
	3	# This program is free software: you can redistribute it and/or modify it
	4	# under the terms of the GNU General Public License as published by the
	5	# Free Software Foundation, version 3.
	6
	7
	8	from argparse import ArgumentParser
55958ec0	9	import itertools
ba3b7c52	10	import os.path
a0d30541	11
a2d42468	12	from typing import IO, Iterable
a0d30541	13
136277e3	14	from bs4 import BeautifulSoup
6409066b	15	from bs4.element import Tag
ba3b7c52	16	from xdg_base_dirs import xdg_cache_home
92b11a10	17
a64403ac	18	from fetch import CachingFetcher, Fetcher
79631507	19	from texify import PandocTexifier, Texifier
a64403ac	20
92b11a10 SW	21
	22	def command_line_parser() -> ArgumentParser:
	23	parser = ArgumentParser(prog='paperdoorknob', description='Print glowfic')
ba3b7c52 SW	24	parser.add_argument(
	25	'--cache_path',
	26	metavar='PATH',
	27	help='Where to keep the http cache (instead of %(default)s)',
	28	default=os.path.join(xdg_cache_home(), "paperdoorknob"))
a2d42468 SW	29	parser.add_argument(
	30	'--out',
	31	help='The filename stem at which to write output ' +
	32	'(eg: "%(default)s" produces %(default)s.text, %(default)s.pdf, etc.)',
	33	default='book')
1e512629	34	parser.add_argument('--pandoc', help='Location of the pandoc executable')
b25a2f90 SW	35	parser.add_argument(
	36	'--timeout',
	37	help='How long to wait for HTTP requests, in seconds',
	38	default=30)
	39	parser.add_argument('url', help='URL to retrieve')
92b11a10 SW	40	return parser
	41
	42
bf06f467 SW	43	def parse(content: bytes) -> BeautifulSoup:
bf06f467 SW	44	return BeautifulSoup(content, 'html.parser')
b25a2f90 SW	45
b25a2f90 SW	46
47cfa3cd SW	47	def clean(html: BeautifulSoup) -> BeautifulSoup:
	48	for eb in html.find_all("div", class_="post-edit-box"):
	49	eb.decompose()
	50	for footer in html.find_all("div", class_="post-footer"):
	51	footer.decompose()
	52	return html
	53
	54
	55	def replies(html: BeautifulSoup) -> Iterable[Tag]:
	56	def text() -> Tag:
	57	body = html.body
	58	assert body
	59	text = body.find_next("div", class_="post-post")
	60	assert isinstance(text, Tag)
	61	return text
	62
	63	def the_replies() -> Iterable[Tag]:
	64	rs = html.find_all("div", class_="post-reply")
	65	assert all(isinstance(r, Tag) for r in rs)
	66	return rs
	67
	68	return itertools.chain([text()], the_replies())
	69
	70
	71	def process(
	72	url: str,
a64403ac	73	fetcher: Fetcher,
79631507 SW	74	texifier: Texifier,
79631507 SW	75	texout: IO[bytes]) -> None:
07f9b178	76	texout.write(b'\\documentclass{article}\n\\begin{document}\n')
bf06f467	77	html = clean(parse(fetcher.fetch(url)))
36ae1d5f	78	for r in replies(html):
79631507	79	texout.write(texifier.texify(r))
07f9b178	80	texout.write(b'\\end{document}\n')
47cfa3cd SW	81
47cfa3cd SW	82
92b11a10	83	def main() -> None:
b25a2f90	84	args = command_line_parser().parse_args()
79631507	85	texifier = PandocTexifier(args.pandoc or 'pandoc')
a64403ac	86	with CachingFetcher(args.cache_path, args.timeout) as fetcher:
36ae1d5f	87	with open(args.out + '.tex', 'wb') as texout:
79631507	88	process(args.url, fetcher, texifier, texout)
92b11a10 SW	89
	90
	91	if __name__ == '__main__':
	92	main()