[paperdoorknob] / paperdoorknob.py

# paperdoorknob: Print glowfic
#
# This program is free software: you can redistribute it and/or modify it
# under the terms of the GNU General Public License as published by the
# Free Software Foundation, version 3.


from argparse import ArgumentParser
import itertools
import os.path

from typing import IO, Iterable

from bs4 import BeautifulSoup
from bs4.element import Tag
import requests
import requests_cache
from xdg_base_dirs import xdg_cache_home


def command_line_parser() -> ArgumentParser:
    parser = ArgumentParser(prog='paperdoorknob', description='Print glowfic')
    parser.add_argument(
        '--cache_path',
        metavar='PATH',
        help='Where to keep the http cache (instead of %(default)s)',
        default=os.path.join(xdg_cache_home(), "paperdoorknob"))
    parser.add_argument(
        '--out',
        help='The filename stem at which to write output ' +
             '(eg: "%(default)s" produces %(default)s.text, %(default)s.pdf, etc.)',
        default='book')
    parser.add_argument('--pandoc', help='Location of the pandoc executable')
    parser.add_argument(
        '--timeout',
        help='How long to wait for HTTP requests, in seconds',
        default=30)
    parser.add_argument('url', help='URL to retrieve')
    return parser


def fetch(url: str, session: requests.Session, timeout: int) -> BeautifulSoup:
    with session.get(url, timeout=timeout) as r:
        r.raise_for_status()
        return BeautifulSoup(r.text, 'html.parser')


def clean(html: BeautifulSoup) -> BeautifulSoup:
    for eb in html.find_all("div", class_="post-edit-box"):
        eb.decompose()
    for footer in html.find_all("div", class_="post-footer"):
        footer.decompose()
    return html


def replies(html: BeautifulSoup) -> Iterable[Tag]:
    def text() -> Tag:
        body = html.body
        assert body
        text = body.find_next("div", class_="post-post")
        assert isinstance(text, Tag)
        return text

    def the_replies() -> Iterable[Tag]:
        rs = html.find_all("div", class_="post-reply")
        assert all(isinstance(r, Tag) for r in rs)
        return rs

    return itertools.chain([text()], the_replies())


def process(
        url: str,
        session: requests.Session,
        timeout: int,
        texout: IO[str],
) -> None:
    html = clean(fetch(url, session, timeout))
    replies(html)
    print("soon", file=texout)


def main() -> None:
    args = command_line_parser().parse_args()
    with requests_cache.CachedSession(args.cache_path, cache_control=True) as session:
        with open(args.out + '.tex', 'w', encoding='UTF-8') as texout:
            process(args.url, session, args.timeout, texout)


if __name__ == '__main__':
    main()
Commit	Line	Data
92b11a10 SW	1	# paperdoorknob: Print glowfic
	2	#
	3	# This program is free software: you can redistribute it and/or modify it
	4	# under the terms of the GNU General Public License as published by the
	5	# Free Software Foundation, version 3.
	6
	7
	8	from argparse import ArgumentParser
55958ec0	9	import itertools
ba3b7c52	10	import os.path
a0d30541	11
a2d42468	12	from typing import IO, Iterable
a0d30541	13
136277e3	14	from bs4 import BeautifulSoup
6409066b	15	from bs4.element import Tag
b25a2f90	16	import requests
b34a368f	17	import requests_cache
ba3b7c52	18	from xdg_base_dirs import xdg_cache_home
92b11a10 SW	19
	20
	21	def command_line_parser() -> ArgumentParser:
	22	parser = ArgumentParser(prog='paperdoorknob', description='Print glowfic')
ba3b7c52 SW	23	parser.add_argument(
	24	'--cache_path',
	25	metavar='PATH',
	26	help='Where to keep the http cache (instead of %(default)s)',
	27	default=os.path.join(xdg_cache_home(), "paperdoorknob"))
a2d42468 SW	28	parser.add_argument(
	29	'--out',
	30	help='The filename stem at which to write output ' +
	31	'(eg: "%(default)s" produces %(default)s.text, %(default)s.pdf, etc.)',
	32	default='book')
1e512629	33	parser.add_argument('--pandoc', help='Location of the pandoc executable')
b25a2f90 SW	34	parser.add_argument(
	35	'--timeout',
	36	help='How long to wait for HTTP requests, in seconds',
	37	default=30)
	38	parser.add_argument('url', help='URL to retrieve')
92b11a10 SW	39	return parser
	40
	41
136277e3	42	def fetch(url: str, session: requests.Session, timeout: int) -> BeautifulSoup:
e138a9b4 SW	43	with session.get(url, timeout=timeout) as r:
e138a9b4 SW	44	r.raise_for_status()
136277e3	45	return BeautifulSoup(r.text, 'html.parser')
b25a2f90 SW	46
b25a2f90 SW	47
47cfa3cd SW	48	def clean(html: BeautifulSoup) -> BeautifulSoup:
	49	for eb in html.find_all("div", class_="post-edit-box"):
	50	eb.decompose()
	51	for footer in html.find_all("div", class_="post-footer"):
	52	footer.decompose()
	53	return html
	54
	55
	56	def replies(html: BeautifulSoup) -> Iterable[Tag]:
	57	def text() -> Tag:
	58	body = html.body
	59	assert body
	60	text = body.find_next("div", class_="post-post")
	61	assert isinstance(text, Tag)
	62	return text
	63
	64	def the_replies() -> Iterable[Tag]:
	65	rs = html.find_all("div", class_="post-reply")
	66	assert all(isinstance(r, Tag) for r in rs)
	67	return rs
	68
	69	return itertools.chain([text()], the_replies())
	70
	71
	72	def process(
	73	url: str,
	74	session: requests.Session,
a2d42468 SW	75	timeout: int,
	76	texout: IO[str],
	77	) -> None:
47cfa3cd	78	html = clean(fetch(url, session, timeout))
a2d42468 SW	79	replies(html)
a2d42468 SW	80	print("soon", file=texout)
47cfa3cd SW	81
47cfa3cd SW	82
92b11a10	83	def main() -> None:
b25a2f90	84	args = command_line_parser().parse_args()
4c1cf54e	85	with requests_cache.CachedSession(args.cache_path, cache_control=True) as session:
a2d42468 SW	86	with open(args.out + '.tex', 'w', encoding='UTF-8') as texout:
a2d42468 SW	87	process(args.url, session, args.timeout, texout)
92b11a10 SW	88
	89
	90	if __name__ == '__main__':
	91	main()