git.scottworley.com Git - paperdoorknob/blame_incremental

... / ...

Commit	Line	Data
	1	# paperdoorknob: Print glowfic
	2	#
	3	# This program is free software: you can redistribute it and/or modify it
	4	# under the terms of the GNU General Public License as published by the
	5	# Free Software Foundation, version 3.
	6
	7
	8	from argparse import ArgumentParser
	9	import itertools
	10	import os.path
	11	import subprocess
	12
	13	from typing import IO, Iterable
	14
	15	from bs4 import BeautifulSoup
	16	from bs4.element import Tag
	17	import requests
	18	import requests_cache
	19	from xdg_base_dirs import xdg_cache_home
	20
	21
	22	def command_line_parser() -> ArgumentParser:
	23	parser = ArgumentParser(prog='paperdoorknob', description='Print glowfic')
	24	parser.add_argument(
	25	'--cache_path',
	26	metavar='PATH',
	27	help='Where to keep the http cache (instead of %(default)s)',
	28	default=os.path.join(xdg_cache_home(), "paperdoorknob"))
	29	parser.add_argument(
	30	'--out',
	31	help='The filename stem at which to write output ' +
	32	'(eg: "%(default)s" produces %(default)s.text, %(default)s.pdf, etc.)',
	33	default='book')
	34	parser.add_argument('--pandoc', help='Location of the pandoc executable')
	35	parser.add_argument(
	36	'--timeout',
	37	help='How long to wait for HTTP requests, in seconds',
	38	default=30)
	39	parser.add_argument('url', help='URL to retrieve')
	40	return parser
	41
	42
	43	def fetch(url: str, session: requests.Session, timeout: int) -> BeautifulSoup:
	44	with session.get(url, timeout=timeout) as r:
	45	r.raise_for_status()
	46	return BeautifulSoup(r.text, 'html.parser')
	47
	48
	49	def clean(html: BeautifulSoup) -> BeautifulSoup:
	50	for eb in html.find_all("div", class_="post-edit-box"):
	51	eb.decompose()
	52	for footer in html.find_all("div", class_="post-footer"):
	53	footer.decompose()
	54	return html
	55
	56
	57	def replies(html: BeautifulSoup) -> Iterable[Tag]:
	58	def text() -> Tag:
	59	body = html.body
	60	assert body
	61	text = body.find_next("div", class_="post-post")
	62	assert isinstance(text, Tag)
	63	return text
	64
	65	def the_replies() -> Iterable[Tag]:
	66	rs = html.find_all("div", class_="post-reply")
	67	assert all(isinstance(r, Tag) for r in rs)
	68	return rs
	69
	70	return itertools.chain([text()], the_replies())
	71
	72
	73	def html_to_tex(pandoc: str, tag: Tag) -> bytes:
	74	return subprocess.run([pandoc, '--from=html', '--to=latex'],
	75	input=tag.encode(),
	76	stdout=subprocess.PIPE,
	77	check=True).stdout
	78
	79
	80	def process(
	81	url: str,
	82	session: requests.Session,
	83	timeout: int,
	84	texout: IO[bytes],
	85	pandoc: str,
	86	) -> None:
	87	html = clean(fetch(url, session, timeout))
	88	for r in replies(html):
	89	texout.write(html_to_tex(pandoc, r))
	90
	91
	92	def main() -> None:
	93	args = command_line_parser().parse_args()
	94	with requests_cache.CachedSession(args.cache_path, cache_control=True) as session:
	95	with open(args.out + '.tex', 'wb') as texout:
	96	process(
	97	args.url,
	98	session,
	99	args.timeout,
	100	texout,
	101	args.pandoc or 'pandoc')
	102
	103
	104	if __name__ == '__main__':
	105	main()