# paperdoorknob: Print glowfic # # This program is free software: you can redistribute it and/or modify it # under the terms of the GNU General Public License as published by the # Free Software Foundation, version 3. from argparse import ArgumentParser import itertools import os.path import subprocess from typing import IO, Iterable from bs4 import BeautifulSoup from bs4.element import Tag import requests import requests_cache from xdg_base_dirs import xdg_cache_home def command_line_parser() -> ArgumentParser: parser = ArgumentParser(prog='paperdoorknob', description='Print glowfic') parser.add_argument( '--cache_path', metavar='PATH', help='Where to keep the http cache (instead of %(default)s)', default=os.path.join(xdg_cache_home(), "paperdoorknob")) parser.add_argument( '--out', help='The filename stem at which to write output ' + '(eg: "%(default)s" produces %(default)s.text, %(default)s.pdf, etc.)', default='book') parser.add_argument('--pandoc', help='Location of the pandoc executable') parser.add_argument( '--timeout', help='How long to wait for HTTP requests, in seconds', default=30) parser.add_argument('url', help='URL to retrieve') return parser def fetch(url: str, session: requests.Session, timeout: int) -> BeautifulSoup: with session.get(url, timeout=timeout) as r: r.raise_for_status() return BeautifulSoup(r.text, 'html.parser') def clean(html: BeautifulSoup) -> BeautifulSoup: for eb in html.find_all("div", class_="post-edit-box"): eb.decompose() for footer in html.find_all("div", class_="post-footer"): footer.decompose() return html def replies(html: BeautifulSoup) -> Iterable[Tag]: def text() -> Tag: body = html.body assert body text = body.find_next("div", class_="post-post") assert isinstance(text, Tag) return text def the_replies() -> Iterable[Tag]: rs = html.find_all("div", class_="post-reply") assert all(isinstance(r, Tag) for r in rs) return rs return itertools.chain([text()], the_replies()) def html_to_tex(pandoc: str, tag: Tag) -> bytes: return subprocess.run([pandoc, '--from=html', '--to=latex'], input=tag.encode(), stdout=subprocess.PIPE, check=True).stdout def process( url: str, session: requests.Session, timeout: int, texout: IO[bytes], pandoc: str, ) -> None: texout.write(b'\\documentclass{article}\n\\begin{document}\n') html = clean(fetch(url, session, timeout)) for r in replies(html): texout.write(html_to_tex(pandoc, r)) texout.write(b'\\end{document}\n') def main() -> None: args = command_line_parser().parse_args() with requests_cache.CachedSession(args.cache_path, cache_control=True) as session: with open(args.out + '.tex', 'wb') as texout: process( args.url, session, args.timeout, texout, args.pandoc or 'pandoc') if __name__ == '__main__': main()