]> git.scottworley.com Git - paperdoorknob/blame - paperdoorknob.py
Cleaner Fetcher interface
[paperdoorknob] / paperdoorknob.py
CommitLineData
92b11a10
SW
1# paperdoorknob: Print glowfic
2#
3# This program is free software: you can redistribute it and/or modify it
4# under the terms of the GNU General Public License as published by the
5# Free Software Foundation, version 3.
6
7
8from argparse import ArgumentParser
55958ec0 9import itertools
ba3b7c52 10import os.path
36ae1d5f 11import subprocess
a0d30541 12
a2d42468 13from typing import IO, Iterable
a0d30541 14
136277e3 15from bs4 import BeautifulSoup
6409066b 16from bs4.element import Tag
b25a2f90 17import requests
b34a368f 18import requests_cache
ba3b7c52 19from xdg_base_dirs import xdg_cache_home
92b11a10
SW
20
21
22def command_line_parser() -> ArgumentParser:
23 parser = ArgumentParser(prog='paperdoorknob', description='Print glowfic')
ba3b7c52
SW
24 parser.add_argument(
25 '--cache_path',
26 metavar='PATH',
27 help='Where to keep the http cache (instead of %(default)s)',
28 default=os.path.join(xdg_cache_home(), "paperdoorknob"))
a2d42468
SW
29 parser.add_argument(
30 '--out',
31 help='The filename stem at which to write output ' +
32 '(eg: "%(default)s" produces %(default)s.text, %(default)s.pdf, etc.)',
33 default='book')
1e512629 34 parser.add_argument('--pandoc', help='Location of the pandoc executable')
b25a2f90
SW
35 parser.add_argument(
36 '--timeout',
37 help='How long to wait for HTTP requests, in seconds',
38 default=30)
39 parser.add_argument('url', help='URL to retrieve')
92b11a10
SW
40 return parser
41
42
136277e3 43def fetch(url: str, session: requests.Session, timeout: int) -> BeautifulSoup:
e138a9b4
SW
44 with session.get(url, timeout=timeout) as r:
45 r.raise_for_status()
136277e3 46 return BeautifulSoup(r.text, 'html.parser')
b25a2f90
SW
47
48
47cfa3cd
SW
49def clean(html: BeautifulSoup) -> BeautifulSoup:
50 for eb in html.find_all("div", class_="post-edit-box"):
51 eb.decompose()
52 for footer in html.find_all("div", class_="post-footer"):
53 footer.decompose()
54 return html
55
56
57def replies(html: BeautifulSoup) -> Iterable[Tag]:
58 def text() -> Tag:
59 body = html.body
60 assert body
61 text = body.find_next("div", class_="post-post")
62 assert isinstance(text, Tag)
63 return text
64
65 def the_replies() -> Iterable[Tag]:
66 rs = html.find_all("div", class_="post-reply")
67 assert all(isinstance(r, Tag) for r in rs)
68 return rs
69
70 return itertools.chain([text()], the_replies())
71
72
36ae1d5f
SW
73def html_to_tex(pandoc: str, tag: Tag) -> bytes:
74 return subprocess.run([pandoc, '--from=html', '--to=latex'],
75 input=tag.encode(),
76 stdout=subprocess.PIPE,
77 check=True).stdout
78
79
47cfa3cd
SW
80def process(
81 url: str,
82 session: requests.Session,
a2d42468 83 timeout: int,
36ae1d5f
SW
84 texout: IO[bytes],
85 pandoc: str,
a2d42468 86) -> None:
07f9b178 87 texout.write(b'\\documentclass{article}\n\\begin{document}\n')
47cfa3cd 88 html = clean(fetch(url, session, timeout))
36ae1d5f
SW
89 for r in replies(html):
90 texout.write(html_to_tex(pandoc, r))
07f9b178 91 texout.write(b'\\end{document}\n')
47cfa3cd
SW
92
93
92b11a10 94def main() -> None:
b25a2f90 95 args = command_line_parser().parse_args()
4c1cf54e 96 with requests_cache.CachedSession(args.cache_path, cache_control=True) as session:
36ae1d5f
SW
97 with open(args.out + '.tex', 'wb') as texout:
98 process(
99 args.url,
100 session,
101 args.timeout,
102 texout,
103 args.pandoc or 'pandoc')
92b11a10
SW
104
105
106if __name__ == '__main__':
107 main()