]> git.scottworley.com Git - paperdoorknob/blame - paperdoorknob.py
Verify that LaTeX conversion is doing something
[paperdoorknob] / paperdoorknob.py
CommitLineData
92b11a10
SW
1# paperdoorknob: Print glowfic
2#
3# This program is free software: you can redistribute it and/or modify it
4# under the terms of the GNU General Public License as published by the
5# Free Software Foundation, version 3.
6
7
8from argparse import ArgumentParser
55958ec0 9import itertools
ba3b7c52 10import os.path
36ae1d5f 11import subprocess
a0d30541 12
a2d42468 13from typing import IO, Iterable
a0d30541 14
136277e3 15from bs4 import BeautifulSoup
6409066b 16from bs4.element import Tag
b25a2f90 17import requests
b34a368f 18import requests_cache
ba3b7c52 19from xdg_base_dirs import xdg_cache_home
92b11a10
SW
20
21
22def command_line_parser() -> ArgumentParser:
23 parser = ArgumentParser(prog='paperdoorknob', description='Print glowfic')
ba3b7c52
SW
24 parser.add_argument(
25 '--cache_path',
26 metavar='PATH',
27 help='Where to keep the http cache (instead of %(default)s)',
28 default=os.path.join(xdg_cache_home(), "paperdoorknob"))
a2d42468
SW
29 parser.add_argument(
30 '--out',
31 help='The filename stem at which to write output ' +
32 '(eg: "%(default)s" produces %(default)s.text, %(default)s.pdf, etc.)',
33 default='book')
1e512629 34 parser.add_argument('--pandoc', help='Location of the pandoc executable')
b25a2f90
SW
35 parser.add_argument(
36 '--timeout',
37 help='How long to wait for HTTP requests, in seconds',
38 default=30)
39 parser.add_argument('url', help='URL to retrieve')
92b11a10
SW
40 return parser
41
42
136277e3 43def fetch(url: str, session: requests.Session, timeout: int) -> BeautifulSoup:
e138a9b4
SW
44 with session.get(url, timeout=timeout) as r:
45 r.raise_for_status()
136277e3 46 return BeautifulSoup(r.text, 'html.parser')
b25a2f90
SW
47
48
47cfa3cd
SW
49def clean(html: BeautifulSoup) -> BeautifulSoup:
50 for eb in html.find_all("div", class_="post-edit-box"):
51 eb.decompose()
52 for footer in html.find_all("div", class_="post-footer"):
53 footer.decompose()
54 return html
55
56
57def replies(html: BeautifulSoup) -> Iterable[Tag]:
58 def text() -> Tag:
59 body = html.body
60 assert body
61 text = body.find_next("div", class_="post-post")
62 assert isinstance(text, Tag)
63 return text
64
65 def the_replies() -> Iterable[Tag]:
66 rs = html.find_all("div", class_="post-reply")
67 assert all(isinstance(r, Tag) for r in rs)
68 return rs
69
70 return itertools.chain([text()], the_replies())
71
72
36ae1d5f
SW
73def html_to_tex(pandoc: str, tag: Tag) -> bytes:
74 return subprocess.run([pandoc, '--from=html', '--to=latex'],
75 input=tag.encode(),
76 stdout=subprocess.PIPE,
77 check=True).stdout
78
79
47cfa3cd
SW
80def process(
81 url: str,
82 session: requests.Session,
a2d42468 83 timeout: int,
36ae1d5f
SW
84 texout: IO[bytes],
85 pandoc: str,
a2d42468 86) -> None:
47cfa3cd 87 html = clean(fetch(url, session, timeout))
36ae1d5f
SW
88 for r in replies(html):
89 texout.write(html_to_tex(pandoc, r))
47cfa3cd
SW
90
91
92b11a10 92def main() -> None:
b25a2f90 93 args = command_line_parser().parse_args()
4c1cf54e 94 with requests_cache.CachedSession(args.cache_path, cache_control=True) as session:
36ae1d5f
SW
95 with open(args.out + '.tex', 'wb') as texout:
96 process(
97 args.url,
98 session,
99 args.timeout,
100 texout,
101 args.pandoc or 'pandoc')
92b11a10
SW
102
103
104if __name__ == '__main__':
105 main()