]> git.scottworley.com Git - paperdoorknob/blob - paperdoorknob.py
Invoke pandoc to convert HTML to LaTeX
[paperdoorknob] / paperdoorknob.py
1 # paperdoorknob: Print glowfic
2 #
3 # This program is free software: you can redistribute it and/or modify it
4 # under the terms of the GNU General Public License as published by the
5 # Free Software Foundation, version 3.
6
7
8 from argparse import ArgumentParser
9 import itertools
10 import os.path
11 import subprocess
12
13 from typing import IO, Iterable
14
15 from bs4 import BeautifulSoup
16 from bs4.element import Tag
17 import requests
18 import requests_cache
19 from xdg_base_dirs import xdg_cache_home
20
21
22 def command_line_parser() -> ArgumentParser:
23 parser = ArgumentParser(prog='paperdoorknob', description='Print glowfic')
24 parser.add_argument(
25 '--cache_path',
26 metavar='PATH',
27 help='Where to keep the http cache (instead of %(default)s)',
28 default=os.path.join(xdg_cache_home(), "paperdoorknob"))
29 parser.add_argument(
30 '--out',
31 help='The filename stem at which to write output ' +
32 '(eg: "%(default)s" produces %(default)s.text, %(default)s.pdf, etc.)',
33 default='book')
34 parser.add_argument('--pandoc', help='Location of the pandoc executable')
35 parser.add_argument(
36 '--timeout',
37 help='How long to wait for HTTP requests, in seconds',
38 default=30)
39 parser.add_argument('url', help='URL to retrieve')
40 return parser
41
42
43 def fetch(url: str, session: requests.Session, timeout: int) -> BeautifulSoup:
44 with session.get(url, timeout=timeout) as r:
45 r.raise_for_status()
46 return BeautifulSoup(r.text, 'html.parser')
47
48
49 def clean(html: BeautifulSoup) -> BeautifulSoup:
50 for eb in html.find_all("div", class_="post-edit-box"):
51 eb.decompose()
52 for footer in html.find_all("div", class_="post-footer"):
53 footer.decompose()
54 return html
55
56
57 def replies(html: BeautifulSoup) -> Iterable[Tag]:
58 def text() -> Tag:
59 body = html.body
60 assert body
61 text = body.find_next("div", class_="post-post")
62 assert isinstance(text, Tag)
63 return text
64
65 def the_replies() -> Iterable[Tag]:
66 rs = html.find_all("div", class_="post-reply")
67 assert all(isinstance(r, Tag) for r in rs)
68 return rs
69
70 return itertools.chain([text()], the_replies())
71
72
73 def html_to_tex(pandoc: str, tag: Tag) -> bytes:
74 return subprocess.run([pandoc, '--from=html', '--to=latex'],
75 input=tag.encode(),
76 stdout=subprocess.PIPE,
77 check=True).stdout
78
79
80 def process(
81 url: str,
82 session: requests.Session,
83 timeout: int,
84 texout: IO[bytes],
85 pandoc: str,
86 ) -> None:
87 html = clean(fetch(url, session, timeout))
88 for r in replies(html):
89 texout.write(html_to_tex(pandoc, r))
90
91
92 def main() -> None:
93 args = command_line_parser().parse_args()
94 with requests_cache.CachedSession(args.cache_path, cache_control=True) as session:
95 with open(args.out + '.tex', 'wb') as texout:
96 process(
97 args.url,
98 session,
99 args.timeout,
100 texout,
101 args.pandoc or 'pandoc')
102
103
104 if __name__ == '__main__':
105 main()