]> git.scottworley.com Git - paperdoorknob/blame - paperdoorknob.py
--pandoc flag
[paperdoorknob] / paperdoorknob.py
CommitLineData
92b11a10
SW
1# paperdoorknob: Print glowfic
2#
3# This program is free software: you can redistribute it and/or modify it
4# under the terms of the GNU General Public License as published by the
5# Free Software Foundation, version 3.
6
7
8from argparse import ArgumentParser
55958ec0 9import itertools
ba3b7c52 10import os.path
a0d30541 11
a2d42468 12from typing import IO, Iterable
a0d30541 13
136277e3 14from bs4 import BeautifulSoup
6409066b 15from bs4.element import Tag
b25a2f90 16import requests
b34a368f 17import requests_cache
ba3b7c52 18from xdg_base_dirs import xdg_cache_home
92b11a10
SW
19
20
21def command_line_parser() -> ArgumentParser:
22 parser = ArgumentParser(prog='paperdoorknob', description='Print glowfic')
ba3b7c52
SW
23 parser.add_argument(
24 '--cache_path',
25 metavar='PATH',
26 help='Where to keep the http cache (instead of %(default)s)',
27 default=os.path.join(xdg_cache_home(), "paperdoorknob"))
a2d42468
SW
28 parser.add_argument(
29 '--out',
30 help='The filename stem at which to write output ' +
31 '(eg: "%(default)s" produces %(default)s.text, %(default)s.pdf, etc.)',
32 default='book')
1e512629 33 parser.add_argument('--pandoc', help='Location of the pandoc executable')
b25a2f90
SW
34 parser.add_argument(
35 '--timeout',
36 help='How long to wait for HTTP requests, in seconds',
37 default=30)
38 parser.add_argument('url', help='URL to retrieve')
92b11a10
SW
39 return parser
40
41
136277e3 42def fetch(url: str, session: requests.Session, timeout: int) -> BeautifulSoup:
e138a9b4
SW
43 with session.get(url, timeout=timeout) as r:
44 r.raise_for_status()
136277e3 45 return BeautifulSoup(r.text, 'html.parser')
b25a2f90
SW
46
47
47cfa3cd
SW
48def clean(html: BeautifulSoup) -> BeautifulSoup:
49 for eb in html.find_all("div", class_="post-edit-box"):
50 eb.decompose()
51 for footer in html.find_all("div", class_="post-footer"):
52 footer.decompose()
53 return html
54
55
56def replies(html: BeautifulSoup) -> Iterable[Tag]:
57 def text() -> Tag:
58 body = html.body
59 assert body
60 text = body.find_next("div", class_="post-post")
61 assert isinstance(text, Tag)
62 return text
63
64 def the_replies() -> Iterable[Tag]:
65 rs = html.find_all("div", class_="post-reply")
66 assert all(isinstance(r, Tag) for r in rs)
67 return rs
68
69 return itertools.chain([text()], the_replies())
70
71
72def process(
73 url: str,
74 session: requests.Session,
a2d42468
SW
75 timeout: int,
76 texout: IO[str],
77) -> None:
47cfa3cd 78 html = clean(fetch(url, session, timeout))
a2d42468
SW
79 replies(html)
80 print("soon", file=texout)
47cfa3cd
SW
81
82
92b11a10 83def main() -> None:
b25a2f90 84 args = command_line_parser().parse_args()
4c1cf54e 85 with requests_cache.CachedSession(args.cache_path, cache_control=True) as session:
a2d42468
SW
86 with open(args.out + '.tex', 'w', encoding='UTF-8') as texout:
87 process(args.url, session, args.timeout, texout)
92b11a10
SW
88
89
90if __name__ == '__main__':
91 main()