]> git.scottworley.com Git - paperdoorknob/blame - paperdoorknob.py
Rename: fetch → parse
[paperdoorknob] / paperdoorknob.py
CommitLineData
92b11a10
SW
1# paperdoorknob: Print glowfic
2#
3# This program is free software: you can redistribute it and/or modify it
4# under the terms of the GNU General Public License as published by the
5# Free Software Foundation, version 3.
6
7
8from argparse import ArgumentParser
55958ec0 9import itertools
ba3b7c52 10import os.path
36ae1d5f 11import subprocess
a0d30541 12
a2d42468 13from typing import IO, Iterable
a0d30541 14
136277e3 15from bs4 import BeautifulSoup
6409066b 16from bs4.element import Tag
ba3b7c52 17from xdg_base_dirs import xdg_cache_home
92b11a10 18
a64403ac
SW
19from fetch import CachingFetcher, Fetcher
20
92b11a10
SW
21
22def command_line_parser() -> ArgumentParser:
23 parser = ArgumentParser(prog='paperdoorknob', description='Print glowfic')
ba3b7c52
SW
24 parser.add_argument(
25 '--cache_path',
26 metavar='PATH',
27 help='Where to keep the http cache (instead of %(default)s)',
28 default=os.path.join(xdg_cache_home(), "paperdoorknob"))
a2d42468
SW
29 parser.add_argument(
30 '--out',
31 help='The filename stem at which to write output ' +
32 '(eg: "%(default)s" produces %(default)s.text, %(default)s.pdf, etc.)',
33 default='book')
1e512629 34 parser.add_argument('--pandoc', help='Location of the pandoc executable')
b25a2f90
SW
35 parser.add_argument(
36 '--timeout',
37 help='How long to wait for HTTP requests, in seconds',
38 default=30)
39 parser.add_argument('url', help='URL to retrieve')
92b11a10
SW
40 return parser
41
42
bf06f467
SW
43def parse(content: bytes) -> BeautifulSoup:
44 return BeautifulSoup(content, 'html.parser')
b25a2f90
SW
45
46
47cfa3cd
SW
47def clean(html: BeautifulSoup) -> BeautifulSoup:
48 for eb in html.find_all("div", class_="post-edit-box"):
49 eb.decompose()
50 for footer in html.find_all("div", class_="post-footer"):
51 footer.decompose()
52 return html
53
54
55def replies(html: BeautifulSoup) -> Iterable[Tag]:
56 def text() -> Tag:
57 body = html.body
58 assert body
59 text = body.find_next("div", class_="post-post")
60 assert isinstance(text, Tag)
61 return text
62
63 def the_replies() -> Iterable[Tag]:
64 rs = html.find_all("div", class_="post-reply")
65 assert all(isinstance(r, Tag) for r in rs)
66 return rs
67
68 return itertools.chain([text()], the_replies())
69
70
36ae1d5f
SW
71def html_to_tex(pandoc: str, tag: Tag) -> bytes:
72 return subprocess.run([pandoc, '--from=html', '--to=latex'],
73 input=tag.encode(),
74 stdout=subprocess.PIPE,
75 check=True).stdout
76
77
47cfa3cd
SW
78def process(
79 url: str,
a64403ac 80 fetcher: Fetcher,
36ae1d5f 81 texout: IO[bytes],
a64403ac 82 pandoc: str) -> None:
07f9b178 83 texout.write(b'\\documentclass{article}\n\\begin{document}\n')
bf06f467 84 html = clean(parse(fetcher.fetch(url)))
36ae1d5f
SW
85 for r in replies(html):
86 texout.write(html_to_tex(pandoc, r))
07f9b178 87 texout.write(b'\\end{document}\n')
47cfa3cd
SW
88
89
92b11a10 90def main() -> None:
b25a2f90 91 args = command_line_parser().parse_args()
a64403ac 92 with CachingFetcher(args.cache_path, args.timeout) as fetcher:
36ae1d5f 93 with open(args.out + '.tex', 'wb') as texout:
a64403ac 94 process(args.url, fetcher, texout, args.pandoc or 'pandoc')
92b11a10
SW
95
96
97if __name__ == '__main__':
98 main()