]>
git.scottworley.com Git - paperdoorknob/blob - paperdoorknob.py
31ce02c494ea29aadc8794d75f11ee8397729a2c
1 # paperdoorknob: Print glowfic
3 # This program is free software: you can redistribute it and/or modify it
4 # under the terms of the GNU General Public License as published by the
5 # Free Software Foundation, version 3.
8 from argparse
import ArgumentParser
13 from typing
import IO
, Iterable
15 from bs4
import BeautifulSoup
16 from bs4
.element
import Tag
17 from xdg_base_dirs
import xdg_cache_home
19 from fetch
import CachingFetcher
, Fetcher
22 def command_line_parser() -> ArgumentParser
:
23 parser
= ArgumentParser(prog
='paperdoorknob', description
='Print glowfic')
27 help='Where to keep the http cache (instead of %(default)s)',
28 default
=os
.path
.join(xdg_cache_home(), "paperdoorknob"))
31 help='The filename stem at which to write output ' +
32 '(eg: "%(default)s" produces %(default)s.text, %(default)s.pdf, etc.)',
34 parser
.add_argument('--pandoc', help='Location of the pandoc executable')
37 help='How long to wait for HTTP requests, in seconds',
39 parser
.add_argument('url', help='URL to retrieve')
43 def parse(content
: bytes) -> BeautifulSoup
:
44 return BeautifulSoup(content
, 'html.parser')
47 def clean(html
: BeautifulSoup
) -> BeautifulSoup
:
48 for eb
in html
.find_all("div", class_
="post-edit-box"):
50 for footer
in html
.find_all("div", class_
="post-footer"):
55 def replies(html
: BeautifulSoup
) -> Iterable
[Tag
]:
59 text
= body
.find_next("div", class_
="post-post")
60 assert isinstance(text
, Tag
)
63 def the_replies() -> Iterable
[Tag
]:
64 rs
= html
.find_all("div", class_
="post-reply")
65 assert all(isinstance(r
, Tag
) for r
in rs
)
68 return itertools
.chain([text()], the_replies())
71 def html_to_tex(pandoc
: str, tag
: Tag
) -> bytes:
72 return subprocess
.run([pandoc
, '--from=html', '--to=latex'],
74 stdout
=subprocess
.PIPE
,
83 texout
.write(b
'\\documentclass{article}\n\\begin{document}\n')
84 html
= clean(parse(fetcher
.fetch(url
)))
85 for r
in replies(html
):
86 texout
.write(html_to_tex(pandoc
, r
))
87 texout
.write(b
'\\end{document}\n')
91 args
= command_line_parser().parse_args()
92 with CachingFetcher(args
.cache_path
, args
.timeout
) as fetcher
:
93 with open(args
.out
+ '.tex', 'wb') as texout
:
94 process(args
.url
, fetcher
, texout
, args
.pandoc
or 'pandoc')
97 if __name__
== '__main__':