git.scottworley.com Git - paperdoorknob/blob - paperdoorknob.py

   1 # paperdoorknob: Print glowfic
   2 #
   3 # This program is free software: you can redistribute it and/or modify it
   4 # under the terms of the GNU General Public License as published by the
   5 # Free Software Foundation, version 3.
   6
   7
   8 from argparse import ArgumentParser
   9 import itertools
  10 import os.path
  11 import subprocess
  12
  13 from typing import IO, Iterable
  14
  15 from bs4 import BeautifulSoup
  16 from bs4.element import Tag
  17 import requests
  18 import requests_cache
  19 from xdg_base_dirs import xdg_cache_home
  20
  21
  22 def command_line_parser() -> ArgumentParser:
  23     parser = ArgumentParser(prog='paperdoorknob', description='Print glowfic')
  24     parser.add_argument(
  25         '--cache_path',
  26         metavar='PATH',
  27         help='Where to keep the http cache (instead of %(default)s)',
  28         default=os.path.join(xdg_cache_home(), "paperdoorknob"))
  29     parser.add_argument(
  30         '--out',
  31         help='The filename stem at which to write output ' +
  32              '(eg: "%(default)s" produces %(default)s.text, %(default)s.pdf, etc.)',
  33         default='book')
  34     parser.add_argument('--pandoc', help='Location of the pandoc executable')
  35     parser.add_argument(
  36         '--timeout',
  37         help='How long to wait for HTTP requests, in seconds',
  38         default=30)
  39     parser.add_argument('url', help='URL to retrieve')
  40     return parser
  41
  42
  43 def fetch(url: str, session: requests.Session, timeout: int) -> BeautifulSoup:
  44     with session.get(url, timeout=timeout) as r:
  45         r.raise_for_status()
  46         return BeautifulSoup(r.text, 'html.parser')
  47
  48
  49 def clean(html: BeautifulSoup) -> BeautifulSoup:
  50     for eb in html.find_all("div", class_="post-edit-box"):
  51         eb.decompose()
  52     for footer in html.find_all("div", class_="post-footer"):
  53         footer.decompose()
  54     return html
  55
  56
  57 def replies(html: BeautifulSoup) -> Iterable[Tag]:
  58     def text() -> Tag:
  59         body = html.body
  60         assert body
  61         text = body.find_next("div", class_="post-post")
  62         assert isinstance(text, Tag)
  63         return text
  64
  65     def the_replies() -> Iterable[Tag]:
  66         rs = html.find_all("div", class_="post-reply")
  67         assert all(isinstance(r, Tag) for r in rs)
  68         return rs
  69
  70     return itertools.chain([text()], the_replies())
  71
  72
  73 def html_to_tex(pandoc: str, tag: Tag) -> bytes:
  74     return subprocess.run([pandoc, '--from=html', '--to=latex'],
  75                           input=tag.encode(),
  76                           stdout=subprocess.PIPE,
  77                           check=True).stdout
  78
  79
  80 def process(
  81         url: str,
  82         session: requests.Session,
  83         timeout: int,
  84         texout: IO[bytes],
  85         pandoc: str,
  86 ) -> None:
  87     html = clean(fetch(url, session, timeout))
  88     for r in replies(html):
  89         texout.write(html_to_tex(pandoc, r))
  90
  91
  92 def main() -> None:
  93     args = command_line_parser().parse_args()
  94     with requests_cache.CachedSession(args.cache_path, cache_control=True) as session:
  95         with open(args.out + '.tex', 'wb') as texout:
  96             process(
  97                 args.url,
  98                 session,
  99                 args.timeout,
 100                 texout,
 101                 args.pandoc or 'pandoc')
 102
 103
 104 if __name__ == '__main__':
 105     main()