# paperdoorknob: Print glowfic
#
# This program is free software: you can redistribute it and/or modify it
# under the terms of the GNU General Public License as published by the
# Free Software Foundation, version 3.


from argparse import ArgumentParser
import itertools
import os.path
import subprocess

from typing import IO, Iterable

from bs4 import BeautifulSoup
from bs4.element import Tag
import requests
import requests_cache
from xdg_base_dirs import xdg_cache_home


def command_line_parser() -> ArgumentParser:
    parser = ArgumentParser(prog='paperdoorknob', description='Print glowfic')
    parser.add_argument(
        '--cache_path',
        metavar='PATH',
        help='Where to keep the http cache (instead of %(default)s)',
        default=os.path.join(xdg_cache_home(), "paperdoorknob"))
    parser.add_argument(
        '--out',
        help='The filename stem at which to write output ' +
             '(eg: "%(default)s" produces %(default)s.text, %(default)s.pdf, etc.)',
        default='book')
    parser.add_argument('--pandoc', help='Location of the pandoc executable')
    parser.add_argument(
        '--timeout',
        help='How long to wait for HTTP requests, in seconds',
        default=30)
    parser.add_argument('url', help='URL to retrieve')
    return parser


def fetch(url: str, session: requests.Session, timeout: int) -> BeautifulSoup:
    with session.get(url, timeout=timeout) as r:
        r.raise_for_status()
        return BeautifulSoup(r.text, 'html.parser')


def clean(html: BeautifulSoup) -> BeautifulSoup:
    for eb in html.find_all("div", class_="post-edit-box"):
        eb.decompose()
    for footer in html.find_all("div", class_="post-footer"):
        footer.decompose()
    return html


def replies(html: BeautifulSoup) -> Iterable[Tag]:
    def text() -> Tag:
        body = html.body
        assert body
        text = body.find_next("div", class_="post-post")
        assert isinstance(text, Tag)
        return text

    def the_replies() -> Iterable[Tag]:
        rs = html.find_all("div", class_="post-reply")
        assert all(isinstance(r, Tag) for r in rs)
        return rs

    return itertools.chain([text()], the_replies())


def html_to_tex(pandoc: str, tag: Tag) -> bytes:
    return subprocess.run([pandoc, '--from=html', '--to=latex'],
                          input=tag.encode(),
                          stdout=subprocess.PIPE,
                          check=True).stdout


def process(
        url: str,
        session: requests.Session,
        timeout: int,
        texout: IO[bytes],
        pandoc: str,
) -> None:
    texout.write(b'\\documentclass{article}\n\\begin{document}\n')
    html = clean(fetch(url, session, timeout))
    for r in replies(html):
        texout.write(html_to_tex(pandoc, r))
    texout.write(b'\\end{document}\n')


def main() -> None:
    args = command_line_parser().parse_args()
    with requests_cache.CachedSession(args.cache_path, cache_control=True) as session:
        with open(args.out + '.tex', 'wb') as texout:
            process(
                args.url,
                session,
                args.timeout,
                texout,
                args.pandoc or 'pandoc')


if __name__ == '__main__':
    main()