From b25a2f90428a1e7fda03c77fcee8ed0fa3b22555 Mon Sep 17 00:00:00 2001 From: Scott Worley Date: Thu, 23 Nov 2023 11:56:31 -0800 Subject: [PATCH 001/100] Fetch URLs --- README.md | 1 + default.nix | 6 ++++-- paperdoorknob.py | 14 +++++++++++++- paperdoorknob_test.py | 42 ++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 60 insertions(+), 3 deletions(-) create mode 100644 paperdoorknob_test.py diff --git a/README.md b/README.md index 7746516..9782d63 100644 --- a/README.md +++ b/README.md @@ -2,6 +2,7 @@ A polite glowfic → printable book scraper. +* Retrieves glowfic ### Alternatives diff --git a/default.nix b/default.nix index f73efb5..affb068 100644 --- a/default.nix +++ b/default.nix @@ -1,11 +1,13 @@ { pkgs ? import { }, lint ? false }: pkgs.python3Packages.callPackage -({ lib, buildPythonPackage, autopep8, mypy, pylint, }: +({ lib, buildPythonPackage, autopep8, mypy, pylint, requests, types-requests }: buildPythonPackage rec { pname = "paperdoorknob"; version = "0.0.1"; src = lib.cleanSource ./.; - nativeCheckInputs = [ mypy ] ++ lib.optionals lint [ autopep8 pylint ]; + propagatedBuildInputs = [ requests ]; + nativeCheckInputs = [ mypy types-requests ] + ++ lib.optionals lint [ autopep8 pylint ]; doCheck = true; checkPhase = "./test.sh"; meta = { diff --git a/paperdoorknob.py b/paperdoorknob.py index 8f1d840..cd93e4e 100644 --- a/paperdoorknob.py +++ b/paperdoorknob.py @@ -6,15 +6,27 @@ from argparse import ArgumentParser +import requests def command_line_parser() -> ArgumentParser: parser = ArgumentParser(prog='paperdoorknob', description='Print glowfic') + parser.add_argument( + '--timeout', + help='How long to wait for HTTP requests, in seconds', + default=30) + parser.add_argument('url', help='URL to retrieve') return parser +def fetch(url: str, timeout: int) -> None: + r = requests.get(url, timeout=timeout) + r.raise_for_status() + + def main() -> None: - command_line_parser().parse_args() + args = command_line_parser().parse_args() + fetch(args.url, args.timeout) if __name__ == '__main__': diff --git a/paperdoorknob_test.py b/paperdoorknob_test.py new file mode 100644 index 0000000..c19e4db --- /dev/null +++ b/paperdoorknob_test.py @@ -0,0 +1,42 @@ +# paperdoorknob: Print glowfic +# +# This program is free software: you can redistribute it and/or modify it +# under the terms of the GNU General Public License as published by the +# Free Software Foundation, version 3. + + +import unittest +import threading +from http.server import BaseHTTPRequestHandler, HTTPServer +import paperdoorknob + +TEST_PORT = 8080 +TIMEOUT = 8 + + +class FakeGlowficHTTPRequestHandler(BaseHTTPRequestHandler): + + def do_GET(self) -> None: + body = b'This is glowfic' + self.send_response(200) + self.send_header("Content-type", "text/html") + self.send_header("Content-Length", str(len(body))) + self.end_headers() + self.wfile.write(body) + + +class TestFetch(unittest.TestCase): + def setUp(self) -> None: + web_server = HTTPServer(('', TEST_PORT), FakeGlowficHTTPRequestHandler) + threading.Thread(target=web_server.serve_forever).start() + self._stop_server = web_server.shutdown + + def tearDown(self) -> None: + self._stop_server() + + def testFetch(self) -> None: + paperdoorknob.fetch(f"http://localhost:{TEST_PORT}", TIMEOUT) + + +if __name__ == '__main__': + unittest.main() -- 2.44.1 From 680f7d1094d76ad090ce1f2c6ae3e14ca5d15677 Mon Sep 17 00:00:00 2001 From: Scott Worley Date: Thu, 23 Nov 2023 12:21:32 -0800 Subject: [PATCH 002/100] fetch: test: Choose port dynamically --- paperdoorknob_test.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/paperdoorknob_test.py b/paperdoorknob_test.py index c19e4db..2bd3544 100644 --- a/paperdoorknob_test.py +++ b/paperdoorknob_test.py @@ -10,7 +10,6 @@ import threading from http.server import BaseHTTPRequestHandler, HTTPServer import paperdoorknob -TEST_PORT = 8080 TIMEOUT = 8 @@ -27,7 +26,8 @@ class FakeGlowficHTTPRequestHandler(BaseHTTPRequestHandler): class TestFetch(unittest.TestCase): def setUp(self) -> None: - web_server = HTTPServer(('', TEST_PORT), FakeGlowficHTTPRequestHandler) + web_server = HTTPServer(('', 0), FakeGlowficHTTPRequestHandler) + self._port = web_server.socket.getsockname()[1] threading.Thread(target=web_server.serve_forever).start() self._stop_server = web_server.shutdown @@ -35,7 +35,7 @@ class TestFetch(unittest.TestCase): self._stop_server() def testFetch(self) -> None: - paperdoorknob.fetch(f"http://localhost:{TEST_PORT}", TIMEOUT) + paperdoorknob.fetch(f"http://localhost:{self._port}", TIMEOUT) if __name__ == '__main__': -- 2.44.1 From 6fdb8f01252a4fa9e033af60bc79a1c8fe036558 Mon Sep 17 00:00:00 2001 From: Scott Worley Date: Thu, 23 Nov 2023 12:09:10 -0800 Subject: [PATCH 003/100] fetch: Verify error handling --- paperdoorknob_test.py | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/paperdoorknob_test.py b/paperdoorknob_test.py index 2bd3544..fafd1c3 100644 --- a/paperdoorknob_test.py +++ b/paperdoorknob_test.py @@ -8,6 +8,7 @@ import unittest import threading from http.server import BaseHTTPRequestHandler, HTTPServer +import requests import paperdoorknob TIMEOUT = 8 @@ -15,9 +16,16 @@ TIMEOUT = 8 class FakeGlowficHTTPRequestHandler(BaseHTTPRequestHandler): + def _response_code(self) -> int: + if self.path == "/not_found": + return 404 + if self.path == "/server_error": + return 500 + return 200 + def do_GET(self) -> None: body = b'This is glowfic' - self.send_response(200) + self.send_response(self._response_code()) self.send_header("Content-type", "text/html") self.send_header("Content-Length", str(len(body))) self.end_headers() @@ -37,6 +45,14 @@ class TestFetch(unittest.TestCase): def testFetch(self) -> None: paperdoorknob.fetch(f"http://localhost:{self._port}", TIMEOUT) + def testFetchErrors(self) -> None: + with self.assertRaises(requests.HTTPError): + paperdoorknob.fetch( + f"http://localhost:{self._port}/not_found", TIMEOUT) + with self.assertRaises(requests.HTTPError): + paperdoorknob.fetch( + f"http://localhost:{self._port}/server_error", TIMEOUT) + if __name__ == '__main__': unittest.main() -- 2.44.1 From 21723d3dc64cdfd14119315cd5e9e0f55b175cc8 Mon Sep 17 00:00:00 2001 From: Scott Worley Date: Thu, 23 Nov 2023 12:31:24 -0800 Subject: [PATCH 004/100] fetch: Try closing connections more aggressively --- paperdoorknob.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/paperdoorknob.py b/paperdoorknob.py index cd93e4e..70f3af5 100644 --- a/paperdoorknob.py +++ b/paperdoorknob.py @@ -20,8 +20,8 @@ def command_line_parser() -> ArgumentParser: def fetch(url: str, timeout: int) -> None: - r = requests.get(url, timeout=timeout) - r.raise_for_status() + with requests.get(url, timeout=timeout) as r: + r.raise_for_status() def main() -> None: -- 2.44.1 From e3ba205699db054723578a32d1bf73276b0e58ad Mon Sep 17 00:00:00 2001 From: Scott Worley Date: Thu, 23 Nov 2023 12:39:13 -0800 Subject: [PATCH 005/100] fetch: test: Explicitly join webserver thread --- paperdoorknob_test.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/paperdoorknob_test.py b/paperdoorknob_test.py index fafd1c3..b1d41d4 100644 --- a/paperdoorknob_test.py +++ b/paperdoorknob_test.py @@ -36,11 +36,13 @@ class TestFetch(unittest.TestCase): def setUp(self) -> None: web_server = HTTPServer(('', 0), FakeGlowficHTTPRequestHandler) self._port = web_server.socket.getsockname()[1] - threading.Thread(target=web_server.serve_forever).start() + self._thread = threading.Thread(target=web_server.serve_forever) + self._thread.start() self._stop_server = web_server.shutdown def tearDown(self) -> None: self._stop_server() + self._thread.join() def testFetch(self) -> None: paperdoorknob.fetch(f"http://localhost:{self._port}", TIMEOUT) -- 2.44.1 From 681380e2377aba038aaa22e1992dea54d6697a70 Mon Sep 17 00:00:00 2001 From: Scott Worley Date: Thu, 23 Nov 2023 12:49:40 -0800 Subject: [PATCH 006/100] fetch: test: Hold reference to webserver --- paperdoorknob_test.py | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/paperdoorknob_test.py b/paperdoorknob_test.py index b1d41d4..cf945d6 100644 --- a/paperdoorknob_test.py +++ b/paperdoorknob_test.py @@ -33,27 +33,30 @@ class FakeGlowficHTTPRequestHandler(BaseHTTPRequestHandler): class TestFetch(unittest.TestCase): + def _port(self) -> int: + port = self._web_server.socket.getsockname()[1] + assert isinstance(port, int) + return port + def setUp(self) -> None: - web_server = HTTPServer(('', 0), FakeGlowficHTTPRequestHandler) - self._port = web_server.socket.getsockname()[1] - self._thread = threading.Thread(target=web_server.serve_forever) + self._web_server = HTTPServer(('', 0), FakeGlowficHTTPRequestHandler) + self._thread = threading.Thread(target=self._web_server.serve_forever) self._thread.start() - self._stop_server = web_server.shutdown def tearDown(self) -> None: - self._stop_server() + self._web_server.shutdown() self._thread.join() def testFetch(self) -> None: - paperdoorknob.fetch(f"http://localhost:{self._port}", TIMEOUT) + paperdoorknob.fetch(f"http://localhost:{self._port()}", TIMEOUT) def testFetchErrors(self) -> None: with self.assertRaises(requests.HTTPError): paperdoorknob.fetch( - f"http://localhost:{self._port}/not_found", TIMEOUT) + f"http://localhost:{self._port()}/not_found", TIMEOUT) with self.assertRaises(requests.HTTPError): paperdoorknob.fetch( - f"http://localhost:{self._port}/server_error", TIMEOUT) + f"http://localhost:{self._port()}/server_error", TIMEOUT) if __name__ == '__main__': -- 2.44.1 From 971e665870ae3d1f38e270e716364dd2ff3dd5a2 Mon Sep 17 00:00:00 2001 From: Scott Worley Date: Thu, 23 Nov 2023 12:54:43 -0800 Subject: [PATCH 007/100] fetch: test: Explicitly close webserver This fixes "unclosed None: self._web_server.shutdown() self._thread.join() + self._web_server.server_close() def testFetch(self) -> None: paperdoorknob.fetch(f"http://localhost:{self._port()}", TIMEOUT) -- 2.44.1 From 17bd16e89158461d82af7cab2a99a36063ff46b1 Mon Sep 17 00:00:00 2001 From: Scott Worley Date: Thu, 23 Nov 2023 13:09:18 -0800 Subject: [PATCH 008/100] fetch: test: Count requests --- paperdoorknob_test.py | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/paperdoorknob_test.py b/paperdoorknob_test.py index c2fce2b..400570f 100644 --- a/paperdoorknob_test.py +++ b/paperdoorknob_test.py @@ -16,6 +16,9 @@ TIMEOUT = 8 class FakeGlowficHTTPRequestHandler(BaseHTTPRequestHandler): + def _notify_test(self) -> None: + raise NotImplementedError() + def _response_code(self) -> int: if self.path == "/not_found": return 404 @@ -30,6 +33,7 @@ class FakeGlowficHTTPRequestHandler(BaseHTTPRequestHandler): self.send_header("Content-Length", str(len(body))) self.end_headers() self.wfile.write(body) + self._notify_test() class TestFetch(unittest.TestCase): @@ -38,8 +42,14 @@ class TestFetch(unittest.TestCase): assert isinstance(port, int) return port + def _count_request(self) -> None: + self._request_counter += 1 + def setUp(self) -> None: - self._web_server = HTTPServer(('', 0), FakeGlowficHTTPRequestHandler) + self._request_counter = 0 + handler = type("Handler", (FakeGlowficHTTPRequestHandler,), { + '_notify_test': lambda _: self._count_request()}) + self._web_server = HTTPServer(('', 0), handler) self._thread = threading.Thread(target=self._web_server.serve_forever) self._thread.start() @@ -50,6 +60,9 @@ class TestFetch(unittest.TestCase): def testFetch(self) -> None: paperdoorknob.fetch(f"http://localhost:{self._port()}", TIMEOUT) + self.assertEqual(self._request_counter, 1) + paperdoorknob.fetch(f"http://localhost:{self._port()}", TIMEOUT) + self.assertEqual(self._request_counter, 2) def testFetchErrors(self) -> None: with self.assertRaises(requests.HTTPError): -- 2.44.1 From b03ad5dc018cc85fd5905011d74de1cc4c234cd9 Mon Sep 17 00:00:00 2001 From: Scott Worley Date: Thu, 23 Nov 2023 13:11:59 -0800 Subject: [PATCH 009/100] fetch: Use a session --- paperdoorknob.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/paperdoorknob.py b/paperdoorknob.py index 70f3af5..c684739 100644 --- a/paperdoorknob.py +++ b/paperdoorknob.py @@ -20,8 +20,9 @@ def command_line_parser() -> ArgumentParser: def fetch(url: str, timeout: int) -> None: - with requests.get(url, timeout=timeout) as r: - r.raise_for_status() + with requests.session() as s: + with s.get(url, timeout=timeout) as r: + r.raise_for_status() def main() -> None: -- 2.44.1 From e138a9b49da14f16c1de2c02dd928fd5d16aed52 Mon Sep 17 00:00:00 2001 From: Scott Worley Date: Thu, 23 Nov 2023 13:18:27 -0800 Subject: [PATCH 010/100] fetch: Multiple fetches per session --- paperdoorknob.py | 10 +++++----- paperdoorknob_test.py | 22 ++++++++++++---------- 2 files changed, 17 insertions(+), 15 deletions(-) diff --git a/paperdoorknob.py b/paperdoorknob.py index c684739..b4f5e52 100644 --- a/paperdoorknob.py +++ b/paperdoorknob.py @@ -19,15 +19,15 @@ def command_line_parser() -> ArgumentParser: return parser -def fetch(url: str, timeout: int) -> None: - with requests.session() as s: - with s.get(url, timeout=timeout) as r: - r.raise_for_status() +def fetch(url: str, session: requests.Session, timeout: int) -> None: + with session.get(url, timeout=timeout) as r: + r.raise_for_status() def main() -> None: args = command_line_parser().parse_args() - fetch(args.url, args.timeout) + with requests.session() as session: + fetch(args.url, session, args.timeout) if __name__ == '__main__': diff --git a/paperdoorknob_test.py b/paperdoorknob_test.py index 400570f..88897f4 100644 --- a/paperdoorknob_test.py +++ b/paperdoorknob_test.py @@ -59,18 +59,20 @@ class TestFetch(unittest.TestCase): self._web_server.server_close() def testFetch(self) -> None: - paperdoorknob.fetch(f"http://localhost:{self._port()}", TIMEOUT) - self.assertEqual(self._request_counter, 1) - paperdoorknob.fetch(f"http://localhost:{self._port()}", TIMEOUT) - self.assertEqual(self._request_counter, 2) + with requests.session() as s: + paperdoorknob.fetch(f"http://localhost:{self._port()}", s, TIMEOUT) + self.assertEqual(self._request_counter, 1) + paperdoorknob.fetch(f"http://localhost:{self._port()}", s, TIMEOUT) + self.assertEqual(self._request_counter, 2) def testFetchErrors(self) -> None: - with self.assertRaises(requests.HTTPError): - paperdoorknob.fetch( - f"http://localhost:{self._port()}/not_found", TIMEOUT) - with self.assertRaises(requests.HTTPError): - paperdoorknob.fetch( - f"http://localhost:{self._port()}/server_error", TIMEOUT) + with requests.session() as s: + with self.assertRaises(requests.HTTPError): + paperdoorknob.fetch( + f"http://localhost:{self._port()}/not_found", s, TIMEOUT) + with self.assertRaises(requests.HTTPError): + paperdoorknob.fetch( + f"http://localhost:{self._port()}/server_error", s, TIMEOUT) if __name__ == '__main__': -- 2.44.1 From b34a368f048581720738d2ea0b99504a98b3c64d Mon Sep 17 00:00:00 2001 From: Scott Worley Date: Thu, 23 Nov 2023 13:27:38 -0800 Subject: [PATCH 011/100] fetch: Cache --- default.nix | 6 +++--- paperdoorknob.py | 3 ++- paperdoorknob_test.py | 8 ++++++++ 3 files changed, 13 insertions(+), 4 deletions(-) diff --git a/default.nix b/default.nix index affb068..6331754 100644 --- a/default.nix +++ b/default.nix @@ -1,11 +1,11 @@ { pkgs ? import { }, lint ? false }: -pkgs.python3Packages.callPackage -({ lib, buildPythonPackage, autopep8, mypy, pylint, requests, types-requests }: +pkgs.python3Packages.callPackage ({ lib, buildPythonPackage, autopep8, mypy + , pylint, requests, requests-cache, types-requests }: buildPythonPackage rec { pname = "paperdoorknob"; version = "0.0.1"; src = lib.cleanSource ./.; - propagatedBuildInputs = [ requests ]; + propagatedBuildInputs = [ requests requests-cache ]; nativeCheckInputs = [ mypy types-requests ] ++ lib.optionals lint [ autopep8 pylint ]; doCheck = true; diff --git a/paperdoorknob.py b/paperdoorknob.py index b4f5e52..c1117c2 100644 --- a/paperdoorknob.py +++ b/paperdoorknob.py @@ -7,6 +7,7 @@ from argparse import ArgumentParser import requests +import requests_cache def command_line_parser() -> ArgumentParser: @@ -26,7 +27,7 @@ def fetch(url: str, session: requests.Session, timeout: int) -> None: def main() -> None: args = command_line_parser().parse_args() - with requests.session() as session: + with requests_cache.CachedSession() as session: fetch(args.url, session, args.timeout) diff --git a/paperdoorknob_test.py b/paperdoorknob_test.py index 88897f4..e966a6b 100644 --- a/paperdoorknob_test.py +++ b/paperdoorknob_test.py @@ -9,6 +9,7 @@ import unittest import threading from http.server import BaseHTTPRequestHandler, HTTPServer import requests +import requests_cache import paperdoorknob TIMEOUT = 8 @@ -65,6 +66,13 @@ class TestFetch(unittest.TestCase): paperdoorknob.fetch(f"http://localhost:{self._port()}", s, TIMEOUT) self.assertEqual(self._request_counter, 2) + def testFetchCaching(self) -> None: + with requests_cache.CachedSession() as s: + paperdoorknob.fetch(f"http://localhost:{self._port()}", s, TIMEOUT) + self.assertEqual(self._request_counter, 1) + paperdoorknob.fetch(f"http://localhost:{self._port()}", s, TIMEOUT) + self.assertEqual(self._request_counter, 1) + def testFetchErrors(self) -> None: with requests.session() as s: with self.assertRaises(requests.HTTPError): -- 2.44.1 From de7251fc8787f9634643d6d4e05cb3233682c34c Mon Sep 17 00:00:00 2001 From: Scott Worley Date: Thu, 23 Nov 2023 13:28:50 -0800 Subject: [PATCH 012/100] fetch: Verify caching across sessions --- README.md | 1 + paperdoorknob_test.py | 8 ++++++++ 2 files changed, 9 insertions(+) diff --git a/README.md b/README.md index 9782d63..e236b64 100644 --- a/README.md +++ b/README.md @@ -3,6 +3,7 @@ A polite glowfic → printable book scraper. * Retrieves glowfic +* Minimizes server load with local caching ### Alternatives diff --git a/paperdoorknob_test.py b/paperdoorknob_test.py index e966a6b..cb70ac5 100644 --- a/paperdoorknob_test.py +++ b/paperdoorknob_test.py @@ -73,6 +73,14 @@ class TestFetch(unittest.TestCase): paperdoorknob.fetch(f"http://localhost:{self._port()}", s, TIMEOUT) self.assertEqual(self._request_counter, 1) + def testFetchPersistentCaching(self) -> None: + with requests_cache.CachedSession() as s: + paperdoorknob.fetch(f"http://localhost:{self._port()}", s, TIMEOUT) + self.assertEqual(self._request_counter, 1) + with requests_cache.CachedSession() as s: + paperdoorknob.fetch(f"http://localhost:{self._port()}", s, TIMEOUT) + self.assertEqual(self._request_counter, 1) + def testFetchErrors(self) -> None: with requests.session() as s: with self.assertRaises(requests.HTTPError): -- 2.44.1 From ba3b7c52144a19a83892638415d2bfdb667888f5 Mon Sep 17 00:00:00 2001 From: Scott Worley Date: Thu, 23 Nov 2023 13:57:49 -0800 Subject: [PATCH 013/100] fetch: cache: Keep cache in XDG_CACHE_HOME (eg: ~/.cache) --- default.nix | 4 ++-- paperdoorknob.py | 9 ++++++++- 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/default.nix b/default.nix index 6331754..69a7149 100644 --- a/default.nix +++ b/default.nix @@ -1,11 +1,11 @@ { pkgs ? import { }, lint ? false }: pkgs.python3Packages.callPackage ({ lib, buildPythonPackage, autopep8, mypy - , pylint, requests, requests-cache, types-requests }: + , pylint, requests, requests-cache, types-requests, xdg-base-dirs }: buildPythonPackage rec { pname = "paperdoorknob"; version = "0.0.1"; src = lib.cleanSource ./.; - propagatedBuildInputs = [ requests requests-cache ]; + propagatedBuildInputs = [ requests requests-cache xdg-base-dirs ]; nativeCheckInputs = [ mypy types-requests ] ++ lib.optionals lint [ autopep8 pylint ]; doCheck = true; diff --git a/paperdoorknob.py b/paperdoorknob.py index c1117c2..3aae538 100644 --- a/paperdoorknob.py +++ b/paperdoorknob.py @@ -6,12 +6,19 @@ from argparse import ArgumentParser +import os.path import requests import requests_cache +from xdg_base_dirs import xdg_cache_home def command_line_parser() -> ArgumentParser: parser = ArgumentParser(prog='paperdoorknob', description='Print glowfic') + parser.add_argument( + '--cache_path', + metavar='PATH', + help='Where to keep the http cache (instead of %(default)s)', + default=os.path.join(xdg_cache_home(), "paperdoorknob")) parser.add_argument( '--timeout', help='How long to wait for HTTP requests, in seconds', @@ -27,7 +34,7 @@ def fetch(url: str, session: requests.Session, timeout: int) -> None: def main() -> None: args = command_line_parser().parse_args() - with requests_cache.CachedSession() as session: + with requests_cache.CachedSession(args.cache_path) as session: fetch(args.url, session, args.timeout) -- 2.44.1 From 4c1cf54e0fa588eda9ebee0aa48d1b56c0c3a37f Mon Sep 17 00:00:00 2001 From: Scott Worley Date: Thu, 23 Nov 2023 14:13:05 -0800 Subject: [PATCH 014/100] fetch: Honor Cache-Control headers --- paperdoorknob.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paperdoorknob.py b/paperdoorknob.py index 3aae538..b7e4349 100644 --- a/paperdoorknob.py +++ b/paperdoorknob.py @@ -34,7 +34,7 @@ def fetch(url: str, session: requests.Session, timeout: int) -> None: def main() -> None: args = command_line_parser().parse_args() - with requests_cache.CachedSession(args.cache_path) as session: + with requests_cache.CachedSession(args.cache_path, cache_control=True) as session: fetch(args.url, session, args.timeout) -- 2.44.1 From 136277e30143cd1219c896bb4980027ac3c6dbe1 Mon Sep 17 00:00:00 2001 From: Scott Worley Date: Thu, 23 Nov 2023 18:34:52 -0800 Subject: [PATCH 015/100] Parse HTML --- default.nix | 8 +++++--- paperdoorknob.py | 4 +++- paperdoorknob_test.py | 8 ++++++++ 3 files changed, 16 insertions(+), 4 deletions(-) diff --git a/default.nix b/default.nix index 69a7149..bf96eb8 100644 --- a/default.nix +++ b/default.nix @@ -1,12 +1,14 @@ { pkgs ? import { }, lint ? false }: pkgs.python3Packages.callPackage ({ lib, buildPythonPackage, autopep8, mypy - , pylint, requests, requests-cache, types-requests, xdg-base-dirs }: + , pylint, beautifulsoup4, requests, requests-cache, types-beautifulsoup4 + , types-requests, xdg-base-dirs }: buildPythonPackage rec { pname = "paperdoorknob"; version = "0.0.1"; src = lib.cleanSource ./.; - propagatedBuildInputs = [ requests requests-cache xdg-base-dirs ]; - nativeCheckInputs = [ mypy types-requests ] + propagatedBuildInputs = + [ beautifulsoup4 requests requests-cache xdg-base-dirs ]; + nativeCheckInputs = [ mypy types-beautifulsoup4 types-requests ] ++ lib.optionals lint [ autopep8 pylint ]; doCheck = true; checkPhase = "./test.sh"; diff --git a/paperdoorknob.py b/paperdoorknob.py index b7e4349..bb8bdd1 100644 --- a/paperdoorknob.py +++ b/paperdoorknob.py @@ -7,6 +7,7 @@ from argparse import ArgumentParser import os.path +from bs4 import BeautifulSoup import requests import requests_cache from xdg_base_dirs import xdg_cache_home @@ -27,9 +28,10 @@ def command_line_parser() -> ArgumentParser: return parser -def fetch(url: str, session: requests.Session, timeout: int) -> None: +def fetch(url: str, session: requests.Session, timeout: int) -> BeautifulSoup: with session.get(url, timeout=timeout) as r: r.raise_for_status() + return BeautifulSoup(r.text, 'html.parser') def main() -> None: diff --git a/paperdoorknob_test.py b/paperdoorknob_test.py index cb70ac5..ab13eed 100644 --- a/paperdoorknob_test.py +++ b/paperdoorknob_test.py @@ -81,6 +81,14 @@ class TestFetch(unittest.TestCase): paperdoorknob.fetch(f"http://localhost:{self._port()}", s, TIMEOUT) self.assertEqual(self._request_counter, 1) + def testFetchConents(self) -> None: + with requests.session() as s: + doc = paperdoorknob.fetch( + f"http://localhost:{self._port()}", s, TIMEOUT) + body = doc.body + assert body + self.assertEqual(body.text, "This is glowfic") + def testFetchErrors(self) -> None: with requests.session() as s: with self.assertRaises(requests.HTTPError): -- 2.44.1 From 6409066b6abbe205fa64f844a01516f3f3de2553 Mon Sep 17 00:00:00 2001 From: Scott Worley Date: Thu, 23 Nov 2023 19:26:53 -0800 Subject: [PATCH 016/100] Begin parsing glowfic html --- paperdoorknob.py | 16 +++++++++++++++- paperdoorknob_test.py | 16 ++++++++++------ 2 files changed, 25 insertions(+), 7 deletions(-) diff --git a/paperdoorknob.py b/paperdoorknob.py index bb8bdd1..b88c02d 100644 --- a/paperdoorknob.py +++ b/paperdoorknob.py @@ -8,11 +8,24 @@ from argparse import ArgumentParser import os.path from bs4 import BeautifulSoup +from bs4.element import Tag import requests import requests_cache from xdg_base_dirs import xdg_cache_home +class Post: + def __init__(self, html: BeautifulSoup) -> None: + self._html = html + + def text(self) -> Tag: + body = self._html.body + assert body + text = body.find_next("div", class_="post-post") + assert isinstance(text, Tag) + return text + + def command_line_parser() -> ArgumentParser: parser = ArgumentParser(prog='paperdoorknob', description='Print glowfic') parser.add_argument( @@ -37,7 +50,8 @@ def fetch(url: str, session: requests.Session, timeout: int) -> BeautifulSoup: def main() -> None: args = command_line_parser().parse_args() with requests_cache.CachedSession(args.cache_path, cache_control=True) as session: - fetch(args.url, session, args.timeout) + html = fetch(args.url, session, args.timeout) + Post(html) if __name__ == '__main__': diff --git a/paperdoorknob_test.py b/paperdoorknob_test.py index ab13eed..83cf76a 100644 --- a/paperdoorknob_test.py +++ b/paperdoorknob_test.py @@ -28,7 +28,13 @@ class FakeGlowficHTTPRequestHandler(BaseHTTPRequestHandler): return 200 def do_GET(self) -> None: - body = b'This is glowfic' + body = b''' + +
+ This is glowfic +
+ +''' self.send_response(self._response_code()) self.send_header("Content-type", "text/html") self.send_header("Content-Length", str(len(body))) @@ -83,11 +89,9 @@ class TestFetch(unittest.TestCase): def testFetchConents(self) -> None: with requests.session() as s: - doc = paperdoorknob.fetch( - f"http://localhost:{self._port()}", s, TIMEOUT) - body = doc.body - assert body - self.assertEqual(body.text, "This is glowfic") + post = paperdoorknob.Post(paperdoorknob.fetch( + f"http://localhost:{self._port()}", s, TIMEOUT)) + self.assertEqual(post.text().text.strip(), "This is glowfic") def testFetchErrors(self) -> None: with requests.session() as s: -- 2.44.1 From a0d30541ee9349b8c62200c9aa6d4dbb2873fccb Mon Sep 17 00:00:00 2001 From: Scott Worley Date: Thu, 23 Nov 2023 19:32:52 -0800 Subject: [PATCH 017/100] Replies --- paperdoorknob.py | 8 ++++++++ paperdoorknob_test.py | 10 ++++++++++ 2 files changed, 18 insertions(+) diff --git a/paperdoorknob.py b/paperdoorknob.py index b88c02d..0c6ec85 100644 --- a/paperdoorknob.py +++ b/paperdoorknob.py @@ -7,6 +7,9 @@ from argparse import ArgumentParser import os.path + +from typing import Iterable + from bs4 import BeautifulSoup from bs4.element import Tag import requests @@ -25,6 +28,11 @@ class Post: assert isinstance(text, Tag) return text + def replies(self) -> Iterable[Tag]: + replies = self._html.find_all("div", class_="post-reply") + assert all(isinstance(r, Tag) for r in replies) + return replies + def command_line_parser() -> ArgumentParser: parser = ArgumentParser(prog='paperdoorknob', description='Print glowfic') diff --git a/paperdoorknob_test.py b/paperdoorknob_test.py index 83cf76a..7c48607 100644 --- a/paperdoorknob_test.py +++ b/paperdoorknob_test.py @@ -33,6 +33,14 @@ class FakeGlowficHTTPRequestHandler(BaseHTTPRequestHandler):
This is glowfic
+
+
+ You sure? +
+
+ Pretty sure. +
+
''' self.send_response(self._response_code()) @@ -92,6 +100,8 @@ class TestFetch(unittest.TestCase): post = paperdoorknob.Post(paperdoorknob.fetch( f"http://localhost:{self._port()}", s, TIMEOUT)) self.assertEqual(post.text().text.strip(), "This is glowfic") + self.assertEqual([r.text.strip() for r in post.replies()], + ["You sure?", "Pretty sure."]) def testFetchErrors(self) -> None: with requests.session() as s: -- 2.44.1 From 55958ec0820a10472e5091b105d730fa38172390 Mon Sep 17 00:00:00 2001 From: Scott Worley Date: Thu, 23 Nov 2023 20:04:59 -0800 Subject: [PATCH 018/100] entries() convenience method --- paperdoorknob.py | 4 ++++ paperdoorknob_test.py | 2 ++ 2 files changed, 6 insertions(+) diff --git a/paperdoorknob.py b/paperdoorknob.py index 0c6ec85..6cc76a2 100644 --- a/paperdoorknob.py +++ b/paperdoorknob.py @@ -6,6 +6,7 @@ from argparse import ArgumentParser +import itertools import os.path from typing import Iterable @@ -33,6 +34,9 @@ class Post: assert all(isinstance(r, Tag) for r in replies) return replies + def entries(self) -> Iterable[Tag]: + return itertools.chain([self.text()], self.replies()) + def command_line_parser() -> ArgumentParser: parser = ArgumentParser(prog='paperdoorknob', description='Print glowfic') diff --git a/paperdoorknob_test.py b/paperdoorknob_test.py index 7c48607..5fe1779 100644 --- a/paperdoorknob_test.py +++ b/paperdoorknob_test.py @@ -102,6 +102,8 @@ class TestFetch(unittest.TestCase): self.assertEqual(post.text().text.strip(), "This is glowfic") self.assertEqual([r.text.strip() for r in post.replies()], ["You sure?", "Pretty sure."]) + self.assertEqual([r.text.strip() for r in post.entries()], + ["This is glowfic", "You sure?", "Pretty sure."]) def testFetchErrors(self) -> None: with requests.session() as s: -- 2.44.1 From 170e50a0efa18ed0f5f2ff644793469894983029 Mon Sep 17 00:00:00 2001 From: Scott Worley Date: Thu, 23 Nov 2023 20:22:31 -0800 Subject: [PATCH 019/100] Strip edit-boxes and footers --- paperdoorknob.py | 4 ++++ paperdoorknob_test.py | 4 ++++ 2 files changed, 8 insertions(+) diff --git a/paperdoorknob.py b/paperdoorknob.py index 6cc76a2..fb2df1f 100644 --- a/paperdoorknob.py +++ b/paperdoorknob.py @@ -21,6 +21,10 @@ from xdg_base_dirs import xdg_cache_home class Post: def __init__(self, html: BeautifulSoup) -> None: self._html = html + for eb in self._html.find_all("div", class_="post-edit-box"): + eb.decompose() + for footer in self._html.find_all("div", class_="post-footer"): + footer.decompose() def text(self) -> Tag: body = self._html.body diff --git a/paperdoorknob_test.py b/paperdoorknob_test.py index 5fe1779..7658cf4 100644 --- a/paperdoorknob_test.py +++ b/paperdoorknob_test.py @@ -31,11 +31,15 @@ class FakeGlowficHTTPRequestHandler(BaseHTTPRequestHandler): body = b'''
+
We don't want edit boxes
This is glowfic +
+
We don't want edit boxes
You sure? +
Pretty sure. -- 2.44.1 From 47cfa3cdb4b4a7c6191e4a63b2a3080b52f076b0 Mon Sep 17 00:00:00 2001 From: Scott Worley Date: Sat, 25 Nov 2023 01:17:05 -0800 Subject: [PATCH 020/100] Drop Post as a class --- paperdoorknob.py | 59 ++++++++++++++++++++++++------------------- paperdoorknob_test.py | 11 +++----- 2 files changed, 37 insertions(+), 33 deletions(-) diff --git a/paperdoorknob.py b/paperdoorknob.py index fb2df1f..8329f83 100644 --- a/paperdoorknob.py +++ b/paperdoorknob.py @@ -18,30 +18,6 @@ import requests_cache from xdg_base_dirs import xdg_cache_home -class Post: - def __init__(self, html: BeautifulSoup) -> None: - self._html = html - for eb in self._html.find_all("div", class_="post-edit-box"): - eb.decompose() - for footer in self._html.find_all("div", class_="post-footer"): - footer.decompose() - - def text(self) -> Tag: - body = self._html.body - assert body - text = body.find_next("div", class_="post-post") - assert isinstance(text, Tag) - return text - - def replies(self) -> Iterable[Tag]: - replies = self._html.find_all("div", class_="post-reply") - assert all(isinstance(r, Tag) for r in replies) - return replies - - def entries(self) -> Iterable[Tag]: - return itertools.chain([self.text()], self.replies()) - - def command_line_parser() -> ArgumentParser: parser = ArgumentParser(prog='paperdoorknob', description='Print glowfic') parser.add_argument( @@ -63,11 +39,42 @@ def fetch(url: str, session: requests.Session, timeout: int) -> BeautifulSoup: return BeautifulSoup(r.text, 'html.parser') +def clean(html: BeautifulSoup) -> BeautifulSoup: + for eb in html.find_all("div", class_="post-edit-box"): + eb.decompose() + for footer in html.find_all("div", class_="post-footer"): + footer.decompose() + return html + + +def replies(html: BeautifulSoup) -> Iterable[Tag]: + def text() -> Tag: + body = html.body + assert body + text = body.find_next("div", class_="post-post") + assert isinstance(text, Tag) + return text + + def the_replies() -> Iterable[Tag]: + rs = html.find_all("div", class_="post-reply") + assert all(isinstance(r, Tag) for r in rs) + return rs + + return itertools.chain([text()], the_replies()) + + +def process( + url: str, + session: requests.Session, + timeout: int) -> Iterable[Tag]: + html = clean(fetch(url, session, timeout)) + return replies(html) + + def main() -> None: args = command_line_parser().parse_args() with requests_cache.CachedSession(args.cache_path, cache_control=True) as session: - html = fetch(args.url, session, args.timeout) - Post(html) + process(args.url, session, args.timeout) if __name__ == '__main__': diff --git a/paperdoorknob_test.py b/paperdoorknob_test.py index 7658cf4..35756cb 100644 --- a/paperdoorknob_test.py +++ b/paperdoorknob_test.py @@ -99,14 +99,11 @@ class TestFetch(unittest.TestCase): paperdoorknob.fetch(f"http://localhost:{self._port()}", s, TIMEOUT) self.assertEqual(self._request_counter, 1) - def testFetchConents(self) -> None: + def testProcess(self) -> None: with requests.session() as s: - post = paperdoorknob.Post(paperdoorknob.fetch( - f"http://localhost:{self._port()}", s, TIMEOUT)) - self.assertEqual(post.text().text.strip(), "This is glowfic") - self.assertEqual([r.text.strip() for r in post.replies()], - ["You sure?", "Pretty sure."]) - self.assertEqual([r.text.strip() for r in post.entries()], + replies = paperdoorknob.process( + f"http://localhost:{self._port()}", s, TIMEOUT) + self.assertEqual([r.text.strip() for r in replies], ["This is glowfic", "You sure?", "Pretty sure."]) def testFetchErrors(self) -> None: -- 2.44.1 From a2d424680628f747d2ec20f2ae3e3758f0d12cc5 Mon Sep 17 00:00:00 2001 From: Scott Worley Date: Sat, 25 Nov 2023 01:40:05 -0800 Subject: [PATCH 021/100] Open output file --- paperdoorknob.py | 17 +++++++++++++---- paperdoorknob_test.py | 10 +++++++--- 2 files changed, 20 insertions(+), 7 deletions(-) diff --git a/paperdoorknob.py b/paperdoorknob.py index 8329f83..91909a3 100644 --- a/paperdoorknob.py +++ b/paperdoorknob.py @@ -9,7 +9,7 @@ from argparse import ArgumentParser import itertools import os.path -from typing import Iterable +from typing import IO, Iterable from bs4 import BeautifulSoup from bs4.element import Tag @@ -25,6 +25,11 @@ def command_line_parser() -> ArgumentParser: metavar='PATH', help='Where to keep the http cache (instead of %(default)s)', default=os.path.join(xdg_cache_home(), "paperdoorknob")) + parser.add_argument( + '--out', + help='The filename stem at which to write output ' + + '(eg: "%(default)s" produces %(default)s.text, %(default)s.pdf, etc.)', + default='book') parser.add_argument( '--timeout', help='How long to wait for HTTP requests, in seconds', @@ -66,15 +71,19 @@ def replies(html: BeautifulSoup) -> Iterable[Tag]: def process( url: str, session: requests.Session, - timeout: int) -> Iterable[Tag]: + timeout: int, + texout: IO[str], +) -> None: html = clean(fetch(url, session, timeout)) - return replies(html) + replies(html) + print("soon", file=texout) def main() -> None: args = command_line_parser().parse_args() with requests_cache.CachedSession(args.cache_path, cache_control=True) as session: - process(args.url, session, args.timeout) + with open(args.out + '.tex', 'w', encoding='UTF-8') as texout: + process(args.url, session, args.timeout, texout) if __name__ == '__main__': diff --git a/paperdoorknob_test.py b/paperdoorknob_test.py index 35756cb..3637514 100644 --- a/paperdoorknob_test.py +++ b/paperdoorknob_test.py @@ -99,10 +99,14 @@ class TestFetch(unittest.TestCase): paperdoorknob.fetch(f"http://localhost:{self._port()}", s, TIMEOUT) self.assertEqual(self._request_counter, 1) - def testProcess(self) -> None: + def testReplies(self) -> None: with requests.session() as s: - replies = paperdoorknob.process( - f"http://localhost:{self._port()}", s, TIMEOUT) + replies = paperdoorknob.replies( + paperdoorknob.clean( + paperdoorknob.fetch( + f"http://localhost:{self._port()}", + s, + TIMEOUT))) self.assertEqual([r.text.strip() for r in replies], ["This is glowfic", "You sure?", "Pretty sure."]) -- 2.44.1 From 1e51262909ed85fcf1016973e6baff2487d889e1 Mon Sep 17 00:00:00 2001 From: Scott Worley Date: Sat, 25 Nov 2023 01:47:31 -0800 Subject: [PATCH 022/100] --pandoc flag --- README.md | 1 + default.nix | 14 ++++++++++---- paperdoorknob.py | 1 + 3 files changed, 12 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index e236b64..5b6336b 100644 --- a/README.md +++ b/README.md @@ -4,6 +4,7 @@ A polite glowfic → printable book scraper. * Retrieves glowfic * Minimizes server load with local caching +* Uses [pandoc](https://pandoc.org/) to convert HTML to LaTeX ### Alternatives diff --git a/default.nix b/default.nix index bf96eb8..6bbed22 100644 --- a/default.nix +++ b/default.nix @@ -1,21 +1,27 @@ { pkgs ? import { }, lint ? false }: -pkgs.python3Packages.callPackage ({ lib, buildPythonPackage, autopep8, mypy - , pylint, beautifulsoup4, requests, requests-cache, types-beautifulsoup4 - , types-requests, xdg-base-dirs }: +pkgs.python3Packages.callPackage ({ lib, buildPythonPackage, makeWrapper + , autopep8, mypy, pylint, beautifulsoup4, requests, requests-cache + , types-beautifulsoup4, types-requests, xdg-base-dirs, pandoc-cli }: buildPythonPackage rec { pname = "paperdoorknob"; version = "0.0.1"; src = lib.cleanSource ./.; + nativeBuildInputs = [ makeWrapper ]; propagatedBuildInputs = [ beautifulsoup4 requests requests-cache xdg-base-dirs ]; nativeCheckInputs = [ mypy types-beautifulsoup4 types-requests ] ++ lib.optionals lint [ autopep8 pylint ]; doCheck = true; checkPhase = "./test.sh"; + postInstall = '' + wrapProgram "$out/bin/paperdoorknob" \ + --add-flags "--pandoc=${pandoc-cli}/bin/pandoc" \ + + ''; meta = { description = "A polite glowfic → printable book scraper."; homepage = "https://git.scottworley.com/paperdoorknob"; license = pkgs.lib.licenses.gpl3; maintainers = with pkgs.lib.maintainers; [ chkno ]; }; - }) { } + }) { inherit (pkgs.haskellPackages) pandoc-cli; } diff --git a/paperdoorknob.py b/paperdoorknob.py index 91909a3..a31b659 100644 --- a/paperdoorknob.py +++ b/paperdoorknob.py @@ -30,6 +30,7 @@ def command_line_parser() -> ArgumentParser: help='The filename stem at which to write output ' + '(eg: "%(default)s" produces %(default)s.text, %(default)s.pdf, etc.)', default='book') + parser.add_argument('--pandoc', help='Location of the pandoc executable') parser.add_argument( '--timeout', help='How long to wait for HTTP requests, in seconds', -- 2.44.1 From 36ae1d5f39e74dc6c6cc1cd253d029f86003c83f Mon Sep 17 00:00:00 2001 From: Scott Worley Date: Thu, 30 Nov 2023 08:26:35 -0800 Subject: [PATCH 023/100] Invoke pandoc to convert HTML to LaTeX --- default.nix | 2 +- paperdoorknob.py | 24 +++++++++++++++++++----- paperdoorknob_test.py | 13 +++++++++++++ 3 files changed, 33 insertions(+), 6 deletions(-) diff --git a/default.nix b/default.nix index 6bbed22..58f9886 100644 --- a/default.nix +++ b/default.nix @@ -9,7 +9,7 @@ pkgs.python3Packages.callPackage ({ lib, buildPythonPackage, makeWrapper nativeBuildInputs = [ makeWrapper ]; propagatedBuildInputs = [ beautifulsoup4 requests requests-cache xdg-base-dirs ]; - nativeCheckInputs = [ mypy types-beautifulsoup4 types-requests ] + nativeCheckInputs = [ mypy pandoc-cli types-beautifulsoup4 types-requests ] ++ lib.optionals lint [ autopep8 pylint ]; doCheck = true; checkPhase = "./test.sh"; diff --git a/paperdoorknob.py b/paperdoorknob.py index a31b659..32daf4f 100644 --- a/paperdoorknob.py +++ b/paperdoorknob.py @@ -8,6 +8,7 @@ from argparse import ArgumentParser import itertools import os.path +import subprocess from typing import IO, Iterable @@ -69,22 +70,35 @@ def replies(html: BeautifulSoup) -> Iterable[Tag]: return itertools.chain([text()], the_replies()) +def html_to_tex(pandoc: str, tag: Tag) -> bytes: + return subprocess.run([pandoc, '--from=html', '--to=latex'], + input=tag.encode(), + stdout=subprocess.PIPE, + check=True).stdout + + def process( url: str, session: requests.Session, timeout: int, - texout: IO[str], + texout: IO[bytes], + pandoc: str, ) -> None: html = clean(fetch(url, session, timeout)) - replies(html) - print("soon", file=texout) + for r in replies(html): + texout.write(html_to_tex(pandoc, r)) def main() -> None: args = command_line_parser().parse_args() with requests_cache.CachedSession(args.cache_path, cache_control=True) as session: - with open(args.out + '.tex', 'w', encoding='UTF-8') as texout: - process(args.url, session, args.timeout, texout) + with open(args.out + '.tex', 'wb') as texout: + process( + args.url, + session, + args.timeout, + texout, + args.pandoc or 'pandoc') if __name__ == '__main__': diff --git a/paperdoorknob_test.py b/paperdoorknob_test.py index 3637514..7cb73b7 100644 --- a/paperdoorknob_test.py +++ b/paperdoorknob_test.py @@ -6,6 +6,7 @@ import unittest +import io import threading from http.server import BaseHTTPRequestHandler, HTTPServer import requests @@ -119,6 +120,18 @@ class TestFetch(unittest.TestCase): paperdoorknob.fetch( f"http://localhost:{self._port()}/server_error", s, TIMEOUT) + def testProcess(self) -> None: + with requests.session() as s: + buf = io.BytesIO() + paperdoorknob.process( + f"http://localhost:{self._port()}", + s, + TIMEOUT, + buf, + 'pandoc') + self.assertEqual(buf.getvalue(), + b'This is glowfic\nYou sure?\nPretty sure.\n') + if __name__ == '__main__': unittest.main() -- 2.44.1 From afd3c3a13956819ceb32c28875f56cff7df8fe3f Mon Sep 17 00:00:00 2001 From: Scott Worley Date: Thu, 30 Nov 2023 08:29:35 -0800 Subject: [PATCH 024/100] Verify that LaTeX conversion is doing something --- paperdoorknob_test.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/paperdoorknob_test.py b/paperdoorknob_test.py index 7cb73b7..ee4de04 100644 --- a/paperdoorknob_test.py +++ b/paperdoorknob_test.py @@ -39,7 +39,7 @@ class FakeGlowficHTTPRequestHandler(BaseHTTPRequestHandler):
We don't want edit boxes
- You sure? + You sure?
@@ -129,8 +129,9 @@ class TestFetch(unittest.TestCase): TIMEOUT, buf, 'pandoc') - self.assertEqual(buf.getvalue(), - b'This is glowfic\nYou sure?\nPretty sure.\n') + self.assertEqual( + buf.getvalue(), + b'This is glowfic\nYou \\emph{sure}?\nPretty sure.\n') if __name__ == '__main__': -- 2.44.1 From 07f9b178a58e87f90aa7ab1e459c17561345154d Mon Sep 17 00:00:00 2001 From: Scott Worley Date: Fri, 1 Dec 2023 01:01:21 -0800 Subject: [PATCH 025/100] LaTeX output is a valid LaTeX file --- default.nix | 6 ++++-- paperdoorknob.py | 2 ++ paperdoorknob_test.py | 23 ++++++++++++++++++++--- 3 files changed, 26 insertions(+), 5 deletions(-) diff --git a/default.nix b/default.nix index 58f9886..3e02638 100644 --- a/default.nix +++ b/default.nix @@ -1,7 +1,8 @@ { pkgs ? import { }, lint ? false }: pkgs.python3Packages.callPackage ({ lib, buildPythonPackage, makeWrapper , autopep8, mypy, pylint, beautifulsoup4, requests, requests-cache - , types-beautifulsoup4, types-requests, xdg-base-dirs, pandoc-cli }: + , texliveBasic, types-beautifulsoup4, types-requests, xdg-base-dirs + , pandoc-cli }: buildPythonPackage rec { pname = "paperdoorknob"; version = "0.0.1"; @@ -9,7 +10,8 @@ pkgs.python3Packages.callPackage ({ lib, buildPythonPackage, makeWrapper nativeBuildInputs = [ makeWrapper ]; propagatedBuildInputs = [ beautifulsoup4 requests requests-cache xdg-base-dirs ]; - nativeCheckInputs = [ mypy pandoc-cli types-beautifulsoup4 types-requests ] + nativeCheckInputs = + [ mypy pandoc-cli texliveBasic types-beautifulsoup4 types-requests ] ++ lib.optionals lint [ autopep8 pylint ]; doCheck = true; checkPhase = "./test.sh"; diff --git a/paperdoorknob.py b/paperdoorknob.py index 32daf4f..8ab62f1 100644 --- a/paperdoorknob.py +++ b/paperdoorknob.py @@ -84,9 +84,11 @@ def process( texout: IO[bytes], pandoc: str, ) -> None: + texout.write(b'\\documentclass{article}\n\\begin{document}\n') html = clean(fetch(url, session, timeout)) for r in replies(html): texout.write(html_to_tex(pandoc, r)) + texout.write(b'\\end{document}\n') def main() -> None: diff --git a/paperdoorknob_test.py b/paperdoorknob_test.py index ee4de04..9ea877d 100644 --- a/paperdoorknob_test.py +++ b/paperdoorknob_test.py @@ -7,6 +7,7 @@ import unittest import io +import subprocess import threading from http.server import BaseHTTPRequestHandler, HTTPServer import requests @@ -129,9 +130,25 @@ class TestFetch(unittest.TestCase): TIMEOUT, buf, 'pandoc') - self.assertEqual( - buf.getvalue(), - b'This is glowfic\nYou \\emph{sure}?\nPretty sure.\n') + self.assertEqual(buf.getvalue(), b'''\\documentclass{article} +\\begin{document} +This is glowfic +You \\emph{sure}? +Pretty sure. +\\end{document} +''') + + def testPDF(self) -> None: + with requests.session() as s: + with open("test.tex", 'wb') as out: + paperdoorknob.process( + f"http://localhost:{self._port()}", + s, + TIMEOUT, + out, + 'pandoc') + subprocess.run(['pdflatex', 'test.tex'], + stdin=subprocess.DEVNULL, check=True) if __name__ == '__main__': -- 2.44.1 From 41b11505f297962d099b75a0372a4068e26773a5 Mon Sep 17 00:00:00 2001 From: Scott Worley Date: Tue, 19 Dec 2023 00:36:35 -0800 Subject: [PATCH 026/100] Extract fake webserver --- paperdoorknob_test.py | 102 +++++++++--------------------------------- testing/__init__.py | 0 testing/fakeserver.py | 72 +++++++++++++++++++++++++++++ 3 files changed, 92 insertions(+), 82 deletions(-) create mode 100644 testing/__init__.py create mode 100644 testing/fakeserver.py diff --git a/paperdoorknob_test.py b/paperdoorknob_test.py index 9ea877d..c896810 100644 --- a/paperdoorknob_test.py +++ b/paperdoorknob_test.py @@ -8,105 +8,47 @@ import unittest import io import subprocess -import threading -from http.server import BaseHTTPRequestHandler, HTTPServer import requests import requests_cache import paperdoorknob +from testing.fakeserver import FakeGlowficServer TIMEOUT = 8 -class FakeGlowficHTTPRequestHandler(BaseHTTPRequestHandler): - - def _notify_test(self) -> None: - raise NotImplementedError() - - def _response_code(self) -> int: - if self.path == "/not_found": - return 404 - if self.path == "/server_error": - return 500 - return 200 - - def do_GET(self) -> None: - body = b''' - -
-
We don't want edit boxes
- This is glowfic - -
-
-
-
We don't want edit boxes
- You sure? - -
-
- Pretty sure. -
-
- -''' - self.send_response(self._response_code()) - self.send_header("Content-type", "text/html") - self.send_header("Content-Length", str(len(body))) - self.end_headers() - self.wfile.write(body) - self._notify_test() - - class TestFetch(unittest.TestCase): - def _port(self) -> int: - port = self._web_server.socket.getsockname()[1] - assert isinstance(port, int) - return port - - def _count_request(self) -> None: - self._request_counter += 1 - def setUp(self) -> None: - self._request_counter = 0 - handler = type("Handler", (FakeGlowficHTTPRequestHandler,), { - '_notify_test': lambda _: self._count_request()}) - self._web_server = HTTPServer(('', 0), handler) - self._thread = threading.Thread(target=self._web_server.serve_forever) - self._thread.start() - - def tearDown(self) -> None: - self._web_server.shutdown() - self._thread.join() - self._web_server.server_close() + self._server = self.enterContext(FakeGlowficServer()) + self._port = self._server.port() def testFetch(self) -> None: with requests.session() as s: - paperdoorknob.fetch(f"http://localhost:{self._port()}", s, TIMEOUT) - self.assertEqual(self._request_counter, 1) - paperdoorknob.fetch(f"http://localhost:{self._port()}", s, TIMEOUT) - self.assertEqual(self._request_counter, 2) + paperdoorknob.fetch(f"http://localhost:{self._port}", s, TIMEOUT) + self.assertEqual(self._server.request_count(), 1) + paperdoorknob.fetch(f"http://localhost:{self._port}", s, TIMEOUT) + self.assertEqual(self._server.request_count(), 2) def testFetchCaching(self) -> None: with requests_cache.CachedSession() as s: - paperdoorknob.fetch(f"http://localhost:{self._port()}", s, TIMEOUT) - self.assertEqual(self._request_counter, 1) - paperdoorknob.fetch(f"http://localhost:{self._port()}", s, TIMEOUT) - self.assertEqual(self._request_counter, 1) + paperdoorknob.fetch(f"http://localhost:{self._port}", s, TIMEOUT) + self.assertEqual(self._server.request_count(), 1) + paperdoorknob.fetch(f"http://localhost:{self._port}", s, TIMEOUT) + self.assertEqual(self._server.request_count(), 1) def testFetchPersistentCaching(self) -> None: with requests_cache.CachedSession() as s: - paperdoorknob.fetch(f"http://localhost:{self._port()}", s, TIMEOUT) - self.assertEqual(self._request_counter, 1) + paperdoorknob.fetch(f"http://localhost:{self._port}", s, TIMEOUT) + self.assertEqual(self._server.request_count(), 1) with requests_cache.CachedSession() as s: - paperdoorknob.fetch(f"http://localhost:{self._port()}", s, TIMEOUT) - self.assertEqual(self._request_counter, 1) + paperdoorknob.fetch(f"http://localhost:{self._port}", s, TIMEOUT) + self.assertEqual(self._server.request_count(), 1) def testReplies(self) -> None: with requests.session() as s: replies = paperdoorknob.replies( paperdoorknob.clean( paperdoorknob.fetch( - f"http://localhost:{self._port()}", + f"http://localhost:{self._port}", s, TIMEOUT))) self.assertEqual([r.text.strip() for r in replies], @@ -116,16 +58,16 @@ class TestFetch(unittest.TestCase): with requests.session() as s: with self.assertRaises(requests.HTTPError): paperdoorknob.fetch( - f"http://localhost:{self._port()}/not_found", s, TIMEOUT) + f"http://localhost:{self._port}/not_found", s, TIMEOUT) with self.assertRaises(requests.HTTPError): paperdoorknob.fetch( - f"http://localhost:{self._port()}/server_error", s, TIMEOUT) + f"http://localhost:{self._port}/server_error", s, TIMEOUT) def testProcess(self) -> None: with requests.session() as s: buf = io.BytesIO() paperdoorknob.process( - f"http://localhost:{self._port()}", + f"http://localhost:{self._port}", s, TIMEOUT, buf, @@ -142,11 +84,7 @@ Pretty sure. with requests.session() as s: with open("test.tex", 'wb') as out: paperdoorknob.process( - f"http://localhost:{self._port()}", - s, - TIMEOUT, - out, - 'pandoc') + f"http://localhost:{self._port}", s, TIMEOUT, out, 'pandoc') subprocess.run(['pdflatex', 'test.tex'], stdin=subprocess.DEVNULL, check=True) diff --git a/testing/__init__.py b/testing/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/testing/fakeserver.py b/testing/fakeserver.py new file mode 100644 index 0000000..2ed8149 --- /dev/null +++ b/testing/fakeserver.py @@ -0,0 +1,72 @@ +from typing import Any, Self +import threading +from http.server import BaseHTTPRequestHandler, HTTPServer + + +class FakeGlowficHTTPRequestHandler(BaseHTTPRequestHandler): + + def _notify_server(self) -> None: + raise NotImplementedError() + + def _response_code(self) -> int: + if self.path == "/not_found": + return 404 + if self.path == "/server_error": + return 500 + return 200 + + def do_GET(self) -> None: + body = b''' + +
+
We don't want edit boxes
+ This is glowfic + +
+
+
+
We don't want edit boxes
+ You sure? + +
+
+ Pretty sure. +
+
+ +''' + self.send_response(self._response_code()) + self.send_header("Content-type", "text/html") + self.send_header("Content-Length", str(len(body))) + self.end_headers() + self.wfile.write(body) + self._notify_server() + + +class FakeGlowficServer(): + def __init__(self) -> None: + self._request_counter = 0 + handler = type("Handler", (FakeGlowficHTTPRequestHandler,), { + '_notify_server': lambda _: self._count_request()}) + self._web_server = HTTPServer(('', 0), handler) + self._thread = threading.Thread(target=self._web_server.serve_forever) + + def __enter__(self) -> Self: + self._thread.start() + return self + + def __exit__(self, *_: Any) -> None: + self._web_server.shutdown() + self._thread.join() + self._web_server.server_close() + + def _count_request(self) -> None: + self._request_counter += 1 + + def port(self) -> int: + p = self._web_server.socket.getsockname()[1] + assert isinstance(p, int) + return p + + def request_count(self) -> int: + return self._request_counter -- 2.44.1 From 705973e749e5a9da0453508dbd23de434a0fdc65 Mon Sep 17 00:00:00 2001 From: Scott Worley Date: Tue, 19 Dec 2023 01:16:03 -0800 Subject: [PATCH 027/100] Cleaner Fetcher interface --- fetch.py | 43 ++++++++++++++++++++++++++++++++++++++ fetch_test.py | 57 +++++++++++++++++++++++++++++++++++++++++++++++++++ setup.py | 5 ++++- 3 files changed, 104 insertions(+), 1 deletion(-) create mode 100644 fetch.py create mode 100644 fetch_test.py diff --git a/fetch.py b/fetch.py new file mode 100644 index 0000000..3c0c6a3 --- /dev/null +++ b/fetch.py @@ -0,0 +1,43 @@ +# paperdoorknob: Print glowfic +# +# This program is free software: you can redistribute it and/or modify it +# under the terms of the GNU General Public License as published by the +# Free Software Foundation, version 3. + + +from abc import ABC, abstractmethod +from contextlib import contextmanager +from typing import Iterator + +import requests +import requests_cache + + +class Fetcher(ABC): + @abstractmethod + def fetch(self, url: str) -> bytes: + raise NotImplementedError() + + +class _SessionFetcher(Fetcher): + + def __init__(self, session: requests.Session, timeout: int) -> None: + self._session = session + self._timeout = timeout + + def fetch(self, url: str) -> bytes: + with self._session.get(url, timeout=self._timeout) as r: + r.raise_for_status() + return r.content + + +@contextmanager +def DirectFetcher(timeout: int) -> Iterator[_SessionFetcher]: + with requests.session() as session: + yield _SessionFetcher(session, timeout) + + +@contextmanager +def CachingFetcher(cache_path: str, timeout: int) -> Iterator[_SessionFetcher]: + with requests_cache.CachedSession(cache_path, cache_control=True) as session: + yield _SessionFetcher(session, timeout) diff --git a/fetch_test.py b/fetch_test.py new file mode 100644 index 0000000..6bdf69e --- /dev/null +++ b/fetch_test.py @@ -0,0 +1,57 @@ +# paperdoorknob: Print glowfic +# +# This program is free software: you can redistribute it and/or modify it +# under the terms of the GNU General Public License as published by the +# Free Software Foundation, version 3. + + +import unittest +from requests import HTTPError +from testing.fakeserver import FakeGlowficServer +from fetch import CachingFetcher, DirectFetcher + +TIMEOUT = 8 + + +class TestFetch(unittest.TestCase): + def setUp(self) -> None: + self._server = self.enterContext(FakeGlowficServer()) + self._port = self._server.port() + + def testDirectFetch(self) -> None: + with DirectFetcher(TIMEOUT) as f: + f.fetch(f"http://localhost:{self._port}") + self.assertEqual(self._server.request_count(), 1) + f.fetch(f"http://localhost:{self._port}") + self.assertEqual(self._server.request_count(), 2) + + def testFetchCaching(self) -> None: + with CachingFetcher("testcache", TIMEOUT) as f: + f.fetch(f"http://localhost:{self._port}") + self.assertEqual(self._server.request_count(), 1) + f.fetch(f"http://localhost:{self._port}") + self.assertEqual(self._server.request_count(), 1) + + def testFetchPersistentCaching(self) -> None: + with CachingFetcher("testpersistentcache", TIMEOUT) as f: + f.fetch(f"http://localhost:{self._port}") + self.assertEqual(self._server.request_count(), 1) + with CachingFetcher("testpersistentcache", TIMEOUT) as f: + f.fetch(f"http://localhost:{self._port}") + self.assertEqual(self._server.request_count(), 1) + + def testFetchErrors(self) -> None: + with DirectFetcher(TIMEOUT) as f: + with self.assertRaises(HTTPError): + f.fetch(f"http://localhost:{self._port}/not_found") + with self.assertRaises(HTTPError): + f.fetch(f"http://localhost:{self._port}/server_error") + with CachingFetcher("testerrorscache", TIMEOUT) as f: + with self.assertRaises(HTTPError): + f.fetch(f"http://localhost:{self._port}/not_found") + with self.assertRaises(HTTPError): + f.fetch(f"http://localhost:{self._port}/server_error") + + +if __name__ == '__main__': + unittest.main() diff --git a/setup.py b/setup.py index bd23bdd..27980a1 100644 --- a/setup.py +++ b/setup.py @@ -7,7 +7,10 @@ setup( author="Scott Worley", author_email="scottworley@scottworley.com", url="https://git.scottworley.com/paperdoorknob", - py_modules=['paperdoorknob'], + py_modules=[ + 'fetch', + 'paperdoorknob', + ], license="GPL-3.0", entry_points={'console_scripts': ['paperdoorknob = paperdoorknob:main']}, ) -- 2.44.1 From a64403ac570d0049c7b5a616d19fab37ab5cb4e8 Mon Sep 17 00:00:00 2001 From: Scott Worley Date: Tue, 19 Dec 2023 01:28:31 -0800 Subject: [PATCH 028/100] Use the new encapsulated fetchers --- paperdoorknob.py | 27 +++++++-------------- paperdoorknob_test.py | 55 +++++++------------------------------------ 2 files changed, 17 insertions(+), 65 deletions(-) diff --git a/paperdoorknob.py b/paperdoorknob.py index 8ab62f1..f83c9f8 100644 --- a/paperdoorknob.py +++ b/paperdoorknob.py @@ -14,10 +14,10 @@ from typing import IO, Iterable from bs4 import BeautifulSoup from bs4.element import Tag -import requests -import requests_cache from xdg_base_dirs import xdg_cache_home +from fetch import CachingFetcher, Fetcher + def command_line_parser() -> ArgumentParser: parser = ArgumentParser(prog='paperdoorknob', description='Print glowfic') @@ -40,10 +40,8 @@ def command_line_parser() -> ArgumentParser: return parser -def fetch(url: str, session: requests.Session, timeout: int) -> BeautifulSoup: - with session.get(url, timeout=timeout) as r: - r.raise_for_status() - return BeautifulSoup(r.text, 'html.parser') +def fetch(url: str, fetcher: Fetcher) -> BeautifulSoup: + return BeautifulSoup(fetcher.fetch(url), 'html.parser') def clean(html: BeautifulSoup) -> BeautifulSoup: @@ -79,13 +77,11 @@ def html_to_tex(pandoc: str, tag: Tag) -> bytes: def process( url: str, - session: requests.Session, - timeout: int, + fetcher: Fetcher, texout: IO[bytes], - pandoc: str, -) -> None: + pandoc: str) -> None: texout.write(b'\\documentclass{article}\n\\begin{document}\n') - html = clean(fetch(url, session, timeout)) + html = clean(fetch(url, fetcher)) for r in replies(html): texout.write(html_to_tex(pandoc, r)) texout.write(b'\\end{document}\n') @@ -93,14 +89,9 @@ def process( def main() -> None: args = command_line_parser().parse_args() - with requests_cache.CachedSession(args.cache_path, cache_control=True) as session: + with CachingFetcher(args.cache_path, args.timeout) as fetcher: with open(args.out + '.tex', 'wb') as texout: - process( - args.url, - session, - args.timeout, - texout, - args.pandoc or 'pandoc') + process(args.url, fetcher, texout, args.pandoc or 'pandoc') if __name__ == '__main__': diff --git a/paperdoorknob_test.py b/paperdoorknob_test.py index c896810..49bef9f 100644 --- a/paperdoorknob_test.py +++ b/paperdoorknob_test.py @@ -8,70 +8,31 @@ import unittest import io import subprocess -import requests -import requests_cache import paperdoorknob from testing.fakeserver import FakeGlowficServer +from fetch import DirectFetcher TIMEOUT = 8 -class TestFetch(unittest.TestCase): +class TestPaperDoorknob(unittest.TestCase): def setUp(self) -> None: self._server = self.enterContext(FakeGlowficServer()) self._port = self._server.port() - def testFetch(self) -> None: - with requests.session() as s: - paperdoorknob.fetch(f"http://localhost:{self._port}", s, TIMEOUT) - self.assertEqual(self._server.request_count(), 1) - paperdoorknob.fetch(f"http://localhost:{self._port}", s, TIMEOUT) - self.assertEqual(self._server.request_count(), 2) - - def testFetchCaching(self) -> None: - with requests_cache.CachedSession() as s: - paperdoorknob.fetch(f"http://localhost:{self._port}", s, TIMEOUT) - self.assertEqual(self._server.request_count(), 1) - paperdoorknob.fetch(f"http://localhost:{self._port}", s, TIMEOUT) - self.assertEqual(self._server.request_count(), 1) - - def testFetchPersistentCaching(self) -> None: - with requests_cache.CachedSession() as s: - paperdoorknob.fetch(f"http://localhost:{self._port}", s, TIMEOUT) - self.assertEqual(self._server.request_count(), 1) - with requests_cache.CachedSession() as s: - paperdoorknob.fetch(f"http://localhost:{self._port}", s, TIMEOUT) - self.assertEqual(self._server.request_count(), 1) - def testReplies(self) -> None: - with requests.session() as s: + with DirectFetcher(TIMEOUT) as f: replies = paperdoorknob.replies( paperdoorknob.clean( - paperdoorknob.fetch( - f"http://localhost:{self._port}", - s, - TIMEOUT))) + paperdoorknob.fetch(f"http://localhost:{self._port}", f))) self.assertEqual([r.text.strip() for r in replies], ["This is glowfic", "You sure?", "Pretty sure."]) - def testFetchErrors(self) -> None: - with requests.session() as s: - with self.assertRaises(requests.HTTPError): - paperdoorknob.fetch( - f"http://localhost:{self._port}/not_found", s, TIMEOUT) - with self.assertRaises(requests.HTTPError): - paperdoorknob.fetch( - f"http://localhost:{self._port}/server_error", s, TIMEOUT) - def testProcess(self) -> None: - with requests.session() as s: + with DirectFetcher(TIMEOUT) as f: buf = io.BytesIO() paperdoorknob.process( - f"http://localhost:{self._port}", - s, - TIMEOUT, - buf, - 'pandoc') + f"http://localhost:{self._port}", f, buf, 'pandoc') self.assertEqual(buf.getvalue(), b'''\\documentclass{article} \\begin{document} This is glowfic @@ -81,10 +42,10 @@ Pretty sure. ''') def testPDF(self) -> None: - with requests.session() as s: + with DirectFetcher(TIMEOUT) as f: with open("test.tex", 'wb') as out: paperdoorknob.process( - f"http://localhost:{self._port}", s, TIMEOUT, out, 'pandoc') + f"http://localhost:{self._port}", f, out, 'pandoc') subprocess.run(['pdflatex', 'test.tex'], stdin=subprocess.DEVNULL, check=True) -- 2.44.1 From bf06f467bd363cb9fc9b1ad581b298a99d3d122b Mon Sep 17 00:00:00 2001 From: Scott Worley Date: Tue, 19 Dec 2023 01:32:43 -0800 Subject: [PATCH 029/100] =?utf8?q?Rename:=20fetch=20=E2=86=92=20parse?= MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit --- paperdoorknob.py | 6 +++--- paperdoorknob_test.py | 3 ++- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/paperdoorknob.py b/paperdoorknob.py index f83c9f8..31ce02c 100644 --- a/paperdoorknob.py +++ b/paperdoorknob.py @@ -40,8 +40,8 @@ def command_line_parser() -> ArgumentParser: return parser -def fetch(url: str, fetcher: Fetcher) -> BeautifulSoup: - return BeautifulSoup(fetcher.fetch(url), 'html.parser') +def parse(content: bytes) -> BeautifulSoup: + return BeautifulSoup(content, 'html.parser') def clean(html: BeautifulSoup) -> BeautifulSoup: @@ -81,7 +81,7 @@ def process( texout: IO[bytes], pandoc: str) -> None: texout.write(b'\\documentclass{article}\n\\begin{document}\n') - html = clean(fetch(url, fetcher)) + html = clean(parse(fetcher.fetch(url))) for r in replies(html): texout.write(html_to_tex(pandoc, r)) texout.write(b'\\end{document}\n') diff --git a/paperdoorknob_test.py b/paperdoorknob_test.py index 49bef9f..9676e28 100644 --- a/paperdoorknob_test.py +++ b/paperdoorknob_test.py @@ -24,7 +24,8 @@ class TestPaperDoorknob(unittest.TestCase): with DirectFetcher(TIMEOUT) as f: replies = paperdoorknob.replies( paperdoorknob.clean( - paperdoorknob.fetch(f"http://localhost:{self._port}", f))) + paperdoorknob.parse( + f.fetch(f"http://localhost:{self._port}")))) self.assertEqual([r.text.strip() for r in replies], ["This is glowfic", "You sure?", "Pretty sure."]) -- 2.44.1 From 79631507ecb69945b7cedb10c3aaed4b8dd788b0 Mon Sep 17 00:00:00 2001 From: Scott Worley Date: Tue, 19 Dec 2023 01:45:20 -0800 Subject: [PATCH 030/100] Texifier interface --- paperdoorknob.py | 18 ++++++------------ paperdoorknob_test.py | 7 +++++-- setup.py | 1 + texify.py | 29 +++++++++++++++++++++++++++++ 4 files changed, 41 insertions(+), 14 deletions(-) create mode 100644 texify.py diff --git a/paperdoorknob.py b/paperdoorknob.py index 31ce02c..3b86cb8 100644 --- a/paperdoorknob.py +++ b/paperdoorknob.py @@ -8,7 +8,6 @@ from argparse import ArgumentParser import itertools import os.path -import subprocess from typing import IO, Iterable @@ -17,6 +16,7 @@ from bs4.element import Tag from xdg_base_dirs import xdg_cache_home from fetch import CachingFetcher, Fetcher +from texify import PandocTexifier, Texifier def command_line_parser() -> ArgumentParser: @@ -68,30 +68,24 @@ def replies(html: BeautifulSoup) -> Iterable[Tag]: return itertools.chain([text()], the_replies()) -def html_to_tex(pandoc: str, tag: Tag) -> bytes: - return subprocess.run([pandoc, '--from=html', '--to=latex'], - input=tag.encode(), - stdout=subprocess.PIPE, - check=True).stdout - - def process( url: str, fetcher: Fetcher, - texout: IO[bytes], - pandoc: str) -> None: + texifier: Texifier, + texout: IO[bytes]) -> None: texout.write(b'\\documentclass{article}\n\\begin{document}\n') html = clean(parse(fetcher.fetch(url))) for r in replies(html): - texout.write(html_to_tex(pandoc, r)) + texout.write(texifier.texify(r)) texout.write(b'\\end{document}\n') def main() -> None: args = command_line_parser().parse_args() + texifier = PandocTexifier(args.pandoc or 'pandoc') with CachingFetcher(args.cache_path, args.timeout) as fetcher: with open(args.out + '.tex', 'wb') as texout: - process(args.url, fetcher, texout, args.pandoc or 'pandoc') + process(args.url, fetcher, texifier, texout) if __name__ == '__main__': diff --git a/paperdoorknob_test.py b/paperdoorknob_test.py index 9676e28..453f5a6 100644 --- a/paperdoorknob_test.py +++ b/paperdoorknob_test.py @@ -11,6 +11,7 @@ import subprocess import paperdoorknob from testing.fakeserver import FakeGlowficServer from fetch import DirectFetcher +from texify import PandocTexifier TIMEOUT = 8 @@ -30,10 +31,11 @@ class TestPaperDoorknob(unittest.TestCase): ["This is glowfic", "You sure?", "Pretty sure."]) def testProcess(self) -> None: + texifier = PandocTexifier('pandoc') with DirectFetcher(TIMEOUT) as f: buf = io.BytesIO() paperdoorknob.process( - f"http://localhost:{self._port}", f, buf, 'pandoc') + f"http://localhost:{self._port}", f, texifier, buf) self.assertEqual(buf.getvalue(), b'''\\documentclass{article} \\begin{document} This is glowfic @@ -43,10 +45,11 @@ Pretty sure. ''') def testPDF(self) -> None: + texifier = PandocTexifier('pandoc') with DirectFetcher(TIMEOUT) as f: with open("test.tex", 'wb') as out: paperdoorknob.process( - f"http://localhost:{self._port}", f, out, 'pandoc') + f"http://localhost:{self._port}", f, texifier, out) subprocess.run(['pdflatex', 'test.tex'], stdin=subprocess.DEVNULL, check=True) diff --git a/setup.py b/setup.py index 27980a1..6464879 100644 --- a/setup.py +++ b/setup.py @@ -10,6 +10,7 @@ setup( py_modules=[ 'fetch', 'paperdoorknob', + 'texify', ], license="GPL-3.0", entry_points={'console_scripts': ['paperdoorknob = paperdoorknob:main']}, diff --git a/texify.py b/texify.py new file mode 100644 index 0000000..4fb1557 --- /dev/null +++ b/texify.py @@ -0,0 +1,29 @@ +# paperdoorknob: Print glowfic +# +# This program is free software: you can redistribute it and/or modify it +# under the terms of the GNU General Public License as published by the +# Free Software Foundation, version 3. + + +from abc import ABC, abstractmethod +import subprocess + +from bs4.element import Tag + + +class Texifier(ABC): + @abstractmethod + def texify(self, html: Tag) -> bytes: + raise NotImplementedError() + + +class PandocTexifier(Texifier): + + def __init__(self, pandoc_path: str) -> None: + self._pandoc_path = pandoc_path + + def texify(self, html: Tag) -> bytes: + return subprocess.run([self._pandoc_path, '--from=html', '--to=latex'], + input=html.encode(), + stdout=subprocess.PIPE, + check=True).stdout -- 2.44.1 From f1dec72014657ab33bffa8f68064d85cc862ea65 Mon Sep 17 00:00:00 2001 From: Scott Worley Date: Tue, 19 Dec 2023 02:23:48 -0800 Subject: [PATCH 031/100] Contemplate generating LaTeX directly --- paperdoorknob_test.py | 10 +++++++++- texify.py | 40 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 49 insertions(+), 1 deletion(-) diff --git a/paperdoorknob_test.py b/paperdoorknob_test.py index 453f5a6..4cead97 100644 --- a/paperdoorknob_test.py +++ b/paperdoorknob_test.py @@ -11,7 +11,7 @@ import subprocess import paperdoorknob from testing.fakeserver import FakeGlowficServer from fetch import DirectFetcher -from texify import PandocTexifier +from texify import DirectTexifier, PandocTexifier, VerifyingTexifier TIMEOUT = 8 @@ -44,6 +44,14 @@ Pretty sure. \\end{document} ''') + def testDirectTexifier(self) -> None: + texifier = VerifyingTexifier( + PandocTexifier('pandoc'), DirectTexifier()) + with DirectFetcher(TIMEOUT) as f: + buf = io.BytesIO() + paperdoorknob.process( + f"http://localhost:{self._port}", f, texifier, buf) + def testPDF(self) -> None: texifier = PandocTexifier('pandoc') with DirectFetcher(TIMEOUT) as f: diff --git a/texify.py b/texify.py index 4fb1557..83a8c67 100644 --- a/texify.py +++ b/texify.py @@ -6,6 +6,7 @@ from abc import ABC, abstractmethod +import re import subprocess from bs4.element import Tag @@ -27,3 +28,42 @@ class PandocTexifier(Texifier): input=html.encode(), stdout=subprocess.PIPE, check=True).stdout + + +class TexifierError(Exception): + pass + + +class DirectTexifier(Texifier): + def _texify_children(self, html: Tag) -> bytes: + out = b'' + for c in html.children: + if isinstance(c, str): + out += c.encode('UTF-8') + elif isinstance(c, Tag): + out += self.texify(c).strip() + else: + raise TexifierError(f"Unsupported PageElement: {type(c)}") + return re.sub(b'[ \n]+', b' ', out).strip() + b'\n' + + def texify(self, html: Tag) -> bytes: + if html.name == 'em': + return b'\\emph{' + self._texify_children(html).strip() + b'}\n' + return self._texify_children(html) + + +class TexifierVerificationError(Exception): + pass + + +class VerifyingTexifier(Texifier): + def __init__(self, a: Texifier, b: Texifier) -> None: + self._a = a + self._b = b + + def texify(self, html: Tag) -> bytes: + aout = self._a.texify(html) + bout = self._b.texify(html) + if aout != bout: + raise TexifierVerificationError(aout, bout) + return aout -- 2.44.1 From 23f3187945d1cf4f3a9cdc43462be2ca39e7023a Mon Sep 17 00:00:00 2001 From: Scott Worley Date: Tue, 19 Dec 2023 17:29:14 -0800 Subject: [PATCH 032/100] Bundle the things needed for a run together into a Spec --- args.py | 47 ++++++++++++++++++++++++++++++++++++++ paperdoorknob.py | 53 +++++++++---------------------------------- paperdoorknob_test.py | 29 +++++++++++++++-------- setup.py | 2 ++ spec.py | 21 +++++++++++++++++ 5 files changed, 100 insertions(+), 52 deletions(-) create mode 100644 args.py create mode 100644 spec.py diff --git a/args.py b/args.py new file mode 100644 index 0000000..2a03836 --- /dev/null +++ b/args.py @@ -0,0 +1,47 @@ +# paperdoorknob: Print glowfic +# +# This program is free software: you can redistribute it and/or modify it +# under the terms of the GNU General Public License as published by the +# Free Software Foundation, version 3. + + +from argparse import ArgumentParser +from contextlib import contextmanager +import os.path + +from typing import Iterator + +from xdg_base_dirs import xdg_cache_home + +from fetch import CachingFetcher +from spec import Spec +from texify import PandocTexifier + + +def _command_line_parser() -> ArgumentParser: + parser = ArgumentParser(prog='paperdoorknob', description='Print glowfic') + parser.add_argument( + '--cache_path', + metavar='PATH', + help='Where to keep the http cache (instead of %(default)s)', + default=os.path.join(xdg_cache_home(), "paperdoorknob")) + parser.add_argument( + '--out', + help='The filename stem at which to write output ' + + '(eg: "%(default)s" produces %(default)s.tex, %(default)s.pdf, etc.)', + default='book') + parser.add_argument('--pandoc', help='Location of the pandoc executable') + parser.add_argument( + '--timeout', + help='How long to wait for HTTP requests, in seconds', + default=30) + parser.add_argument('url', help='URL to retrieve') + return parser + + +@contextmanager +def spec_from_commandline_args() -> Iterator[Spec]: + args = _command_line_parser().parse_args() + with CachingFetcher(args.cache_path, args.timeout) as fetcher: + with open(args.out + '.tex', 'wb') as texout: + yield Spec(args.url, fetcher, PandocTexifier(args.pandoc or 'pandoc'), texout) diff --git a/paperdoorknob.py b/paperdoorknob.py index 3b86cb8..653d0c6 100644 --- a/paperdoorknob.py +++ b/paperdoorknob.py @@ -5,39 +5,15 @@ # Free Software Foundation, version 3. -from argparse import ArgumentParser import itertools -import os.path -from typing import IO, Iterable +from typing import Iterable from bs4 import BeautifulSoup from bs4.element import Tag -from xdg_base_dirs import xdg_cache_home - -from fetch import CachingFetcher, Fetcher -from texify import PandocTexifier, Texifier - - -def command_line_parser() -> ArgumentParser: - parser = ArgumentParser(prog='paperdoorknob', description='Print glowfic') - parser.add_argument( - '--cache_path', - metavar='PATH', - help='Where to keep the http cache (instead of %(default)s)', - default=os.path.join(xdg_cache_home(), "paperdoorknob")) - parser.add_argument( - '--out', - help='The filename stem at which to write output ' + - '(eg: "%(default)s" produces %(default)s.text, %(default)s.pdf, etc.)', - default='book') - parser.add_argument('--pandoc', help='Location of the pandoc executable') - parser.add_argument( - '--timeout', - help='How long to wait for HTTP requests, in seconds', - default=30) - parser.add_argument('url', help='URL to retrieve') - return parser + +from args import spec_from_commandline_args +from spec import Spec def parse(content: bytes) -> BeautifulSoup: @@ -68,24 +44,17 @@ def replies(html: BeautifulSoup) -> Iterable[Tag]: return itertools.chain([text()], the_replies()) -def process( - url: str, - fetcher: Fetcher, - texifier: Texifier, - texout: IO[bytes]) -> None: - texout.write(b'\\documentclass{article}\n\\begin{document}\n') - html = clean(parse(fetcher.fetch(url))) +def process(spec: Spec) -> None: + spec.texout.write(b'\\documentclass{article}\n\\begin{document}\n') + html = clean(parse(spec.fetcher.fetch(spec.url))) for r in replies(html): - texout.write(texifier.texify(r)) - texout.write(b'\\end{document}\n') + spec.texout.write(spec.texifier.texify(r)) + spec.texout.write(b'\\end{document}\n') def main() -> None: - args = command_line_parser().parse_args() - texifier = PandocTexifier(args.pandoc or 'pandoc') - with CachingFetcher(args.cache_path, args.timeout) as fetcher: - with open(args.out + '.tex', 'wb') as texout: - process(args.url, fetcher, texifier, texout) + with spec_from_commandline_args() as spec: + process(spec) if __name__ == '__main__': diff --git a/paperdoorknob_test.py b/paperdoorknob_test.py index 4cead97..baf7945 100644 --- a/paperdoorknob_test.py +++ b/paperdoorknob_test.py @@ -8,9 +8,12 @@ import unittest import io import subprocess + import paperdoorknob + from testing.fakeserver import FakeGlowficServer from fetch import DirectFetcher +from spec import Spec from texify import DirectTexifier, PandocTexifier, VerifyingTexifier TIMEOUT = 8 @@ -31,11 +34,14 @@ class TestPaperDoorknob(unittest.TestCase): ["This is glowfic", "You sure?", "Pretty sure."]) def testProcess(self) -> None: - texifier = PandocTexifier('pandoc') with DirectFetcher(TIMEOUT) as f: buf = io.BytesIO() - paperdoorknob.process( - f"http://localhost:{self._port}", f, texifier, buf) + spec = Spec( + f"http://localhost:{self._port}", + f, + PandocTexifier('pandoc'), + buf) + paperdoorknob.process(spec) self.assertEqual(buf.getvalue(), b'''\\documentclass{article} \\begin{document} This is glowfic @@ -49,17 +55,20 @@ Pretty sure. PandocTexifier('pandoc'), DirectTexifier()) with DirectFetcher(TIMEOUT) as f: buf = io.BytesIO() - paperdoorknob.process( - f"http://localhost:{self._port}", f, texifier, buf) + spec = Spec(f"http://localhost:{self._port}", f, texifier, buf) + paperdoorknob.process(spec) def testPDF(self) -> None: - texifier = PandocTexifier('pandoc') with DirectFetcher(TIMEOUT) as f: with open("test.tex", 'wb') as out: - paperdoorknob.process( - f"http://localhost:{self._port}", f, texifier, out) - subprocess.run(['pdflatex', 'test.tex'], - stdin=subprocess.DEVNULL, check=True) + spec = Spec( + f"http://localhost:{self._port}", + f, + PandocTexifier('pandoc'), + out) + paperdoorknob.process(spec) + subprocess.run(['pdflatex', 'test.tex'], + stdin=subprocess.DEVNULL, check=True) if __name__ == '__main__': diff --git a/setup.py b/setup.py index 6464879..edaa429 100644 --- a/setup.py +++ b/setup.py @@ -8,8 +8,10 @@ setup( author_email="scottworley@scottworley.com", url="https://git.scottworley.com/paperdoorknob", py_modules=[ + 'args', 'fetch', 'paperdoorknob', + 'spec', 'texify', ], license="GPL-3.0", diff --git a/spec.py b/spec.py new file mode 100644 index 0000000..1322897 --- /dev/null +++ b/spec.py @@ -0,0 +1,21 @@ +# paperdoorknob: Print glowfic +# +# This program is free software: you can redistribute it and/or modify it +# under the terms of the GNU General Public License as published by the +# Free Software Foundation, version 3. + + +from dataclasses import dataclass + +from typing import IO + +from fetch import Fetcher +from texify import Texifier + + +@dataclass(frozen=True) +class Spec: + url: str + fetcher: Fetcher + texifier: Texifier + texout: IO[bytes] -- 2.44.1 From dbda64b551aaeffa4f01779a52c7ec68b1fffa98 Mon Sep 17 00:00:00 2001 From: Scott Worley Date: Tue, 19 Dec 2023 17:44:32 -0800 Subject: [PATCH 033/100] Move HTML test data out to separate file --- testdata/this-is-glowfic.html | 19 +++++++++++++++++++ testing/fakeserver.py | 21 ++------------------- 2 files changed, 21 insertions(+), 19 deletions(-) create mode 100644 testdata/this-is-glowfic.html diff --git a/testdata/this-is-glowfic.html b/testdata/this-is-glowfic.html new file mode 100644 index 0000000..22c7a37 --- /dev/null +++ b/testdata/this-is-glowfic.html @@ -0,0 +1,19 @@ + + +
+
We don't want edit boxes
+ This is glowfic + +
+
+
+
We don't want edit boxes
+ You sure? + +
+
+ Pretty sure. +
+
+ + diff --git a/testing/fakeserver.py b/testing/fakeserver.py index 2ed8149..bbeb522 100644 --- a/testing/fakeserver.py +++ b/testing/fakeserver.py @@ -16,25 +16,8 @@ class FakeGlowficHTTPRequestHandler(BaseHTTPRequestHandler): return 200 def do_GET(self) -> None: - body = b''' - -
-
We don't want edit boxes
- This is glowfic - -
-
-
-
We don't want edit boxes
- You sure? - -
-
- Pretty sure. -
-
- -''' + with open("testdata/this-is-glowfic.html", "rb") as f: + body = f.read(9999) self.send_response(self._response_code()) self.send_header("Content-type", "text/html") self.send_header("Content-Length", str(len(body))) -- 2.44.1 From 386218397a4723129b31e3954bb960da61f72474 Mon Sep 17 00:00:00 2001 From: Scott Worley Date: Tue, 19 Dec 2023 18:45:55 -0800 Subject: [PATCH 034/100] FakeFetcher for faster tests Run some tests against both the webserver and the FakeFetcher to make sure both work. --- fetch.py | 11 +++++ paperdoorknob_test.py | 97 ++++++++++++++++++++++++++++--------------- 2 files changed, 74 insertions(+), 34 deletions(-) diff --git a/fetch.py b/fetch.py index 3c0c6a3..7776f93 100644 --- a/fetch.py +++ b/fetch.py @@ -41,3 +41,14 @@ def DirectFetcher(timeout: int) -> Iterator[_SessionFetcher]: def CachingFetcher(cache_path: str, timeout: int) -> Iterator[_SessionFetcher]: with requests_cache.CachedSession(cache_path, cache_control=True) as session: yield _SessionFetcher(session, timeout) + + +class FakeFetcher(Fetcher): + + def __init__(self, resources: dict[str, bytes]) -> None: + self._resources = resources + + def fetch(self, url: str) -> bytes: + if url not in self._resources: + raise requests.HTTPError("URL not found") + return self._resources[url] diff --git a/paperdoorknob_test.py b/paperdoorknob_test.py index baf7945..425d5e3 100644 --- a/paperdoorknob_test.py +++ b/paperdoorknob_test.py @@ -5,6 +5,7 @@ # Free Software Foundation, version 3. +from abc import ABC, abstractmethod import unittest import io import subprocess @@ -12,64 +13,92 @@ import subprocess import paperdoorknob from testing.fakeserver import FakeGlowficServer -from fetch import DirectFetcher +from fetch import DirectFetcher, FakeFetcher, Fetcher from spec import Spec from texify import DirectTexifier, PandocTexifier, VerifyingTexifier TIMEOUT = 8 -class TestPaperDoorknob(unittest.TestCase): - def setUp(self) -> None: - self._server = self.enterContext(FakeGlowficServer()) - self._port = self._server.port() +class BaseTestProcess(ABC): + + @abstractmethod + def url(self) -> str: + raise NotImplementedError() + + @abstractmethod + def fetcher(self) -> Fetcher: + raise NotImplementedError() def testReplies(self) -> None: - with DirectFetcher(TIMEOUT) as f: - replies = paperdoorknob.replies( - paperdoorknob.clean( - paperdoorknob.parse( - f.fetch(f"http://localhost:{self._port}")))) - self.assertEqual([r.text.strip() for r in replies], - ["This is glowfic", "You sure?", "Pretty sure."]) + replies = paperdoorknob.replies( + paperdoorknob.clean( + paperdoorknob.parse( + self.fetcher().fetch( + self.url())))) + assert [r.text.strip() for r in replies] == [ + "This is glowfic", + "You sure?", + "Pretty sure."] def testProcess(self) -> None: - with DirectFetcher(TIMEOUT) as f: - buf = io.BytesIO() - spec = Spec( - f"http://localhost:{self._port}", - f, - PandocTexifier('pandoc'), - buf) - paperdoorknob.process(spec) - self.assertEqual(buf.getvalue(), b'''\\documentclass{article} + buf = io.BytesIO() + spec = Spec( + self.url(), + self.fetcher(), + PandocTexifier('pandoc'), + buf) + paperdoorknob.process(spec) + assert buf.getvalue() == b'''\\documentclass{article} \\begin{document} This is glowfic You \\emph{sure}? Pretty sure. \\end{document} -''') +''' def testDirectTexifier(self) -> None: texifier = VerifyingTexifier( PandocTexifier('pandoc'), DirectTexifier()) - with DirectFetcher(TIMEOUT) as f: - buf = io.BytesIO() - spec = Spec(f"http://localhost:{self._port}", f, texifier, buf) - paperdoorknob.process(spec) + buf = io.BytesIO() + spec = Spec(self.url(), self.fetcher(), texifier, buf) + paperdoorknob.process(spec) def testPDF(self) -> None: - with DirectFetcher(TIMEOUT) as f: - with open("test.tex", 'wb') as out: - spec = Spec( - f"http://localhost:{self._port}", - f, - PandocTexifier('pandoc'), - out) - paperdoorknob.process(spec) + with open("test.tex", 'wb') as out: + spec = Spec( + self.url(), + self.fetcher(), + PandocTexifier('pandoc'), + out) + paperdoorknob.process(spec) subprocess.run(['pdflatex', 'test.tex'], stdin=subprocess.DEVNULL, check=True) +class TestProcessFromWebserver(BaseTestProcess, unittest.TestCase): + + def setUp(self) -> None: + self._fetcher = self.enterContext(DirectFetcher(TIMEOUT)) + self._server = self.enterContext(FakeGlowficServer()) + self._port = self._server.port() + + def url(self) -> str: + return f"http://localhost:{self._port}" + + def fetcher(self) -> Fetcher: + return self._fetcher + + +class TestProcessFromFakeFetcher(BaseTestProcess, unittest.TestCase): + + def url(self) -> str: + return 'fic' + + def fetcher(self) -> Fetcher: + with open('testdata/this-is-glowfic.html', 'rb') as f: + return FakeFetcher({'fic': f.read(9999)}) + + if __name__ == '__main__': unittest.main() -- 2.44.1 From 929db57622c6b8756ed341d7ae2862126a7c638f Mon Sep 17 00:00:00 2001 From: Scott Worley Date: Tue, 19 Dec 2023 19:39:16 -0800 Subject: [PATCH 035/100] Strip all   Project Lawful does not use   carefully/thoughtfully. They are all over the place, at the end of every sentence, randomly around s, etc. Just remove them all. But allow this behavior to be flag-controlled, in case this step is not desired when processing other works. --- args.py | 12 +++++++++++- htmlfilter.py | 31 +++++++++++++++++++++++++++++++ htmlfilter_test.py | 23 +++++++++++++++++++++++ paperdoorknob.py | 2 +- paperdoorknob_test.py | 4 +++- setup.py | 1 + spec.py | 3 ++- 7 files changed, 72 insertions(+), 4 deletions(-) create mode 100644 htmlfilter.py create mode 100644 htmlfilter_test.py diff --git a/args.py b/args.py index 2a03836..1931d42 100644 --- a/args.py +++ b/args.py @@ -14,6 +14,7 @@ from typing import Iterator from xdg_base_dirs import xdg_cache_home from fetch import CachingFetcher +from htmlfilter import ApplyHTMLFilters, HTMLFilters from spec import Spec from texify import PandocTexifier @@ -25,6 +26,10 @@ def _command_line_parser() -> ArgumentParser: metavar='PATH', help='Where to keep the http cache (instead of %(default)s)', default=os.path.join(xdg_cache_home(), "paperdoorknob")) + parser.add_argument( + '--htmlfilters', + help='Which HTML filters to use (default: %(default)s)', + default=','.join(f[0] for f in HTMLFilters)) parser.add_argument( '--out', help='The filename stem at which to write output ' + @@ -44,4 +49,9 @@ def spec_from_commandline_args() -> Iterator[Spec]: args = _command_line_parser().parse_args() with CachingFetcher(args.cache_path, args.timeout) as fetcher: with open(args.out + '.tex', 'wb') as texout: - yield Spec(args.url, fetcher, PandocTexifier(args.pandoc or 'pandoc'), texout) + yield Spec( + args.url, + fetcher, + lambda x: ApplyHTMLFilters(args.htmlfilters, x), + PandocTexifier(args.pandoc or 'pandoc'), + texout) diff --git a/htmlfilter.py b/htmlfilter.py new file mode 100644 index 0000000..9bc9b6f --- /dev/null +++ b/htmlfilter.py @@ -0,0 +1,31 @@ +# paperdoorknob: Print glowfic +# +# This program is free software: you can redistribute it and/or modify it +# under the terms of the GNU General Public License as published by the +# Free Software Foundation, version 3. + + +import re + +from typing import Callable, List, Tuple + + +HTMLFilters: List[Tuple[str, Callable[[bytes], bytes]]] = [ + ("NoNBSP", lambda x: re.sub(b"( |\xc2\xa0)", b" ", x)), +] + + +class HTMLFilterError(Exception): + pass + + +def ApplyHTMLFilters(filter_list: str, data: bytes) -> bytes: + for filter_name in filter_list.split(','): + filters = [f for (name, f) in HTMLFilters if name == filter_name] + if len(filters) == 0: + raise HTMLFilterError(f"Unknown HTML filter: {filter_name}") + if len(filters) > 1: + raise HTMLFilterError( + f"Multiple HTML filters with the same name!?: {filter_name}") + data = filters[0](data) + return data diff --git a/htmlfilter_test.py b/htmlfilter_test.py new file mode 100644 index 0000000..6b5584c --- /dev/null +++ b/htmlfilter_test.py @@ -0,0 +1,23 @@ +# paperdoorknob: Print glowfic +# +# This program is free software: you can redistribute it and/or modify it +# under the terms of the GNU General Public License as published by the +# Free Software Foundation, version 3. + + +import unittest +from htmlfilter import ApplyHTMLFilters + + +class TestHTMLFilters(unittest.TestCase): + + def testStripNBSP(self) -> None: + self.assertEqual( + ApplyHTMLFilters( + "NoNBSP", + b"

It's really cold.

"), + b"

It's really cold.

") + + +if __name__ == '__main__': + unittest.main() diff --git a/paperdoorknob.py b/paperdoorknob.py index 653d0c6..34223e4 100644 --- a/paperdoorknob.py +++ b/paperdoorknob.py @@ -46,7 +46,7 @@ def replies(html: BeautifulSoup) -> Iterable[Tag]: def process(spec: Spec) -> None: spec.texout.write(b'\\documentclass{article}\n\\begin{document}\n') - html = clean(parse(spec.fetcher.fetch(spec.url))) + html = clean(parse(spec.htmlfilter(spec.fetcher.fetch(spec.url)))) for r in replies(html): spec.texout.write(spec.texifier.texify(r)) spec.texout.write(b'\\end{document}\n') diff --git a/paperdoorknob_test.py b/paperdoorknob_test.py index 425d5e3..14f6dd9 100644 --- a/paperdoorknob_test.py +++ b/paperdoorknob_test.py @@ -46,6 +46,7 @@ class BaseTestProcess(ABC): spec = Spec( self.url(), self.fetcher(), + lambda x: x, PandocTexifier('pandoc'), buf) paperdoorknob.process(spec) @@ -61,7 +62,7 @@ Pretty sure. texifier = VerifyingTexifier( PandocTexifier('pandoc'), DirectTexifier()) buf = io.BytesIO() - spec = Spec(self.url(), self.fetcher(), texifier, buf) + spec = Spec(self.url(), self.fetcher(), lambda x: x, texifier, buf) paperdoorknob.process(spec) def testPDF(self) -> None: @@ -69,6 +70,7 @@ Pretty sure. spec = Spec( self.url(), self.fetcher(), + lambda x: x, PandocTexifier('pandoc'), out) paperdoorknob.process(spec) diff --git a/setup.py b/setup.py index edaa429..e83ec01 100644 --- a/setup.py +++ b/setup.py @@ -10,6 +10,7 @@ setup( py_modules=[ 'args', 'fetch', + 'htmlfilter', 'paperdoorknob', 'spec', 'texify', diff --git a/spec.py b/spec.py index 1322897..98832bf 100644 --- a/spec.py +++ b/spec.py @@ -7,7 +7,7 @@ from dataclasses import dataclass -from typing import IO +from typing import Callable, IO from fetch import Fetcher from texify import Texifier @@ -17,5 +17,6 @@ from texify import Texifier class Spec: url: str fetcher: Fetcher + htmlfilter: Callable[[bytes], bytes] texifier: Texifier texout: IO[bytes] -- 2.44.1 From 8be20b9d36c49271ed392038750dfba981c283b6 Mon Sep 17 00:00:00 2001 From: Scott Worley Date: Tue, 19 Dec 2023 21:57:13 -0800 Subject: [PATCH 036/100] Extensible, flag-controlled DOM filters --- args.py | 6 ++++++ domfilter.py | 31 +++++++++++++++++++++++++++++++ domfilter_test.py | 42 ++++++++++++++++++++++++++++++++++++++++++ paperdoorknob.py | 11 ++--------- paperdoorknob_test.py | 20 ++++++++++++++------ setup.py | 1 + spec.py | 3 +++ 7 files changed, 99 insertions(+), 15 deletions(-) create mode 100644 domfilter.py create mode 100644 domfilter_test.py diff --git a/args.py b/args.py index 1931d42..2198a1d 100644 --- a/args.py +++ b/args.py @@ -13,6 +13,7 @@ from typing import Iterator from xdg_base_dirs import xdg_cache_home +from domfilter import ApplyDOMFilters, DOMFilters from fetch import CachingFetcher from htmlfilter import ApplyHTMLFilters, HTMLFilters from spec import Spec @@ -26,6 +27,10 @@ def _command_line_parser() -> ArgumentParser: metavar='PATH', help='Where to keep the http cache (instead of %(default)s)', default=os.path.join(xdg_cache_home(), "paperdoorknob")) + parser.add_argument( + '--domfilters', + help='Which DOM filters to use (default: %(default)s)', + default=','.join(f[0] for f in DOMFilters)) parser.add_argument( '--htmlfilters', help='Which HTML filters to use (default: %(default)s)', @@ -53,5 +58,6 @@ def spec_from_commandline_args() -> Iterator[Spec]: args.url, fetcher, lambda x: ApplyHTMLFilters(args.htmlfilters, x), + lambda x: ApplyDOMFilters(args.domfilters, x), PandocTexifier(args.pandoc or 'pandoc'), texout) diff --git a/domfilter.py b/domfilter.py new file mode 100644 index 0000000..22f96a1 --- /dev/null +++ b/domfilter.py @@ -0,0 +1,31 @@ +# paperdoorknob: Print glowfic +# +# This program is free software: you can redistribute it and/or modify it +# under the terms of the GNU General Public License as published by the +# Free Software Foundation, version 3. + + +from typing import Any, Callable, List, Tuple + +from bs4.element import Tag + + +DOMFilters: List[Tuple[str, Callable[[Tag], Any]]] = [ + ("NoEdit", lambda x: [eb.decompose() for eb in x.find_all("div", class_="post-edit-box")]), + ("NoFooter", lambda x: [foot.decompose() for foot in x.find_all("div", class_="post-footer")]), +] + + +class DOMFilterError(Exception): + pass + + +def ApplyDOMFilters(filter_list: str, tag: Tag) -> None: + for filter_name in filter_list.split(','): + filters = [f for (name, f) in DOMFilters if name == filter_name] + if len(filters) == 0: + raise DOMFilterError(f"Unknown DOM filter: {filter_name}") + if len(filters) > 1: + raise DOMFilterError( + f"Multiple DOM filters with the same name!?: {filter_name}") + filters[0](tag) diff --git a/domfilter_test.py b/domfilter_test.py new file mode 100644 index 0000000..c3c8ee2 --- /dev/null +++ b/domfilter_test.py @@ -0,0 +1,42 @@ +# paperdoorknob: Print glowfic +# +# This program is free software: you can redistribute it and/or modify it +# under the terms of the GNU General Public License as published by the +# Free Software Foundation, version 3. + + +import unittest + +from bs4 import BeautifulSoup + +from domfilter import ApplyDOMFilters + + +class TestDOMFilters(unittest.TestCase): + + def setUp(self) -> None: + self._html = BeautifulSoup(b''' +
+
This is the edit box
+ This is glowfic + +
''', 'html.parser') + + def testStripFooters(self) -> None: + ApplyDOMFilters("NoFooter", self._html) + self.assertEqual(list(self._html.stripped_strings), + ["This is the edit box", "This is glowfic"]) + + def testStripEditBoxes(self) -> None: + ApplyDOMFilters("NoEdit", self._html) + self.assertEqual(list(self._html.stripped_strings), + ["This is glowfic", "This is the footer"]) + + def testStripFootersAndEditBoxes(self) -> None: + ApplyDOMFilters("NoEdit,NoFooter", self._html) + self.assertEqual(list(self._html.stripped_strings), + ["This is glowfic"]) + + +if __name__ == '__main__': + unittest.main() diff --git a/paperdoorknob.py b/paperdoorknob.py index 34223e4..2bcf31a 100644 --- a/paperdoorknob.py +++ b/paperdoorknob.py @@ -20,14 +20,6 @@ def parse(content: bytes) -> BeautifulSoup: return BeautifulSoup(content, 'html.parser') -def clean(html: BeautifulSoup) -> BeautifulSoup: - for eb in html.find_all("div", class_="post-edit-box"): - eb.decompose() - for footer in html.find_all("div", class_="post-footer"): - footer.decompose() - return html - - def replies(html: BeautifulSoup) -> Iterable[Tag]: def text() -> Tag: body = html.body @@ -46,8 +38,9 @@ def replies(html: BeautifulSoup) -> Iterable[Tag]: def process(spec: Spec) -> None: spec.texout.write(b'\\documentclass{article}\n\\begin{document}\n') - html = clean(parse(spec.htmlfilter(spec.fetcher.fetch(spec.url)))) + html = parse(spec.htmlfilter(spec.fetcher.fetch(spec.url))) for r in replies(html): + spec.domfilter(r) spec.texout.write(spec.texifier.texify(r)) spec.texout.write(b'\\end{document}\n') diff --git a/paperdoorknob_test.py b/paperdoorknob_test.py index 14f6dd9..aa50340 100644 --- a/paperdoorknob_test.py +++ b/paperdoorknob_test.py @@ -13,6 +13,7 @@ import subprocess import paperdoorknob from testing.fakeserver import FakeGlowficServer +from domfilter import ApplyDOMFilters from fetch import DirectFetcher, FakeFetcher, Fetcher from spec import Spec from texify import DirectTexifier, PandocTexifier, VerifyingTexifier @@ -31,11 +32,10 @@ class BaseTestProcess(ABC): raise NotImplementedError() def testReplies(self) -> None: - replies = paperdoorknob.replies( - paperdoorknob.clean( - paperdoorknob.parse( - self.fetcher().fetch( - self.url())))) + replies = list(paperdoorknob.replies( + paperdoorknob.parse(self.fetcher().fetch(self.url())))) + for r in replies: + ApplyDOMFilters('NoEdit,NoFooter', r) assert [r.text.strip() for r in replies] == [ "This is glowfic", "You sure?", @@ -47,6 +47,7 @@ class BaseTestProcess(ABC): self.url(), self.fetcher(), lambda x: x, + lambda x: ApplyDOMFilters('NoEdit,NoFooter', x), PandocTexifier('pandoc'), buf) paperdoorknob.process(spec) @@ -62,7 +63,13 @@ Pretty sure. texifier = VerifyingTexifier( PandocTexifier('pandoc'), DirectTexifier()) buf = io.BytesIO() - spec = Spec(self.url(), self.fetcher(), lambda x: x, texifier, buf) + spec = Spec( + self.url(), + self.fetcher(), + lambda x: x, + lambda x: ApplyDOMFilters('NoEdit,NoFooter', x), + texifier, + buf) paperdoorknob.process(spec) def testPDF(self) -> None: @@ -71,6 +78,7 @@ Pretty sure. self.url(), self.fetcher(), lambda x: x, + lambda x: ApplyDOMFilters('NoEdit,NoFooter', x), PandocTexifier('pandoc'), out) paperdoorknob.process(spec) diff --git a/setup.py b/setup.py index e83ec01..948b6bc 100644 --- a/setup.py +++ b/setup.py @@ -9,6 +9,7 @@ setup( url="https://git.scottworley.com/paperdoorknob", py_modules=[ 'args', + 'domfilter', 'fetch', 'htmlfilter', 'paperdoorknob', diff --git a/spec.py b/spec.py index 98832bf..75d00c3 100644 --- a/spec.py +++ b/spec.py @@ -9,6 +9,8 @@ from dataclasses import dataclass from typing import Callable, IO +from bs4.element import Tag + from fetch import Fetcher from texify import Texifier @@ -18,5 +20,6 @@ class Spec: url: str fetcher: Fetcher htmlfilter: Callable[[bytes], bytes] + domfilter: Callable[[Tag], None] texifier: Texifier texout: IO[bytes] -- 2.44.1 From c04b2e7812ebd48c2d9273122deef888483bc9f6 Mon Sep 17 00:00:00 2001 From: Scott Worley Date: Tue, 19 Dec 2023 22:07:37 -0800 Subject: [PATCH 037/100] Project Lawful start URL in --help --- args.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/args.py b/args.py index 2198a1d..fb36083 100644 --- a/args.py +++ b/args.py @@ -45,7 +45,9 @@ def _command_line_parser() -> ArgumentParser: '--timeout', help='How long to wait for HTTP requests, in seconds', default=30) - parser.add_argument('url', help='URL to retrieve') + parser.add_argument( + 'url', + help='URL to retrieve (example: https://www.projectlawful.com/posts/4582 )') return parser -- 2.44.1 From e10b5b6f112c057ab33ad46f8a3385d3bcd23e1d Mon Sep 17 00:00:00 2001 From: Scott Worley Date: Tue, 19 Dec 2023 22:38:46 -0800 Subject: [PATCH 038/100] Specify page geometry --- args.py | 8 ++++++++ paperdoorknob.py | 7 ++++++- paperdoorknob_test.py | 4 ++++ spec.py | 1 + 4 files changed, 19 insertions(+), 1 deletion(-) diff --git a/args.py b/args.py index fb36083..f343a84 100644 --- a/args.py +++ b/args.py @@ -31,6 +31,13 @@ def _command_line_parser() -> ArgumentParser: '--domfilters', help='Which DOM filters to use (default: %(default)s)', default=','.join(f[0] for f in DOMFilters)) + parser.add_argument( + '--geometry', + help='''Page size and margin control +See https://faculty.bard.edu/bloch/geometry.pdf for details +(default: %(default)s)''', + default='paperwidth=5.5in,paperheight=8.5in,nohead,' + + 'tmargin=15mm,hmargin=15mm,bmargin=17mm,foot=4mm') parser.add_argument( '--htmlfilters', help='Which HTML filters to use (default: %(default)s)', @@ -62,4 +69,5 @@ def spec_from_commandline_args() -> Iterator[Spec]: lambda x: ApplyHTMLFilters(args.htmlfilters, x), lambda x: ApplyDOMFilters(args.domfilters, x), PandocTexifier(args.pandoc or 'pandoc'), + args.geometry, texout) diff --git a/paperdoorknob.py b/paperdoorknob.py index 2bcf31a..26eb7be 100644 --- a/paperdoorknob.py +++ b/paperdoorknob.py @@ -37,7 +37,12 @@ def replies(html: BeautifulSoup) -> Iterable[Tag]: def process(spec: Spec) -> None: - spec.texout.write(b'\\documentclass{article}\n\\begin{document}\n') + spec.texout.write(b'\\documentclass{article}\n') + if spec.geometry is not None: + spec.texout.write(b'\\usepackage[' + + spec.geometry.encode('UTF-8') + + b']{geometry}\n') + spec.texout.write(b'\\begin{document}\n') html = parse(spec.htmlfilter(spec.fetcher.fetch(spec.url))) for r in replies(html): spec.domfilter(r) diff --git a/paperdoorknob_test.py b/paperdoorknob_test.py index aa50340..49f1872 100644 --- a/paperdoorknob_test.py +++ b/paperdoorknob_test.py @@ -49,9 +49,11 @@ class BaseTestProcess(ABC): lambda x: x, lambda x: ApplyDOMFilters('NoEdit,NoFooter', x), PandocTexifier('pandoc'), + 'margin=20mm', buf) paperdoorknob.process(spec) assert buf.getvalue() == b'''\\documentclass{article} +\\usepackage[margin=20mm]{geometry} \\begin{document} This is glowfic You \\emph{sure}? @@ -69,6 +71,7 @@ Pretty sure. lambda x: x, lambda x: ApplyDOMFilters('NoEdit,NoFooter', x), texifier, + None, buf) paperdoorknob.process(spec) @@ -80,6 +83,7 @@ Pretty sure. lambda x: x, lambda x: ApplyDOMFilters('NoEdit,NoFooter', x), PandocTexifier('pandoc'), + None, out) paperdoorknob.process(spec) subprocess.run(['pdflatex', 'test.tex'], diff --git a/spec.py b/spec.py index 75d00c3..50746d7 100644 --- a/spec.py +++ b/spec.py @@ -22,4 +22,5 @@ class Spec: htmlfilter: Callable[[bytes], bytes] domfilter: Callable[[Tag], None] texifier: Texifier + geometry: str | None texout: IO[bytes] -- 2.44.1 From 91fe9916122adaee2cf1695040f906d709e1aa1c Mon Sep 17 00:00:00 2001 From: Scott Worley Date: Tue, 19 Dec 2023 23:39:34 -0800 Subject: [PATCH 039/100] ImageStore --- fetch.py | 5 +++ images.py | 48 ++++++++++++++++++++++++++++ images_test.py | 87 ++++++++++++++++++++++++++++++++++++++++++++++++++ setup.py | 1 + 4 files changed, 141 insertions(+) create mode 100644 images.py create mode 100644 images_test.py diff --git a/fetch.py b/fetch.py index 7776f93..d526753 100644 --- a/fetch.py +++ b/fetch.py @@ -47,8 +47,13 @@ class FakeFetcher(Fetcher): def __init__(self, resources: dict[str, bytes]) -> None: self._resources = resources + self._fetch_count = 0 def fetch(self, url: str) -> bytes: + self._fetch_count += 1 if url not in self._resources: raise requests.HTTPError("URL not found") return self._resources[url] + + def request_count(self) -> int: + return self._fetch_count diff --git a/images.py b/images.py new file mode 100644 index 0000000..178f708 --- /dev/null +++ b/images.py @@ -0,0 +1,48 @@ +# paperdoorknob: Print glowfic +# +# This program is free software: you can redistribute it and/or modify it +# under the terms of the GNU General Public License as published by the +# Free Software Foundation, version 3. + +import os +import os.path + +from fetch import Fetcher + + +class ImageStoreError(Exception): + pass + + +class ImageStore: + + def __init__(self, root_path: str, fetcher: Fetcher) -> None: + self._root_path = root_path + self._fetcher = fetcher + self._images: dict[str, str] = {} + self._filenames: set[str] = set() + + def _filename(self, url: str) -> str: + assert url not in self._images + base = os.path.basename(url) + if base not in self._filenames: + return base + stem, ext = os.path.splitext(base) + for i in range(10000): + name = f"{stem}-{i:04d}{ext}" + if name not in self._filenames: + return name + raise ImageStoreError( + 'Unexpectedly-many similarly-named images fetched?') + + def get_image(self, url: str) -> str: + if url not in self._images: + image_data = self._fetcher.fetch(url) + name = self._filename(url) + path = os.path.join(self._root_path, name) + os.makedirs(self._root_path, exist_ok=True) + with open(path, "wb") as f: + f.write(image_data) + self._filenames.add(name) + self._images[url] = path + return self._images[url] diff --git a/images_test.py b/images_test.py new file mode 100644 index 0000000..8ada1ee --- /dev/null +++ b/images_test.py @@ -0,0 +1,87 @@ +# paperdoorknob: Print glowfic +# +# This program is free software: you can redistribute it and/or modify it +# under the terms of the GNU General Public License as published by the +# Free Software Foundation, version 3. + + +import unittest + +from fetch import FakeFetcher +from images import ImageStore + + +class TestImageStore(unittest.TestCase): + def setUp(self) -> None: + self._fetcher = FakeFetcher({ + 'https://example.com/images/alice.png': b'ALICE', + 'https://example.com/images/bob.jpeg': b'BOB', + 'https://example.com/alt_images/bob.jpeg': b'BOBBY', + 'https://example.com/images/carol': b'CAROL', + 'https://example.com/alt_images/carol': b'CAROLINE', + 'https://example.com/other_images/carol': b'CAROLINA'}) + + def testFetchOnce(self) -> None: + store = ImageStore('istest_fetch_once', self._fetcher) + self.assertEqual(self._fetcher.request_count(), 0) + a1 = store.get_image('https://example.com/images/alice.png') + self.assertEqual(self._fetcher.request_count(), 1) + a2 = store.get_image('https://example.com/images/alice.png') + self.assertEqual(self._fetcher.request_count(), 1) + self.assertEqual(a1, a2) + self.assertEqual(a1, 'istest_fetch_once/alice.png') + with open(a1, 'rb') as f: + self.assertEqual(f.read(), b'ALICE') + + self.assertEqual(self._fetcher.request_count(), 1) + b1 = store.get_image('https://example.com/images/bob.jpeg') + self.assertEqual(self._fetcher.request_count(), 2) + b2 = store.get_image('https://example.com/images/bob.jpeg') + self.assertEqual(self._fetcher.request_count(), 2) + self.assertEqual(b1, b2) + self.assertEqual(b1, 'istest_fetch_once/bob.jpeg') + with open(b1, 'rb') as f: + self.assertEqual(f.read(), b'BOB') + + a3 = store.get_image('https://example.com/images/alice.png') + self.assertEqual(self._fetcher.request_count(), 2) + self.assertEqual(a1, a3) + + def testNameCollision(self) -> None: + store = ImageStore('istest_name_collision', self._fetcher) + self.assertEqual(self._fetcher.request_count(), 0) + b1 = store.get_image('https://example.com/images/bob.jpeg') + self.assertEqual(self._fetcher.request_count(), 1) + b2 = store.get_image('https://example.com/alt_images/bob.jpeg') + self.assertEqual(self._fetcher.request_count(), 2) + self.assertNotEqual(b1, b2) + self.assertEqual(b1, 'istest_name_collision/bob.jpeg') + self.assertEqual(b2, 'istest_name_collision/bob-0000.jpeg') + with open(b1, 'rb') as f: + self.assertEqual(f.read(), b'BOB') + with open(b2, 'rb') as f: + self.assertEqual(f.read(), b'BOBBY') + + self.assertEqual(self._fetcher.request_count(), 2) + c1 = store.get_image('https://example.com/images/carol') + self.assertEqual(self._fetcher.request_count(), 3) + c2 = store.get_image('https://example.com/alt_images/carol') + self.assertEqual(self._fetcher.request_count(), 4) + c3 = store.get_image('https://example.com/other_images/carol') + self.assertEqual(self._fetcher.request_count(), 5) + self.assertNotEqual(c1, c2) + self.assertNotEqual(c2, c3) + self.assertNotEqual(c1, c3) + self.assertEqual(c1, 'istest_name_collision/carol') + self.assertEqual(c2, 'istest_name_collision/carol-0000') + self.assertEqual(c3, 'istest_name_collision/carol-0001') + with open(c1, 'rb') as f: + self.assertEqual(f.read(), b'CAROL') + with open(c2, 'rb') as f: + self.assertEqual(f.read(), b'CAROLINE') + with open(c3, 'rb') as f: + self.assertEqual(f.read(), b'CAROLINA') + + +if __name__ == '__main__': + unittest.main() diff --git a/setup.py b/setup.py index 948b6bc..1ef11e6 100644 --- a/setup.py +++ b/setup.py @@ -12,6 +12,7 @@ setup( 'domfilter', 'fetch', 'htmlfilter', + 'images', 'paperdoorknob', 'spec', 'texify', -- 2.44.1 From 432bb83bc214a4b3212471a69ace15090cbb7d37 Mon Sep 17 00:00:00 2001 From: Scott Worley Date: Tue, 19 Dec 2023 23:48:57 -0800 Subject: [PATCH 040/100] Allow many Spec fields The whole point of Spec is to be a simple, inert, immutable collection of largely independent program elements easily at hand. It's being used here like a namespace, not a pile of shared mutable state for which a field limit of 7 would make more sense. Let Spec grow. --- spec.py | 1 + 1 file changed, 1 insertion(+) diff --git a/spec.py b/spec.py index 50746d7..c7ace56 100644 --- a/spec.py +++ b/spec.py @@ -15,6 +15,7 @@ from fetch import Fetcher from texify import Texifier +# pylint: disable=too-many-instance-attributes @dataclass(frozen=True) class Spec: url: str -- 2.44.1 From cfd0b634e21851a0d9e8b4c8c258307baefcd931 Mon Sep 17 00:00:00 2001 From: Scott Worley Date: Tue, 19 Dec 2023 23:55:05 -0800 Subject: [PATCH 041/100] Pass an ImageStore around --- args.py | 2 ++ paperdoorknob_test.py | 4 ++++ spec.py | 2 ++ 3 files changed, 8 insertions(+) diff --git a/args.py b/args.py index f343a84..d90de8a 100644 --- a/args.py +++ b/args.py @@ -16,6 +16,7 @@ from xdg_base_dirs import xdg_cache_home from domfilter import ApplyDOMFilters, DOMFilters from fetch import CachingFetcher from htmlfilter import ApplyHTMLFilters, HTMLFilters +from images import ImageStore from spec import Spec from texify import PandocTexifier @@ -66,6 +67,7 @@ def spec_from_commandline_args() -> Iterator[Spec]: yield Spec( args.url, fetcher, + ImageStore(args.out + '_images', fetcher), lambda x: ApplyHTMLFilters(args.htmlfilters, x), lambda x: ApplyDOMFilters(args.domfilters, x), PandocTexifier(args.pandoc or 'pandoc'), diff --git a/paperdoorknob_test.py b/paperdoorknob_test.py index 49f1872..65fc2a9 100644 --- a/paperdoorknob_test.py +++ b/paperdoorknob_test.py @@ -15,6 +15,7 @@ import paperdoorknob from testing.fakeserver import FakeGlowficServer from domfilter import ApplyDOMFilters from fetch import DirectFetcher, FakeFetcher, Fetcher +from images import ImageStore from spec import Spec from texify import DirectTexifier, PandocTexifier, VerifyingTexifier @@ -46,6 +47,7 @@ class BaseTestProcess(ABC): spec = Spec( self.url(), self.fetcher(), + ImageStore('is', self.fetcher()), lambda x: x, lambda x: ApplyDOMFilters('NoEdit,NoFooter', x), PandocTexifier('pandoc'), @@ -68,6 +70,7 @@ Pretty sure. spec = Spec( self.url(), self.fetcher(), + ImageStore('is', self.fetcher()), lambda x: x, lambda x: ApplyDOMFilters('NoEdit,NoFooter', x), texifier, @@ -80,6 +83,7 @@ Pretty sure. spec = Spec( self.url(), self.fetcher(), + ImageStore('is', self.fetcher()), lambda x: x, lambda x: ApplyDOMFilters('NoEdit,NoFooter', x), PandocTexifier('pandoc'), diff --git a/spec.py b/spec.py index c7ace56..2b80f16 100644 --- a/spec.py +++ b/spec.py @@ -13,6 +13,7 @@ from bs4.element import Tag from fetch import Fetcher from texify import Texifier +from images import ImageStore # pylint: disable=too-many-instance-attributes @@ -20,6 +21,7 @@ from texify import Texifier class Spec: url: str fetcher: Fetcher + images: ImageStore htmlfilter: Callable[[bytes], bytes] domfilter: Callable[[Tag], None] texifier: Texifier -- 2.44.1 From e6adf6ced68d667429975110f2d1a1bd9c8d79b6 Mon Sep 17 00:00:00 2001 From: Scott Worley Date: Wed, 20 Dec 2023 11:43:09 -0800 Subject: [PATCH 042/100] More structure and tests around splitting the page into chunks' DOMs. --- glowfic.py | 39 +++++++++++++++++++++++++++++++++ glowfic_test.py | 50 +++++++++++++++++++++++++++++++++++++++++++ paperdoorknob.py | 24 ++------------------- paperdoorknob_test.py | 10 --------- setup.py | 1 + 5 files changed, 92 insertions(+), 32 deletions(-) create mode 100644 glowfic.py create mode 100644 glowfic_test.py diff --git a/glowfic.py b/glowfic.py new file mode 100644 index 0000000..901246b --- /dev/null +++ b/glowfic.py @@ -0,0 +1,39 @@ +# paperdoorknob: Print glowfic +# +# This program is free software: you can redistribute it and/or modify it +# under the terms of the GNU General Public License as published by the +# Free Software Foundation, version 3. + + +import itertools + +from typing import Iterable + +from bs4 import BeautifulSoup +from bs4.element import Tag + +# We avoid the name "post" because the Glowfic community uses the term +# inconsistently: +# * The Glowfic software sometimes uses "post" to refer to a whole thread +# (eg: in the URL), but more often uses "post" to refer to just the first +# chunk in a thread. The non-first chunks are "replies". +# * Readers and this software don't need to distinguish first-chunks and +# non-first-chunks. +# * Humans in the community tend to use "posts" to mean "chunks" ("replies" +# in the Glowfic software's lexicon). + + +def chunkDOMs(html: BeautifulSoup) -> Iterable[Tag]: + def text() -> Tag: + body = html.body + assert body + text = body.find_next("div", class_="post-post") + assert isinstance(text, Tag) + return text + + def the_replies() -> Iterable[Tag]: + rs = html.find_all("div", class_="post-reply") + assert all(isinstance(r, Tag) for r in rs) + return rs + + return itertools.chain([text()], the_replies()) diff --git a/glowfic_test.py b/glowfic_test.py new file mode 100644 index 0000000..d847333 --- /dev/null +++ b/glowfic_test.py @@ -0,0 +1,50 @@ +# paperdoorknob: Print glowfic +# +# This program is free software: you can redistribute it and/or modify it +# under the terms of the GNU General Public License as published by the +# Free Software Foundation, version 3. + + +import unittest + +from bs4 import BeautifulSoup + +from glowfic import chunkDOMs + + +class TestSplit(unittest.TestCase): + + def testSplit1(self) -> None: + soup = BeautifulSoup(b''' +
+ The "post" +
''', 'html.parser') + self.assertEqual([list(t.stripped_strings) for t in chunkDOMs(soup)], + [['The "post"']]) + + def testSplit2(self) -> None: + soup = BeautifulSoup(b''' + +
The "post"
+
+
The "reply"
+
+ ''', 'html.parser') + self.assertEqual([list(t.stripped_strings) for t in chunkDOMs(soup)], + [['The "post"'], ['The "reply"']]) + + def testSplit3(self) -> None: + soup = BeautifulSoup(b''' + +
The "post"
+
+
1st reply
+
2nd reply
+
+ ''', 'html.parser') + self.assertEqual([list(t.stripped_strings) for t in chunkDOMs(soup)], + [['The "post"'], ['1st reply'], ['2nd reply']]) + + +if __name__ == '__main__': + unittest.main() diff --git a/paperdoorknob.py b/paperdoorknob.py index 26eb7be..84ed702 100644 --- a/paperdoorknob.py +++ b/paperdoorknob.py @@ -5,14 +5,10 @@ # Free Software Foundation, version 3. -import itertools - -from typing import Iterable - from bs4 import BeautifulSoup -from bs4.element import Tag from args import spec_from_commandline_args +from glowfic import chunkDOMs from spec import Spec @@ -20,22 +16,6 @@ def parse(content: bytes) -> BeautifulSoup: return BeautifulSoup(content, 'html.parser') -def replies(html: BeautifulSoup) -> Iterable[Tag]: - def text() -> Tag: - body = html.body - assert body - text = body.find_next("div", class_="post-post") - assert isinstance(text, Tag) - return text - - def the_replies() -> Iterable[Tag]: - rs = html.find_all("div", class_="post-reply") - assert all(isinstance(r, Tag) for r in rs) - return rs - - return itertools.chain([text()], the_replies()) - - def process(spec: Spec) -> None: spec.texout.write(b'\\documentclass{article}\n') if spec.geometry is not None: @@ -44,7 +24,7 @@ def process(spec: Spec) -> None: b']{geometry}\n') spec.texout.write(b'\\begin{document}\n') html = parse(spec.htmlfilter(spec.fetcher.fetch(spec.url))) - for r in replies(html): + for r in chunkDOMs(html): spec.domfilter(r) spec.texout.write(spec.texifier.texify(r)) spec.texout.write(b'\\end{document}\n') diff --git a/paperdoorknob_test.py b/paperdoorknob_test.py index 65fc2a9..249b6e4 100644 --- a/paperdoorknob_test.py +++ b/paperdoorknob_test.py @@ -32,16 +32,6 @@ class BaseTestProcess(ABC): def fetcher(self) -> Fetcher: raise NotImplementedError() - def testReplies(self) -> None: - replies = list(paperdoorknob.replies( - paperdoorknob.parse(self.fetcher().fetch(self.url())))) - for r in replies: - ApplyDOMFilters('NoEdit,NoFooter', r) - assert [r.text.strip() for r in replies] == [ - "This is glowfic", - "You sure?", - "Pretty sure."] - def testProcess(self) -> None: buf = io.BytesIO() spec = Spec( diff --git a/setup.py b/setup.py index 1ef11e6..2152739 100644 --- a/setup.py +++ b/setup.py @@ -11,6 +11,7 @@ setup( 'args', 'domfilter', 'fetch', + 'glowfic', 'htmlfilter', 'images', 'paperdoorknob', -- 2.44.1 From 4b7d905ec23e1190bcb65a3041816e07f2663d9b Mon Sep 17 00:00:00 2001 From: Scott Worley Date: Wed, 20 Dec 2023 13:10:55 -0800 Subject: [PATCH 043/100] FakeImageStore for tests --- args.py | 4 ++-- images.py | 16 +++++++++++++++- images_test.py | 6 +++--- paperdoorknob_test.py | 8 ++++---- 4 files changed, 24 insertions(+), 10 deletions(-) diff --git a/args.py b/args.py index d90de8a..f966ba9 100644 --- a/args.py +++ b/args.py @@ -16,7 +16,7 @@ from xdg_base_dirs import xdg_cache_home from domfilter import ApplyDOMFilters, DOMFilters from fetch import CachingFetcher from htmlfilter import ApplyHTMLFilters, HTMLFilters -from images import ImageStore +from images import DiskImageStore from spec import Spec from texify import PandocTexifier @@ -67,7 +67,7 @@ def spec_from_commandline_args() -> Iterator[Spec]: yield Spec( args.url, fetcher, - ImageStore(args.out + '_images', fetcher), + DiskImageStore(args.out + '_images', fetcher), lambda x: ApplyHTMLFilters(args.htmlfilters, x), lambda x: ApplyDOMFilters(args.domfilters, x), PandocTexifier(args.pandoc or 'pandoc'), diff --git a/images.py b/images.py index 178f708..d9926a3 100644 --- a/images.py +++ b/images.py @@ -4,17 +4,25 @@ # under the terms of the GNU General Public License as published by the # Free Software Foundation, version 3. +from abc import ABC, abstractmethod import os import os.path from fetch import Fetcher +class ImageStore(ABC): + + @abstractmethod + def get_image(self, url: str) -> str: + raise NotImplementedError() + + class ImageStoreError(Exception): pass -class ImageStore: +class DiskImageStore(ImageStore): def __init__(self, root_path: str, fetcher: Fetcher) -> None: self._root_path = root_path @@ -46,3 +54,9 @@ class ImageStore: self._filenames.add(name) self._images[url] = path return self._images[url] + + +class FakeImageStore(ImageStore): + + def get_image(self, url: str) -> str: + return 'stored:' + url diff --git a/images_test.py b/images_test.py index 8ada1ee..3f3fd13 100644 --- a/images_test.py +++ b/images_test.py @@ -8,7 +8,7 @@ import unittest from fetch import FakeFetcher -from images import ImageStore +from images import DiskImageStore class TestImageStore(unittest.TestCase): @@ -22,7 +22,7 @@ class TestImageStore(unittest.TestCase): 'https://example.com/other_images/carol': b'CAROLINA'}) def testFetchOnce(self) -> None: - store = ImageStore('istest_fetch_once', self._fetcher) + store = DiskImageStore('istest_fetch_once', self._fetcher) self.assertEqual(self._fetcher.request_count(), 0) a1 = store.get_image('https://example.com/images/alice.png') self.assertEqual(self._fetcher.request_count(), 1) @@ -48,7 +48,7 @@ class TestImageStore(unittest.TestCase): self.assertEqual(a1, a3) def testNameCollision(self) -> None: - store = ImageStore('istest_name_collision', self._fetcher) + store = DiskImageStore('istest_name_collision', self._fetcher) self.assertEqual(self._fetcher.request_count(), 0) b1 = store.get_image('https://example.com/images/bob.jpeg') self.assertEqual(self._fetcher.request_count(), 1) diff --git a/paperdoorknob_test.py b/paperdoorknob_test.py index 249b6e4..7ceb3ea 100644 --- a/paperdoorknob_test.py +++ b/paperdoorknob_test.py @@ -15,7 +15,7 @@ import paperdoorknob from testing.fakeserver import FakeGlowficServer from domfilter import ApplyDOMFilters from fetch import DirectFetcher, FakeFetcher, Fetcher -from images import ImageStore +from images import FakeImageStore from spec import Spec from texify import DirectTexifier, PandocTexifier, VerifyingTexifier @@ -37,7 +37,7 @@ class BaseTestProcess(ABC): spec = Spec( self.url(), self.fetcher(), - ImageStore('is', self.fetcher()), + FakeImageStore(), lambda x: x, lambda x: ApplyDOMFilters('NoEdit,NoFooter', x), PandocTexifier('pandoc'), @@ -60,7 +60,7 @@ Pretty sure. spec = Spec( self.url(), self.fetcher(), - ImageStore('is', self.fetcher()), + FakeImageStore(), lambda x: x, lambda x: ApplyDOMFilters('NoEdit,NoFooter', x), texifier, @@ -73,7 +73,7 @@ Pretty sure. spec = Spec( self.url(), self.fetcher(), - ImageStore('is', self.fetcher()), + FakeImageStore(), lambda x: x, lambda x: ApplyDOMFilters('NoEdit,NoFooter', x), PandocTexifier('pandoc'), -- 2.44.1 From aa060d9ba5891fd87fc62664fb30b75bf0463e3e Mon Sep 17 00:00:00 2001 From: Scott Worley Date: Wed, 20 Dec 2023 13:44:27 -0800 Subject: [PATCH 044/100] Reify Chunk --- glowfic.py | 48 +++++++++++++++++++++++++++++++++---- glowfic_test.py | 20 +++++++++++++++- testdata/empty-content.html | 32 +++++++++++++++++++++++++ 3 files changed, 95 insertions(+), 5 deletions(-) create mode 100644 testdata/empty-content.html diff --git a/glowfic.py b/glowfic.py index 901246b..4169786 100644 --- a/glowfic.py +++ b/glowfic.py @@ -5,6 +5,7 @@ # Free Software Foundation, version 3. +from dataclasses import dataclass import itertools from typing import Iterable @@ -12,15 +13,26 @@ from typing import Iterable from bs4 import BeautifulSoup from bs4.element import Tag +from images import ImageStore + + +@dataclass(frozen=True) +class Chunk: + icon: str | None + character: str | None + screen_name: str | None + author: str | None + content: Tag + # We avoid the name "post" because the Glowfic community uses the term # inconsistently: # * The Glowfic software sometimes uses "post" to refer to a whole thread -# (eg: in the URL), but more often uses "post" to refer to just the first -# chunk in a thread. The non-first chunks are "replies". +# (in the URL), sometimes uses "post" to refer to chunks (in the CSS), +# but mostly uses "post" to refer to just the first chunk in a thread +# (in the HTML and UI). The non-first chunks are "replies". # * Readers and this software don't need to distinguish first-chunks and # non-first-chunks. -# * Humans in the community tend to use "posts" to mean "chunks" ("replies" -# in the Glowfic software's lexicon). +# * Humans in the community tend to use "posts" to mean chunks. def chunkDOMs(html: BeautifulSoup) -> Iterable[Tag]: @@ -37,3 +49,31 @@ def chunkDOMs(html: BeautifulSoup) -> Iterable[Tag]: return rs return itertools.chain([text()], the_replies()) + + +def makeChunk(chunk_dom: Tag, image_store: ImageStore) -> Chunk: + + def getIcon() -> str | None: + icon_div = chunk_dom.find_next('div', class_='post-icon') + if icon_div is None: + return None + icon_img = icon_div.find_next('img') + if icon_img is None: + return None + assert isinstance(icon_img, Tag) + return image_store.get_image(icon_img.attrs['src']) + + def getTextByClass(css_class: str) -> str | None: + div = chunk_dom.find_next('div', class_=css_class) + if div is None: + return None + return div.text.strip() + + content = chunk_dom.find_next('div', class_='post-content') + assert isinstance(content, Tag) + + return Chunk(getIcon(), + getTextByClass('post-character'), + getTextByClass('post-screenname'), + getTextByClass('post-author'), + content) diff --git a/glowfic_test.py b/glowfic_test.py index d847333..d3a6d56 100644 --- a/glowfic_test.py +++ b/glowfic_test.py @@ -9,7 +9,8 @@ import unittest from bs4 import BeautifulSoup -from glowfic import chunkDOMs +from images import FakeImageStore +from glowfic import chunkDOMs, makeChunk class TestSplit(unittest.TestCase): @@ -46,5 +47,22 @@ class TestSplit(unittest.TestCase): [['The "post"'], ['1st reply'], ['2nd reply']]) +class TestMakeChunk(unittest.TestCase): + + def testEmptyContent(self) -> None: + with open('testdata/empty-content.html', 'rb') as f: + soup = BeautifulSoup(f, 'html.parser') + c = makeChunk(next(iter(chunkDOMs(soup))), FakeImageStore()) + self.assertEqual( + c.icon, + 'stored:https://d1anwqy6ci9o1i.cloudfront.net/' + + 'users%2F366%2Ficons%2Fxqmypqvflgdy28aorw9ml_shock.png') + self.assertEqual(c.character, 'Keltham') + self.assertEqual(c.screen_name, 'lawful chaotic') + self.assertEqual(c.author, 'Iarwain') + self.assertEqual(str(c.content), + '

') + + if __name__ == '__main__': unittest.main() diff --git a/testdata/empty-content.html b/testdata/empty-content.html new file mode 100644 index 0000000..e1dc3f5 --- /dev/null +++ b/testdata/empty-content.html @@ -0,0 +1,32 @@ + + +
+ +
+ +
+ + Permalink + +
+

+
+ +
+ + -- 2.44.1 From 8d583d5bdb7c8c2c6f7dfc3336c48a9ac04393ca Mon Sep 17 00:00:00 2001 From: Scott Worley Date: Wed, 20 Dec 2023 15:25:53 -0800 Subject: [PATCH 045/100] New dependency: wrapfig TeX package --- default.nix | 4 ++-- paperdoorknob.py | 2 +- paperdoorknob_test.py | 1 + 3 files changed, 4 insertions(+), 3 deletions(-) diff --git a/default.nix b/default.nix index 3e02638..c27c9e6 100644 --- a/default.nix +++ b/default.nix @@ -1,7 +1,7 @@ { pkgs ? import { }, lint ? false }: pkgs.python3Packages.callPackage ({ lib, buildPythonPackage, makeWrapper , autopep8, mypy, pylint, beautifulsoup4, requests, requests-cache - , texliveBasic, types-beautifulsoup4, types-requests, xdg-base-dirs + , texliveFull, types-beautifulsoup4, types-requests, xdg-base-dirs , pandoc-cli }: buildPythonPackage rec { pname = "paperdoorknob"; @@ -11,7 +11,7 @@ pkgs.python3Packages.callPackage ({ lib, buildPythonPackage, makeWrapper propagatedBuildInputs = [ beautifulsoup4 requests requests-cache xdg-base-dirs ]; nativeCheckInputs = - [ mypy pandoc-cli texliveBasic types-beautifulsoup4 types-requests ] + [ mypy pandoc-cli texliveFull types-beautifulsoup4 types-requests ] ++ lib.optionals lint [ autopep8 pylint ]; doCheck = true; checkPhase = "./test.sh"; diff --git a/paperdoorknob.py b/paperdoorknob.py index 84ed702..160f0db 100644 --- a/paperdoorknob.py +++ b/paperdoorknob.py @@ -17,7 +17,7 @@ def parse(content: bytes) -> BeautifulSoup: def process(spec: Spec) -> None: - spec.texout.write(b'\\documentclass{article}\n') + spec.texout.write(b'\\documentclass{article}\n\\usepackage{wrapfig}\n') if spec.geometry is not None: spec.texout.write(b'\\usepackage[' + spec.geometry.encode('UTF-8') + diff --git a/paperdoorknob_test.py b/paperdoorknob_test.py index 7ceb3ea..4291bb4 100644 --- a/paperdoorknob_test.py +++ b/paperdoorknob_test.py @@ -45,6 +45,7 @@ class BaseTestProcess(ABC): buf) paperdoorknob.process(spec) assert buf.getvalue() == b'''\\documentclass{article} +\\usepackage{wrapfig} \\usepackage[margin=20mm]{geometry} \\begin{document} This is glowfic -- 2.44.1 From d2a41ff4f433a29173259359dcf477fd59ee3c78 Mon Sep 17 00:00:00 2001 From: Scott Worley Date: Wed, 20 Dec 2023 15:42:32 -0800 Subject: [PATCH 046/100] Reify Layout --- args.py | 4 ++- glowfic.py | 46 +++++++++++++++++++++++++++++++++++ paperdoorknob.py | 10 +++++--- paperdoorknob_test.py | 8 +++--- spec.py | 4 +-- testdata/this-is-glowfic.html | 6 ++--- 6 files changed, 66 insertions(+), 12 deletions(-) diff --git a/args.py b/args.py index f966ba9..62b9404 100644 --- a/args.py +++ b/args.py @@ -15,6 +15,7 @@ from xdg_base_dirs import xdg_cache_home from domfilter import ApplyDOMFilters, DOMFilters from fetch import CachingFetcher +from glowfic import BelowIconLayout from htmlfilter import ApplyHTMLFilters, HTMLFilters from images import DiskImageStore from spec import Spec @@ -62,6 +63,7 @@ See https://faculty.bard.edu/bloch/geometry.pdf for details @contextmanager def spec_from_commandline_args() -> Iterator[Spec]: args = _command_line_parser().parse_args() + texifier = PandocTexifier(args.pandoc or 'pandoc') with CachingFetcher(args.cache_path, args.timeout) as fetcher: with open(args.out + '.tex', 'wb') as texout: yield Spec( @@ -70,6 +72,6 @@ def spec_from_commandline_args() -> Iterator[Spec]: DiskImageStore(args.out + '_images', fetcher), lambda x: ApplyHTMLFilters(args.htmlfilters, x), lambda x: ApplyDOMFilters(args.domfilters, x), - PandocTexifier(args.pandoc or 'pandoc'), + BelowIconLayout(texifier), args.geometry, texout) diff --git a/glowfic.py b/glowfic.py index 4169786..0eab425 100644 --- a/glowfic.py +++ b/glowfic.py @@ -5,6 +5,7 @@ # Free Software Foundation, version 3. +from abc import ABC, abstractmethod from dataclasses import dataclass import itertools @@ -14,6 +15,7 @@ from bs4 import BeautifulSoup from bs4.element import Tag from images import ImageStore +from texify import Texifier @dataclass(frozen=True) @@ -77,3 +79,47 @@ def makeChunk(chunk_dom: Tag, image_store: ImageStore) -> Chunk: getTextByClass('post-screenname'), getTextByClass('post-author'), content) + + +def renderIcon(icon_path: str | None) -> bytes: + return b'\\includegraphics{%s}' % icon_path.encode( + 'UTF-8') if icon_path else b'' + + +class Layout(ABC): + + @abstractmethod + def renderChunk(self, chunk: Chunk) -> bytes: + raise NotImplementedError() + + +class ContentOnlyLayout(Layout): + + def __init__(self, texifier: Texifier) -> None: + self._texifier = texifier + + def renderChunk(self, chunk: Chunk) -> bytes: + return self._texifier.texify(chunk.content) + + +class BelowIconLayout(Layout): + + def __init__(self, texifier: Texifier) -> None: + self._texifier = texifier + + def renderChunk(self, chunk: Chunk) -> bytes: + icon_width = b'0.25\\textwidth' # TODO: Make this configurable + return b'''\\begin{wrapfigure}{l}{%s} +%s +%s +%s +%s +\\end{wrapfigure} +%s +''' % ( + icon_width, + renderIcon(chunk.icon), + chunk.character.encode('UTF-8') if chunk.character else b'', + chunk.screen_name.encode('UTF-8') if chunk.screen_name else b'', + chunk.author.encode('UTF-8') if chunk.author else b'', + self._texifier.texify(chunk.content)) diff --git a/paperdoorknob.py b/paperdoorknob.py index 160f0db..f8b3361 100644 --- a/paperdoorknob.py +++ b/paperdoorknob.py @@ -8,7 +8,7 @@ from bs4 import BeautifulSoup from args import spec_from_commandline_args -from glowfic import chunkDOMs +from glowfic import chunkDOMs, makeChunk from spec import Spec @@ -17,7 +17,10 @@ def parse(content: bytes) -> BeautifulSoup: def process(spec: Spec) -> None: - spec.texout.write(b'\\documentclass{article}\n\\usepackage{wrapfig}\n') + spec.texout.write(b'''\\documentclass{article} +\\usepackage{graphicx} +\\usepackage{wrapfig} +''') if spec.geometry is not None: spec.texout.write(b'\\usepackage[' + spec.geometry.encode('UTF-8') + @@ -26,7 +29,8 @@ def process(spec: Spec) -> None: html = parse(spec.htmlfilter(spec.fetcher.fetch(spec.url))) for r in chunkDOMs(html): spec.domfilter(r) - spec.texout.write(spec.texifier.texify(r)) + chunk = makeChunk(r, spec.images) + spec.texout.write(spec.layout.renderChunk(chunk)) spec.texout.write(b'\\end{document}\n') diff --git a/paperdoorknob_test.py b/paperdoorknob_test.py index 4291bb4..c1f8b60 100644 --- a/paperdoorknob_test.py +++ b/paperdoorknob_test.py @@ -15,6 +15,7 @@ import paperdoorknob from testing.fakeserver import FakeGlowficServer from domfilter import ApplyDOMFilters from fetch import DirectFetcher, FakeFetcher, Fetcher +from glowfic import ContentOnlyLayout, BelowIconLayout from images import FakeImageStore from spec import Spec from texify import DirectTexifier, PandocTexifier, VerifyingTexifier @@ -40,11 +41,12 @@ class BaseTestProcess(ABC): FakeImageStore(), lambda x: x, lambda x: ApplyDOMFilters('NoEdit,NoFooter', x), - PandocTexifier('pandoc'), + ContentOnlyLayout(PandocTexifier('pandoc')), 'margin=20mm', buf) paperdoorknob.process(spec) assert buf.getvalue() == b'''\\documentclass{article} +\\usepackage{graphicx} \\usepackage{wrapfig} \\usepackage[margin=20mm]{geometry} \\begin{document} @@ -64,7 +66,7 @@ Pretty sure. FakeImageStore(), lambda x: x, lambda x: ApplyDOMFilters('NoEdit,NoFooter', x), - texifier, + ContentOnlyLayout(texifier), None, buf) paperdoorknob.process(spec) @@ -77,7 +79,7 @@ Pretty sure. FakeImageStore(), lambda x: x, lambda x: ApplyDOMFilters('NoEdit,NoFooter', x), - PandocTexifier('pandoc'), + BelowIconLayout(PandocTexifier('pandoc')), None, out) paperdoorknob.process(spec) diff --git a/spec.py b/spec.py index 2b80f16..34e2e6b 100644 --- a/spec.py +++ b/spec.py @@ -12,7 +12,7 @@ from typing import Callable, IO from bs4.element import Tag from fetch import Fetcher -from texify import Texifier +from glowfic import Layout from images import ImageStore @@ -24,6 +24,6 @@ class Spec: images: ImageStore htmlfilter: Callable[[bytes], bytes] domfilter: Callable[[Tag], None] - texifier: Texifier + layout: Layout geometry: str | None texout: IO[bytes] diff --git a/testdata/this-is-glowfic.html b/testdata/this-is-glowfic.html index 22c7a37..207cdd1 100644 --- a/testdata/this-is-glowfic.html +++ b/testdata/this-is-glowfic.html @@ -2,17 +2,17 @@
We don't want edit boxes
- This is glowfic +
This is glowfic
We don't want edit boxes
- You sure? +
You sure?
- Pretty sure. +
Pretty sure.
-- 2.44.1 From d59b46dbd038d61afd875c04e6dcc39e9395f414 Mon Sep 17 00:00:00 2001 From: Scott Worley Date: Wed, 20 Dec 2023 15:57:46 -0800 Subject: [PATCH 047/100] LaTeX doesn't like % in filenames --- images.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/images.py b/images.py index d9926a3..68a9696 100644 --- a/images.py +++ b/images.py @@ -32,7 +32,7 @@ class DiskImageStore(ImageStore): def _filename(self, url: str) -> str: assert url not in self._images - base = os.path.basename(url) + base = os.path.basename(url).replace('%', '') if base not in self._filenames: return base stem, ext = os.path.splitext(base) -- 2.44.1 From c62e8d404dd179bc5a84794f307120572185e397 Mon Sep 17 00:00:00 2001 From: Scott Worley Date: Wed, 20 Dec 2023 16:14:30 -0800 Subject: [PATCH 048/100] Uniform icon image size --- args.py | 6 +++++- glowfic.py | 11 ++++++----- paperdoorknob_test.py | 2 +- 3 files changed, 12 insertions(+), 7 deletions(-) diff --git a/args.py b/args.py index 62b9404..e952c45 100644 --- a/args.py +++ b/args.py @@ -44,6 +44,10 @@ See https://faculty.bard.edu/bloch/geometry.pdf for details '--htmlfilters', help='Which HTML filters to use (default: %(default)s)', default=','.join(f[0] for f in HTMLFilters)) + parser.add_argument( + '--image_size', + help='How large the icon images are, in mm', + default=20) parser.add_argument( '--out', help='The filename stem at which to write output ' + @@ -72,6 +76,6 @@ def spec_from_commandline_args() -> Iterator[Spec]: DiskImageStore(args.out + '_images', fetcher), lambda x: ApplyHTMLFilters(args.htmlfilters, x), lambda x: ApplyDOMFilters(args.domfilters, x), - BelowIconLayout(texifier), + BelowIconLayout(texifier, args.image_size), args.geometry, texout) diff --git a/glowfic.py b/glowfic.py index 0eab425..1249663 100644 --- a/glowfic.py +++ b/glowfic.py @@ -81,9 +81,9 @@ def makeChunk(chunk_dom: Tag, image_store: ImageStore) -> Chunk: content) -def renderIcon(icon_path: str | None) -> bytes: - return b'\\includegraphics{%s}' % icon_path.encode( - 'UTF-8') if icon_path else b'' +def renderIcon(icon_path: str | None, image_size: float) -> bytes: + return b'\\includegraphics[width=%fmm,height=%fmm,keepaspectratio]{%s}' % ( + image_size, image_size, icon_path.encode('UTF-8')) if icon_path else b'' class Layout(ABC): @@ -104,8 +104,9 @@ class ContentOnlyLayout(Layout): class BelowIconLayout(Layout): - def __init__(self, texifier: Texifier) -> None: + def __init__(self, texifier: Texifier, image_size: float) -> None: self._texifier = texifier + self._image_size = image_size def renderChunk(self, chunk: Chunk) -> bytes: icon_width = b'0.25\\textwidth' # TODO: Make this configurable @@ -118,7 +119,7 @@ class BelowIconLayout(Layout): %s ''' % ( icon_width, - renderIcon(chunk.icon), + renderIcon(chunk.icon, self._image_size), chunk.character.encode('UTF-8') if chunk.character else b'', chunk.screen_name.encode('UTF-8') if chunk.screen_name else b'', chunk.author.encode('UTF-8') if chunk.author else b'', diff --git a/paperdoorknob_test.py b/paperdoorknob_test.py index c1f8b60..a0aeb21 100644 --- a/paperdoorknob_test.py +++ b/paperdoorknob_test.py @@ -79,7 +79,7 @@ Pretty sure. FakeImageStore(), lambda x: x, lambda x: ApplyDOMFilters('NoEdit,NoFooter', x), - BelowIconLayout(PandocTexifier('pandoc')), + BelowIconLayout(PandocTexifier('pandoc'), 20), None, out) paperdoorknob.process(spec) -- 2.44.1 From 163851310b141ce92013cbc13e40bc4a17669a54 Mon Sep 17 00:00:00 2001 From: Scott Worley Date: Wed, 20 Dec 2023 16:17:54 -0800 Subject: [PATCH 049/100] Paragraph breaks --- glowfic.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/glowfic.py b/glowfic.py index 1249663..a4f3ca9 100644 --- a/glowfic.py +++ b/glowfic.py @@ -112,11 +112,16 @@ class BelowIconLayout(Layout): icon_width = b'0.25\\textwidth' # TODO: Make this configurable return b'''\\begin{wrapfigure}{l}{%s} %s + %s + %s + %s \\end{wrapfigure} + %s + ''' % ( icon_width, renderIcon(chunk.icon, self._image_size), -- 2.44.1 From 75c5665e089279f9de1a38b31e03f9736ecdc17e Mon Sep 17 00:00:00 2001 From: Scott Worley Date: Wed, 20 Dec 2023 21:21:53 -0800 Subject: [PATCH 050/100] Report fetch cache hit rate --- fetch.py | 43 ++++++++++++++++++++++++++++++++++++++++--- fetch_test.py | 10 ++++++++++ 2 files changed, 50 insertions(+), 3 deletions(-) diff --git a/fetch.py b/fetch.py index d526753..1d20701 100644 --- a/fetch.py +++ b/fetch.py @@ -7,7 +7,8 @@ from abc import ABC, abstractmethod from contextlib import contextmanager -from typing import Iterator +from sys import stderr +from typing import IO, Iterator import requests import requests_cache @@ -31,6 +32,33 @@ class _SessionFetcher(Fetcher): return r.content +class _CachingFetcher(Fetcher): + + def __init__( + self, + session: requests_cache.CachedSession, + timeout: int) -> None: + self._session = session + self._timeout = timeout + self._request_count = 0 + self._cache_hit_count = 0 + + def fetch(self, url: str) -> bytes: + with self._session.get(url, timeout=self._timeout) as r: + r.raise_for_status() + self._request_count += 1 + self._cache_hit_count += int(r.from_cache) + return r.content + + @property + def request_count(self) -> int: + return self._request_count + + @property + def cache_hit_count(self) -> int: + return self._cache_hit_count + + @contextmanager def DirectFetcher(timeout: int) -> Iterator[_SessionFetcher]: with requests.session() as session: @@ -38,9 +66,18 @@ def DirectFetcher(timeout: int) -> Iterator[_SessionFetcher]: @contextmanager -def CachingFetcher(cache_path: str, timeout: int) -> Iterator[_SessionFetcher]: +def CachingFetcher( + cache_path: str, + timeout: int, + report_stream: IO[str] = stderr) -> Iterator[_CachingFetcher]: with requests_cache.CachedSession(cache_path, cache_control=True) as session: - yield _SessionFetcher(session, timeout) + fetcher = _CachingFetcher(session, timeout) + yield fetcher + if fetcher.request_count > 0: + percent = 100.0 * fetcher.cache_hit_count / fetcher.request_count + print( + f"Fetch cache hits: {fetcher.cache_hit_count} ({percent:.1f}%)", + file=report_stream) class FakeFetcher(Fetcher): diff --git a/fetch_test.py b/fetch_test.py index 6bdf69e..5f21fee 100644 --- a/fetch_test.py +++ b/fetch_test.py @@ -5,8 +5,11 @@ # Free Software Foundation, version 3. +from io import StringIO import unittest + from requests import HTTPError + from testing.fakeserver import FakeGlowficServer from fetch import CachingFetcher, DirectFetcher @@ -40,6 +43,13 @@ class TestFetch(unittest.TestCase): f.fetch(f"http://localhost:{self._port}") self.assertEqual(self._server.request_count(), 1) + def testCacheHitRateReport(self) -> None: + buf = StringIO() + with CachingFetcher("testcachehitratereportwithcl", TIMEOUT, buf) as f: + for _ in range(7): + f.fetch(f"http://localhost:{self._port}") + self.assertEqual("Fetch cache hits: 6 (85.7%)\n", buf.getvalue()) + def testFetchErrors(self) -> None: with DirectFetcher(TIMEOUT) as f: with self.assertRaises(HTTPError): -- 2.44.1 From 69f86f06f2bfed44d957f0b5eabbdeb47a0e9572 Mon Sep 17 00:00:00 2001 From: Scott Worley Date: Wed, 20 Dec 2023 23:39:49 -0800 Subject: [PATCH 051/100] Fix flaky test: Increment request count before responding --- testing/fakeserver.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/testing/fakeserver.py b/testing/fakeserver.py index bbeb522..fbef2e5 100644 --- a/testing/fakeserver.py +++ b/testing/fakeserver.py @@ -16,6 +16,7 @@ class FakeGlowficHTTPRequestHandler(BaseHTTPRequestHandler): return 200 def do_GET(self) -> None: + self._notify_server() with open("testdata/this-is-glowfic.html", "rb") as f: body = f.read(9999) self.send_response(self._response_code()) @@ -23,7 +24,6 @@ class FakeGlowficHTTPRequestHandler(BaseHTTPRequestHandler): self.send_header("Content-Length", str(len(body))) self.end_headers() self.wfile.write(body) - self._notify_server() class FakeGlowficServer(): -- 2.44.1 From 85bcacb0cabcb41468ade664041315162c2c9248 Mon Sep 17 00:00:00 2001 From: Scott Worley Date: Wed, 20 Dec 2023 23:41:33 -0800 Subject: [PATCH 052/100] Lines between chunks --- paperdoorknob.py | 9 +++++++++ paperdoorknob_test.py | 6 ++++++ 2 files changed, 15 insertions(+) diff --git a/paperdoorknob.py b/paperdoorknob.py index f8b3361..24ccccd 100644 --- a/paperdoorknob.py +++ b/paperdoorknob.py @@ -27,7 +27,16 @@ def process(spec: Spec) -> None: b']{geometry}\n') spec.texout.write(b'\\begin{document}\n') html = parse(spec.htmlfilter(spec.fetcher.fetch(spec.url))) + first = True for r in chunkDOMs(html): + if first: + first = False + else: + # h/t https://tex.stackexchange.com/questions/309856 + spec.texout.write(b''' +\\vspace{-.5\\ht\\strutbox}\\noindent\\hrulefill + +''') spec.domfilter(r) chunk = makeChunk(r, spec.images) spec.texout.write(spec.layout.renderChunk(chunk)) diff --git a/paperdoorknob_test.py b/paperdoorknob_test.py index a0aeb21..612ee8d 100644 --- a/paperdoorknob_test.py +++ b/paperdoorknob_test.py @@ -51,7 +51,13 @@ class BaseTestProcess(ABC): \\usepackage[margin=20mm]{geometry} \\begin{document} This is glowfic + +\\vspace{-.5\\ht\\strutbox}\\noindent\\hrulefill + You \\emph{sure}? + +\\vspace{-.5\\ht\\strutbox}\\noindent\\hrulefill + Pretty sure. \\end{document} ''' -- 2.44.1 From 40b25122b6baf344aeb990b405f6015e6ffec8c8 Mon Sep 17 00:00:00 2001 From: Scott Worley Date: Wed, 20 Dec 2023 23:43:28 -0800 Subject: [PATCH 053/100] wrapfigure 0pt width means auto-scale --- glowfic.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/glowfic.py b/glowfic.py index a4f3ca9..7a9f82a 100644 --- a/glowfic.py +++ b/glowfic.py @@ -109,8 +109,7 @@ class BelowIconLayout(Layout): self._image_size = image_size def renderChunk(self, chunk: Chunk) -> bytes: - icon_width = b'0.25\\textwidth' # TODO: Make this configurable - return b'''\\begin{wrapfigure}{l}{%s} + return b'''\\begin{wrapfigure}{l}{0pt} %s %s @@ -123,7 +122,6 @@ class BelowIconLayout(Layout): %s ''' % ( - icon_width, renderIcon(chunk.icon, self._image_size), chunk.character.encode('UTF-8') if chunk.character else b'', chunk.screen_name.encode('UTF-8') if chunk.screen_name else b'', -- 2.44.1 From 0374fbde6508c909522c701cca9c7ca672a66719 Mon Sep 17 00:00:00 2001 From: Scott Worley Date: Wed, 20 Dec 2023 23:46:36 -0800 Subject: [PATCH 054/100] Force line breaks between icon, character, screen name, and author h/t https://tex.stackexchange.com/questions/35110/how-to-stack-boxes-like-a-vertical-version-of-mbox#comment75823_37801 --- glowfic.py | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/glowfic.py b/glowfic.py index 7a9f82a..c8246a1 100644 --- a/glowfic.py +++ b/glowfic.py @@ -109,15 +109,12 @@ class BelowIconLayout(Layout): self._image_size = image_size def renderChunk(self, chunk: Chunk) -> bytes: - return b'''\\begin{wrapfigure}{l}{0pt} -%s - -%s - -%s - -%s -\\end{wrapfigure} + return b'''\\begin{wrapfigure}{l}{0pt}\\oalign{ +%s\\cr +%s\\cr +%s\\cr +%s\\cr +}\\end{wrapfigure} %s -- 2.44.1 From 357f37be787409c5655df646f8aa0ed4abfa5eaf Mon Sep 17 00:00:00 2001 From: Scott Worley Date: Thu, 21 Dec 2023 00:24:04 -0800 Subject: [PATCH 055/100] =?utf8?q?wrapfig=20=E2=86=92=20wrapstuff,=20for?= =?utf8?q?=20\wrapstuffclear?= MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit --- glowfic.py | 4 ++-- paperdoorknob.py | 4 ++-- paperdoorknob_test.py | 6 +++--- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/glowfic.py b/glowfic.py index c8246a1..c969ac0 100644 --- a/glowfic.py +++ b/glowfic.py @@ -109,12 +109,12 @@ class BelowIconLayout(Layout): self._image_size = image_size def renderChunk(self, chunk: Chunk) -> bytes: - return b'''\\begin{wrapfigure}{l}{0pt}\\oalign{ + return b'''\\begin{wrapstuff}[l]\\oalign{ %s\\cr %s\\cr %s\\cr %s\\cr -}\\end{wrapfigure} +}\\end{wrapstuff} %s diff --git a/paperdoorknob.py b/paperdoorknob.py index 24ccccd..f2a81d2 100644 --- a/paperdoorknob.py +++ b/paperdoorknob.py @@ -19,7 +19,7 @@ def parse(content: bytes) -> BeautifulSoup: def process(spec: Spec) -> None: spec.texout.write(b'''\\documentclass{article} \\usepackage{graphicx} -\\usepackage{wrapfig} +\\usepackage{wrapstuff} ''') if spec.geometry is not None: spec.texout.write(b'\\usepackage[' + @@ -34,7 +34,7 @@ def process(spec: Spec) -> None: else: # h/t https://tex.stackexchange.com/questions/309856 spec.texout.write(b''' -\\vspace{-.5\\ht\\strutbox}\\noindent\\hrulefill +\\wrapstuffclear\\vspace{-.5\\ht\\strutbox}\\noindent\\hrulefill ''') spec.domfilter(r) diff --git a/paperdoorknob_test.py b/paperdoorknob_test.py index 612ee8d..a89e8bb 100644 --- a/paperdoorknob_test.py +++ b/paperdoorknob_test.py @@ -47,16 +47,16 @@ class BaseTestProcess(ABC): paperdoorknob.process(spec) assert buf.getvalue() == b'''\\documentclass{article} \\usepackage{graphicx} -\\usepackage{wrapfig} +\\usepackage{wrapstuff} \\usepackage[margin=20mm]{geometry} \\begin{document} This is glowfic -\\vspace{-.5\\ht\\strutbox}\\noindent\\hrulefill +\\wrapstuffclear\\vspace{-.5\\ht\\strutbox}\\noindent\\hrulefill You \\emph{sure}? -\\vspace{-.5\\ht\\strutbox}\\noindent\\hrulefill +\\wrapstuffclear\\vspace{-.5\\ht\\strutbox}\\noindent\\hrulefill Pretty sure. \\end{document} -- 2.44.1 From 313a871e86e7ed32db9d39d46aa9dce22dbc7263 Mon Sep 17 00:00:00 2001 From: Scott Worley Date: Thu, 21 Dec 2023 00:42:07 -0800 Subject: [PATCH 056/100] Less space between icon and post-separation line --- glowfic.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/glowfic.py b/glowfic.py index c969ac0..eb34c1b 100644 --- a/glowfic.py +++ b/glowfic.py @@ -109,7 +109,8 @@ class BelowIconLayout(Layout): self._image_size = image_size def renderChunk(self, chunk: Chunk) -> bytes: - return b'''\\begin{wrapstuff}[l]\\oalign{ + return b'''\\begin{wrapstuff}[l,abovesep=-1\\ht\\strutbox] +\\oalign{ %s\\cr %s\\cr %s\\cr -- 2.44.1 From 23dabdf5c5133e06fbc44763225cda1ab72e668e Mon Sep 17 00:00:00 2001 From: Scott Worley Date: Thu, 21 Dec 2023 03:18:41 -0800 Subject: [PATCH 057/100] Border around icon + author data that extends between chunks --- glowfic.py | 47 ++++++++++++++++++++++++++++--------------- paperdoorknob.py | 10 +-------- paperdoorknob_test.py | 6 ++---- 3 files changed, 34 insertions(+), 29 deletions(-) diff --git a/glowfic.py b/glowfic.py index eb34c1b..76a9a84 100644 --- a/glowfic.py +++ b/glowfic.py @@ -81,9 +81,9 @@ def makeChunk(chunk_dom: Tag, image_store: ImageStore) -> Chunk: content) -def renderIcon(icon_path: str | None, image_size: float) -> bytes: - return b'\\includegraphics[width=%fmm,height=%fmm,keepaspectratio]{%s}' % ( - image_size, image_size, icon_path.encode('UTF-8')) if icon_path else b'' +def renderIcon(icon_path: str | None, image_size: float) -> str | None: + params = f'width={image_size}mm,height={image_size}mm,keepaspectratio' + return f'\\includegraphics[{params}]{{{icon_path}}}' if icon_path else None class Layout(ABC): @@ -99,7 +99,7 @@ class ContentOnlyLayout(Layout): self._texifier = texifier def renderChunk(self, chunk: Chunk) -> bytes: - return self._texifier.texify(chunk.content) + return self._texifier.texify(chunk.content) + b'\n' class BelowIconLayout(Layout): @@ -109,19 +109,34 @@ class BelowIconLayout(Layout): self._image_size = image_size def renderChunk(self, chunk: Chunk) -> bytes: - return b'''\\begin{wrapstuff}[l,abovesep=-1\\ht\\strutbox] -\\oalign{ -%s\\cr -%s\\cr -%s\\cr -%s\\cr -}\\end{wrapstuff} - + meta = [ + renderIcon(chunk.icon, self._image_size), + chunk.character, + chunk.screen_name, + chunk.author, + ] + + return b'''\\wrapstuffclear +\\begin{wrapstuff}[l] +\\fbox{ +\\begin{varwidth}{0.5\\textwidth} + \\smash{\\parbox[t][0pt]{0pt}{ + \\setlength{\\fboxrule}{0.2pt} + \\setlength{\\fboxsep}{0pt} + \\vspace{-3.4pt} + \\fbox{\\hspace{107mm}} + }\\\\*} + \\vspace{-1em} +\\begin{center} %s +\\end{center} +\\end{varwidth} +} +\\end{wrapstuff} + +\\strut +%s ''' % ( - renderIcon(chunk.icon, self._image_size), - chunk.character.encode('UTF-8') if chunk.character else b'', - chunk.screen_name.encode('UTF-8') if chunk.screen_name else b'', - chunk.author.encode('UTF-8') if chunk.author else b'', + b'\\\\*'.join(x.encode('UTF-8') for x in meta if x is not None), self._texifier.texify(chunk.content)) diff --git a/paperdoorknob.py b/paperdoorknob.py index f2a81d2..5e5747c 100644 --- a/paperdoorknob.py +++ b/paperdoorknob.py @@ -19,6 +19,7 @@ def parse(content: bytes) -> BeautifulSoup: def process(spec: Spec) -> None: spec.texout.write(b'''\\documentclass{article} \\usepackage{graphicx} +\\usepackage{varwidth} \\usepackage{wrapstuff} ''') if spec.geometry is not None: @@ -27,16 +28,7 @@ def process(spec: Spec) -> None: b']{geometry}\n') spec.texout.write(b'\\begin{document}\n') html = parse(spec.htmlfilter(spec.fetcher.fetch(spec.url))) - first = True for r in chunkDOMs(html): - if first: - first = False - else: - # h/t https://tex.stackexchange.com/questions/309856 - spec.texout.write(b''' -\\wrapstuffclear\\vspace{-.5\\ht\\strutbox}\\noindent\\hrulefill - -''') spec.domfilter(r) chunk = makeChunk(r, spec.images) spec.texout.write(spec.layout.renderChunk(chunk)) diff --git a/paperdoorknob_test.py b/paperdoorknob_test.py index a89e8bb..4711fcd 100644 --- a/paperdoorknob_test.py +++ b/paperdoorknob_test.py @@ -47,18 +47,16 @@ class BaseTestProcess(ABC): paperdoorknob.process(spec) assert buf.getvalue() == b'''\\documentclass{article} \\usepackage{graphicx} +\\usepackage{varwidth} \\usepackage{wrapstuff} \\usepackage[margin=20mm]{geometry} \\begin{document} This is glowfic -\\wrapstuffclear\\vspace{-.5\\ht\\strutbox}\\noindent\\hrulefill - You \\emph{sure}? -\\wrapstuffclear\\vspace{-.5\\ht\\strutbox}\\noindent\\hrulefill - Pretty sure. + \\end{document} ''' -- 2.44.1 From 7198d9ebe20982cb33f3c5acd3850f66cf8e6257 Mon Sep 17 00:00:00 2001 From: Scott Worley Date: Thu, 21 Dec 2023 03:21:42 -0800 Subject: [PATCH 058/100] No indent on first paragraph in each chunk --- glowfic.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/glowfic.py b/glowfic.py index 76a9a84..eadc40a 100644 --- a/glowfic.py +++ b/glowfic.py @@ -136,7 +136,7 @@ class BelowIconLayout(Layout): \\strut -%s +\\noindent %s ''' % ( b'\\\\*'.join(x.encode('UTF-8') for x in meta if x is not None), self._texifier.texify(chunk.content)) -- 2.44.1 From f75c16294c01dc2bdbe867e4477c2073c10680a8 Mon Sep 17 00:00:00 2001 From: Scott Worley Date: Sat, 23 Dec 2023 22:02:35 -0800 Subject: [PATCH 059/100] `beside` layout --- args.py | 16 ++++++++++++++-- glowfic.py | 35 +++++++++++++++++++++++++++++++++++ 2 files changed, 49 insertions(+), 2 deletions(-) diff --git a/args.py b/args.py index e952c45..8f77a88 100644 --- a/args.py +++ b/args.py @@ -15,7 +15,7 @@ from xdg_base_dirs import xdg_cache_home from domfilter import ApplyDOMFilters, DOMFilters from fetch import CachingFetcher -from glowfic import BelowIconLayout +from glowfic import BesideIconLayout, BelowIconLayout, Layout from htmlfilter import ApplyHTMLFilters, HTMLFilters from images import DiskImageStore from spec import Spec @@ -48,6 +48,11 @@ See https://faculty.bard.edu/bloch/geometry.pdf for details '--image_size', help='How large the icon images are, in mm', default=20) + parser.add_argument( + '--layout', + default='below', + help='Whether to put character and author information `beside` or `below` the icon ' + + '(default: below)') parser.add_argument( '--out', help='The filename stem at which to write output ' + @@ -68,6 +73,13 @@ See https://faculty.bard.edu/bloch/geometry.pdf for details def spec_from_commandline_args() -> Iterator[Spec]: args = _command_line_parser().parse_args() texifier = PandocTexifier(args.pandoc or 'pandoc') + layout: Layout + if args.layout == 'below': + layout = BelowIconLayout(texifier, args.image_size) + elif args.layout == 'beside': + layout = BesideIconLayout(texifier, args.image_size) + else: + raise ValueError(f'Unknown layout: {args.layout}') with CachingFetcher(args.cache_path, args.timeout) as fetcher: with open(args.out + '.tex', 'wb') as texout: yield Spec( @@ -76,6 +88,6 @@ def spec_from_commandline_args() -> Iterator[Spec]: DiskImageStore(args.out + '_images', fetcher), lambda x: ApplyHTMLFilters(args.htmlfilters, x), lambda x: ApplyDOMFilters(args.domfilters, x), - BelowIconLayout(texifier, args.image_size), + layout, args.geometry, texout) diff --git a/glowfic.py b/glowfic.py index eadc40a..d1966ca 100644 --- a/glowfic.py +++ b/glowfic.py @@ -140,3 +140,38 @@ class BelowIconLayout(Layout): ''' % ( b'\\\\*'.join(x.encode('UTF-8') for x in meta if x is not None), self._texifier.texify(chunk.content)) + + +class BesideIconLayout(Layout): + + def __init__(self, texifier: Texifier, image_size: float) -> None: + self._texifier = texifier + self._image_size = image_size + + def renderChunk(self, chunk: Chunk) -> bytes: + icon = renderIcon(chunk.icon, self._image_size) + meta = [ + chunk.character, + chunk.screen_name, + chunk.author, + ] + + # Why is \textwidth not the width of the text? + # Why is the width of the text .765\textwidth? + return b'''\\noindent\\fbox{ +%s +\\parbox[b]{.765\\textwidth}{ +\\begin{center} +%s +\\end{center} +} +}\\\\* +\\vspace{-0.75em}\\\\* +\\noindent %s + +\\strut + +''' % ( + icon.encode('UTF-8') if icon else b'', + b'\\\\*'.join(x.encode('UTF-8') for x in meta if x is not None), + self._texifier.texify(chunk.content)) -- 2.44.1 From ae7b6283ed5ead577d1a8f263d37d01717e2bb33 Mon Sep 17 00:00:00 2001 From: Scott Worley Date: Thu, 28 Dec 2023 15:13:11 -0800 Subject: [PATCH 060/100] FakeFetcher: Show bad URLs in error messages --- fetch.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fetch.py b/fetch.py index 1d20701..b99938c 100644 --- a/fetch.py +++ b/fetch.py @@ -89,7 +89,7 @@ class FakeFetcher(Fetcher): def fetch(self, url: str) -> bytes: self._fetch_count += 1 if url not in self._resources: - raise requests.HTTPError("URL not found") + raise requests.HTTPError("URL not found", url) return self._resources[url] def request_count(self) -> int: -- 2.44.1 From 1452f8d33eb6ffe6812698768f1e6bb975d9c4cf Mon Sep 17 00:00:00 2001 From: Scott Worley Date: Thu, 28 Dec 2023 15:19:37 -0800 Subject: [PATCH 061/100] Use view=flat to get whole threads at once --- glowfic.py | 18 ++++++++++++++++++ paperdoorknob.py | 5 +++-- paperdoorknob_test.py | 2 +- 3 files changed, 22 insertions(+), 3 deletions(-) diff --git a/glowfic.py b/glowfic.py index d1966ca..5730508 100644 --- a/glowfic.py +++ b/glowfic.py @@ -8,6 +8,7 @@ from abc import ABC, abstractmethod from dataclasses import dataclass import itertools +from urllib.parse import parse_qsl, urlencode, urlparse, urlunparse from typing import Iterable @@ -18,6 +19,23 @@ from images import ImageStore from texify import Texifier +def _removeViewFromURL(url: str) -> str: + u = urlparse(url) + old_qs = parse_qsl(u.query) + new_qs = [(k, v) for k, v in old_qs if k != 'view'] + return urlunparse(u._replace(query=urlencode(new_qs))) + + +def nonFlatURL(url: str) -> str: + return _removeViewFromURL(url) + + +def flatURL(url: str) -> str: + u = urlparse(_removeViewFromURL(url)) + qs = parse_qsl(u.query) + [('view', 'flat')] + return urlunparse(u._replace(query=urlencode(qs))) + + @dataclass(frozen=True) class Chunk: icon: str | None diff --git a/paperdoorknob.py b/paperdoorknob.py index 5e5747c..4ee4dbc 100644 --- a/paperdoorknob.py +++ b/paperdoorknob.py @@ -8,7 +8,7 @@ from bs4 import BeautifulSoup from args import spec_from_commandline_args -from glowfic import chunkDOMs, makeChunk +from glowfic import chunkDOMs, flatURL, makeChunk from spec import Spec @@ -27,7 +27,8 @@ def process(spec: Spec) -> None: spec.geometry.encode('UTF-8') + b']{geometry}\n') spec.texout.write(b'\\begin{document}\n') - html = parse(spec.htmlfilter(spec.fetcher.fetch(spec.url))) + url = flatURL(spec.url) + html = parse(spec.htmlfilter(spec.fetcher.fetch(url))) for r in chunkDOMs(html): spec.domfilter(r) chunk = makeChunk(r, spec.images) diff --git a/paperdoorknob_test.py b/paperdoorknob_test.py index 4711fcd..2787287 100644 --- a/paperdoorknob_test.py +++ b/paperdoorknob_test.py @@ -112,7 +112,7 @@ class TestProcessFromFakeFetcher(BaseTestProcess, unittest.TestCase): def fetcher(self) -> Fetcher: with open('testdata/this-is-glowfic.html', 'rb') as f: - return FakeFetcher({'fic': f.read(9999)}) + return FakeFetcher({'fic?view=flat': f.read(9999)}) if __name__ == '__main__': -- 2.44.1 From a8a47f7846509d22b123f4ab9cfdd7e65ac0d32e Mon Sep 17 00:00:00 2001 From: Scott Worley Date: Thu, 28 Dec 2023 22:46:59 -0800 Subject: [PATCH 062/100] image filenames: Drop ? and following query parameters --- images.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/images.py b/images.py index 68a9696..7932091 100644 --- a/images.py +++ b/images.py @@ -32,7 +32,7 @@ class DiskImageStore(ImageStore): def _filename(self, url: str) -> str: assert url not in self._images - base = os.path.basename(url).replace('%', '') + base = os.path.basename(url).replace('%', '').split('?')[0] if base not in self._filenames: return base stem, ext = os.path.splitext(base) -- 2.44.1 From 1e86dcaaf369d3d89f25437443d0905044216723 Mon Sep 17 00:00:00 2001 From: Scott Worley Date: Thu, 28 Dec 2023 22:48:17 -0800 Subject: [PATCH 063/100] fetch: Send User-Agent header --- fetch.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/fetch.py b/fetch.py index b99938c..151e2a9 100644 --- a/fetch.py +++ b/fetch.py @@ -14,6 +14,10 @@ import requests import requests_cache +_headers = { + 'User-Agent': 'paperdoorknob/0.0.1 (https://git.scottworley.com/paperdoorknob/)'} + + class Fetcher(ABC): @abstractmethod def fetch(self, url: str) -> bytes: @@ -27,7 +31,7 @@ class _SessionFetcher(Fetcher): self._timeout = timeout def fetch(self, url: str) -> bytes: - with self._session.get(url, timeout=self._timeout) as r: + with self._session.get(url, timeout=self._timeout, headers=_headers) as r: r.raise_for_status() return r.content @@ -44,7 +48,7 @@ class _CachingFetcher(Fetcher): self._cache_hit_count = 0 def fetch(self, url: str) -> bytes: - with self._session.get(url, timeout=self._timeout) as r: + with self._session.get(url, timeout=self._timeout, headers=_headers) as r: r.raise_for_status() self._request_count += 1 self._cache_hit_count += int(r.from_cache) -- 2.44.1 From e3b5fdf02164b45e85da8898196612aa51060760 Mon Sep 17 00:00:00 2001 From: Scott Worley Date: Thu, 28 Dec 2023 22:53:40 -0800 Subject: [PATCH 064/100] =?utf8?q?Reduce=20the=20number=20of=20places=20we?= =?utf8?q?=20keep=20our=20version=20number:=203=20=E2=86=92=202?= MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit --- fetch.py | 6 ++++-- setup.py | 5 ++++- version.py | 1 + 3 files changed, 9 insertions(+), 3 deletions(-) create mode 100644 version.py diff --git a/fetch.py b/fetch.py index 151e2a9..e998394 100644 --- a/fetch.py +++ b/fetch.py @@ -13,9 +13,11 @@ from typing import IO, Iterator import requests import requests_cache +from version import paperdoorknob_version -_headers = { - 'User-Agent': 'paperdoorknob/0.0.1 (https://git.scottworley.com/paperdoorknob/)'} + +_headers = {'User-Agent': f'paperdoorknob/{paperdoorknob_version} ' + + '(https://git.scottworley.com/paperdoorknob/)'} class Fetcher(ABC): diff --git a/setup.py b/setup.py index 2152739..5e748fd 100644 --- a/setup.py +++ b/setup.py @@ -1,8 +1,10 @@ from setuptools import setup +from version import paperdoorknob_version + setup( name='paperdoorknob', - version='0.0.1', + version=paperdoorknob_version, description="Print glowfic", author="Scott Worley", author_email="scottworley@scottworley.com", @@ -17,6 +19,7 @@ setup( 'paperdoorknob', 'spec', 'texify', + 'version', ], license="GPL-3.0", entry_points={'console_scripts': ['paperdoorknob = paperdoorknob:main']}, diff --git a/version.py b/version.py new file mode 100644 index 0000000..37219b3 --- /dev/null +++ b/version.py @@ -0,0 +1 @@ +paperdoorknob_version = '0.0.1' -- 2.44.1 From 681c2bd06893c728febe8e2a8f61f204a6b23deb Mon Sep 17 00:00:00 2001 From: Scott Worley Date: Thu, 28 Dec 2023 23:06:24 -0800 Subject: [PATCH 065/100] =?utf8?q?Reduce=20the=20number=20of=20places=20we?= =?utf8?q?=20keep=20our=20version=20number:=202=20=E2=86=92=201?= MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit --- default.nix | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/default.nix b/default.nix index c27c9e6..bb2d87e 100644 --- a/default.nix +++ b/default.nix @@ -5,7 +5,8 @@ pkgs.python3Packages.callPackage ({ lib, buildPythonPackage, makeWrapper , pandoc-cli }: buildPythonPackage rec { pname = "paperdoorknob"; - version = "0.0.1"; + version = lib.removeSuffix "'" (lib.removePrefix "paperdoorknob_version = '" + (lib.fileContents ./version.py)); src = lib.cleanSource ./.; nativeBuildInputs = [ makeWrapper ]; propagatedBuildInputs = -- 2.44.1 From 4aa870922e636a50ec829471e0de5ca6a6103974 Mon Sep 17 00:00:00 2001 From: Scott Worley Date: Fri, 29 Dec 2023 00:35:20 -0800 Subject: [PATCH 066/100] test: Don't duplicate list of TeX modules in test --- paperdoorknob_test.py | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/paperdoorknob_test.py b/paperdoorknob_test.py index 2787287..a608f8b 100644 --- a/paperdoorknob_test.py +++ b/paperdoorknob_test.py @@ -8,6 +8,7 @@ from abc import ABC, abstractmethod import unittest import io +import re import subprocess import paperdoorknob @@ -45,20 +46,17 @@ class BaseTestProcess(ABC): 'margin=20mm', buf) paperdoorknob.process(spec) - assert buf.getvalue() == b'''\\documentclass{article} -\\usepackage{graphicx} -\\usepackage{varwidth} -\\usepackage{wrapstuff} -\\usepackage[margin=20mm]{geometry} + assert re.match(br'''\\documentclass{article} +(\\usepackage{[a-z]+}\n)+\\usepackage\[margin=20mm\]{geometry} \\begin{document} This is glowfic -You \\emph{sure}? +You \\emph{sure}\? Pretty sure. \\end{document} -''' +''', buf.getvalue()) def testDirectTexifier(self) -> None: texifier = VerifyingTexifier( -- 2.44.1 From a5f4539c4cc5954515e1256289784f1b174a5a16 Mon Sep 17 00:00:00 2001 From: Scott Worley Date: Fri, 29 Dec 2023 00:38:34 -0800 Subject: [PATCH 067/100] Use LaTeX packages: longtable and booktabs pandoc-generated LaTeX assumes these packages are available --- paperdoorknob.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/paperdoorknob.py b/paperdoorknob.py index 4ee4dbc..b5b7a44 100644 --- a/paperdoorknob.py +++ b/paperdoorknob.py @@ -18,7 +18,9 @@ def parse(content: bytes) -> BeautifulSoup: def process(spec: Spec) -> None: spec.texout.write(b'''\\documentclass{article} +\\usepackage{booktabs} \\usepackage{graphicx} +\\usepackage{longtable} \\usepackage{varwidth} \\usepackage{wrapstuff} ''') -- 2.44.1 From 131deef1eba442d6e05c2ff6c6f7669b4b9e2b24 Mon Sep 17 00:00:00 2001 From: Scott Worley Date: Fri, 29 Dec 2023 10:24:14 -0800 Subject: [PATCH 068/100] texfilter to work around \emph nesting issue I don't know enough LaTeX to understand what the problem is, but this makes it go away. --- args.py | 6 ++++++ paperdoorknob.py | 2 +- paperdoorknob_test.py | 3 +++ setup.py | 1 + spec.py | 1 + texfilter.py | 32 ++++++++++++++++++++++++++++++++ texfilter_test.py | 24 ++++++++++++++++++++++++ 7 files changed, 68 insertions(+), 1 deletion(-) create mode 100644 texfilter.py create mode 100644 texfilter_test.py diff --git a/args.py b/args.py index 8f77a88..384e448 100644 --- a/args.py +++ b/args.py @@ -17,6 +17,7 @@ from domfilter import ApplyDOMFilters, DOMFilters from fetch import CachingFetcher from glowfic import BesideIconLayout, BelowIconLayout, Layout from htmlfilter import ApplyHTMLFilters, HTMLFilters +from texfilter import ApplyTexFilters, TexFilters from images import DiskImageStore from spec import Spec from texify import PandocTexifier @@ -59,6 +60,10 @@ See https://faculty.bard.edu/bloch/geometry.pdf for details '(eg: "%(default)s" produces %(default)s.tex, %(default)s.pdf, etc.)', default='book') parser.add_argument('--pandoc', help='Location of the pandoc executable') + parser.add_argument( + '--texfilters', + help='Which TeX filters to use (default: %(default)s)', + default=','.join(f[0] for f in TexFilters)) parser.add_argument( '--timeout', help='How long to wait for HTTP requests, in seconds', @@ -88,6 +93,7 @@ def spec_from_commandline_args() -> Iterator[Spec]: DiskImageStore(args.out + '_images', fetcher), lambda x: ApplyHTMLFilters(args.htmlfilters, x), lambda x: ApplyDOMFilters(args.domfilters, x), + lambda x: ApplyTexFilters(args.texfilters, x), layout, args.geometry, texout) diff --git a/paperdoorknob.py b/paperdoorknob.py index b5b7a44..b364f4d 100644 --- a/paperdoorknob.py +++ b/paperdoorknob.py @@ -34,7 +34,7 @@ def process(spec: Spec) -> None: for r in chunkDOMs(html): spec.domfilter(r) chunk = makeChunk(r, spec.images) - spec.texout.write(spec.layout.renderChunk(chunk)) + spec.texout.write(spec.texfilter(spec.layout.renderChunk(chunk))) spec.texout.write(b'\\end{document}\n') diff --git a/paperdoorknob_test.py b/paperdoorknob_test.py index a608f8b..6481803 100644 --- a/paperdoorknob_test.py +++ b/paperdoorknob_test.py @@ -42,6 +42,7 @@ class BaseTestProcess(ABC): FakeImageStore(), lambda x: x, lambda x: ApplyDOMFilters('NoEdit,NoFooter', x), + lambda x: x, ContentOnlyLayout(PandocTexifier('pandoc')), 'margin=20mm', buf) @@ -68,6 +69,7 @@ Pretty sure. FakeImageStore(), lambda x: x, lambda x: ApplyDOMFilters('NoEdit,NoFooter', x), + lambda x: x, ContentOnlyLayout(texifier), None, buf) @@ -81,6 +83,7 @@ Pretty sure. FakeImageStore(), lambda x: x, lambda x: ApplyDOMFilters('NoEdit,NoFooter', x), + lambda x: x, BelowIconLayout(PandocTexifier('pandoc'), 20), None, out) diff --git a/setup.py b/setup.py index 5e748fd..e21bc14 100644 --- a/setup.py +++ b/setup.py @@ -18,6 +18,7 @@ setup( 'images', 'paperdoorknob', 'spec', + 'texfilter', 'texify', 'version', ], diff --git a/spec.py b/spec.py index 34e2e6b..77a6a12 100644 --- a/spec.py +++ b/spec.py @@ -24,6 +24,7 @@ class Spec: images: ImageStore htmlfilter: Callable[[bytes], bytes] domfilter: Callable[[Tag], None] + texfilter: Callable[[bytes], bytes] layout: Layout geometry: str | None texout: IO[bytes] diff --git a/texfilter.py b/texfilter.py new file mode 100644 index 0000000..8103c19 --- /dev/null +++ b/texfilter.py @@ -0,0 +1,32 @@ +# paperdoorknob: Print glowfic +# +# This program is free software: you can redistribute it and/or modify it +# under the terms of the GNU General Public License as published by the +# Free Software Foundation, version 3. + +import re + +from typing import Callable, List, Tuple + + +TexFilters: List[Tuple[str, Callable[[bytes], bytes]]] = [ + # Work around `Extra }, or forgotten \endgroup.` + ("FixBareEmph", lambda x: re.sub( + b'(^|\n)(\\\\emph{)', b'\\1\\\\hspace{0pt}\\2', x)), +] + + +class TexFilterError(Exception): + pass + + +def ApplyTexFilters(filter_list: str, data: bytes) -> bytes: + for filter_name in filter_list.split(','): + filters = [f for (name, f) in TexFilters if name == filter_name] + if len(filters) == 0: + raise TexFilterError(f"Unknown Tex filter: {filter_name}") + if len(filters) > 1: + raise TexFilterError( + f"Multiple Tex filters with the same name!?: {filter_name}") + data = filters[0](data) + return data diff --git a/texfilter_test.py b/texfilter_test.py new file mode 100644 index 0000000..8b603a3 --- /dev/null +++ b/texfilter_test.py @@ -0,0 +1,24 @@ +# paperdoorknob: Print glowfic +# +# This program is free software: you can redistribute it and/or modify it +# under the terms of the GNU General Public License as published by the +# Free Software Foundation, version 3. + + +import unittest +from texfilter import ApplyTexFilters + + +class TestTexFilters(unittest.TestCase): + + def testStripNBSP(self) -> None: + self.assertEqual( + ApplyTexFilters("FixBareEmph", b'\\emph{Yes?}'), + b'\\hspace{0pt}\\emph{Yes?}') + self.assertEqual( + ApplyTexFilters("FixBareEmph", b'Reassuring.\n\n\\emph{Yes?}'), + b'Reassuring.\n\n\\hspace{0pt}\\emph{Yes?}') + + +if __name__ == '__main__': + unittest.main() -- 2.44.1 From 24e866347ec2355c35e0e39e317946b9b833d2ed Mon Sep 17 00:00:00 2001 From: Scott Worley Date: Fri, 29 Dec 2023 11:46:36 -0800 Subject: [PATCH 069/100] Support strikethrough --- domfilter.py | 7 +++++++ domfilter_test.py | 12 ++++++++++++ paperdoorknob.py | 1 + 3 files changed, 20 insertions(+) diff --git a/domfilter.py b/domfilter.py index 22f96a1..e1095f0 100644 --- a/domfilter.py +++ b/domfilter.py @@ -10,9 +10,16 @@ from typing import Any, Callable, List, Tuple from bs4.element import Tag +def _changeTag(tag: Tag, new_name: str) -> Tag: + tag.name = new_name + return tag + + DOMFilters: List[Tuple[str, Callable[[Tag], Any]]] = [ ("NoEdit", lambda x: [eb.decompose() for eb in x.find_all("div", class_="post-edit-box")]), ("NoFooter", lambda x: [foot.decompose() for foot in x.find_all("div", class_="post-footer")]), + ("Strike", lambda x: [_changeTag(span, 's') + for span in x.find_all("span", style="text-decoration: line-through;")]), ] diff --git a/domfilter_test.py b/domfilter_test.py index c3c8ee2..8daca20 100644 --- a/domfilter_test.py +++ b/domfilter_test.py @@ -21,6 +21,11 @@ class TestDOMFilters(unittest.TestCase): This is glowfic
''', 'html.parser') + self._strike = BeautifulSoup(b''' +
+

Abandon hope and endure.

+

No. Win.

+
''', 'html.parser') def testStripFooters(self) -> None: ApplyDOMFilters("NoFooter", self._html) @@ -37,6 +42,13 @@ class TestDOMFilters(unittest.TestCase): self.assertEqual(list(self._html.stripped_strings), ["This is glowfic"]) + def testStrike(self) -> None: + ApplyDOMFilters('Strike', self._strike) + s = self._strike.find_all('s') + self.assertEqual(len(s), 1) + self.assertEqual(list(s[0].stripped_strings), + ['Abandon hope and endure.']) + if __name__ == '__main__': unittest.main() diff --git a/paperdoorknob.py b/paperdoorknob.py index b364f4d..0da6d97 100644 --- a/paperdoorknob.py +++ b/paperdoorknob.py @@ -21,6 +21,7 @@ def process(spec: Spec) -> None: \\usepackage{booktabs} \\usepackage{graphicx} \\usepackage{longtable} +\\usepackage{soul} \\usepackage{varwidth} \\usepackage{wrapstuff} ''') -- 2.44.1 From 87fb552dd37301bc0834afc09aa99cadd6310b98 Mon Sep 17 00:00:00 2001 From: Scott Worley Date: Fri, 29 Dec 2023 11:49:08 -0800 Subject: [PATCH 070/100] Apply bare-\emph fix to \st too --- texfilter.py | 4 ++-- texfilter_test.py | 7 +++++-- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/texfilter.py b/texfilter.py index 8103c19..de3674b 100644 --- a/texfilter.py +++ b/texfilter.py @@ -11,8 +11,8 @@ from typing import Callable, List, Tuple TexFilters: List[Tuple[str, Callable[[bytes], bytes]]] = [ # Work around `Extra }, or forgotten \endgroup.` - ("FixBareEmph", lambda x: re.sub( - b'(^|\n)(\\\\emph{)', b'\\1\\\\hspace{0pt}\\2', x)), + ("FixBareNesting", lambda x: re.sub( + b'(^|\n)(\\\\(emph|st){)', b'\\1\\\\hspace{0pt}\\2', x)), ] diff --git a/texfilter_test.py b/texfilter_test.py index 8b603a3..66144e8 100644 --- a/texfilter_test.py +++ b/texfilter_test.py @@ -13,11 +13,14 @@ class TestTexFilters(unittest.TestCase): def testStripNBSP(self) -> None: self.assertEqual( - ApplyTexFilters("FixBareEmph", b'\\emph{Yes?}'), + ApplyTexFilters("FixBareNesting", b'\\emph{Yes?}'), b'\\hspace{0pt}\\emph{Yes?}') self.assertEqual( - ApplyTexFilters("FixBareEmph", b'Reassuring.\n\n\\emph{Yes?}'), + ApplyTexFilters("FixBareNesting", b'Reassuring.\n\n\\emph{Yes?}'), b'Reassuring.\n\n\\hspace{0pt}\\emph{Yes?}') + self.assertEqual( + ApplyTexFilters("FixBareNesting", b'\\st{Nope}'), + b'\\hspace{0pt}\\st{Nope}') if __name__ == '__main__': -- 2.44.1 From 4640c55a74e31de7f7ddde379e9c564c184bc820 Mon Sep 17 00:00:00 2001 From: Scott Worley Date: Fri, 29 Dec 2023 15:49:02 -0800 Subject: [PATCH 071/100] Remove no-longer-needed DOMFilters NoEdit and NoFooter These aren't needed anymore because Chunk individually extracts the five things it needs from the chunk-DOMs, rather than pulling in the whole chunk-
. --- domfilter.py | 2 -- domfilter_test.py | 21 --------------------- paperdoorknob_test.py | 7 +++---- 3 files changed, 3 insertions(+), 27 deletions(-) diff --git a/domfilter.py b/domfilter.py index e1095f0..f00721d 100644 --- a/domfilter.py +++ b/domfilter.py @@ -16,8 +16,6 @@ def _changeTag(tag: Tag, new_name: str) -> Tag: DOMFilters: List[Tuple[str, Callable[[Tag], Any]]] = [ - ("NoEdit", lambda x: [eb.decompose() for eb in x.find_all("div", class_="post-edit-box")]), - ("NoFooter", lambda x: [foot.decompose() for foot in x.find_all("div", class_="post-footer")]), ("Strike", lambda x: [_changeTag(span, 's') for span in x.find_all("span", style="text-decoration: line-through;")]), ] diff --git a/domfilter_test.py b/domfilter_test.py index 8daca20..02c33ac 100644 --- a/domfilter_test.py +++ b/domfilter_test.py @@ -15,33 +15,12 @@ from domfilter import ApplyDOMFilters class TestDOMFilters(unittest.TestCase): def setUp(self) -> None: - self._html = BeautifulSoup(b''' -
-
This is the edit box
- This is glowfic - -
''', 'html.parser') self._strike = BeautifulSoup(b'''

Abandon hope and endure.

No. Win.

''', 'html.parser') - def testStripFooters(self) -> None: - ApplyDOMFilters("NoFooter", self._html) - self.assertEqual(list(self._html.stripped_strings), - ["This is the edit box", "This is glowfic"]) - - def testStripEditBoxes(self) -> None: - ApplyDOMFilters("NoEdit", self._html) - self.assertEqual(list(self._html.stripped_strings), - ["This is glowfic", "This is the footer"]) - - def testStripFootersAndEditBoxes(self) -> None: - ApplyDOMFilters("NoEdit,NoFooter", self._html) - self.assertEqual(list(self._html.stripped_strings), - ["This is glowfic"]) - def testStrike(self) -> None: ApplyDOMFilters('Strike', self._strike) s = self._strike.find_all('s') diff --git a/paperdoorknob_test.py b/paperdoorknob_test.py index 6481803..bafce17 100644 --- a/paperdoorknob_test.py +++ b/paperdoorknob_test.py @@ -14,7 +14,6 @@ import subprocess import paperdoorknob from testing.fakeserver import FakeGlowficServer -from domfilter import ApplyDOMFilters from fetch import DirectFetcher, FakeFetcher, Fetcher from glowfic import ContentOnlyLayout, BelowIconLayout from images import FakeImageStore @@ -41,7 +40,7 @@ class BaseTestProcess(ABC): self.fetcher(), FakeImageStore(), lambda x: x, - lambda x: ApplyDOMFilters('NoEdit,NoFooter', x), + lambda x: None, lambda x: x, ContentOnlyLayout(PandocTexifier('pandoc')), 'margin=20mm', @@ -68,7 +67,7 @@ Pretty sure. self.fetcher(), FakeImageStore(), lambda x: x, - lambda x: ApplyDOMFilters('NoEdit,NoFooter', x), + lambda x: None, lambda x: x, ContentOnlyLayout(texifier), None, @@ -82,7 +81,7 @@ Pretty sure. self.fetcher(), FakeImageStore(), lambda x: x, - lambda x: ApplyDOMFilters('NoEdit,NoFooter', x), + lambda x: None, lambda x: x, BelowIconLayout(PandocTexifier('pandoc'), 20), None, -- 2.44.1 From 81557fabadd506e31a6a9a787bcebe92b0883ad2 Mon Sep 17 00:00:00 2001 From: Scott Worley Date: Fri, 29 Dec 2023 16:06:38 -0800 Subject: [PATCH 072/100] Progress indicator --- paperdoorknob.py | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/paperdoorknob.py b/paperdoorknob.py index 0da6d97..26a60fb 100644 --- a/paperdoorknob.py +++ b/paperdoorknob.py @@ -4,6 +4,9 @@ # under the terms of the GNU General Public License as published by the # Free Software Foundation, version 3. +from sys import stderr + +from typing import Any, Iterable from bs4 import BeautifulSoup @@ -16,6 +19,10 @@ def parse(content: bytes) -> BeautifulSoup: return BeautifulSoup(content, 'html.parser') +def ilen(it: Iterable[Any]) -> int: + return sum(1 for _ in it) + + def process(spec: Spec) -> None: spec.texout.write(b'''\\documentclass{article} \\usepackage{booktabs} @@ -32,7 +39,13 @@ def process(spec: Spec) -> None: spec.texout.write(b'\\begin{document}\n') url = flatURL(spec.url) html = parse(spec.htmlfilter(spec.fetcher.fetch(url))) - for r in chunkDOMs(html): + num_chunks = ilen(chunkDOMs(html)) + for i, r in enumerate(chunkDOMs(html)): + percent = 100.0 * i / num_chunks + print( + f'Processing chunk {i} of {num_chunks} ({percent:.1f}%)', + end='\r', + file=stderr) spec.domfilter(r) chunk = makeChunk(r, spec.images) spec.texout.write(spec.texfilter(spec.layout.renderChunk(chunk))) -- 2.44.1 From c594c8bf593ab97e863827b7a22f2148bcb174b6 Mon Sep 17 00:00:00 2001 From: Scott Worley Date: Fri, 29 Dec 2023 16:19:00 -0800 Subject: [PATCH 073/100] --quiet flag to suppress progress messages --- args.py | 12 +++++++++++- paperdoorknob.py | 8 ++------ paperdoorknob_test.py | 9 ++++++--- spec.py | 1 + 4 files changed, 20 insertions(+), 10 deletions(-) diff --git a/args.py b/args.py index 384e448..c550672 100644 --- a/args.py +++ b/args.py @@ -8,6 +8,7 @@ from argparse import ArgumentParser from contextlib import contextmanager import os.path +from sys import stderr from typing import Iterator @@ -23,6 +24,10 @@ from spec import Spec from texify import PandocTexifier +def _print_status(msg: str) -> None: + print(msg, file=stderr, end='' if msg.endswith('\r') else '\n') + + def _command_line_parser() -> ArgumentParser: parser = ArgumentParser(prog='paperdoorknob', description='Print glowfic') parser.add_argument( @@ -60,6 +65,10 @@ See https://faculty.bard.edu/bloch/geometry.pdf for details '(eg: "%(default)s" produces %(default)s.tex, %(default)s.pdf, etc.)', default='book') parser.add_argument('--pandoc', help='Location of the pandoc executable') + parser.add_argument( + '--quiet', + action='store_true', + help='Suppress progress messages') parser.add_argument( '--texfilters', help='Which TeX filters to use (default: %(default)s)', @@ -96,4 +105,5 @@ def spec_from_commandline_args() -> Iterator[Spec]: lambda x: ApplyTexFilters(args.texfilters, x), layout, args.geometry, - texout) + texout, + (lambda _: None) if args.quiet else _print_status) diff --git a/paperdoorknob.py b/paperdoorknob.py index 26a60fb..4e7cdf6 100644 --- a/paperdoorknob.py +++ b/paperdoorknob.py @@ -4,8 +4,6 @@ # under the terms of the GNU General Public License as published by the # Free Software Foundation, version 3. -from sys import stderr - from typing import Any, Iterable from bs4 import BeautifulSoup @@ -42,13 +40,11 @@ def process(spec: Spec) -> None: num_chunks = ilen(chunkDOMs(html)) for i, r in enumerate(chunkDOMs(html)): percent = 100.0 * i / num_chunks - print( - f'Processing chunk {i} of {num_chunks} ({percent:.1f}%)', - end='\r', - file=stderr) + spec.log(f'Processing chunk {i} of {num_chunks} ({percent:.1f}%)\r') spec.domfilter(r) chunk = makeChunk(r, spec.images) spec.texout.write(spec.texfilter(spec.layout.renderChunk(chunk))) + spec.log('') spec.texout.write(b'\\end{document}\n') diff --git a/paperdoorknob_test.py b/paperdoorknob_test.py index bafce17..8527dd3 100644 --- a/paperdoorknob_test.py +++ b/paperdoorknob_test.py @@ -44,7 +44,8 @@ class BaseTestProcess(ABC): lambda x: x, ContentOnlyLayout(PandocTexifier('pandoc')), 'margin=20mm', - buf) + buf, + lambda _: None) paperdoorknob.process(spec) assert re.match(br'''\\documentclass{article} (\\usepackage{[a-z]+}\n)+\\usepackage\[margin=20mm\]{geometry} @@ -71,7 +72,8 @@ Pretty sure. lambda x: x, ContentOnlyLayout(texifier), None, - buf) + buf, + lambda _: None) paperdoorknob.process(spec) def testPDF(self) -> None: @@ -85,7 +87,8 @@ Pretty sure. lambda x: x, BelowIconLayout(PandocTexifier('pandoc'), 20), None, - out) + out, + lambda _: None) paperdoorknob.process(spec) subprocess.run(['pdflatex', 'test.tex'], stdin=subprocess.DEVNULL, check=True) diff --git a/spec.py b/spec.py index 77a6a12..2eccbb5 100644 --- a/spec.py +++ b/spec.py @@ -28,3 +28,4 @@ class Spec: layout: Layout geometry: str | None texout: IO[bytes] + log: Callable[[str], None] -- 2.44.1 From 47e940089d73e13a8f7f1d12344c2eb9bcaa3454 Mon Sep 17 00:00:00 2001 From: Scott Worley Date: Fri, 29 Dec 2023 16:33:39 -0800 Subject: [PATCH 074/100] --quiet controls fetch cache hit rate report too --- args.py | 5 +++-- fetch.py | 13 ++++++------- fetch_test.py | 5 ++++- 3 files changed, 13 insertions(+), 10 deletions(-) diff --git a/args.py b/args.py index c550672..67d8c3f 100644 --- a/args.py +++ b/args.py @@ -94,7 +94,8 @@ def spec_from_commandline_args() -> Iterator[Spec]: layout = BesideIconLayout(texifier, args.image_size) else: raise ValueError(f'Unknown layout: {args.layout}') - with CachingFetcher(args.cache_path, args.timeout) as fetcher: + log = (lambda _: None) if args.quiet else _print_status + with CachingFetcher(args.cache_path, args.timeout, log) as fetcher: with open(args.out + '.tex', 'wb') as texout: yield Spec( args.url, @@ -106,4 +107,4 @@ def spec_from_commandline_args() -> Iterator[Spec]: layout, args.geometry, texout, - (lambda _: None) if args.quiet else _print_status) + log) diff --git a/fetch.py b/fetch.py index e998394..eb904a4 100644 --- a/fetch.py +++ b/fetch.py @@ -8,7 +8,7 @@ from abc import ABC, abstractmethod from contextlib import contextmanager from sys import stderr -from typing import IO, Iterator +from typing import Callable, Iterator import requests import requests_cache @@ -73,17 +73,16 @@ def DirectFetcher(timeout: int) -> Iterator[_SessionFetcher]: @contextmanager def CachingFetcher( - cache_path: str, - timeout: int, - report_stream: IO[str] = stderr) -> Iterator[_CachingFetcher]: + cache_path: str, + timeout: int, + log: Callable[[str], None] = lambda x: print(x, file=stderr), +) -> Iterator[_CachingFetcher]: with requests_cache.CachedSession(cache_path, cache_control=True) as session: fetcher = _CachingFetcher(session, timeout) yield fetcher if fetcher.request_count > 0: percent = 100.0 * fetcher.cache_hit_count / fetcher.request_count - print( - f"Fetch cache hits: {fetcher.cache_hit_count} ({percent:.1f}%)", - file=report_stream) + log(f"Fetch cache hits: {fetcher.cache_hit_count} ({percent:.1f}%)") class FakeFetcher(Fetcher): diff --git a/fetch_test.py b/fetch_test.py index 5f21fee..59a1514 100644 --- a/fetch_test.py +++ b/fetch_test.py @@ -45,7 +45,10 @@ class TestFetch(unittest.TestCase): def testCacheHitRateReport(self) -> None: buf = StringIO() - with CachingFetcher("testcachehitratereportwithcl", TIMEOUT, buf) as f: + + def log(msg: str) -> None: + print(msg, file=buf) + with CachingFetcher("testcachehitratereportwithcl", TIMEOUT, log) as f: for _ in range(7): f.fetch(f"http://localhost:{self._port}") self.assertEqual("Fetch cache hits: 6 (85.7%)\n", buf.getvalue()) -- 2.44.1 From 372211762c57033dfef33075bd593ec5d426f3c7 Mon Sep 17 00:00:00 2001 From: Scott Worley Date: Sat, 30 Dec 2023 03:19:11 -0800 Subject: [PATCH 075/100] =?utf8?q?Change=20default=20layout:=20below=20?= =?utf8?q?=E2=86=92=20beside?= MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit :( Below layout is nicer, but has two problems: 1. The page break logic seems to be broken. Some pages have content run off the bottom. Other pages get a single line. This seems to be a bug in the wrapstuff package. 2. There's a problem I don't understand where LaTeX complains about the \wrapstuffclear following a \footnote, saying: ! LaTeX kernel Error: Not in vertical mode. Starting a paragraph with \RawIndent or \RawNoindent (or \para_raw_indent: or \para_raw_noindent:) is only allowed if LaTeX is in vertical mode. Two problems --- args.py | 4 ++-- paperdoorknob_test.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/args.py b/args.py index 67d8c3f..b765fc6 100644 --- a/args.py +++ b/args.py @@ -56,9 +56,9 @@ See https://faculty.bard.edu/bloch/geometry.pdf for details default=20) parser.add_argument( '--layout', - default='below', + default='beside', help='Whether to put character and author information `beside` or `below` the icon ' + - '(default: below)') + '(default: beside)') parser.add_argument( '--out', help='The filename stem at which to write output ' + diff --git a/paperdoorknob_test.py b/paperdoorknob_test.py index 8527dd3..e01327c 100644 --- a/paperdoorknob_test.py +++ b/paperdoorknob_test.py @@ -15,7 +15,7 @@ import paperdoorknob from testing.fakeserver import FakeGlowficServer from fetch import DirectFetcher, FakeFetcher, Fetcher -from glowfic import ContentOnlyLayout, BelowIconLayout +from glowfic import ContentOnlyLayout, BesideIconLayout from images import FakeImageStore from spec import Spec from texify import DirectTexifier, PandocTexifier, VerifyingTexifier @@ -85,7 +85,7 @@ Pretty sure. lambda x: x, lambda x: None, lambda x: x, - BelowIconLayout(PandocTexifier('pandoc'), 20), + BesideIconLayout(PandocTexifier('pandoc'), 20), None, out, lambda _: None) -- 2.44.1 From b254976424d0f56fab14f8897dade345ce46a2d1 Mon Sep 17 00:00:00 2001 From: Scott Worley Date: Sat, 30 Dec 2023 03:27:04 -0800 Subject: [PATCH 076/100] Render links as footnotes --- paperdoorknob.py | 4 +++- paperdoorknob_test.py | 3 ++- testdata/this-is-glowfic.html | 2 +- texify.py | 3 +++ 4 files changed, 9 insertions(+), 3 deletions(-) diff --git a/paperdoorknob.py b/paperdoorknob.py index 4e7cdf6..6a55675 100644 --- a/paperdoorknob.py +++ b/paperdoorknob.py @@ -34,7 +34,9 @@ def process(spec: Spec) -> None: spec.texout.write(b'\\usepackage[' + spec.geometry.encode('UTF-8') + b']{geometry}\n') - spec.texout.write(b'\\begin{document}\n') + spec.texout.write(b'''\\begin{document} +\\newcommand{\\href}[2]{#2\\footnote{#1}} +''') url = flatURL(spec.url) html = parse(spec.htmlfilter(spec.fetcher.fetch(url))) num_chunks = ilen(chunkDOMs(html)) diff --git a/paperdoorknob_test.py b/paperdoorknob_test.py index e01327c..c423282 100644 --- a/paperdoorknob_test.py +++ b/paperdoorknob_test.py @@ -50,7 +50,8 @@ class BaseTestProcess(ABC): assert re.match(br'''\\documentclass{article} (\\usepackage{[a-z]+}\n)+\\usepackage\[margin=20mm\]{geometry} \\begin{document} -This is glowfic +\\newcommand{\\href}\[2\]{#2\\footnote{#1}} +This is \\href{https://glowfic.com}{glowfic} You \\emph{sure}\? diff --git a/testdata/this-is-glowfic.html b/testdata/this-is-glowfic.html index 207cdd1..158f5ad 100644 --- a/testdata/this-is-glowfic.html +++ b/testdata/this-is-glowfic.html @@ -2,7 +2,7 @@
We don't want edit boxes
-
This is glowfic
+
This is glowfic
diff --git a/texify.py b/texify.py index 83a8c67..65b4230 100644 --- a/texify.py +++ b/texify.py @@ -49,6 +49,9 @@ class DirectTexifier(Texifier): def texify(self, html: Tag) -> bytes: if html.name == 'em': return b'\\emph{' + self._texify_children(html).strip() + b'}\n' + if html.name == 'a' and 'href' in html.attrs: + return b'\\href{' + html.attrs['href'].encode( + 'UTF-8') + b'}{' + self._texify_children(html).strip() + b'}\n' return self._texify_children(html) -- 2.44.1 From 606c97c59b4f50d8025b3b2946f6f8a4942c5fb9 Mon Sep 17 00:00:00 2001 From: Scott Worley Date: Sat, 30 Dec 2023 03:37:44 -0800 Subject: [PATCH 077/100] =?utf8?q?Rename:=20html=20=E2=86=92=20dom?= MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit --- paperdoorknob.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/paperdoorknob.py b/paperdoorknob.py index 6a55675..05b068f 100644 --- a/paperdoorknob.py +++ b/paperdoorknob.py @@ -38,9 +38,9 @@ def process(spec: Spec) -> None: \\newcommand{\\href}[2]{#2\\footnote{#1}} ''') url = flatURL(spec.url) - html = parse(spec.htmlfilter(spec.fetcher.fetch(url))) - num_chunks = ilen(chunkDOMs(html)) - for i, r in enumerate(chunkDOMs(html)): + dom = parse(spec.htmlfilter(spec.fetcher.fetch(url))) + num_chunks = ilen(chunkDOMs(dom)) + for i, r in enumerate(chunkDOMs(dom)): percent = 100.0 * i / num_chunks spec.log(f'Processing chunk {i} of {num_chunks} ({percent:.1f}%)\r') spec.domfilter(r) -- 2.44.1 From d70acaef01c7e1501d06fda7b0e3f05de021427d Mon Sep 17 00:00:00 2001 From: Scott Worley Date: Sat, 30 Dec 2023 03:39:18 -0800 Subject: [PATCH 078/100] Progress messages for fetch and parse --- paperdoorknob.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/paperdoorknob.py b/paperdoorknob.py index 05b068f..0fd1fde 100644 --- a/paperdoorknob.py +++ b/paperdoorknob.py @@ -38,7 +38,11 @@ def process(spec: Spec) -> None: \\newcommand{\\href}[2]{#2\\footnote{#1}} ''') url = flatURL(spec.url) - dom = parse(spec.htmlfilter(spec.fetcher.fetch(url))) + spec.log('Fetching HTML...\r') + html = spec.fetcher.fetch(url) + spec.log('Parsing HTML...\r') + dom = parse(spec.htmlfilter(html)) + spec.log('Counting chunks...\r') num_chunks = ilen(chunkDOMs(dom)) for i, r in enumerate(chunkDOMs(dom)): percent = 100.0 * i / num_chunks -- 2.44.1 From 69169c2c71234aa0bc60e15b331369263db0e4b8 Mon Sep 17 00:00:00 2001 From: Scott Worley Date: Sat, 30 Dec 2023 03:42:57 -0800 Subject: [PATCH 079/100] args: Prefer dashes to underscores --- args.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/args.py b/args.py index b765fc6..fc7a0f7 100644 --- a/args.py +++ b/args.py @@ -31,7 +31,7 @@ def _print_status(msg: str) -> None: def _command_line_parser() -> ArgumentParser: parser = ArgumentParser(prog='paperdoorknob', description='Print glowfic') parser.add_argument( - '--cache_path', + '--cache-path', metavar='PATH', help='Where to keep the http cache (instead of %(default)s)', default=os.path.join(xdg_cache_home(), "paperdoorknob")) @@ -51,7 +51,7 @@ See https://faculty.bard.edu/bloch/geometry.pdf for details help='Which HTML filters to use (default: %(default)s)', default=','.join(f[0] for f in HTMLFilters)) parser.add_argument( - '--image_size', + '--image-size', help='How large the icon images are, in mm', default=20) parser.add_argument( -- 2.44.1 From 37c47bc273c9c46d35be45ee7866ca5ca01f3159 Mon Sep 17 00:00:00 2001 From: Scott Worley Date: Sat, 30 Dec 2023 04:19:00 -0800 Subject: [PATCH 080/100] Escape character names This is slow. :( --- glowfic.py | 36 +++++++++++++++++------------------- glowfic_test.py | 10 +++++++--- 2 files changed, 24 insertions(+), 22 deletions(-) diff --git a/glowfic.py b/glowfic.py index 5730508..a700995 100644 --- a/glowfic.py +++ b/glowfic.py @@ -39,9 +39,9 @@ def flatURL(url: str) -> str: @dataclass(frozen=True) class Chunk: icon: str | None - character: str | None - screen_name: str | None - author: str | None + character: Tag | None + screen_name: Tag | None + author: Tag | None content: Tag # We avoid the name "post" because the Glowfic community uses the term @@ -83,19 +83,18 @@ def makeChunk(chunk_dom: Tag, image_store: ImageStore) -> Chunk: assert isinstance(icon_img, Tag) return image_store.get_image(icon_img.attrs['src']) - def getTextByClass(css_class: str) -> str | None: - div = chunk_dom.find_next('div', class_=css_class) - if div is None: - return None - return div.text.strip() + def getByClass(css_class: str) -> Tag | None: + tag = chunk_dom.find_next('div', class_=css_class) + assert tag is None or isinstance(tag, Tag) + return tag content = chunk_dom.find_next('div', class_='post-content') assert isinstance(content, Tag) return Chunk(getIcon(), - getTextByClass('post-character'), - getTextByClass('post-screenname'), - getTextByClass('post-author'), + getByClass('post-character'), + getByClass('post-screenname'), + getByClass('post-author'), content) @@ -127,12 +126,11 @@ class BelowIconLayout(Layout): self._image_size = image_size def renderChunk(self, chunk: Chunk) -> bytes: - meta = [ - renderIcon(chunk.icon, self._image_size), - chunk.character, - chunk.screen_name, - chunk.author, - ] + icon = renderIcon(chunk.icon, self._image_size) + meta = [icon.encode('UTF-8')] if icon else [] + meta += [self._texifier.texify(x) + for x in [chunk.character, chunk.screen_name, chunk.author] + if x is not None] return b'''\\wrapstuffclear \\begin{wrapstuff}[l] @@ -156,7 +154,7 @@ class BelowIconLayout(Layout): \\noindent %s ''' % ( - b'\\\\*'.join(x.encode('UTF-8') for x in meta if x is not None), + b'\\\\*'.join(meta), self._texifier.texify(chunk.content)) @@ -191,5 +189,5 @@ class BesideIconLayout(Layout): ''' % ( icon.encode('UTF-8') if icon else b'', - b'\\\\*'.join(x.encode('UTF-8') for x in meta if x is not None), + b'\\\\*'.join(self._texifier.texify(x) for x in meta if x is not None), self._texifier.texify(chunk.content)) diff --git a/glowfic_test.py b/glowfic_test.py index d3a6d56..9b802c1 100644 --- a/glowfic_test.py +++ b/glowfic_test.py @@ -57,9 +57,13 @@ class TestMakeChunk(unittest.TestCase): c.icon, 'stored:https://d1anwqy6ci9o1i.cloudfront.net/' + 'users%2F366%2Ficons%2Fxqmypqvflgdy28aorw9ml_shock.png') - self.assertEqual(c.character, 'Keltham') - self.assertEqual(c.screen_name, 'lawful chaotic') - self.assertEqual(c.author, 'Iarwain') + assert c.character + assert c.screen_name + assert c.author + self.assertEqual(list(c.character.stripped_strings), ['Keltham']) + self.assertEqual( + list(c.screen_name.stripped_strings), ['lawful chaotic']) + self.assertEqual(list(c.author.stripped_strings), ['Iarwain']) self.assertEqual(str(c.content), '

') -- 2.44.1 From 551bb1c9f85382a90abc89ac622bd363dccd36a7 Mon Sep 17 00:00:00 2001 From: Scott Worley Date: Sat, 30 Dec 2023 04:33:36 -0800 Subject: [PATCH 081/100] Only look within each chunk-dom for chunk fields --- glowfic.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/glowfic.py b/glowfic.py index a700995..e192646 100644 --- a/glowfic.py +++ b/glowfic.py @@ -74,21 +74,22 @@ def chunkDOMs(html: BeautifulSoup) -> Iterable[Tag]: def makeChunk(chunk_dom: Tag, image_store: ImageStore) -> Chunk: def getIcon() -> str | None: - icon_div = chunk_dom.find_next('div', class_='post-icon') + icon_div = chunk_dom.findChild('div', class_='post-icon') if icon_div is None: return None - icon_img = icon_div.find_next('img') + assert isinstance(icon_div, Tag) + icon_img = icon_div.findChild('img') if icon_img is None: return None assert isinstance(icon_img, Tag) return image_store.get_image(icon_img.attrs['src']) def getByClass(css_class: str) -> Tag | None: - tag = chunk_dom.find_next('div', class_=css_class) + tag = chunk_dom.findChild('div', class_=css_class) assert tag is None or isinstance(tag, Tag) return tag - content = chunk_dom.find_next('div', class_='post-content') + content = chunk_dom.findChild('div', class_='post-content') assert isinstance(content, Tag) return Chunk(getIcon(), -- 2.44.1 From 39db9a10878f8bac785178a85b2f548f4b24684b Mon Sep 17 00:00:00 2001 From: Scott Worley Date: Sat, 30 Dec 2023 23:45:12 -0800 Subject: [PATCH 082/100] renderIcon makes bytes --- glowfic.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/glowfic.py b/glowfic.py index e192646..065c715 100644 --- a/glowfic.py +++ b/glowfic.py @@ -99,9 +99,11 @@ def makeChunk(chunk_dom: Tag, image_store: ImageStore) -> Chunk: content) -def renderIcon(icon_path: str | None, image_size: float) -> str | None: - params = f'width={image_size}mm,height={image_size}mm,keepaspectratio' - return f'\\includegraphics[{params}]{{{icon_path}}}' if icon_path else None +def renderIcon(icon_path: str | None, image_size: float) -> bytes | None: + if icon_path is None: + return None + return b'\\includegraphics[width=%fmm,height=%fmm,keepaspectratio]{%s}' % ( + image_size, image_size, icon_path.encode('UTF-8')) class Layout(ABC): @@ -128,7 +130,7 @@ class BelowIconLayout(Layout): def renderChunk(self, chunk: Chunk) -> bytes: icon = renderIcon(chunk.icon, self._image_size) - meta = [icon.encode('UTF-8')] if icon else [] + meta = [icon] if icon else [] meta += [self._texifier.texify(x) for x in [chunk.character, chunk.screen_name, chunk.author] if x is not None] @@ -189,6 +191,6 @@ class BesideIconLayout(Layout): \\strut ''' % ( - icon.encode('UTF-8') if icon else b'', + icon if icon else b'', b'\\\\*'.join(self._texifier.texify(x) for x in meta if x is not None), self._texifier.texify(chunk.content)) -- 2.44.1 From 228bb7bb52ef9df820fb41312144c87ee987434d Mon Sep 17 00:00:00 2001 From: Scott Worley Date: Sun, 31 Dec 2023 01:16:00 -0800 Subject: [PATCH 083/100] Support _ in URLs --- paperdoorknob.py | 2 +- paperdoorknob_test.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/paperdoorknob.py b/paperdoorknob.py index 0fd1fde..72ec1a5 100644 --- a/paperdoorknob.py +++ b/paperdoorknob.py @@ -35,7 +35,7 @@ def process(spec: Spec) -> None: spec.geometry.encode('UTF-8') + b']{geometry}\n') spec.texout.write(b'''\\begin{document} -\\newcommand{\\href}[2]{#2\\footnote{#1}} +\\newcommand{\\href}[2]{#2\\footnote{\\detokenize{#1}}} ''') url = flatURL(spec.url) spec.log('Fetching HTML...\r') diff --git a/paperdoorknob_test.py b/paperdoorknob_test.py index c423282..511abb3 100644 --- a/paperdoorknob_test.py +++ b/paperdoorknob_test.py @@ -50,7 +50,7 @@ class BaseTestProcess(ABC): assert re.match(br'''\\documentclass{article} (\\usepackage{[a-z]+}\n)+\\usepackage\[margin=20mm\]{geometry} \\begin{document} -\\newcommand{\\href}\[2\]{#2\\footnote{#1}} +\\newcommand{\\href}\[2\]{#2\\footnote{\\detokenize{#1}}} This is \\href{https://glowfic.com}{glowfic} You \\emph{sure}\? -- 2.44.1 From 62043b2bafbcb57beae7559235bd6984b6275b55 Mon Sep 17 00:00:00 2001 From: Scott Worley Date: Sun, 31 Dec 2023 11:05:59 -0800 Subject: [PATCH 084/100] Strip links from meta fields This gives us the flexibility to process non-flat URLs, which is useful for shorter feedback cycles during development. --- glowfic.py | 18 +++++++++++++++--- glowfic_test.py | 4 ++++ 2 files changed, 19 insertions(+), 3 deletions(-) diff --git a/glowfic.py b/glowfic.py index 065c715..8561cd7 100644 --- a/glowfic.py +++ b/glowfic.py @@ -89,13 +89,25 @@ def makeChunk(chunk_dom: Tag, image_store: ImageStore) -> Chunk: assert tag is None or isinstance(tag, Tag) return tag + def stripHREF(tag: Tag) -> None: + for c in tag.findChildren("a"): + if "href" in c.attrs: + del c.attrs["href"] + + def getMeta(css_class: str) -> Tag | None: + tag = getByClass(css_class) + if tag is None: + return None + stripHREF(tag) + return tag + content = chunk_dom.findChild('div', class_='post-content') assert isinstance(content, Tag) return Chunk(getIcon(), - getByClass('post-character'), - getByClass('post-screenname'), - getByClass('post-author'), + getMeta('post-character'), + getMeta('post-screenname'), + getMeta('post-author'), content) diff --git a/glowfic_test.py b/glowfic_test.py index 9b802c1..2d01c2a 100644 --- a/glowfic_test.py +++ b/glowfic_test.py @@ -11,6 +11,7 @@ from bs4 import BeautifulSoup from images import FakeImageStore from glowfic import chunkDOMs, makeChunk +from texify import PandocTexifier class TestSplit(unittest.TestCase): @@ -67,6 +68,9 @@ class TestMakeChunk(unittest.TestCase): self.assertEqual(str(c.content), '

') + self.assertEqual( + PandocTexifier("pandoc").texify(c.character), b'{Keltham}\n') + if __name__ == '__main__': unittest.main() -- 2.44.1 From 67612898f2efcbb985da8534e0cce124152601db Mon Sep 17 00:00:00 2001 From: Scott Worley Date: Sun, 31 Dec 2023 13:48:21 -0800 Subject: [PATCH 085/100] Use raw strings for less escaping --- glowfic.py | 56 ++++++++++++++++++++++++------------------------ paperdoorknob.py | 20 ++++++++--------- 2 files changed, 38 insertions(+), 38 deletions(-) diff --git a/glowfic.py b/glowfic.py index 8561cd7..bda6d19 100644 --- a/glowfic.py +++ b/glowfic.py @@ -114,7 +114,7 @@ def makeChunk(chunk_dom: Tag, image_store: ImageStore) -> Chunk: def renderIcon(icon_path: str | None, image_size: float) -> bytes | None: if icon_path is None: return None - return b'\\includegraphics[width=%fmm,height=%fmm,keepaspectratio]{%s}' % ( + return br'\includegraphics[width=%fmm,height=%fmm,keepaspectratio]{%s}' % ( image_size, image_size, icon_path.encode('UTF-8')) @@ -147,29 +147,29 @@ class BelowIconLayout(Layout): for x in [chunk.character, chunk.screen_name, chunk.author] if x is not None] - return b'''\\wrapstuffclear -\\begin{wrapstuff}[l] -\\fbox{ -\\begin{varwidth}{0.5\\textwidth} - \\smash{\\parbox[t][0pt]{0pt}{ - \\setlength{\\fboxrule}{0.2pt} - \\setlength{\\fboxsep}{0pt} - \\vspace{-3.4pt} - \\fbox{\\hspace{107mm}} - }\\\\*} - \\vspace{-1em} -\\begin{center} + return br'''\wrapstuffclear +\begin{wrapstuff}[l] +\fbox{ +\begin{varwidth}{0.5\textwidth} + \smash{\parbox[t][0pt]{0pt}{ + \setlength{\fboxrule}{0.2pt} + \setlength{\fboxsep}{0pt} + \vspace{-3.4pt} + \fbox{\hspace{107mm}} + }\\*} + \vspace{-1em} +\begin{center} %s -\\end{center} -\\end{varwidth} +\end{center} +\end{varwidth} } -\\end{wrapstuff} +\end{wrapstuff} -\\strut +\strut -\\noindent %s +\noindent %s ''' % ( - b'\\\\*'.join(meta), + br'\\*'.join(meta), self._texifier.texify(chunk.content)) @@ -189,20 +189,20 @@ class BesideIconLayout(Layout): # Why is \textwidth not the width of the text? # Why is the width of the text .765\textwidth? - return b'''\\noindent\\fbox{ + return br'''\noindent\fbox{ %s -\\parbox[b]{.765\\textwidth}{ -\\begin{center} +\parbox[b]{.765\textwidth}{ +\begin{center} %s -\\end{center} +\end{center} } -}\\\\* -\\vspace{-0.75em}\\\\* -\\noindent %s +}\\* +\vspace{-0.75em}\\* +\noindent %s -\\strut +\strut ''' % ( icon if icon else b'', - b'\\\\*'.join(self._texifier.texify(x) for x in meta if x is not None), + br'\\*'.join(self._texifier.texify(x) for x in meta if x is not None), self._texifier.texify(chunk.content)) diff --git a/paperdoorknob.py b/paperdoorknob.py index 72ec1a5..a19045c 100644 --- a/paperdoorknob.py +++ b/paperdoorknob.py @@ -22,20 +22,20 @@ def ilen(it: Iterable[Any]) -> int: def process(spec: Spec) -> None: - spec.texout.write(b'''\\documentclass{article} -\\usepackage{booktabs} -\\usepackage{graphicx} -\\usepackage{longtable} -\\usepackage{soul} -\\usepackage{varwidth} -\\usepackage{wrapstuff} + spec.texout.write(br'''\documentclass{article} +\usepackage{booktabs} +\usepackage{graphicx} +\usepackage{longtable} +\usepackage{soul} +\usepackage{varwidth} +\usepackage{wrapstuff} ''') if spec.geometry is not None: - spec.texout.write(b'\\usepackage[' + + spec.texout.write(br'\usepackage[' + spec.geometry.encode('UTF-8') + b']{geometry}\n') - spec.texout.write(b'''\\begin{document} -\\newcommand{\\href}[2]{#2\\footnote{\\detokenize{#1}}} + spec.texout.write(br'''\begin{document} +\newcommand{\href}[2]{#2\footnote{\detokenize{#1}}} ''') url = flatURL(spec.url) spec.log('Fetching HTML...\r') -- 2.44.1 From 7804859f8fe8b8dc94b6a33b2221ac0b68d7ed86 Mon Sep 17 00:00:00 2001 From: Scott Worley Date: Sun, 31 Dec 2023 13:49:42 -0800 Subject: [PATCH 086/100] Don't uselessly repeat preamble in test --- paperdoorknob_test.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paperdoorknob_test.py b/paperdoorknob_test.py index 511abb3..61c8e6a 100644 --- a/paperdoorknob_test.py +++ b/paperdoorknob_test.py @@ -50,7 +50,7 @@ class BaseTestProcess(ABC): assert re.match(br'''\\documentclass{article} (\\usepackage{[a-z]+}\n)+\\usepackage\[margin=20mm\]{geometry} \\begin{document} -\\newcommand{\\href}\[2\]{#2\\footnote{\\detokenize{#1}}} +.* This is \\href{https://glowfic.com}{glowfic} You \\emph{sure}\? -- 2.44.1 From 9afdb32a9385204c13018198a136637ad599df47 Mon Sep 17 00:00:00 2001 From: Scott Worley Date: Sun, 31 Dec 2023 14:27:10 -0800 Subject: [PATCH 087/100] Learning TeX: Keep icon image size in TeX --- args.py | 5 +++-- glowfic.py | 17 ++++++++--------- paperdoorknob.py | 3 ++- paperdoorknob_test.py | 7 +++++-- spec.py | 1 + 5 files changed, 19 insertions(+), 14 deletions(-) diff --git a/args.py b/args.py index fc7a0f7..45c080a 100644 --- a/args.py +++ b/args.py @@ -89,9 +89,9 @@ def spec_from_commandline_args() -> Iterator[Spec]: texifier = PandocTexifier(args.pandoc or 'pandoc') layout: Layout if args.layout == 'below': - layout = BelowIconLayout(texifier, args.image_size) + layout = BelowIconLayout(texifier) elif args.layout == 'beside': - layout = BesideIconLayout(texifier, args.image_size) + layout = BesideIconLayout(texifier) else: raise ValueError(f'Unknown layout: {args.layout}') log = (lambda _: None) if args.quiet else _print_status @@ -104,6 +104,7 @@ def spec_from_commandline_args() -> Iterator[Spec]: lambda x: ApplyHTMLFilters(args.htmlfilters, x), lambda x: ApplyDOMFilters(args.domfilters, x), lambda x: ApplyTexFilters(args.texfilters, x), + args.image_size, layout, args.geometry, texout, diff --git a/glowfic.py b/glowfic.py index bda6d19..4c108b8 100644 --- a/glowfic.py +++ b/glowfic.py @@ -111,11 +111,12 @@ def makeChunk(chunk_dom: Tag, image_store: ImageStore) -> Chunk: content) -def renderIcon(icon_path: str | None, image_size: float) -> bytes | None: +def renderIcon(icon_path: str | None) -> bytes | None: if icon_path is None: return None - return br'\includegraphics[width=%fmm,height=%fmm,keepaspectratio]{%s}' % ( - image_size, image_size, icon_path.encode('UTF-8')) + return ( + br'\includegraphics[width=\glowiconsize,height=\glowiconsize,keepaspectratio]{%s}' % + icon_path.encode('UTF-8')) class Layout(ABC): @@ -136,12 +137,11 @@ class ContentOnlyLayout(Layout): class BelowIconLayout(Layout): - def __init__(self, texifier: Texifier, image_size: float) -> None: + def __init__(self, texifier: Texifier) -> None: self._texifier = texifier - self._image_size = image_size def renderChunk(self, chunk: Chunk) -> bytes: - icon = renderIcon(chunk.icon, self._image_size) + icon = renderIcon(chunk.icon) meta = [icon] if icon else [] meta += [self._texifier.texify(x) for x in [chunk.character, chunk.screen_name, chunk.author] @@ -175,12 +175,11 @@ class BelowIconLayout(Layout): class BesideIconLayout(Layout): - def __init__(self, texifier: Texifier, image_size: float) -> None: + def __init__(self, texifier: Texifier) -> None: self._texifier = texifier - self._image_size = image_size def renderChunk(self, chunk: Chunk) -> bytes: - icon = renderIcon(chunk.icon, self._image_size) + icon = renderIcon(chunk.icon) meta = [ chunk.character, chunk.screen_name, diff --git a/paperdoorknob.py b/paperdoorknob.py index a19045c..8183136 100644 --- a/paperdoorknob.py +++ b/paperdoorknob.py @@ -36,7 +36,8 @@ def process(spec: Spec) -> None: b']{geometry}\n') spec.texout.write(br'''\begin{document} \newcommand{\href}[2]{#2\footnote{\detokenize{#1}}} -''') +\def\glowiconsize{%fmm} +''' % spec.icon_size) url = flatURL(spec.url) spec.log('Fetching HTML...\r') html = spec.fetcher.fetch(url) diff --git a/paperdoorknob_test.py b/paperdoorknob_test.py index 61c8e6a..286d6a9 100644 --- a/paperdoorknob_test.py +++ b/paperdoorknob_test.py @@ -42,6 +42,7 @@ class BaseTestProcess(ABC): lambda x: x, lambda x: None, lambda x: x, + 20, ContentOnlyLayout(PandocTexifier('pandoc')), 'margin=20mm', buf, @@ -50,7 +51,7 @@ class BaseTestProcess(ABC): assert re.match(br'''\\documentclass{article} (\\usepackage{[a-z]+}\n)+\\usepackage\[margin=20mm\]{geometry} \\begin{document} -.* +(.|\n)* This is \\href{https://glowfic.com}{glowfic} You \\emph{sure}\? @@ -71,6 +72,7 @@ Pretty sure. lambda x: x, lambda x: None, lambda x: x, + 20, ContentOnlyLayout(texifier), None, buf, @@ -86,7 +88,8 @@ Pretty sure. lambda x: x, lambda x: None, lambda x: x, - BesideIconLayout(PandocTexifier('pandoc'), 20), + 20, + BesideIconLayout(PandocTexifier('pandoc')), None, out, lambda _: None) diff --git a/spec.py b/spec.py index 2eccbb5..c4975b5 100644 --- a/spec.py +++ b/spec.py @@ -25,6 +25,7 @@ class Spec: htmlfilter: Callable[[bytes], bytes] domfilter: Callable[[Tag], None] texfilter: Callable[[bytes], bytes] + icon_size: float layout: Layout geometry: str | None texout: IO[bytes] -- 2.44.1 From 5f230208bb5e858cfeadb3d9c822579d2b7b8300 Mon Sep 17 00:00:00 2001 From: Scott Worley Date: Sun, 31 Dec 2023 14:37:57 -0800 Subject: [PATCH 088/100] Learning TeX: Render icon images with TeX command --- glowfic.py | 16 ++++------------ paperdoorknob.py | 3 +++ 2 files changed, 7 insertions(+), 12 deletions(-) diff --git a/glowfic.py b/glowfic.py index 4c108b8..b7e7ba7 100644 --- a/glowfic.py +++ b/glowfic.py @@ -111,14 +111,6 @@ def makeChunk(chunk_dom: Tag, image_store: ImageStore) -> Chunk: content) -def renderIcon(icon_path: str | None) -> bytes | None: - if icon_path is None: - return None - return ( - br'\includegraphics[width=\glowiconsize,height=\glowiconsize,keepaspectratio]{%s}' % - icon_path.encode('UTF-8')) - - class Layout(ABC): @abstractmethod @@ -141,8 +133,9 @@ class BelowIconLayout(Layout): self._texifier = texifier def renderChunk(self, chunk: Chunk) -> bytes: - icon = renderIcon(chunk.icon) - meta = [icon] if icon else [] + meta = [] + if chunk.icon: + meta += [br'\glowicon{%s}' % chunk.icon.encode('UTF-8')] meta += [self._texifier.texify(x) for x in [chunk.character, chunk.screen_name, chunk.author] if x is not None] @@ -179,7 +172,6 @@ class BesideIconLayout(Layout): self._texifier = texifier def renderChunk(self, chunk: Chunk) -> bytes: - icon = renderIcon(chunk.icon) meta = [ chunk.character, chunk.screen_name, @@ -202,6 +194,6 @@ class BesideIconLayout(Layout): \strut ''' % ( - icon if icon else b'', + br'\glowicon{%s}' % chunk.icon.encode('UTF-8') if chunk.icon else b'', br'\\*'.join(self._texifier.texify(x) for x in meta if x is not None), self._texifier.texify(chunk.content)) diff --git a/paperdoorknob.py b/paperdoorknob.py index 8183136..0fb3fe1 100644 --- a/paperdoorknob.py +++ b/paperdoorknob.py @@ -37,6 +37,9 @@ def process(spec: Spec) -> None: spec.texout.write(br'''\begin{document} \newcommand{\href}[2]{#2\footnote{\detokenize{#1}}} \def\glowiconsize{%fmm} +\newcommand{\glowicon}[1]{\includegraphics[ + width=\glowiconsize,height=\glowiconsize,keepaspectratio +]{#1}} ''' % spec.icon_size) url = flatURL(spec.url) spec.log('Fetching HTML...\r') -- 2.44.1 From bbfe8e520fea31ad19b8bc2ab5bc02cc5cddb205 Mon Sep 17 00:00:00 2001 From: Scott Worley Date: Sun, 31 Dec 2023 18:54:30 -0800 Subject: [PATCH 089/100] Put Texifier in spec --- args.py | 1 + paperdoorknob_test.py | 9 +++++++-- spec.py | 2 ++ 3 files changed, 10 insertions(+), 2 deletions(-) diff --git a/args.py b/args.py index 45c080a..ede8471 100644 --- a/args.py +++ b/args.py @@ -103,6 +103,7 @@ def spec_from_commandline_args() -> Iterator[Spec]: DiskImageStore(args.out + '_images', fetcher), lambda x: ApplyHTMLFilters(args.htmlfilters, x), lambda x: ApplyDOMFilters(args.domfilters, x), + texifier, lambda x: ApplyTexFilters(args.texfilters, x), args.image_size, layout, diff --git a/paperdoorknob_test.py b/paperdoorknob_test.py index 286d6a9..28408d6 100644 --- a/paperdoorknob_test.py +++ b/paperdoorknob_test.py @@ -35,15 +35,17 @@ class BaseTestProcess(ABC): def testProcess(self) -> None: buf = io.BytesIO() + texifier = PandocTexifier('pandoc') spec = Spec( self.url(), self.fetcher(), FakeImageStore(), lambda x: x, lambda x: None, + texifier, lambda x: x, 20, - ContentOnlyLayout(PandocTexifier('pandoc')), + ContentOnlyLayout(texifier), 'margin=20mm', buf, lambda _: None) @@ -71,6 +73,7 @@ Pretty sure. FakeImageStore(), lambda x: x, lambda x: None, + texifier, lambda x: x, 20, ContentOnlyLayout(texifier), @@ -80,6 +83,7 @@ Pretty sure. paperdoorknob.process(spec) def testPDF(self) -> None: + texifier = PandocTexifier('pandoc') with open("test.tex", 'wb') as out: spec = Spec( self.url(), @@ -87,9 +91,10 @@ Pretty sure. FakeImageStore(), lambda x: x, lambda x: None, + texifier, lambda x: x, 20, - BesideIconLayout(PandocTexifier('pandoc')), + BesideIconLayout(texifier), None, out, lambda _: None) diff --git a/spec.py b/spec.py index c4975b5..3aa7292 100644 --- a/spec.py +++ b/spec.py @@ -14,6 +14,7 @@ from bs4.element import Tag from fetch import Fetcher from glowfic import Layout from images import ImageStore +from texify import Texifier # pylint: disable=too-many-instance-attributes @@ -24,6 +25,7 @@ class Spec: images: ImageStore htmlfilter: Callable[[bytes], bytes] domfilter: Callable[[Tag], None] + texifier: Texifier texfilter: Callable[[bytes], bytes] icon_size: float layout: Layout -- 2.44.1 From 1fac41bf2d403d35d43dd532a37a8225f682bae2 Mon Sep 17 00:00:00 2001 From: Scott Worley Date: Sun, 31 Dec 2023 22:11:25 -0800 Subject: [PATCH 090/100] Learning TeX: Do Layouts with TeX macros This is more elegant and reduces the size of the .tex output by 33% h/t https://tex.stackexchange.com/a/537222 by Steven B. Segletes for the \ifnotempty technique --- args.py | 8 ++-- glowfic.py | 88 ++++++++++++++++--------------------------- paperdoorknob.py | 9 +++-- paperdoorknob_test.py | 15 +++----- spec.py | 3 +- 5 files changed, 50 insertions(+), 73 deletions(-) diff --git a/args.py b/args.py index ede8471..de6fc01 100644 --- a/args.py +++ b/args.py @@ -16,7 +16,7 @@ from xdg_base_dirs import xdg_cache_home from domfilter import ApplyDOMFilters, DOMFilters from fetch import CachingFetcher -from glowfic import BesideIconLayout, BelowIconLayout, Layout +from glowfic import BesideIconLayout, BelowIconLayout from htmlfilter import ApplyHTMLFilters, HTMLFilters from texfilter import ApplyTexFilters, TexFilters from images import DiskImageStore @@ -87,11 +87,11 @@ See https://faculty.bard.edu/bloch/geometry.pdf for details def spec_from_commandline_args() -> Iterator[Spec]: args = _command_line_parser().parse_args() texifier = PandocTexifier(args.pandoc or 'pandoc') - layout: Layout + layout: bytes if args.layout == 'below': - layout = BelowIconLayout(texifier) + layout = BelowIconLayout elif args.layout == 'beside': - layout = BesideIconLayout(texifier) + layout = BesideIconLayout else: raise ValueError(f'Unknown layout: {args.layout}') log = (lambda _: None) if args.quiet else _print_status diff --git a/glowfic.py b/glowfic.py index b7e7ba7..012bd06 100644 --- a/glowfic.py +++ b/glowfic.py @@ -5,7 +5,6 @@ # Free Software Foundation, version 3. -from abc import ABC, abstractmethod from dataclasses import dataclass import itertools from urllib.parse import parse_qsl, urlencode, urlparse, urlunparse @@ -111,36 +110,27 @@ def makeChunk(chunk_dom: Tag, image_store: ImageStore) -> Chunk: content) -class Layout(ABC): +def renderChunk(texifier: Texifier, chunk: Chunk) -> bytes: + return b''.join([ + br'\glowhead{', + br'\glowicon{%s}' % chunk.icon.encode('UTF-8') if chunk.icon else b'', + b'}{', + texifier.texify(chunk.character) if chunk.character else b'', + b'}{', + texifier.texify(chunk.screen_name) if chunk.screen_name else b'', + b'}{', + texifier.texify(chunk.author) if chunk.author else b'', + b'}', + texifier.texify(chunk.content)]) - @abstractmethod - def renderChunk(self, chunk: Chunk) -> bytes: - raise NotImplementedError() +ContentOnlyLayout = br''' +\newcommand{\glowhead}[4]{} +''' -class ContentOnlyLayout(Layout): - def __init__(self, texifier: Texifier) -> None: - self._texifier = texifier - - def renderChunk(self, chunk: Chunk) -> bytes: - return self._texifier.texify(chunk.content) + b'\n' - - -class BelowIconLayout(Layout): - - def __init__(self, texifier: Texifier) -> None: - self._texifier = texifier - - def renderChunk(self, chunk: Chunk) -> bytes: - meta = [] - if chunk.icon: - meta += [br'\glowicon{%s}' % chunk.icon.encode('UTF-8')] - meta += [self._texifier.texify(x) - for x in [chunk.character, chunk.screen_name, chunk.author] - if x is not None] - - return br'''\wrapstuffclear +BelowIconLayout = br''' +\newcommand{\glowhead}[4]{\wrapstuffclear \begin{wrapstuff}[l] \fbox{ \begin{varwidth}{0.5\textwidth} @@ -152,7 +142,10 @@ class BelowIconLayout(Layout): }\\*} \vspace{-1em} \begin{center} -%s +#1\ifnotempty +{#1}{\\*}#2\ifnotempty +{#2}{\\*}#3\ifnotempty +{#3}{\\*}#4 \end{center} \end{varwidth} } @@ -160,40 +153,25 @@ class BelowIconLayout(Layout): \strut -\noindent %s -''' % ( - br'\\*'.join(meta), - self._texifier.texify(chunk.content)) +\noindent}''' -class BesideIconLayout(Layout): +# Why is \textwidth not the width of the text? +# Why is the width of the text .765\textwidth? +BesideIconLayout = br''' +\newcommand{\glowhead}[4]{ - def __init__(self, texifier: Texifier) -> None: - self._texifier = texifier - - def renderChunk(self, chunk: Chunk) -> bytes: - meta = [ - chunk.character, - chunk.screen_name, - chunk.author, - ] +\strut - # Why is \textwidth not the width of the text? - # Why is the width of the text .765\textwidth? - return br'''\noindent\fbox{ -%s +\noindent\fbox{ +#1 \parbox[b]{.765\textwidth}{ \begin{center} -%s +#2\ifnotempty +{#2}{\\*}#3\ifnotempty +{#3}{\\*}#4 \end{center} } }\\* \vspace{-0.75em}\\* -\noindent %s - -\strut - -''' % ( - br'\glowicon{%s}' % chunk.icon.encode('UTF-8') if chunk.icon else b'', - br'\\*'.join(self._texifier.texify(x) for x in meta if x is not None), - self._texifier.texify(chunk.content)) +\noindent}''' diff --git a/paperdoorknob.py b/paperdoorknob.py index 0fb3fe1..86b8766 100644 --- a/paperdoorknob.py +++ b/paperdoorknob.py @@ -9,7 +9,7 @@ from typing import Any, Iterable from bs4 import BeautifulSoup from args import spec_from_commandline_args -from glowfic import chunkDOMs, flatURL, makeChunk +from glowfic import chunkDOMs, flatURL, makeChunk, renderChunk from spec import Spec @@ -40,7 +40,10 @@ def process(spec: Spec) -> None: \newcommand{\glowicon}[1]{\includegraphics[ width=\glowiconsize,height=\glowiconsize,keepaspectratio ]{#1}} -''' % spec.icon_size) +\newcommand{\ifnotempty}[2]{\expandafter\ifx\expandafter\relax + \detokenize{#1}\relax\else #2\fi} +%s +''' % (spec.icon_size, spec.layout)) url = flatURL(spec.url) spec.log('Fetching HTML...\r') html = spec.fetcher.fetch(url) @@ -53,7 +56,7 @@ def process(spec: Spec) -> None: spec.log(f'Processing chunk {i} of {num_chunks} ({percent:.1f}%)\r') spec.domfilter(r) chunk = makeChunk(r, spec.images) - spec.texout.write(spec.texfilter(spec.layout.renderChunk(chunk))) + spec.texout.write(spec.texfilter(renderChunk(spec.texifier, chunk))) spec.log('') spec.texout.write(b'\\end{document}\n') diff --git a/paperdoorknob_test.py b/paperdoorknob_test.py index 28408d6..f2cd5f1 100644 --- a/paperdoorknob_test.py +++ b/paperdoorknob_test.py @@ -45,7 +45,7 @@ class BaseTestProcess(ABC): texifier, lambda x: x, 20, - ContentOnlyLayout(texifier), + ContentOnlyLayout, 'margin=20mm', buf, lambda _: None) @@ -54,12 +54,9 @@ class BaseTestProcess(ABC): (\\usepackage{[a-z]+}\n)+\\usepackage\[margin=20mm\]{geometry} \\begin{document} (.|\n)* -This is \\href{https://glowfic.com}{glowfic} - -You \\emph{sure}\? - -Pretty sure. - +\\glowhead{}{}{}{}This is \\href{https://glowfic.com}{glowfic} +\\glowhead{}{}{}{}You \\emph{sure}\? +\\glowhead{}{}{}{}Pretty sure. \\end{document} ''', buf.getvalue()) @@ -76,7 +73,7 @@ Pretty sure. texifier, lambda x: x, 20, - ContentOnlyLayout(texifier), + ContentOnlyLayout, None, buf, lambda _: None) @@ -94,7 +91,7 @@ Pretty sure. texifier, lambda x: x, 20, - BesideIconLayout(texifier), + BesideIconLayout, None, out, lambda _: None) diff --git a/spec.py b/spec.py index 3aa7292..5d708e2 100644 --- a/spec.py +++ b/spec.py @@ -12,7 +12,6 @@ from typing import Callable, IO from bs4.element import Tag from fetch import Fetcher -from glowfic import Layout from images import ImageStore from texify import Texifier @@ -28,7 +27,7 @@ class Spec: texifier: Texifier texfilter: Callable[[bytes], bytes] icon_size: float - layout: Layout + layout: bytes geometry: str | None texout: IO[bytes] log: Callable[[str], None] -- 2.44.1 From 7b4b681229f07e431e0945b41b956a28fa1258e4 Mon Sep 17 00:00:00 2001 From: Scott Worley Date: Fri, 5 Jan 2024 17:24:46 -0800 Subject: [PATCH 091/100] Show name of thread being processed --- paperdoorknob.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/paperdoorknob.py b/paperdoorknob.py index 86b8766..1350784 100644 --- a/paperdoorknob.py +++ b/paperdoorknob.py @@ -7,6 +7,7 @@ from typing import Any, Iterable from bs4 import BeautifulSoup +from bs4.element import Tag from args import spec_from_commandline_args from glowfic import chunkDOMs, flatURL, makeChunk, renderChunk @@ -21,6 +22,13 @@ def ilen(it: Iterable[Any]) -> int: return sum(1 for _ in it) +def get_title(dom: BeautifulSoup) -> str | None: + span = dom.findChild("span", id="post-title") + if not isinstance(span, Tag): + return None + return span.text + + def process(spec: Spec) -> None: spec.texout.write(br'''\documentclass{article} \usepackage{booktabs} @@ -51,9 +59,10 @@ def process(spec: Spec) -> None: dom = parse(spec.htmlfilter(html)) spec.log('Counting chunks...\r') num_chunks = ilen(chunkDOMs(dom)) + title = get_title(dom) or "chunk" for i, r in enumerate(chunkDOMs(dom)): percent = 100.0 * i / num_chunks - spec.log(f'Processing chunk {i} of {num_chunks} ({percent:.1f}%)\r') + spec.log(f'Processing {title} {i} of {num_chunks} ({percent:.1f}%)\r') spec.domfilter(r) chunk = makeChunk(r, spec.images) spec.texout.write(spec.texfilter(renderChunk(spec.texifier, chunk))) -- 2.44.1 From 940270992010ea5b4c912eb02f502923c0487a31 Mon Sep 17 00:00:00 2001 From: Scott Worley Date: Thu, 11 Jan 2024 18:17:26 -0800 Subject: [PATCH 092/100] Reify Thread --- glowfic.py | 33 +++++++++++++++++++-------------- glowfic_test.py | 24 ++++++++++++------------ paperdoorknob.py | 7 ++++--- 3 files changed, 35 insertions(+), 29 deletions(-) diff --git a/glowfic.py b/glowfic.py index 012bd06..372b3aa 100644 --- a/glowfic.py +++ b/glowfic.py @@ -54,20 +54,25 @@ class Chunk: # * Humans in the community tend to use "posts" to mean chunks. -def chunkDOMs(html: BeautifulSoup) -> Iterable[Tag]: - def text() -> Tag: - body = html.body - assert body - text = body.find_next("div", class_="post-post") - assert isinstance(text, Tag) - return text - - def the_replies() -> Iterable[Tag]: - rs = html.find_all("div", class_="post-reply") - assert all(isinstance(r, Tag) for r in rs) - return rs - - return itertools.chain([text()], the_replies()) +class Thread: + + def __init__(self, html: BeautifulSoup) -> None: + self._html = html + + def chunkDOMs(self) -> Iterable[Tag]: + def text() -> Tag: + body = self._html.body + assert body + text = body.find_next("div", class_="post-post") + assert isinstance(text, Tag) + return text + + def the_replies() -> Iterable[Tag]: + rs = self._html.find_all("div", class_="post-reply") + assert all(isinstance(r, Tag) for r in rs) + return rs + + return itertools.chain([text()], the_replies()) def makeChunk(chunk_dom: Tag, image_store: ImageStore) -> Chunk: diff --git a/glowfic_test.py b/glowfic_test.py index 2d01c2a..ed32945 100644 --- a/glowfic_test.py +++ b/glowfic_test.py @@ -10,41 +10,41 @@ import unittest from bs4 import BeautifulSoup from images import FakeImageStore -from glowfic import chunkDOMs, makeChunk +from glowfic import makeChunk, Thread from texify import PandocTexifier class TestSplit(unittest.TestCase): def testSplit1(self) -> None: - soup = BeautifulSoup(b''' + t = Thread(BeautifulSoup(b'''
The "post" -
''', 'html.parser') - self.assertEqual([list(t.stripped_strings) for t in chunkDOMs(soup)], +
''', 'html.parser')) + self.assertEqual([list(t.stripped_strings) for t in t.chunkDOMs()], [['The "post"']]) def testSplit2(self) -> None: - soup = BeautifulSoup(b''' + t = Thread(BeautifulSoup(b'''
The "post"
The "reply"
- ''', 'html.parser') - self.assertEqual([list(t.stripped_strings) for t in chunkDOMs(soup)], + ''', 'html.parser')) + self.assertEqual([list(t.stripped_strings) for t in t.chunkDOMs()], [['The "post"'], ['The "reply"']]) def testSplit3(self) -> None: - soup = BeautifulSoup(b''' + t = Thread(BeautifulSoup(b'''
The "post"
1st reply
2nd reply
- ''', 'html.parser') - self.assertEqual([list(t.stripped_strings) for t in chunkDOMs(soup)], + ''', 'html.parser')) + self.assertEqual([list(t.stripped_strings) for t in t.chunkDOMs()], [['The "post"'], ['1st reply'], ['2nd reply']]) @@ -52,8 +52,8 @@ class TestMakeChunk(unittest.TestCase): def testEmptyContent(self) -> None: with open('testdata/empty-content.html', 'rb') as f: - soup = BeautifulSoup(f, 'html.parser') - c = makeChunk(next(iter(chunkDOMs(soup))), FakeImageStore()) + t = Thread(BeautifulSoup(f, 'html.parser')) + c = makeChunk(next(iter(t.chunkDOMs())), FakeImageStore()) self.assertEqual( c.icon, 'stored:https://d1anwqy6ci9o1i.cloudfront.net/' + diff --git a/paperdoorknob.py b/paperdoorknob.py index 1350784..ade0b88 100644 --- a/paperdoorknob.py +++ b/paperdoorknob.py @@ -10,7 +10,7 @@ from bs4 import BeautifulSoup from bs4.element import Tag from args import spec_from_commandline_args -from glowfic import chunkDOMs, flatURL, makeChunk, renderChunk +from glowfic import flatURL, makeChunk, renderChunk, Thread from spec import Spec @@ -57,10 +57,11 @@ def process(spec: Spec) -> None: html = spec.fetcher.fetch(url) spec.log('Parsing HTML...\r') dom = parse(spec.htmlfilter(html)) + thread = Thread(dom) spec.log('Counting chunks...\r') - num_chunks = ilen(chunkDOMs(dom)) + num_chunks = ilen(thread.chunkDOMs()) title = get_title(dom) or "chunk" - for i, r in enumerate(chunkDOMs(dom)): + for i, r in enumerate(thread.chunkDOMs()): percent = 100.0 * i / num_chunks spec.log(f'Processing {title} {i} of {num_chunks} ({percent:.1f}%)\r') spec.domfilter(r) -- 2.44.1 From a18519bf16f2983b139cffaa731d3ea8e900f542 Mon Sep 17 00:00:00 2001 From: Scott Worley Date: Thu, 11 Jan 2024 18:40:11 -0800 Subject: [PATCH 093/100] =?utf8?q?Rename=20html=20=E2=86=92=20dom?= MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit --- glowfic.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/glowfic.py b/glowfic.py index 372b3aa..746a7a2 100644 --- a/glowfic.py +++ b/glowfic.py @@ -56,19 +56,19 @@ class Chunk: class Thread: - def __init__(self, html: BeautifulSoup) -> None: - self._html = html + def __init__(self, dom: BeautifulSoup) -> None: + self._dom = dom def chunkDOMs(self) -> Iterable[Tag]: def text() -> Tag: - body = self._html.body + body = self._dom.body assert body text = body.find_next("div", class_="post-post") assert isinstance(text, Tag) return text def the_replies() -> Iterable[Tag]: - rs = self._html.find_all("div", class_="post-reply") + rs = self._dom.find_all("div", class_="post-reply") assert all(isinstance(r, Tag) for r in rs) return rs -- 2.44.1 From 21e82200b2887fc4f4f9eff4023f4a73547cacb5 Mon Sep 17 00:00:00 2001 From: Scott Worley Date: Thu, 11 Jan 2024 19:00:56 -0800 Subject: [PATCH 094/100] Move get_title() to Thread --- glowfic.py | 6 ++++++ glowfic_test.py | 15 +++++++++++++++ paperdoorknob.py | 10 +--------- 3 files changed, 22 insertions(+), 9 deletions(-) diff --git a/glowfic.py b/glowfic.py index 746a7a2..92ae4ab 100644 --- a/glowfic.py +++ b/glowfic.py @@ -59,6 +59,12 @@ class Thread: def __init__(self, dom: BeautifulSoup) -> None: self._dom = dom + def title(self) -> str | None: + span = self._dom.findChild("span", id="post-title") + if not isinstance(span, Tag): + return None + return span.text.strip() + def chunkDOMs(self) -> Iterable[Tag]: def text() -> Tag: body = self._dom.body diff --git a/glowfic_test.py b/glowfic_test.py index ed32945..64da164 100644 --- a/glowfic_test.py +++ b/glowfic_test.py @@ -72,5 +72,20 @@ class TestMakeChunk(unittest.TestCase): PandocTexifier("pandoc").texify(c.character), b'{Keltham}\n') +class TestThread(unittest.TestCase): + + def testTitle(self) -> None: + t = Thread(BeautifulSoup(b''' + +
+ + Teh Story! + +
+
The "post"
+ ''', 'html.parser')) + self.assertEqual(t.title(), 'Teh Story!') + + if __name__ == '__main__': unittest.main() diff --git a/paperdoorknob.py b/paperdoorknob.py index ade0b88..cf50057 100644 --- a/paperdoorknob.py +++ b/paperdoorknob.py @@ -7,7 +7,6 @@ from typing import Any, Iterable from bs4 import BeautifulSoup -from bs4.element import Tag from args import spec_from_commandline_args from glowfic import flatURL, makeChunk, renderChunk, Thread @@ -22,13 +21,6 @@ def ilen(it: Iterable[Any]) -> int: return sum(1 for _ in it) -def get_title(dom: BeautifulSoup) -> str | None: - span = dom.findChild("span", id="post-title") - if not isinstance(span, Tag): - return None - return span.text - - def process(spec: Spec) -> None: spec.texout.write(br'''\documentclass{article} \usepackage{booktabs} @@ -60,7 +52,7 @@ def process(spec: Spec) -> None: thread = Thread(dom) spec.log('Counting chunks...\r') num_chunks = ilen(thread.chunkDOMs()) - title = get_title(dom) or "chunk" + title = thread.title() or "chunk" for i, r in enumerate(thread.chunkDOMs()): percent = 100.0 * i / num_chunks spec.log(f'Processing {title} {i} of {num_chunks} ({percent:.1f}%)\r') -- 2.44.1 From 70adfbff16123a62d6b25f8ead23789d831d1d70 Mon Sep 17 00:00:00 2001 From: Scott Worley Date: Thu, 11 Jan 2024 19:30:21 -0800 Subject: [PATCH 095/100] Optionally have Thread.__init__ fetch the HTML --- glowfic.py | 12 ++++++++++-- paperdoorknob.py | 15 ++------------- 2 files changed, 12 insertions(+), 15 deletions(-) diff --git a/glowfic.py b/glowfic.py index 92ae4ab..8029b5d 100644 --- a/glowfic.py +++ b/glowfic.py @@ -15,6 +15,7 @@ from bs4 import BeautifulSoup from bs4.element import Tag from images import ImageStore +from spec import Spec from texify import Texifier @@ -56,8 +57,15 @@ class Chunk: class Thread: - def __init__(self, dom: BeautifulSoup) -> None: - self._dom = dom + def __init__(self, thing: BeautifulSoup | Spec) -> None: + if isinstance(thing, Spec): + spec = thing + spec.log('Fetching HTML...\r') + html = spec.fetcher.fetch(flatURL(spec.url)) + spec.log('Parsing HTML...\r') + self._dom = BeautifulSoup(spec.htmlfilter(html), 'html.parser') + else: + self._dom = thing def title(self) -> str | None: span = self._dom.findChild("span", id="post-title") diff --git a/paperdoorknob.py b/paperdoorknob.py index cf50057..d545f61 100644 --- a/paperdoorknob.py +++ b/paperdoorknob.py @@ -6,17 +6,11 @@ from typing import Any, Iterable -from bs4 import BeautifulSoup - from args import spec_from_commandline_args -from glowfic import flatURL, makeChunk, renderChunk, Thread +from glowfic import makeChunk, renderChunk, Thread from spec import Spec -def parse(content: bytes) -> BeautifulSoup: - return BeautifulSoup(content, 'html.parser') - - def ilen(it: Iterable[Any]) -> int: return sum(1 for _ in it) @@ -44,12 +38,7 @@ def process(spec: Spec) -> None: \detokenize{#1}\relax\else #2\fi} %s ''' % (spec.icon_size, spec.layout)) - url = flatURL(spec.url) - spec.log('Fetching HTML...\r') - html = spec.fetcher.fetch(url) - spec.log('Parsing HTML...\r') - dom = parse(spec.htmlfilter(html)) - thread = Thread(dom) + thread = Thread(spec) spec.log('Counting chunks...\r') num_chunks = ilen(thread.chunkDOMs()) title = thread.title() or "chunk" -- 2.44.1 From 19ed28f37b59e480cebe8f122e07ce18ce536db8 Mon Sep 17 00:00:00 2001 From: Scott Worley Date: Thu, 11 Jan 2024 20:57:25 -0800 Subject: [PATCH 096/100] Always have Thread.__init__ fetch the HTML --- glowfic.py | 14 +++++--------- glowfic_test.py | 38 +++++++++++++++++++++++++++----------- 2 files changed, 32 insertions(+), 20 deletions(-) diff --git a/glowfic.py b/glowfic.py index 8029b5d..5ffaac3 100644 --- a/glowfic.py +++ b/glowfic.py @@ -57,15 +57,11 @@ class Chunk: class Thread: - def __init__(self, thing: BeautifulSoup | Spec) -> None: - if isinstance(thing, Spec): - spec = thing - spec.log('Fetching HTML...\r') - html = spec.fetcher.fetch(flatURL(spec.url)) - spec.log('Parsing HTML...\r') - self._dom = BeautifulSoup(spec.htmlfilter(html), 'html.parser') - else: - self._dom = thing + def __init__(self, spec: Spec) -> None: + spec.log('Fetching HTML...\r') + html = spec.fetcher.fetch(flatURL(spec.url)) + spec.log('Parsing HTML...\r') + self._dom = BeautifulSoup(spec.htmlfilter(html), 'html.parser') def title(self) -> str | None: span = self._dom.findChild("span", id="post-title") diff --git a/glowfic_test.py b/glowfic_test.py index 64da164..52d6f86 100644 --- a/glowfic_test.py +++ b/glowfic_test.py @@ -5,45 +5,61 @@ # Free Software Foundation, version 3. +from sys import stderr import unittest -from bs4 import BeautifulSoup - +from fetch import FakeFetcher from images import FakeImageStore from glowfic import makeChunk, Thread +from spec import Spec from texify import PandocTexifier +def spec_for_testing(html: bytes) -> Spec: + return Spec('test', + FakeFetcher({'test?view=flat': html}), + FakeImageStore(), + lambda x: x, + lambda x: None, + PandocTexifier('pandoc'), + lambda x: x, + 20, + b'', + None, + stderr.buffer, + lambda x: None) + + class TestSplit(unittest.TestCase): def testSplit1(self) -> None: - t = Thread(BeautifulSoup(b''' + t = Thread(spec_for_testing(b'''
The "post" -
''', 'html.parser')) +
''')) self.assertEqual([list(t.stripped_strings) for t in t.chunkDOMs()], [['The "post"']]) def testSplit2(self) -> None: - t = Thread(BeautifulSoup(b''' + t = Thread(spec_for_testing(b'''
The "post"
The "reply"
- ''', 'html.parser')) + ''')) self.assertEqual([list(t.stripped_strings) for t in t.chunkDOMs()], [['The "post"'], ['The "reply"']]) def testSplit3(self) -> None: - t = Thread(BeautifulSoup(b''' + t = Thread(spec_for_testing(b'''
The "post"
1st reply
2nd reply
- ''', 'html.parser')) + ''')) self.assertEqual([list(t.stripped_strings) for t in t.chunkDOMs()], [['The "post"'], ['1st reply'], ['2nd reply']]) @@ -52,7 +68,7 @@ class TestMakeChunk(unittest.TestCase): def testEmptyContent(self) -> None: with open('testdata/empty-content.html', 'rb') as f: - t = Thread(BeautifulSoup(f, 'html.parser')) + t = Thread(spec_for_testing(f.read((9999)))) c = makeChunk(next(iter(t.chunkDOMs())), FakeImageStore()) self.assertEqual( c.icon, @@ -75,7 +91,7 @@ class TestMakeChunk(unittest.TestCase): class TestThread(unittest.TestCase): def testTitle(self) -> None: - t = Thread(BeautifulSoup(b''' + t = Thread(spec_for_testing(b'''
@@ -83,7 +99,7 @@ class TestThread(unittest.TestCase):
The "post"
- ''', 'html.parser')) + ''')) self.assertEqual(t.title(), 'Teh Story!') -- 2.44.1 From 53f396b459889b46e6feea48e8fa8e89b2452e9f Mon Sep 17 00:00:00 2001 From: Scott Worley Date: Fri, 12 Jan 2024 04:05:14 -0800 Subject: [PATCH 097/100] Fetch the non-flat view to get the next-thread link --- glowfic.py | 18 ++++++++++++++++-- glowfic_test.py | 14 +++++++++++++- paperdoorknob_test.py | 3 ++- 3 files changed, 31 insertions(+), 4 deletions(-) diff --git a/glowfic.py b/glowfic.py index 5ffaac3..81a5342 100644 --- a/glowfic.py +++ b/glowfic.py @@ -58,10 +58,21 @@ class Chunk: class Thread: def __init__(self, spec: Spec) -> None: + def find_next_thread(dom: BeautifulSoup) -> str | None: + for c in dom.findChildren('div', class_='post-navheader'): + for a in c.findChildren('a'): + if 'Next Post' in a.text and 'href' in a.attrs and isinstance( + a.attrs['href'], str): + return a.attrs['href'] + return None + spec.log('Fetching HTML...\r') - html = spec.fetcher.fetch(flatURL(spec.url)) + html = spec.fetcher.fetch(spec.url) + flat_html = spec.fetcher.fetch(flatURL(spec.url)) spec.log('Parsing HTML...\r') - self._dom = BeautifulSoup(spec.htmlfilter(html), 'html.parser') + self._next_thread = find_next_thread( + BeautifulSoup(spec.htmlfilter(html), 'html.parser')) + self._dom = BeautifulSoup(spec.htmlfilter(flat_html), 'html.parser') def title(self) -> str | None: span = self._dom.findChild("span", id="post-title") @@ -69,6 +80,9 @@ class Thread: return None return span.text.strip() + def next_thread(self) -> str | None: + return self._next_thread + def chunkDOMs(self) -> Iterable[Tag]: def text() -> Tag: body = self._dom.body diff --git a/glowfic_test.py b/glowfic_test.py index 52d6f86..68debf9 100644 --- a/glowfic_test.py +++ b/glowfic_test.py @@ -17,7 +17,7 @@ from texify import PandocTexifier def spec_for_testing(html: bytes) -> Spec: return Spec('test', - FakeFetcher({'test?view=flat': html}), + FakeFetcher({'test': html, 'test?view=flat': html}), FakeImageStore(), lambda x: x, lambda x: None, @@ -102,6 +102,18 @@ class TestThread(unittest.TestCase): ''')) self.assertEqual(t.title(), 'Teh Story!') + def testNextThread(self) -> None: + t = Thread(spec_for_testing(b''' + +
+
Next Post »
+
« Previous Post
+
+
+
The "post"
+ ''')) + self.assertEqual(t.next_thread(), '/posts/4567') + if __name__ == '__main__': unittest.main() diff --git a/paperdoorknob_test.py b/paperdoorknob_test.py index f2cd5f1..571dda6 100644 --- a/paperdoorknob_test.py +++ b/paperdoorknob_test.py @@ -121,7 +121,8 @@ class TestProcessFromFakeFetcher(BaseTestProcess, unittest.TestCase): def fetcher(self) -> Fetcher: with open('testdata/this-is-glowfic.html', 'rb') as f: - return FakeFetcher({'fic?view=flat': f.read(9999)}) + html = f.read(9999) + return FakeFetcher({'fic': html, 'fic?view=flat': html}) if __name__ == '__main__': -- 2.44.1 From 075f2f08f0f67d449e04e64f5a85742e52e50d30 Mon Sep 17 00:00:00 2001 From: Scott Worley Date: Thu, 25 Jan 2024 18:06:43 -0800 Subject: [PATCH 098/100] Use a more URL-looking url in tests --- glowfic_test.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/glowfic_test.py b/glowfic_test.py index 68debf9..22eea25 100644 --- a/glowfic_test.py +++ b/glowfic_test.py @@ -16,8 +16,9 @@ from texify import PandocTexifier def spec_for_testing(html: bytes) -> Spec: - return Spec('test', - FakeFetcher({'test': html, 'test?view=flat': html}), + return Spec('https://fake/test', + FakeFetcher({'https://fake/test': html, + 'https://fake/test?view=flat': html}), FakeImageStore(), lambda x: x, lambda x: None, -- 2.44.1 From fc2aaa75baff15d201153efeeda0c4c5898e4822 Mon Sep 17 00:00:00 2001 From: Scott Worley Date: Thu, 25 Jan 2024 18:09:18 -0800 Subject: [PATCH 099/100] next_thread should be an absolute URL --- glowfic.py | 4 ++-- glowfic_test.py | 16 ++++++++++++++-- 2 files changed, 16 insertions(+), 4 deletions(-) diff --git a/glowfic.py b/glowfic.py index 81a5342..9d863c8 100644 --- a/glowfic.py +++ b/glowfic.py @@ -7,7 +7,7 @@ from dataclasses import dataclass import itertools -from urllib.parse import parse_qsl, urlencode, urlparse, urlunparse +from urllib.parse import parse_qsl, urlencode, urljoin, urlparse, urlunparse from typing import Iterable @@ -63,7 +63,7 @@ class Thread: for a in c.findChildren('a'): if 'Next Post' in a.text and 'href' in a.attrs and isinstance( a.attrs['href'], str): - return a.attrs['href'] + return urljoin(spec.url, a.attrs['href']) return None spec.log('Fetching HTML...\r') diff --git a/glowfic_test.py b/glowfic_test.py index 22eea25..bd26e3c 100644 --- a/glowfic_test.py +++ b/glowfic_test.py @@ -103,7 +103,7 @@ class TestThread(unittest.TestCase): ''')) self.assertEqual(t.title(), 'Teh Story!') - def testNextThread(self) -> None: + def testNextThreadRelative(self) -> None: t = Thread(spec_for_testing(b'''
@@ -113,7 +113,19 @@ class TestThread(unittest.TestCase):
The "post"
''')) - self.assertEqual(t.next_thread(), '/posts/4567') + self.assertEqual(t.next_thread(), 'https://fake/posts/4567') + + def testNextThreadAbsolute(self) -> None: + t = Thread(spec_for_testing(b''' + +
+
Next Post »
+
« Previous Post
+
+
+
The "post"
+ ''')) + self.assertEqual(t.next_thread(), 'https://elsewhere/posts/4567') if __name__ == '__main__': -- 2.44.1 From 24b87badcef35a53477aec067ee61cc9dd97b2a5 Mon Sep 17 00:00:00 2001 From: Scott Worley Date: Thu, 25 Jan 2024 18:20:46 -0800 Subject: [PATCH 100/100] Move per-thread processing stuff into Thread --- glowfic.py | 21 ++++++++++++++++++++- paperdoorknob.py | 20 ++------------------ 2 files changed, 22 insertions(+), 19 deletions(-) diff --git a/glowfic.py b/glowfic.py index 9d863c8..ef71eef 100644 --- a/glowfic.py +++ b/glowfic.py @@ -9,7 +9,7 @@ from dataclasses import dataclass import itertools from urllib.parse import parse_qsl, urlencode, urljoin, urlparse, urlunparse -from typing import Iterable +from typing import Any, Iterable from bs4 import BeautifulSoup from bs4.element import Tag @@ -19,6 +19,10 @@ from spec import Spec from texify import Texifier +def ilen(it: Iterable[Any]) -> int: + return sum(1 for _ in it) + + def _removeViewFromURL(url: str) -> str: u = urlparse(url) old_qs = parse_qsl(u.query) @@ -73,6 +77,7 @@ class Thread: self._next_thread = find_next_thread( BeautifulSoup(spec.htmlfilter(html), 'html.parser')) self._dom = BeautifulSoup(spec.htmlfilter(flat_html), 'html.parser') + self._spec = spec def title(self) -> str | None: span = self._dom.findChild("span", id="post-title") @@ -98,6 +103,20 @@ class Thread: return itertools.chain([text()], the_replies()) + def emit(self) -> None: + self._spec.log('Counting chunks...\r') + num_chunks = ilen(self.chunkDOMs()) + title = self.title() or "chunk" + for i, r in enumerate(self.chunkDOMs()): + percent = 100.0 * i / num_chunks + self._spec.log( + f'Processing {title} {i} of {num_chunks} ({percent:.1f}%)\r') + self._spec.domfilter(r) + chunk = makeChunk(r, self._spec.images) + self._spec.texout.write( + self._spec.texfilter(renderChunk(self._spec.texifier, chunk))) + self._spec.log('') + def makeChunk(chunk_dom: Tag, image_store: ImageStore) -> Chunk: diff --git a/paperdoorknob.py b/paperdoorknob.py index d545f61..ed7813b 100644 --- a/paperdoorknob.py +++ b/paperdoorknob.py @@ -4,17 +4,11 @@ # under the terms of the GNU General Public License as published by the # Free Software Foundation, version 3. -from typing import Any, Iterable - from args import spec_from_commandline_args -from glowfic import makeChunk, renderChunk, Thread +from glowfic import Thread from spec import Spec -def ilen(it: Iterable[Any]) -> int: - return sum(1 for _ in it) - - def process(spec: Spec) -> None: spec.texout.write(br'''\documentclass{article} \usepackage{booktabs} @@ -38,17 +32,7 @@ def process(spec: Spec) -> None: \detokenize{#1}\relax\else #2\fi} %s ''' % (spec.icon_size, spec.layout)) - thread = Thread(spec) - spec.log('Counting chunks...\r') - num_chunks = ilen(thread.chunkDOMs()) - title = thread.title() or "chunk" - for i, r in enumerate(thread.chunkDOMs()): - percent = 100.0 * i / num_chunks - spec.log(f'Processing {title} {i} of {num_chunks} ({percent:.1f}%)\r') - spec.domfilter(r) - chunk = makeChunk(r, spec.images) - spec.texout.write(spec.texfilter(renderChunk(spec.texifier, chunk))) - spec.log('') + Thread(spec).emit() spec.texout.write(b'\\end{document}\n') -- 2.44.1