From 929db57622c6b8756ed341d7ae2862126a7c638f Mon Sep 17 00:00:00 2001
From: Scott Worley <scottworley@scottworley.com>
Date: Tue, 19 Dec 2023 19:39:16 -0800
Subject: [PATCH] Strip all &nbsp;

Project Lawful does not use &nbsp; carefully/thoughtfully.  They are all
over the place, at the end of every sentence, randomly around <em>s,
etc.  Just remove them all.

But allow this behavior to be flag-controlled, in case this step is not
desired when processing other works.
---
 args.py               | 12 +++++++++++-
 htmlfilter.py         | 31 +++++++++++++++++++++++++++++++
 htmlfilter_test.py    | 23 +++++++++++++++++++++++
 paperdoorknob.py      |  2 +-
 paperdoorknob_test.py |  4 +++-
 setup.py              |  1 +
 spec.py               |  3 ++-
 7 files changed, 72 insertions(+), 4 deletions(-)
 create mode 100644 htmlfilter.py
 create mode 100644 htmlfilter_test.py
diff --git a/args.py b/args.py
index 2a03836..1931d42 100644
--- a/args.py
+++ b/args.py
@@ -14,6 +14,7 @@ from typing import Iterator
 from xdg_base_dirs import xdg_cache_home
 
 from fetch import CachingFetcher
+from htmlfilter import ApplyHTMLFilters, HTMLFilters
 from spec import Spec
 from texify import PandocTexifier
 
@@ -25,6 +26,10 @@ def _command_line_parser() -> ArgumentParser:
         metavar='PATH',
         help='Where to keep the http cache (instead of %(default)s)',
         default=os.path.join(xdg_cache_home(), "paperdoorknob"))
+    parser.add_argument(
+        '--htmlfilters',
+        help='Which HTML filters to use (default: %(default)s)',
+        default=','.join(f[0] for f in HTMLFilters))
     parser.add_argument(
         '--out',
         help='The filename stem at which to write output ' +
@@ -44,4 +49,9 @@ def spec_from_commandline_args() -> Iterator[Spec]:
     args = _command_line_parser().parse_args()
     with CachingFetcher(args.cache_path, args.timeout) as fetcher:
         with open(args.out + '.tex', 'wb') as texout:
-            yield Spec(args.url, fetcher, PandocTexifier(args.pandoc or 'pandoc'), texout)
+            yield Spec(
+                args.url,
+                fetcher,
+                lambda x: ApplyHTMLFilters(args.htmlfilters, x),
+                PandocTexifier(args.pandoc or 'pandoc'),
+                texout)
diff --git a/htmlfilter.py b/htmlfilter.py
new file mode 100644
index 0000000..9bc9b6f
--- /dev/null
+++ b/htmlfilter.py
@@ -0,0 +1,31 @@
+# paperdoorknob: Print glowfic
+#
+# This program is free software: you can redistribute it and/or modify it
+# under the terms of the GNU General Public License as published by the
+# Free Software Foundation, version 3.
+
+
+import re
+
+from typing import Callable, List, Tuple
+
+
+HTMLFilters: List[Tuple[str, Callable[[bytes], bytes]]] = [
+    ("NoNBSP", lambda x: re.sub(b"(&nbsp;|\xc2\xa0)", b" ", x)),
+]
+
+
+class HTMLFilterError(Exception):
+    pass
+
+
+def ApplyHTMLFilters(filter_list: str, data: bytes) -> bytes:
+    for filter_name in filter_list.split(','):
+        filters = [f for (name, f) in HTMLFilters if name == filter_name]
+        if len(filters) == 0:
+            raise HTMLFilterError(f"Unknown HTML filter: {filter_name}")
+        if len(filters) > 1:
+            raise HTMLFilterError(
+                f"Multiple HTML filters with the same name!?: {filter_name}")
+        data = filters[0](data)
+    return data
diff --git a/htmlfilter_test.py b/htmlfilter_test.py
new file mode 100644
index 0000000..6b5584c
--- /dev/null
+++ b/htmlfilter_test.py
@@ -0,0 +1,23 @@
+# paperdoorknob: Print glowfic
+#
+# This program is free software: you can redistribute it and/or modify it
+# under the terms of the GNU General Public License as published by the
+# Free Software Foundation, version 3.
+
+
+import unittest
+from htmlfilter import ApplyHTMLFilters
+
+
+class TestHTMLFilters(unittest.TestCase):
+
+    def testStripNBSP(self) -> None:
+        self.assertEqual(
+            ApplyHTMLFilters(
+                "NoNBSP",
+                b"<p>It's&nbsp;<em>really&nbsp;</em>cold.</p>"),
+            b"<p>It's <em>really </em>cold.</p>")
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/paperdoorknob.py b/paperdoorknob.py
index 653d0c6..34223e4 100644
--- a/paperdoorknob.py
+++ b/paperdoorknob.py
@@ -46,7 +46,7 @@ def replies(html: BeautifulSoup) -> Iterable[Tag]:
 
 def process(spec: Spec) -> None:
     spec.texout.write(b'\\documentclass{article}\n\\begin{document}\n')
-    html = clean(parse(spec.fetcher.fetch(spec.url)))
+    html = clean(parse(spec.htmlfilter(spec.fetcher.fetch(spec.url))))
     for r in replies(html):
         spec.texout.write(spec.texifier.texify(r))
     spec.texout.write(b'\\end{document}\n')
diff --git a/paperdoorknob_test.py b/paperdoorknob_test.py
index 425d5e3..14f6dd9 100644
--- a/paperdoorknob_test.py
+++ b/paperdoorknob_test.py
@@ -46,6 +46,7 @@ class BaseTestProcess(ABC):
         spec = Spec(
             self.url(),
             self.fetcher(),
+            lambda x: x,
             PandocTexifier('pandoc'),
             buf)
         paperdoorknob.process(spec)
@@ -61,7 +62,7 @@ Pretty sure.
         texifier = VerifyingTexifier(
             PandocTexifier('pandoc'), DirectTexifier())
         buf = io.BytesIO()
-        spec = Spec(self.url(), self.fetcher(), texifier, buf)
+        spec = Spec(self.url(), self.fetcher(), lambda x: x, texifier, buf)
         paperdoorknob.process(spec)
 
     def testPDF(self) -> None:
@@ -69,6 +70,7 @@ Pretty sure.
             spec = Spec(
                 self.url(),
                 self.fetcher(),
+                lambda x: x,
                 PandocTexifier('pandoc'),
                 out)
             paperdoorknob.process(spec)
diff --git a/setup.py b/setup.py
index edaa429..e83ec01 100644
--- a/setup.py
+++ b/setup.py
@@ -10,6 +10,7 @@ setup(
     py_modules=[
         'args',
         'fetch',
+        'htmlfilter',
         'paperdoorknob',
         'spec',
         'texify',
diff --git a/spec.py b/spec.py
index 1322897..98832bf 100644
--- a/spec.py
+++ b/spec.py
@@ -7,7 +7,7 @@
 
 from dataclasses import dataclass
 
-from typing import IO
+from typing import Callable, IO
 
 from fetch import Fetcher
 from texify import Texifier
@@ -17,5 +17,6 @@ from texify import Texifier
 class Spec:
     url: str
     fetcher: Fetcher
+    htmlfilter: Callable[[bytes], bytes]
     texifier: Texifier
     texout: IO[bytes]
-- 
2.44.1