]> git.scottworley.com Git - paperdoorknob/commitdiff
Strip all  
authorScott Worley <scottworley@scottworley.com>
Wed, 20 Dec 2023 03:39:16 +0000 (19:39 -0800)
committerScott Worley <scottworley@scottworley.com>
Wed, 20 Dec 2023 05:12:11 +0000 (21:12 -0800)
Project Lawful does not use &nbsp; carefully/thoughtfully.  They are all
over the place, at the end of every sentence, randomly around <em>s,
etc.  Just remove them all.

But allow this behavior to be flag-controlled, in case this step is not
desired when processing other works.

args.py
htmlfilter.py [new file with mode: 0644]
htmlfilter_test.py [new file with mode: 0644]
paperdoorknob.py
paperdoorknob_test.py
setup.py
spec.py

diff --git a/args.py b/args.py
index 2a03836854777b6ce3a6b7b1d4171923508b7e0e..1931d4271f242dc918287d28a7bedf2e4ee71f81 100644 (file)
--- a/args.py
+++ b/args.py
@@ -14,6 +14,7 @@ from typing import Iterator
 from xdg_base_dirs import xdg_cache_home
 
 from fetch import CachingFetcher
 from xdg_base_dirs import xdg_cache_home
 
 from fetch import CachingFetcher
+from htmlfilter import ApplyHTMLFilters, HTMLFilters
 from spec import Spec
 from texify import PandocTexifier
 
 from spec import Spec
 from texify import PandocTexifier
 
@@ -25,6 +26,10 @@ def _command_line_parser() -> ArgumentParser:
         metavar='PATH',
         help='Where to keep the http cache (instead of %(default)s)',
         default=os.path.join(xdg_cache_home(), "paperdoorknob"))
         metavar='PATH',
         help='Where to keep the http cache (instead of %(default)s)',
         default=os.path.join(xdg_cache_home(), "paperdoorknob"))
+    parser.add_argument(
+        '--htmlfilters',
+        help='Which HTML filters to use (default: %(default)s)',
+        default=','.join(f[0] for f in HTMLFilters))
     parser.add_argument(
         '--out',
         help='The filename stem at which to write output ' +
     parser.add_argument(
         '--out',
         help='The filename stem at which to write output ' +
@@ -44,4 +49,9 @@ def spec_from_commandline_args() -> Iterator[Spec]:
     args = _command_line_parser().parse_args()
     with CachingFetcher(args.cache_path, args.timeout) as fetcher:
         with open(args.out + '.tex', 'wb') as texout:
     args = _command_line_parser().parse_args()
     with CachingFetcher(args.cache_path, args.timeout) as fetcher:
         with open(args.out + '.tex', 'wb') as texout:
-            yield Spec(args.url, fetcher, PandocTexifier(args.pandoc or 'pandoc'), texout)
+            yield Spec(
+                args.url,
+                fetcher,
+                lambda x: ApplyHTMLFilters(args.htmlfilters, x),
+                PandocTexifier(args.pandoc or 'pandoc'),
+                texout)
diff --git a/htmlfilter.py b/htmlfilter.py
new file mode 100644 (file)
index 0000000..9bc9b6f
--- /dev/null
@@ -0,0 +1,31 @@
+# paperdoorknob: Print glowfic
+#
+# This program is free software: you can redistribute it and/or modify it
+# under the terms of the GNU General Public License as published by the
+# Free Software Foundation, version 3.
+
+
+import re
+
+from typing import Callable, List, Tuple
+
+
+HTMLFilters: List[Tuple[str, Callable[[bytes], bytes]]] = [
+    ("NoNBSP", lambda x: re.sub(b"(&nbsp;|\xc2\xa0)", b" ", x)),
+]
+
+
+class HTMLFilterError(Exception):
+    pass
+
+
+def ApplyHTMLFilters(filter_list: str, data: bytes) -> bytes:
+    for filter_name in filter_list.split(','):
+        filters = [f for (name, f) in HTMLFilters if name == filter_name]
+        if len(filters) == 0:
+            raise HTMLFilterError(f"Unknown HTML filter: {filter_name}")
+        if len(filters) > 1:
+            raise HTMLFilterError(
+                f"Multiple HTML filters with the same name!?: {filter_name}")
+        data = filters[0](data)
+    return data
diff --git a/htmlfilter_test.py b/htmlfilter_test.py
new file mode 100644 (file)
index 0000000..6b5584c
--- /dev/null
@@ -0,0 +1,23 @@
+# paperdoorknob: Print glowfic
+#
+# This program is free software: you can redistribute it and/or modify it
+# under the terms of the GNU General Public License as published by the
+# Free Software Foundation, version 3.
+
+
+import unittest
+from htmlfilter import ApplyHTMLFilters
+
+
+class TestHTMLFilters(unittest.TestCase):
+
+    def testStripNBSP(self) -> None:
+        self.assertEqual(
+            ApplyHTMLFilters(
+                "NoNBSP",
+                b"<p>It's&nbsp;<em>really&nbsp;</em>cold.</p>"),
+            b"<p>It's <em>really </em>cold.</p>")
+
+
+if __name__ == '__main__':
+    unittest.main()
index 653d0c65d8d781790aceaff6d59f91a742a277ca..34223e4c35eb7636370c1032764de6455c075933 100644 (file)
@@ -46,7 +46,7 @@ def replies(html: BeautifulSoup) -> Iterable[Tag]:
 
 def process(spec: Spec) -> None:
     spec.texout.write(b'\\documentclass{article}\n\\begin{document}\n')
 
 def process(spec: Spec) -> None:
     spec.texout.write(b'\\documentclass{article}\n\\begin{document}\n')
-    html = clean(parse(spec.fetcher.fetch(spec.url)))
+    html = clean(parse(spec.htmlfilter(spec.fetcher.fetch(spec.url))))
     for r in replies(html):
         spec.texout.write(spec.texifier.texify(r))
     spec.texout.write(b'\\end{document}\n')
     for r in replies(html):
         spec.texout.write(spec.texifier.texify(r))
     spec.texout.write(b'\\end{document}\n')
index 425d5e3f21c6c8cb827ba8694dae0a7435608f89..14f6dd9cde0057e459c3fa7bd961c1fdf02fc436 100644 (file)
@@ -46,6 +46,7 @@ class BaseTestProcess(ABC):
         spec = Spec(
             self.url(),
             self.fetcher(),
         spec = Spec(
             self.url(),
             self.fetcher(),
+            lambda x: x,
             PandocTexifier('pandoc'),
             buf)
         paperdoorknob.process(spec)
             PandocTexifier('pandoc'),
             buf)
         paperdoorknob.process(spec)
@@ -61,7 +62,7 @@ Pretty sure.
         texifier = VerifyingTexifier(
             PandocTexifier('pandoc'), DirectTexifier())
         buf = io.BytesIO()
         texifier = VerifyingTexifier(
             PandocTexifier('pandoc'), DirectTexifier())
         buf = io.BytesIO()
-        spec = Spec(self.url(), self.fetcher(), texifier, buf)
+        spec = Spec(self.url(), self.fetcher(), lambda x: x, texifier, buf)
         paperdoorknob.process(spec)
 
     def testPDF(self) -> None:
         paperdoorknob.process(spec)
 
     def testPDF(self) -> None:
@@ -69,6 +70,7 @@ Pretty sure.
             spec = Spec(
                 self.url(),
                 self.fetcher(),
             spec = Spec(
                 self.url(),
                 self.fetcher(),
+                lambda x: x,
                 PandocTexifier('pandoc'),
                 out)
             paperdoorknob.process(spec)
                 PandocTexifier('pandoc'),
                 out)
             paperdoorknob.process(spec)
index edaa429bc0fd6504f2647b9618074ec381534a81..e83ec01dfe86a4e1c8aca6e6fd3d0d020a245736 100644 (file)
--- a/setup.py
+++ b/setup.py
@@ -10,6 +10,7 @@ setup(
     py_modules=[
         'args',
         'fetch',
     py_modules=[
         'args',
         'fetch',
+        'htmlfilter',
         'paperdoorknob',
         'spec',
         'texify',
         'paperdoorknob',
         'spec',
         'texify',
diff --git a/spec.py b/spec.py
index 132289791a0f8ad45544995a228d8d175f156928..98832bfddfed9978ea67e14edca2b5900d5bc56d 100644 (file)
--- a/spec.py
+++ b/spec.py
@@ -7,7 +7,7 @@
 
 from dataclasses import dataclass
 
 
 from dataclasses import dataclass
 
-from typing import IO
+from typing import Callable, IO
 
 from fetch import Fetcher
 from texify import Texifier
 
 from fetch import Fetcher
 from texify import Texifier
@@ -17,5 +17,6 @@ from texify import Texifier
 class Spec:
     url: str
     fetcher: Fetcher
 class Spec:
     url: str
     fetcher: Fetcher
+    htmlfilter: Callable[[bytes], bytes]
     texifier: Texifier
     texout: IO[bytes]
     texifier: Texifier
     texout: IO[bytes]