Strip all  

author Scott Worley <scottworley@scottworley.com>

Wed, 20 Dec 2023 03:39:16 +0000 (19:39 -0800)

committer Scott Worley <scottworley@scottworley.com>

Wed, 20 Dec 2023 05:12:11 +0000 (21:12 -0800)
author Scott Worley <scottworley@scottworley.com>
Wed, 20 Dec 2023 03:39:16 +0000 (19:39 -0800)
committer Scott Worley <scottworley@scottworley.com>
Wed, 20 Dec 2023 05:12:11 +0000 (21:12 -0800)
diff --git a/args.py b/args.py

index 2a03836854777b6ce3a6b7b1d4171923508b7e0e..1931d4271f242dc918287d28a7bedf2e4ee71f81 100644 (file)
--- a/args.py
+++ b/args.py
@@ -14,6 +14,7 @@ from typing import Iterator
  from xdg_base_dirs import xdg_cache_home
  
  from fetch import CachingFetcher
+from htmlfilter import ApplyHTMLFilters, HTMLFilters
  from spec import Spec
  from texify import PandocTexifier
  
@@ -25,6 +26,10 @@ def _command_line_parser() -> ArgumentParser:
          metavar='PATH',
          help='Where to keep the http cache (instead of %(default)s)',
          default=os.path.join(xdg_cache_home(), "paperdoorknob"))
+    parser.add_argument(
+        '--htmlfilters',
+        help='Which HTML filters to use (default: %(default)s)',
+        default=','.join(f[0] for f in HTMLFilters))
      parser.add_argument(
          '--out',
          help='The filename stem at which to write output ' +
@@ -44,4 +49,9 @@ def spec_from_commandline_args() -> Iterator[Spec]:
      args = _command_line_parser().parse_args()
      with CachingFetcher(args.cache_path, args.timeout) as fetcher:
          with open(args.out + '.tex', 'wb') as texout:
-            yield Spec(args.url, fetcher, PandocTexifier(args.pandoc or 'pandoc'), texout)
+            yield Spec(
+                args.url,
+                fetcher,
+                lambda x: ApplyHTMLFilters(args.htmlfilters, x),
+                PandocTexifier(args.pandoc or 'pandoc'),
+                texout)
diff --git a/htmlfilter.py b/htmlfilter.py

new file mode 100644 (file)

index 0000000..9bc9b6f
--- /dev/null
+++ b/htmlfilter.py
@@ -0,0 +1,31 @@
+# paperdoorknob: Print glowfic
+#
+# This program is free software: you can redistribute it and/or modify it
+# under the terms of the GNU General Public License as published by the
+# Free Software Foundation, version 3.
+
+
+import re
+
+from typing import Callable, List, Tuple
+
+
+HTMLFilters: List[Tuple[str, Callable[[bytes], bytes]]] = [
+    ("NoNBSP", lambda x: re.sub(b"(&nbsp;|\xc2\xa0)", b" ", x)),
+]
+
+
+class HTMLFilterError(Exception):
+    pass
+
+
+def ApplyHTMLFilters(filter_list: str, data: bytes) -> bytes:
+    for filter_name in filter_list.split(','):
+        filters = [f for (name, f) in HTMLFilters if name == filter_name]
+        if len(filters) == 0:
+            raise HTMLFilterError(f"Unknown HTML filter: {filter_name}")
+        if len(filters) > 1:
+            raise HTMLFilterError(
+                f"Multiple HTML filters with the same name!?: {filter_name}")
+        data = filters[0](data)
+    return data
diff --git a/htmlfilter_test.py b/htmlfilter_test.py

new file mode 100644 (file)

index 0000000..6b5584c
--- /dev/null
+++ b/htmlfilter_test.py
@@ -0,0 +1,23 @@
+# paperdoorknob: Print glowfic
+#
+# This program is free software: you can redistribute it and/or modify it
+# under the terms of the GNU General Public License as published by the
+# Free Software Foundation, version 3.
+
+
+import unittest
+from htmlfilter import ApplyHTMLFilters
+
+
+class TestHTMLFilters(unittest.TestCase):
+
+    def testStripNBSP(self) -> None:
+        self.assertEqual(
+            ApplyHTMLFilters(
+                "NoNBSP",
+                b"<p>It's&nbsp;<em>really&nbsp;</em>cold.</p>"),
+            b"<p>It's <em>really </em>cold.</p>")
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/paperdoorknob.py b/paperdoorknob.py

index 653d0c65d8d781790aceaff6d59f91a742a277ca..34223e4c35eb7636370c1032764de6455c075933 100644 (file)
--- a/paperdoorknob.py
+++ b/paperdoorknob.py
@@ -46,7 +46,7 @@ def replies(html: BeautifulSoup) -> Iterable[Tag]:
  
  def process(spec: Spec) -> None:
      spec.texout.write(b'\\documentclass{article}\n\\begin{document}\n')
-    html = clean(parse(spec.fetcher.fetch(spec.url)))
+    html = clean(parse(spec.htmlfilter(spec.fetcher.fetch(spec.url))))
      for r in replies(html):
          spec.texout.write(spec.texifier.texify(r))
      spec.texout.write(b'\\end{document}\n')
diff --git a/paperdoorknob_test.py b/paperdoorknob_test.py

index 425d5e3f21c6c8cb827ba8694dae0a7435608f89..14f6dd9cde0057e459c3fa7bd961c1fdf02fc436 100644 (file)
--- a/paperdoorknob_test.py
+++ b/paperdoorknob_test.py
@@ -46,6 +46,7 @@ class BaseTestProcess(ABC):
          spec = Spec(
              self.url(),
              self.fetcher(),
+            lambda x: x,
              PandocTexifier('pandoc'),
              buf)
          paperdoorknob.process(spec)
@@ -61,7 +62,7 @@ Pretty sure.
          texifier = VerifyingTexifier(
              PandocTexifier('pandoc'), DirectTexifier())
          buf = io.BytesIO()
-        spec = Spec(self.url(), self.fetcher(), texifier, buf)
+        spec = Spec(self.url(), self.fetcher(), lambda x: x, texifier, buf)
          paperdoorknob.process(spec)
  
      def testPDF(self) -> None:
@@ -69,6 +70,7 @@ Pretty sure.
              spec = Spec(
                  self.url(),
                  self.fetcher(),
+                lambda x: x,
                  PandocTexifier('pandoc'),
                  out)
              paperdoorknob.process(spec)
diff --git a/setup.py b/setup.py

index edaa429bc0fd6504f2647b9618074ec381534a81..e83ec01dfe86a4e1c8aca6e6fd3d0d020a245736 100644 (file)
--- a/setup.py
+++ b/setup.py
@@ -10,6 +10,7 @@ setup(
      py_modules=[
          'args',
          'fetch',
+        'htmlfilter',
          'paperdoorknob',
          'spec',
          'texify',
diff --git a/spec.py b/spec.py

index 132289791a0f8ad45544995a228d8d175f156928..98832bfddfed9978ea67e14edca2b5900d5bc56d 100644 (file)
--- a/spec.py
+++ b/spec.py
@@ -7,7 +7,7 @@
  
  from dataclasses import dataclass
  
-from typing import IO
+from typing import Callable, IO
  
  from fetch import Fetcher
  from texify import Texifier
@@ -17,5 +17,6 @@ from texify import Texifier
  class Spec:
      url: str
      fetcher: Fetcher
+    htmlfilter: Callable[[bytes], bytes]
      texifier: Texifier
      texout: IO[bytes]
author	Scott Worley <scottworley@scottworley.com>
	Wed, 20 Dec 2023 03:39:16 +0000 (19:39 -0800)
committer	Scott Worley <scottworley@scottworley.com>
	Wed, 20 Dec 2023 05:12:11 +0000 (21:12 -0800)
args.py		patch \| blob \| blame \| history
htmlfilter.py	[new file with mode: 0644]	patch \| blob
htmlfilter_test.py	[new file with mode: 0644]	patch \| blob
paperdoorknob.py		patch \| blob \| blame \| history
paperdoorknob_test.py		patch \| blob \| blame \| history
setup.py		patch \| blob \| blame \| history
spec.py		patch \| blob \| blame \| history