From: Scott Worley Date: Fri, 24 Nov 2023 02:34:52 +0000 (-0800) Subject: Parse HTML X-Git-Url: http://git.scottworley.com/paperdoorknob/commitdiff_plain/136277e30143cd1219c896bb4980027ac3c6dbe1?ds=inline Parse HTML --- diff --git a/default.nix b/default.nix index 69a7149..bf96eb8 100644 --- a/default.nix +++ b/default.nix @@ -1,12 +1,14 @@ { pkgs ? import { }, lint ? false }: pkgs.python3Packages.callPackage ({ lib, buildPythonPackage, autopep8, mypy - , pylint, requests, requests-cache, types-requests, xdg-base-dirs }: + , pylint, beautifulsoup4, requests, requests-cache, types-beautifulsoup4 + , types-requests, xdg-base-dirs }: buildPythonPackage rec { pname = "paperdoorknob"; version = "0.0.1"; src = lib.cleanSource ./.; - propagatedBuildInputs = [ requests requests-cache xdg-base-dirs ]; - nativeCheckInputs = [ mypy types-requests ] + propagatedBuildInputs = + [ beautifulsoup4 requests requests-cache xdg-base-dirs ]; + nativeCheckInputs = [ mypy types-beautifulsoup4 types-requests ] ++ lib.optionals lint [ autopep8 pylint ]; doCheck = true; checkPhase = "./test.sh"; diff --git a/paperdoorknob.py b/paperdoorknob.py index b7e4349..bb8bdd1 100644 --- a/paperdoorknob.py +++ b/paperdoorknob.py @@ -7,6 +7,7 @@ from argparse import ArgumentParser import os.path +from bs4 import BeautifulSoup import requests import requests_cache from xdg_base_dirs import xdg_cache_home @@ -27,9 +28,10 @@ def command_line_parser() -> ArgumentParser: return parser -def fetch(url: str, session: requests.Session, timeout: int) -> None: +def fetch(url: str, session: requests.Session, timeout: int) -> BeautifulSoup: with session.get(url, timeout=timeout) as r: r.raise_for_status() + return BeautifulSoup(r.text, 'html.parser') def main() -> None: diff --git a/paperdoorknob_test.py b/paperdoorknob_test.py index cb70ac5..ab13eed 100644 --- a/paperdoorknob_test.py +++ b/paperdoorknob_test.py @@ -81,6 +81,14 @@ class TestFetch(unittest.TestCase): paperdoorknob.fetch(f"http://localhost:{self._port()}", s, TIMEOUT) self.assertEqual(self._request_counter, 1) + def testFetchConents(self) -> None: + with requests.session() as s: + doc = paperdoorknob.fetch( + f"http://localhost:{self._port()}", s, TIMEOUT) + body = doc.body + assert body + self.assertEqual(body.text, "This is glowfic") + def testFetchErrors(self) -> None: with requests.session() as s: with self.assertRaises(requests.HTTPError):