X-Git-Url: http://git.scottworley.com/paperdoorknob/blobdiff_plain/705973e749e5a9da0453508dbd23de434a0fdc65..24b87badcef35a53477aec067ee61cc9dd97b2a5:/fetch.py?ds=inline diff --git a/fetch.py b/fetch.py index 3c0c6a3..eb904a4 100644 --- a/fetch.py +++ b/fetch.py @@ -7,11 +7,18 @@ from abc import ABC, abstractmethod from contextlib import contextmanager -from typing import Iterator +from sys import stderr +from typing import Callable, Iterator import requests import requests_cache +from version import paperdoorknob_version + + +_headers = {'User-Agent': f'paperdoorknob/{paperdoorknob_version} ' + + '(https://git.scottworley.com/paperdoorknob/)'} + class Fetcher(ABC): @abstractmethod @@ -26,11 +33,38 @@ class _SessionFetcher(Fetcher): self._timeout = timeout def fetch(self, url: str) -> bytes: - with self._session.get(url, timeout=self._timeout) as r: + with self._session.get(url, timeout=self._timeout, headers=_headers) as r: r.raise_for_status() return r.content +class _CachingFetcher(Fetcher): + + def __init__( + self, + session: requests_cache.CachedSession, + timeout: int) -> None: + self._session = session + self._timeout = timeout + self._request_count = 0 + self._cache_hit_count = 0 + + def fetch(self, url: str) -> bytes: + with self._session.get(url, timeout=self._timeout, headers=_headers) as r: + r.raise_for_status() + self._request_count += 1 + self._cache_hit_count += int(r.from_cache) + return r.content + + @property + def request_count(self) -> int: + return self._request_count + + @property + def cache_hit_count(self) -> int: + return self._cache_hit_count + + @contextmanager def DirectFetcher(timeout: int) -> Iterator[_SessionFetcher]: with requests.session() as session: @@ -38,6 +72,30 @@ def DirectFetcher(timeout: int) -> Iterator[_SessionFetcher]: @contextmanager -def CachingFetcher(cache_path: str, timeout: int) -> Iterator[_SessionFetcher]: +def CachingFetcher( + cache_path: str, + timeout: int, + log: Callable[[str], None] = lambda x: print(x, file=stderr), +) -> Iterator[_CachingFetcher]: with requests_cache.CachedSession(cache_path, cache_control=True) as session: - yield _SessionFetcher(session, timeout) + fetcher = _CachingFetcher(session, timeout) + yield fetcher + if fetcher.request_count > 0: + percent = 100.0 * fetcher.cache_hit_count / fetcher.request_count + log(f"Fetch cache hits: {fetcher.cache_hit_count} ({percent:.1f}%)") + + +class FakeFetcher(Fetcher): + + def __init__(self, resources: dict[str, bytes]) -> None: + self._resources = resources + self._fetch_count = 0 + + def fetch(self, url: str) -> bytes: + self._fetch_count += 1 + if url not in self._resources: + raise requests.HTTPError("URL not found", url) + return self._resources[url] + + def request_count(self) -> int: + return self._fetch_count