]> git.scottworley.com Git - paperdoorknob/blobdiff - fetch.py
Progress indicator
[paperdoorknob] / fetch.py
index 7776f93085943ef8f04948858eeadb0d69323d86..e998394445196fcf677766a78cd3805d6b241bd1 100644 (file)
--- a/fetch.py
+++ b/fetch.py
@@ -7,11 +7,18 @@
 
 from abc import ABC, abstractmethod
 from contextlib import contextmanager
-from typing import Iterator
+from sys import stderr
+from typing import IO, Iterator
 
 import requests
 import requests_cache
 
+from version import paperdoorknob_version
+
+
+_headers = {'User-Agent': f'paperdoorknob/{paperdoorknob_version} ' +
+            '(https://git.scottworley.com/paperdoorknob/)'}
+
 
 class Fetcher(ABC):
     @abstractmethod
@@ -26,10 +33,37 @@ class _SessionFetcher(Fetcher):
         self._timeout = timeout
 
     def fetch(self, url: str) -> bytes:
-        with self._session.get(url, timeout=self._timeout) as r:
+        with self._session.get(url, timeout=self._timeout, headers=_headers) as r:
+            r.raise_for_status()
+            return r.content
+
+
+class _CachingFetcher(Fetcher):
+
+    def __init__(
+            self,
+            session: requests_cache.CachedSession,
+            timeout: int) -> None:
+        self._session = session
+        self._timeout = timeout
+        self._request_count = 0
+        self._cache_hit_count = 0
+
+    def fetch(self, url: str) -> bytes:
+        with self._session.get(url, timeout=self._timeout, headers=_headers) as r:
             r.raise_for_status()
+            self._request_count += 1
+            self._cache_hit_count += int(r.from_cache)
             return r.content
 
+    @property
+    def request_count(self) -> int:
+        return self._request_count
+
+    @property
+    def cache_hit_count(self) -> int:
+        return self._cache_hit_count
+
 
 @contextmanager
 def DirectFetcher(timeout: int) -> Iterator[_SessionFetcher]:
@@ -38,17 +72,31 @@ def DirectFetcher(timeout: int) -> Iterator[_SessionFetcher]:
 
 
 @contextmanager
-def CachingFetcher(cache_path: str, timeout: int) -> Iterator[_SessionFetcher]:
+def CachingFetcher(
+        cache_path: str,
+        timeout: int,
+        report_stream: IO[str] = stderr) -> Iterator[_CachingFetcher]:
     with requests_cache.CachedSession(cache_path, cache_control=True) as session:
-        yield _SessionFetcher(session, timeout)
+        fetcher = _CachingFetcher(session, timeout)
+        yield fetcher
+        if fetcher.request_count > 0:
+            percent = 100.0 * fetcher.cache_hit_count / fetcher.request_count
+            print(
+                f"Fetch cache hits: {fetcher.cache_hit_count} ({percent:.1f}%)",
+                file=report_stream)
 
 
 class FakeFetcher(Fetcher):
 
     def __init__(self, resources: dict[str, bytes]) -> None:
         self._resources = resources
+        self._fetch_count = 0
 
     def fetch(self, url: str) -> bytes:
+        self._fetch_count += 1
         if url not in self._resources:
-            raise requests.HTTPError("URL not found")
+            raise requests.HTTPError("URL not found", url)
         return self._resources[url]
+
+    def request_count(self) -> int:
+        return self._fetch_count