|
@@ -0,0 +1,268 @@
|
|
|
|
|
+"""Abstract spider interface for Databank (no concrete spiders here).
|
|
|
|
|
+
|
|
|
|
|
+Contract (must implement):
|
|
|
|
|
+- build_payload(url): pure builder for request parameters/headers/body.
|
|
|
|
|
+- fetch(url, payload): perform retrieval and return raw textual content.
|
|
|
|
|
+- parse(url, content, payload): transform raw content into a sequence of Documents.
|
|
|
|
|
+
|
|
|
|
|
+Extension points (optional to override):
|
|
|
|
|
+- on_run_start / on_run_end: hooks at the boundaries of a run.
|
|
|
|
|
+- should_fetch: per-URL gate to skip fetching (e.g., dedup or robots checks).
|
|
|
|
|
+- before_fetch / after_fetch: lifecycle hooks around network retrieval.
|
|
|
|
|
+- transform: post-parse normalization/filtering step over parsed documents.
|
|
|
|
|
+- handle_error: customize error handling per-URL (default: re-raise).
|
|
|
|
|
+- configure/metadata: provide generic configuration and metadata.
|
|
|
|
|
+- close: release resources (sessions, files, etc.); context-manager friendly.
|
|
|
|
|
+
|
|
|
|
|
+Notes:
|
|
|
|
|
+- This module intentionally contains no concrete spider implementation.
|
|
|
|
|
+- The provided class attributes (timeouts, retries, etc.) are advisory hints;
|
|
|
|
|
+ concrete implementations may honor them but this base does not enforce behavior.
|
|
|
|
|
+"""
|
|
|
|
|
+
|
|
|
|
|
+from __future__ import annotations
|
|
|
|
|
+
|
|
|
|
|
+from abc import ABC, abstractmethod
|
|
|
|
|
+import logging
|
|
|
|
|
+from typing import Any, Iterable, Mapping, Sequence, Optional
|
|
|
|
|
+
|
|
|
|
|
+from databank.core.models import Document
|
|
|
|
|
+
|
|
|
|
|
+# Type aliases for clarity
|
|
|
|
|
+URL = str
|
|
|
|
|
+Payload = Mapping[str, Any]
|
|
|
|
|
+Documents = Sequence[Document]
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+# Error hierarchy (implementations may raise these for clarity)
|
|
|
|
|
+class SpiderError(Exception):
|
|
|
|
|
+ """Base error for spider operations."""
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+class BuildPayloadError(SpiderError):
|
|
|
|
|
+ """Raised when building the request payload fails for a URL."""
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+class FetchError(SpiderError):
|
|
|
|
|
+ """Raised when fetching raw content for a URL fails."""
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+class ParseError(SpiderError):
|
|
|
|
|
+ """Raised when parsing raw content into documents fails."""
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+class BaseSpider(ABC):
|
|
|
|
|
+ """Abstract spider definition.
|
|
|
|
|
+
|
|
|
|
|
+ Attributes (advisory; implementations may choose to honor these):
|
|
|
|
|
+ - name: Identifier for this spider.
|
|
|
|
|
+ - max_retries: Suggested maximum retry attempts per URL (default 0).
|
|
|
|
|
+ - request_timeout_s: Suggested request timeout in seconds (None = no limit).
|
|
|
|
|
+ - rate_limit_per_sec: Suggested maximum requests per second (None = unlimited).
|
|
|
|
|
+ - default_headers: Suggested default HTTP headers mapping.
|
|
|
|
|
+ - logger: Optional logger for diagnostics.
|
|
|
|
|
+ - options: Arbitrary configuration bag provided at construction.
|
|
|
|
|
+ - metadata: Arbitrary, read-mostly metadata about this spider instance.
|
|
|
|
|
+ """
|
|
|
|
|
+
|
|
|
|
|
+ name: str = "base"
|
|
|
|
|
+ max_retries: int = 0
|
|
|
|
|
+ request_timeout_s: Optional[float] = None
|
|
|
|
|
+ rate_limit_per_sec: Optional[float] = None
|
|
|
|
|
+ default_headers: Optional[Mapping[str, str]] = None
|
|
|
|
|
+
|
|
|
|
|
+ def __init__(
|
|
|
|
|
+ self, *, logger: Optional[logging.Logger] = None, **options: Any
|
|
|
|
|
+ ) -> None:
|
|
|
|
|
+ """Initialize spider with optional logger and configuration options.
|
|
|
|
|
+
|
|
|
|
|
+ Args:
|
|
|
|
|
+ logger: Optional logger for diagnostics.
|
|
|
|
|
+ **options: Arbitrary configuration bag for concrete implementation.
|
|
|
|
|
+ """
|
|
|
|
|
+ self.logger: Optional[logging.Logger] = logger
|
|
|
|
|
+ self.options: dict[str, Any] = dict(options)
|
|
|
|
|
+ self._metadata: dict[str, Any] = {}
|
|
|
|
|
+
|
|
|
|
|
+ @abstractmethod
|
|
|
|
|
+ def build_payload(self, url: URL) -> Payload:
|
|
|
|
|
+ """Build request params/body or headers from a URL.
|
|
|
|
|
+
|
|
|
|
|
+ Args:
|
|
|
|
|
+ url: Target URL to be fetched.
|
|
|
|
|
+
|
|
|
|
|
+ Returns:
|
|
|
|
|
+ A mapping of request parameters/headers/body to be used by ``fetch``.
|
|
|
|
|
+
|
|
|
|
|
+ Raises:
|
|
|
|
|
+ BuildPayloadError: If payload construction fails for ``url``.
|
|
|
|
|
+ """
|
|
|
|
|
+
|
|
|
|
|
+ @abstractmethod
|
|
|
|
|
+ def fetch(self, url: URL, payload: Payload) -> str:
|
|
|
|
|
+ """Fetch raw textual content for a URL using the given payload.
|
|
|
|
|
+
|
|
|
|
|
+ Args:
|
|
|
|
|
+ url: Target URL to fetch.
|
|
|
|
|
+ payload: Request parameters prepared by ``build_payload``.
|
|
|
|
|
+
|
|
|
|
|
+ Returns:
|
|
|
|
|
+ Raw textual content.
|
|
|
|
|
+
|
|
|
|
|
+ Raises:
|
|
|
|
|
+ FetchError: If retrieval fails for ``url``.
|
|
|
|
|
+ """
|
|
|
|
|
+
|
|
|
|
|
+ @abstractmethod
|
|
|
|
|
+ def parse(self, url: URL, content: str, payload: Payload) -> Documents:
|
|
|
|
|
+ """Parse raw content into a sequence of Documents.
|
|
|
|
|
+
|
|
|
|
|
+ Args:
|
|
|
|
|
+ url: URL associated with the content.
|
|
|
|
|
+ content: Raw textual content fetched by ``fetch``.
|
|
|
|
|
+ payload: The payload used to fetch, for context if needed.
|
|
|
|
|
+
|
|
|
|
|
+ Returns:
|
|
|
|
|
+ A sequence of :class:`~databank.core.models.Document` instances.
|
|
|
|
|
+
|
|
|
|
|
+ Raises:
|
|
|
|
|
+ ParseError: If parsing fails for ``url``.
|
|
|
|
|
+ """
|
|
|
|
|
+
|
|
|
|
|
+ # ---- Optional lifecycle hooks (no-op by default) ----
|
|
|
|
|
+ def on_run_start(self, urls: Iterable[URL]) -> None: # pragma: no cover
|
|
|
|
|
+ """Hook invoked once before processing a batch of URLs.
|
|
|
|
|
+
|
|
|
|
|
+ Args:
|
|
|
|
|
+ urls: Collection of URLs to be processed in this run.
|
|
|
|
|
+ """
|
|
|
|
|
+
|
|
|
|
|
+ def on_run_end(
|
|
|
|
|
+ self, urls: Iterable[URL], results: Sequence[Document], error_count: int
|
|
|
|
|
+ ) -> None: # pragma: no cover
|
|
|
|
|
+ """Hook invoked once after processing a batch of URLs.
|
|
|
|
|
+
|
|
|
|
|
+ Args:
|
|
|
|
|
+ urls: The same collection passed to ``on_run_start``.
|
|
|
|
|
+ results: All successfully parsed documents.
|
|
|
|
|
+ error_count: Number of URLs that raised errors.
|
|
|
|
|
+ """
|
|
|
|
|
+
|
|
|
|
|
+ def should_fetch(self, _url: URL, _payload: Payload) -> bool: # pragma: no cover
|
|
|
|
|
+ """Return False to skip fetching this URL (e.g., dedup, robots, filters).
|
|
|
|
|
+
|
|
|
|
|
+ Returns:
|
|
|
|
|
+ True to proceed with fetching; False to skip this URL.
|
|
|
|
|
+ """
|
|
|
|
|
+ return True
|
|
|
|
|
+
|
|
|
|
|
+ def before_fetch(self, url: URL, payload: Payload) -> None: # pragma: no cover
|
|
|
|
|
+ """Hook invoked before fetch; override for logging/metrics/rate-limit.
|
|
|
|
|
+
|
|
|
|
|
+ Args:
|
|
|
|
|
+ url: URL to fetch.
|
|
|
|
|
+ payload: Request parameters prepared by ``build_payload``.
|
|
|
|
|
+ """
|
|
|
|
|
+
|
|
|
|
|
+ def after_fetch(
|
|
|
|
|
+ self, url: URL, payload: Payload, content: str
|
|
|
|
|
+ ) -> None: # pragma: no cover
|
|
|
|
|
+ """Hook invoked after fetch; override for logging/metrics/tracing.
|
|
|
|
|
+
|
|
|
|
|
+ Args:
|
|
|
|
|
+ url: URL fetched.
|
|
|
|
|
+ payload: Request parameters used to fetch.
|
|
|
|
|
+ content: Raw textual content returned by ``fetch``.
|
|
|
|
|
+ """
|
|
|
|
|
+
|
|
|
|
|
+ def handle_error(
|
|
|
|
|
+ self, url: URL, payload: Payload, exc: Exception
|
|
|
|
|
+ ) -> None: # pragma: no cover
|
|
|
|
|
+ """Handle per-URL errors; default behavior re-raises the exception.
|
|
|
|
|
+
|
|
|
|
|
+ Implementations may log, collect metrics, or convert exceptions.
|
|
|
|
|
+
|
|
|
|
|
+ Args:
|
|
|
|
|
+ url: URL whose processing failed.
|
|
|
|
|
+ payload: Payload built for this URL.
|
|
|
|
|
+ exc: Original exception raised.
|
|
|
|
|
+ """
|
|
|
|
|
+ raise exc
|
|
|
|
|
+
|
|
|
|
|
+ def transform(self, _url: URL, docs: Documents) -> Documents: # pragma: no cover
|
|
|
|
|
+ """Post-parse transformation/normalization stage; default is identity.
|
|
|
|
|
+
|
|
|
|
|
+ Args:
|
|
|
|
|
+ _url: URL associated with ``docs``.
|
|
|
|
|
+ docs: Parsed documents.
|
|
|
|
|
+
|
|
|
|
|
+ Returns:
|
|
|
|
|
+ Possibly modified documents (default: unchanged).
|
|
|
|
|
+ """
|
|
|
|
|
+ return docs
|
|
|
|
|
+
|
|
|
|
|
+ def run(self, urls: Iterable[URL]) -> list[Document]:
|
|
|
|
|
+ """Reference orchestration: build -> fetch -> parse.
|
|
|
|
|
+
|
|
|
|
|
+ Steps:
|
|
|
|
|
+ 1) ``build_payload`` per URL.
|
|
|
|
|
+ 2) ``should_fetch`` gate.
|
|
|
|
|
+ 3) ``before_fetch`` -> ``fetch`` -> ``after_fetch``.
|
|
|
|
|
+ 4) ``parse`` -> ``transform``.
|
|
|
|
|
+
|
|
|
|
|
+ Implementations may override for concurrency, caching, or tracing.
|
|
|
|
|
+
|
|
|
|
|
+ Args:
|
|
|
|
|
+ urls: URLs to process.
|
|
|
|
|
+
|
|
|
|
|
+ Returns:
|
|
|
|
|
+ A list of parsed documents across all URLs.
|
|
|
|
|
+ """
|
|
|
|
|
+ results: list[Document] = []
|
|
|
|
|
+ urls_seq = tuple(urls)
|
|
|
|
|
+ self.on_run_start(urls_seq)
|
|
|
|
|
+ error_count = 0
|
|
|
|
|
+ for url in urls_seq:
|
|
|
|
|
+ payload = self.build_payload(url)
|
|
|
|
|
+ if not self.should_fetch(url, payload):
|
|
|
|
|
+ continue
|
|
|
|
|
+ self.before_fetch(url, payload)
|
|
|
|
|
+ try:
|
|
|
|
|
+ raw = self.fetch(url, payload)
|
|
|
|
|
+ self.after_fetch(url, payload, raw)
|
|
|
|
|
+ docs = self.parse(url, raw, payload)
|
|
|
|
|
+ docs = self.transform(url, docs)
|
|
|
|
|
+ except Exception as exc: # pylint: disable=broad-except
|
|
|
|
|
+ self.handle_error(url, payload, exc)
|
|
|
|
|
+ error_count += 1
|
|
|
|
|
+ continue
|
|
|
|
|
+ results.extend(docs)
|
|
|
|
|
+ self.on_run_end(urls_seq, results, error_count)
|
|
|
|
|
+ return results
|
|
|
|
|
+
|
|
|
|
|
+ # ---- Resource management ----
|
|
|
|
|
+ def close(self) -> None: # pragma: no cover
|
|
|
|
|
+ """Release resources (e.g., network sessions)."""
|
|
|
|
|
+
|
|
|
|
|
+ # ---- Generic configuration & metadata ----
|
|
|
|
|
+ def configure(self, **options: Any) -> None: # pragma: no cover
|
|
|
|
|
+ """Update generic configuration options for this spider instance."""
|
|
|
|
|
+ self.options.update(options)
|
|
|
|
|
+
|
|
|
|
|
+ @property
|
|
|
|
|
+ def metadata(self) -> Mapping[str, Any]: # pragma: no cover
|
|
|
|
|
+ """Read-only view of spider metadata."""
|
|
|
|
|
+ return dict(self._metadata)
|
|
|
|
|
+
|
|
|
|
|
+ def set_metadata(self, **meta: Any) -> None: # pragma: no cover
|
|
|
|
|
+ """Update spider metadata."""
|
|
|
|
|
+ self._metadata.update(meta)
|
|
|
|
|
+
|
|
|
|
|
+ # Context manager support
|
|
|
|
|
+ def __enter__(self) -> "BaseSpider": # pragma: no cover
|
|
|
|
|
+ return self
|
|
|
|
|
+
|
|
|
|
|
+ def __exit__(self, exc_type, exc, tb) -> bool: # pragma: no cover
|
|
|
|
|
+ self.close()
|
|
|
|
|
+ # Do not suppress exceptions
|
|
|
|
|
+ return False
|