|
|
@@ -29,7 +29,7 @@ from typing import Any, Iterable, Mapping, Sequence, Optional
|
|
|
from databank.core.models import Document
|
|
|
|
|
|
# Type aliases for clarity
|
|
|
-URL = str
|
|
|
+Task = Any # A structured task object (e.g., a dataclass) provided by the runner
|
|
|
Payload = Mapping[str, Any]
|
|
|
Documents = Sequence[Document]
|
|
|
|
|
|
@@ -40,11 +40,11 @@ class SpiderError(Exception):
|
|
|
|
|
|
|
|
|
class BuildPayloadError(SpiderError):
|
|
|
- """Raised when building the request payload fails for a URL."""
|
|
|
+ """Raised when building the request payload fails for a task."""
|
|
|
|
|
|
|
|
|
class FetchError(SpiderError):
|
|
|
- """Raised when fetching raw content for a URL fails."""
|
|
|
+ """Raised when fetching raw content for a task fails."""
|
|
|
|
|
|
|
|
|
class ParseError(SpiderError):
|
|
|
@@ -85,40 +85,40 @@ class BaseSpider(ABC):
|
|
|
self._metadata: dict[str, Any] = {}
|
|
|
|
|
|
@abstractmethod
|
|
|
- def build_payload(self, url: URL) -> Payload:
|
|
|
- """Build request params/body or headers from a URL.
|
|
|
+ def build_payload(self, task: Task) -> Payload:
|
|
|
+ """Build request params/body or headers from a structured task.
|
|
|
|
|
|
Args:
|
|
|
- url: Target URL to be fetched.
|
|
|
+ task: A structured input (e.g., a dataclass) describing the job.
|
|
|
|
|
|
Returns:
|
|
|
A mapping of request parameters/headers/body to be used by ``fetch``.
|
|
|
|
|
|
Raises:
|
|
|
- BuildPayloadError: If payload construction fails for ``url``.
|
|
|
+ BuildPayloadError: If payload construction fails for ``task``.
|
|
|
"""
|
|
|
|
|
|
@abstractmethod
|
|
|
- def fetch(self, url: URL, payload: Payload) -> str:
|
|
|
- """Fetch raw textual content for a URL using the given payload.
|
|
|
+ def fetch(self, task: Task, payload: Payload) -> str:
|
|
|
+ """Fetch raw textual content for a task using the given payload.
|
|
|
|
|
|
Args:
|
|
|
- url: Target URL to fetch.
|
|
|
+ task: Structured task to fetch.
|
|
|
payload: Request parameters prepared by ``build_payload``.
|
|
|
|
|
|
Returns:
|
|
|
Raw textual content.
|
|
|
|
|
|
Raises:
|
|
|
- FetchError: If retrieval fails for ``url``.
|
|
|
+ FetchError: If retrieval fails for ``task``.
|
|
|
"""
|
|
|
|
|
|
@abstractmethod
|
|
|
- def parse(self, url: URL, content: str, payload: Payload) -> Documents:
|
|
|
+ def parse(self, task: Task, content: str, payload: Payload) -> Documents:
|
|
|
"""Parse raw content into a sequence of Documents.
|
|
|
|
|
|
Args:
|
|
|
- url: URL associated with the content.
|
|
|
+ task: Task associated with the content.
|
|
|
content: Raw textual content fetched by ``fetch``.
|
|
|
payload: The payload used to fetch, for context if needed.
|
|
|
|
|
|
@@ -126,29 +126,29 @@ class BaseSpider(ABC):
|
|
|
A sequence of :class:`~databank.core.models.Document` instances.
|
|
|
|
|
|
Raises:
|
|
|
- ParseError: If parsing fails for ``url``.
|
|
|
+ ParseError: If parsing fails for ``task``.
|
|
|
"""
|
|
|
|
|
|
# ---- Optional lifecycle hooks (no-op by default) ----
|
|
|
- def on_run_start(self, urls: Iterable[URL]) -> None: # pragma: no cover
|
|
|
+ def on_run_start(self, tasks: Iterable[Task]) -> None: # pragma: no cover
|
|
|
"""Hook invoked once before processing a batch of URLs.
|
|
|
|
|
|
Args:
|
|
|
- urls: Collection of URLs to be processed in this run.
|
|
|
+ tasks: Collection of structured tasks to be processed in this run.
|
|
|
"""
|
|
|
|
|
|
def on_run_end(
|
|
|
- self, urls: Iterable[URL], results: Sequence[Document], error_count: int
|
|
|
+ self, tasks: Iterable[Task], results: Sequence[Document], error_count: int
|
|
|
) -> None: # pragma: no cover
|
|
|
"""Hook invoked once after processing a batch of URLs.
|
|
|
|
|
|
Args:
|
|
|
- urls: The same collection passed to ``on_run_start``.
|
|
|
+ tasks: The same collection passed to ``on_run_start``.
|
|
|
results: All successfully parsed documents.
|
|
|
error_count: Number of URLs that raised errors.
|
|
|
"""
|
|
|
|
|
|
- def should_fetch(self, _url: URL, _payload: Payload) -> bool: # pragma: no cover
|
|
|
+ def should_fetch(self, _task: Task, _payload: Payload) -> bool: # pragma: no cover
|
|
|
"""Return False to skip fetching this URL (e.g., dedup, robots, filters).
|
|
|
|
|
|
Returns:
|
|
|
@@ -156,44 +156,44 @@ class BaseSpider(ABC):
|
|
|
"""
|
|
|
return True
|
|
|
|
|
|
- def before_fetch(self, url: URL, payload: Payload) -> None: # pragma: no cover
|
|
|
+ def before_fetch(self, task: Task, payload: Payload) -> None: # pragma: no cover
|
|
|
"""Hook invoked before fetch; override for logging/metrics/rate-limit.
|
|
|
|
|
|
Args:
|
|
|
- url: URL to fetch.
|
|
|
+ task: Task to fetch.
|
|
|
payload: Request parameters prepared by ``build_payload``.
|
|
|
"""
|
|
|
|
|
|
def after_fetch(
|
|
|
- self, url: URL, payload: Payload, content: str
|
|
|
+ self, task: Task, payload: Payload, content: str
|
|
|
) -> None: # pragma: no cover
|
|
|
"""Hook invoked after fetch; override for logging/metrics/tracing.
|
|
|
|
|
|
Args:
|
|
|
- url: URL fetched.
|
|
|
+ task: Task fetched.
|
|
|
payload: Request parameters used to fetch.
|
|
|
content: Raw textual content returned by ``fetch``.
|
|
|
"""
|
|
|
|
|
|
def handle_error(
|
|
|
- self, url: URL, payload: Payload, exc: Exception
|
|
|
+ self, task: Task, payload: Payload, exc: Exception
|
|
|
) -> None: # pragma: no cover
|
|
|
"""Handle per-URL errors; default behavior re-raises the exception.
|
|
|
|
|
|
Implementations may log, collect metrics, or convert exceptions.
|
|
|
|
|
|
Args:
|
|
|
- url: URL whose processing failed.
|
|
|
+ task: Task whose processing failed.
|
|
|
payload: Payload built for this URL.
|
|
|
exc: Original exception raised.
|
|
|
"""
|
|
|
raise exc
|
|
|
|
|
|
- def transform(self, _url: URL, docs: Documents) -> Documents: # pragma: no cover
|
|
|
+ def transform(self, _task: Task, docs: Documents) -> Documents: # pragma: no cover
|
|
|
"""Post-parse transformation/normalization stage; default is identity.
|
|
|
|
|
|
Args:
|
|
|
- _url: URL associated with ``docs``.
|
|
|
+ _task: Task associated with ``docs``.
|
|
|
docs: Parsed documents.
|
|
|
|
|
|
Returns:
|
|
|
@@ -201,11 +201,11 @@ class BaseSpider(ABC):
|
|
|
"""
|
|
|
return docs
|
|
|
|
|
|
- def run(self, urls: Iterable[URL]) -> list[Document]:
|
|
|
+ def run(self, tasks: Iterable[Task]) -> list[Document]:
|
|
|
"""Reference orchestration: build -> fetch -> parse.
|
|
|
|
|
|
Steps:
|
|
|
- 1) ``build_payload`` per URL.
|
|
|
+ 1) ``build_payload`` per task.
|
|
|
2) ``should_fetch`` gate.
|
|
|
3) ``before_fetch`` -> ``fetch`` -> ``after_fetch``.
|
|
|
4) ``parse`` -> ``transform``.
|
|
|
@@ -213,31 +213,31 @@ class BaseSpider(ABC):
|
|
|
Implementations may override for concurrency, caching, or tracing.
|
|
|
|
|
|
Args:
|
|
|
- urls: URLs to process.
|
|
|
+ tasks: Structured tasks to process.
|
|
|
|
|
|
Returns:
|
|
|
A list of parsed documents across all URLs.
|
|
|
"""
|
|
|
results: list[Document] = []
|
|
|
- urls_seq = tuple(urls)
|
|
|
- self.on_run_start(urls_seq)
|
|
|
+ tasks_seq = tuple(tasks)
|
|
|
+ self.on_run_start(tasks_seq)
|
|
|
error_count = 0
|
|
|
- for url in urls_seq:
|
|
|
- payload = self.build_payload(url)
|
|
|
- if not self.should_fetch(url, payload):
|
|
|
+ for task in tasks_seq:
|
|
|
+ payload = self.build_payload(task)
|
|
|
+ if not self.should_fetch(task, payload):
|
|
|
continue
|
|
|
- self.before_fetch(url, payload)
|
|
|
+ self.before_fetch(task, payload)
|
|
|
try:
|
|
|
- raw = self.fetch(url, payload)
|
|
|
- self.after_fetch(url, payload, raw)
|
|
|
- docs = self.parse(url, raw, payload)
|
|
|
- docs = self.transform(url, docs)
|
|
|
+ raw = self.fetch(task, payload)
|
|
|
+ self.after_fetch(task, payload, raw)
|
|
|
+ docs = self.parse(task, raw, payload)
|
|
|
+ docs = self.transform(task, docs)
|
|
|
except Exception as exc: # pylint: disable=broad-except
|
|
|
- self.handle_error(url, payload, exc)
|
|
|
+ self.handle_error(task, payload, exc)
|
|
|
error_count += 1
|
|
|
continue
|
|
|
results.extend(docs)
|
|
|
- self.on_run_end(urls_seq, results, error_count)
|
|
|
+ self.on_run_end(tasks_seq, results, error_count)
|
|
|
return results
|
|
|
|
|
|
# ---- Resource management ----
|