преди 5 месеца · db7d006eb1
--- a/README.md
+++ b/README.md
@@ -1,3 +1,62 @@
 
															-# databank
														
 
															+# Databank (Abstract Skeleton)
														
 
															-英超足球联赛数据仓库
														
 
															+This repository is a pure abstract skeleton intended to define stable contracts
														
 
															+for a multi-spider data pipeline. It deliberately contains only abstract/base
														
 
															+classes and core models, without any concrete implementations.
														
 
															+
														
 
															+Key modules (all abstract-only):
														
 
															+- spiders: `BaseSpider` with clear lifecycle hooks and advisory attributes.
														
 
															+- db: `BaseDB` defining minimal persistence operations and hooks.
														
 
															+- reporter: `BaseReporter` defining reporting lifecycle.
														
 
															+- scheduler: `RunnerBase` and `SchedulerBase` for coordination/scheduling.
														
 
															+- analytics: `AnalyticsBase` for generic analytics pipelines.
														
 
															+
														
 
															+Guidelines to extend (no code here, only how-to):
														
 
															+- Implementations MUST live in your own packages/modules and import these bases.
														
 
															+- Do NOT modify the base interfaces unless you intend a breaking change.
														
 
															+- Prefer composition and dependency injection over hard-coding dependencies.
														
 
															+
														
 
															+Implementing a spider (outline only):
														
 
															+1. Subclass `BaseSpider` and implement:
														
 
															+	- `build_payload(url) -> Payload`
														
 
															+	- `fetch(url, payload) -> str`
														
 
															+	- `parse(url, content, payload) -> Documents`
														
 
															+2. Optionally override hooks:
														
 
															+	- `on_run_start/on_run_end`, `should_fetch`, `before_fetch/after_fetch`, `transform`, `handle_error`, `close`.
														
 
															+3. Optionally honor advisory attributes like `max_retries`, `request_timeout_s`.
														
 
															+
														
 
															+Implementing a DB backend (outline only):
														
 
															+1. Subclass `BaseDB` and implement: `connect`, `ensure_indexes`, `insert_many`, `close`.
														
 
															+2. Optionally override hooks: `on_connect/on_close`, `before_insert/after_insert`.
														
 
															+
														
 
															+Implementing a reporter (outline only):
														
 
															+1. Subclass `BaseReporter` and implement: `notify_start`, `notify_success`, `notify_error`, `notify_summary`.
														
 
															+2. Optionally override `on_session_start/on_session_end`.
														
 
															+
														
 
															+Implementing a runner/scheduler (outline only):
														
 
															+1. Subclass `RunnerBase` to coordinate spiders -> DB -> reporters.
														
 
															+2. Subclass `SchedulerBase` to install/trigger schedules (e.g., via systemd/cron in your own code).
														
 
															+
														
 
															+Implementing analytics (outline only):
														
 
															+1. Subclass `AnalyticsBase` and implement `compute(data, **kwargs)`.
														
 
															+2. Optional staged hooks: `prepare`, `validate`, `transform`, `finalize`.
														
 
															+
														
 
															+Operations: systemd templates
														
 
															+- See `ops/systemd/databank.service` and `ops/systemd/databank.timer`.
														
 
															+- Customize `User`, `WorkingDirectory`, and `ExecStart` for your environment.
														
 
															+
														
 
															+Optional linting (no deps enforced)
														
 
															+- A minimal Pylint config is included in `pyproject.toml` under `[tool.pylint.*]`.
														
 
															+- You can run Pylint in your environment if desired, for example:
														
 
															+	- `pylint src/databank` (assuming Pylint is installed in your environment)
														
 
															+	- The config disables ABC-related false positives while keeping docstring checks.
														
 
															+
														
 
															+Optional typing and linting (ruff/mypy)
														
 
															+- Minimal configs for Ruff and mypy are also included in `pyproject.toml`.
														
 
															+- If you have them installed locally, example commands:
														
 
															+	- `ruff check src/databank`
														
 
															+	- `mypy src/databank`
														
 
															+	- Both are optional and will not run unless you invoke them.
														
 
															+
														
 
															+License
														
 
															+- See `LICENSE` for details.
														
--- a/ops/systemd/databank.service
+++ b/ops/systemd/databank.service
@@ -0,0 +1,15 @@
 
															+[Unit]
														
 
															+Description=Databank football data pipeline (abstract layer)
														
 
															+After=network-online.target
														
 
															+Wants=network-online.target
														
 
															+
														
 
															+[Service]
														
 
															+Type=simple
														
 
															+User=databank
														
 
															+WorkingDirectory=/opt/databank
														
 
															+ExecStart=/usr/bin/python3 -m databank
														
 
															+Restart=on-failure
														
 
															+RestartSec=10
														
 
															+
														
 
															+[Install]
														
 
															+WantedBy=multi-user.target
														
--- a/ops/systemd/databank.timer
+++ b/ops/systemd/databank.timer
@@ -0,0 +1,9 @@
 
															+[Unit]
														
 
															+Description=Run Databank pipeline periodically
														
 
															+
														
 
															+[Timer]
														
 
															+OnCalendar=hourly
														
 
															+Persistent=true
														
 
															+
														
 
															+[Install]
														
 
															+WantedBy=timers.target
														
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -0,0 +1,69 @@
 
															+[build-system]
														
 
															+requires = ["setuptools>=61.0"]
														
 
															+build-backend = "setuptools.build_meta"
														
 
															+
														
 
															+[project]
														
 
															+name = "databank"
														
 
															+version = "0.0.1"
														
 
															+description = "Databank - abstract-only initialization"
														
 
															+readme = "README.md"
														
 
															+requires-python = ">=3.9"
														
 
															+license = { file = "LICENSE" }
														
 
															+
														
 
															+[tool.setuptools]
														
 
															+package-dir = {"" = "src"}
														
 
															+
														
 
															+[tool.setuptools.packages.find]
														
 
															+where = ["src"]
														
 
															+include = ["databank*"]
														
 
															+
														
 
															+[tool.pytest.ini_options]
														
 
															+addopts = "-q"
														
 
															+pythonpath = ["src"]
														
 
															+
														
 
															+# Optional: Lint settings (no dependency changes here; for reference only)
														
 
															+[tool.pylint.main]
														
 
															+py-version = "3.9"
														
 
															+jobs = 0
														
 
															+recursive = true
														
 
															+
														
 
															+[tool.pylint.messages_control]
														
 
															+disable = [
														
 
															+		"too-few-public-methods",    # common for ABCs/interfaces
														
 
															+		"too-many-arguments",        # allowed in abstract signatures
														
 
															+		"too-many-instance-attributes",
														
 
															+]
														
 
															+enable = [
														
 
															+		"C0116",  # missing-function-docstring (kept enabled, already addressed)
														
 
															+]
														
 
															+
														
 
															+[tool.pylint.format]
														
 
															+max-line-length = 100
														
 
															+
														
 
															+[tool.pylint.basic]
														
 
															+good-names = [
														
 
															+	"db", "id", "url", "raw", "exc", "tb"
														
 
															+]
														
 
															+
														
 
															+# Optional: Ruff configuration (only active if ruff is installed in your env)
														
 
															+[tool.ruff]
														
 
															+line-length = 100
														
 
															+target-version = "py39"
														
 
															+extend-select = [
														
 
															+	"E",  # pycodestyle errors
														
 
															+	"F",  # pyflakes
														
 
															+	"N",  # pep8-naming
														
 
															+]
														
 
															+extend-ignore = [
														
 
															+	# Allow short names in specific contexts (already handled via pylint good-names)
														
 
															+]
														
 
															+
														
 
															+# Optional: mypy configuration (only active if mypy is installed in your env)
														
 
															+[tool.mypy]
														
 
															+python_version = "3.9"
														
 
															+warn_return_any = true
														
 
															+warn_unused_ignores = true
														
 
															+ignore_missing_imports = true
														
 
															+no_implicit_optional = false
														
 
															+strict_optional = false
														
 
															+disallow_untyped_defs = false
														
--- a/src/databank/__init__.py
+++ b/src/databank/__init__.py
@@ -0,0 +1,6 @@
 
															+"""Databank package root (abstract-only initialization)."""
														
 
															+
														
 
															+from __future__ import annotations
														
 
															+
														
 
															+__all__ = ["__version__"]
														
 
															+__version__ = "0.0.1"
														
--- a/src/databank/analytics/__init__.py
+++ b/src/databank/analytics/__init__.py
@@ -0,0 +1 @@
 
															+"""Analytics abstractions."""
														
--- a/src/databank/analytics/base.py
+++ b/src/databank/analytics/base.py
@@ -0,0 +1,115 @@
 
															+"""Abstract analytics interface (no algorithms, no specific modules).
														
 
															+
														
 
															+Contract (must implement):
														
 
															+- compute(data, **kwargs) -> Any
														
 
															+
														
 
															+Extension points (optional to override):
														
 
															+- prepare/validate/transform/finalize hooks to shape analytics pipeline.
														
 
															+- configure/metadata: generic configuration and metadata.
														
 
															+"""
														
 
															+
														
 
															+from __future__ import annotations
														
 
															+
														
 
															+from abc import ABC, abstractmethod
														
 
															+import logging
														
 
															+from typing import Any, Mapping, Optional
														
 
															+
														
 
															+
														
 
															+class AnalyticsError(Exception):
														
 
															+    """Base error for analytics operations."""
														
 
															+
														
 
															+
														
 
															+class AnalyticsBase(ABC):
														
 
															+    """A minimal, generic analytics interface.
														
 
															+
														
 
															+    Concrete analytics should implement compute(), while optional hooks provide
														
 
															+    a simple staged pipeline interface.
														
 
															+    """
														
 
															+
														
 
															+    def __init__(
														
 
															+        self, *, logger: Optional[logging.Logger] = None, **options: object
														
 
															+    ) -> None:
														
 
															+        """Initialize analytics instance with optional logger and options.
														
 
															+
														
 
															+        Args:
														
 
															+            logger: Optional logger for diagnostics.
														
 
															+            **options: Arbitrary key/value options.
														
 
															+        """
														
 
															+        self.logger: Optional[logging.Logger] = logger
														
 
															+        self.options: dict[str, object] = dict(options)
														
 
															+        self._metadata: dict[str, object] = {}
														
 
															+
														
 
															+    @abstractmethod
														
 
															+    def compute(self, data: Any, **kwargs: Any) -> Any:
														
 
															+        """Compute analytics result from input data.
														
 
															+
														
 
															+        Args:
														
 
															+            data: Input data for the analytics computation.
														
 
															+            **kwargs: Optional parameters to influence computation.
														
 
															+
														
 
															+        Returns:
														
 
															+            An analytics result, type defined by the concrete implementation.
														
 
															+        """
														
 
															+
														
 
															+    # ---- Optional hooks (no-op by default) ----
														
 
															+    def prepare(self, data: Any, **_kwargs: Any) -> Any:  # pragma: no cover
														
 
															+        """Pre-compute preparation (e.g., enrich context). Default: passthrough.
														
 
															+
														
 
															+        Args:
														
 
															+            data: Input to be prepared.
														
 
															+
														
 
															+        Returns:
														
 
															+            Prepared data (defaults to input unchanged).
														
 
															+        """
														
 
															+        return data
														
 
															+
														
 
															+    def validate(self, data: Any, **_kwargs: Any) -> None:  # pragma: no cover
														
 
															+        """Validate data preconditions. Default: no validation.
														
 
															+
														
 
															+        Args:
														
 
															+            data: Prepared input data.
														
 
															+        """
														
 
															+
														
 
															+    def transform(self, data: Any, **_kwargs: Any) -> Any:  # pragma: no cover
														
 
															+        """Transform data before compute. Default: passthrough.
														
 
															+
														
 
															+        Args:
														
 
															+            data: Validated input data.
														
 
															+
														
 
															+        Returns:
														
 
															+            Transformed data (defaults to input unchanged).
														
 
															+        """
														
 
															+        return data
														
 
															+
														
 
															+    def finalize(self, result: Any, **_kwargs: Any) -> Any:  # pragma: no cover
														
 
															+        """Finalize result post-compute (e.g., rounding/formatting). Default: identity.
														
 
															+
														
 
															+        Args:
														
 
															+            result: Raw result returned by ``compute``.
														
 
															+
														
 
															+        Returns:
														
 
															+            Final result (defaults to input unchanged).
														
 
															+        """
														
 
															+        return result
														
 
															+
														
 
															+    # ---- Generic configuration & metadata ----
														
 
															+    def configure(self, **options: object) -> None:  # pragma: no cover
														
 
															+        """Update generic configuration options for this analytics instance.
														
 
															+
														
 
															+        Args:
														
 
															+            **options: Arbitrary key/value options.
														
 
															+        """
														
 
															+        self.options.update(options)
														
 
															+
														
 
															+    @property
														
 
															+    def metadata(self) -> Mapping[str, object]:  # pragma: no cover
														
 
															+        """Read-only view of analytics metadata."""
														
 
															+        return dict(self._metadata)
														
 
															+
														
 
															+    def set_metadata(self, **meta: object) -> None:  # pragma: no cover
														
 
															+        """Update analytics metadata.
														
 
															+
														
 
															+        Args:
														
 
															+            **meta: Arbitrary key/value metadata.
														
 
															+        """
														
 
															+        self._metadata.update(meta)
														
--- a/src/databank/core/__init__.py
+++ b/src/databank/core/__init__.py
@@ -0,0 +1 @@
 
															+"""Core models and types."""
														
--- a/src/databank/core/models.py
+++ b/src/databank/core/models.py
@@ -0,0 +1,31 @@
 
															+"""Core models (data-only).
														
 
															+
														
 
															+No concrete behavior here, just containers shared by interfaces.
														
 
															+"""
														
 
															+
														
 
															+from __future__ import annotations
														
 
															+
														
 
															+from dataclasses import dataclass, field
														
 
															+from datetime import UTC, datetime
														
 
															+from typing import Any, MutableMapping, Optional
														
 
															+
														
 
															+
														
 
															+@dataclass(slots=True)
														
 
															+class Document:
														
 
															+    """Generic document emitted by spiders and stored by DB backends."""
														
 
															+
														
 
															+    id: Optional[str]
														
 
															+    kind: str
														
 
															+    data: MutableMapping[str, Any] = field(default_factory=dict)
														
 
															+    created_at: datetime = field(default_factory=lambda: datetime.now(UTC))
														
 
															+
														
 
															+
														
 
															+@dataclass(slots=True)
														
 
															+class RunSummary:
														
 
															+    """Summary object passed to reporters after a run."""
														
 
															+
														
 
															+    started_at: datetime = field(default_factory=lambda: datetime.now(UTC))
														
 
															+    finished_at: Optional[datetime] = None
														
 
															+    total_docs: int = 0
														
 
															+    per_spider: MutableMapping[str, int] = field(default_factory=dict)
														
 
															+    errors: list[str] = field(default_factory=list)
														
--- a/src/databank/db/__init__.py
+++ b/src/databank/db/__init__.py
@@ -0,0 +1 @@
 
															+"""Database abstractions."""
														
--- a/src/databank/db/base.py
+++ b/src/databank/db/base.py
@@ -0,0 +1,150 @@
 
															+"""Abstract database backend interface (no concrete implementations).
														
 
															+
														
 
															+Contract (must implement):
														
 
															+- connect(): establish connection/clients.
														
 
															+- ensure_indexes(): create indexes if needed.
														
 
															+- insert_many(docs): persist documents and return count.
														
 
															+- close(): release resources.
														
 
															+
														
 
															+Extension points (optional to override):
														
 
															+- on_connect/on_close: lifecycle boundaries.
														
 
															+- before_insert/after_insert: hooks around bulk insert.
														
 
															+- configure/metadata: generic configuration and metadata.
														
 
															+
														
 
															+Notes:
														
 
															+- This module intentionally contains no concrete DB backend.
														
 
															+- Attributes like timeouts and retries are advisory; concrete backends may honor them.
														
 
															+"""
														
 
															+
														
 
															+from __future__ import annotations
														
 
															+
														
 
															+from abc import ABC, abstractmethod
														
 
															+import logging
														
 
															+from typing import Iterable, Mapping, Optional
														
 
															+
														
 
															+from databank.core.models import Document
														
 
															+
														
 
															+
														
 
															+# Error hierarchy
														
 
															+class DBError(Exception):
														
 
															+    """Base error for DB operations."""
														
 
															+
														
 
															+
														
 
															+class ConnectError(DBError):
														
 
															+    """Raised when connecting to the backend fails."""
														
 
															+
														
 
															+
														
 
															+class InsertError(DBError):
														
 
															+    """Raised when inserting documents fails."""
														
 
															+
														
 
															+
														
 
															+class BaseDB(ABC):
														
 
															+    """Abstract DB defining minimal persistence operations.
														
 
															+
														
 
															+    Attributes (advisory hints):
														
 
															+    - request_timeout_s: Suggested timeout for DB operations (seconds).
														
 
															+    - max_retries: Suggested retries for transient failures.
														
 
															+    - logger: Optional logger.
														
 
															+    - options: Arbitrary configuration bag.
														
 
															+    - metadata: Arbitrary metadata for this DB instance.
														
 
															+    """
														
 
															+
														
 
															+    request_timeout_s: Optional[float] = None
														
 
															+    max_retries: int = 0
														
 
															+
														
 
															+    def __init__(
														
 
															+        self, *, logger: Optional[logging.Logger] = None, **options: object
														
 
															+    ) -> None:  # noqa: D401
														
 
															+        """Initialize DB backend with optional logger and options.
														
 
															+
														
 
															+        Args:
														
 
															+            logger: Optional logger for diagnostics.
														
 
															+            **options: Arbitrary key/value options.
														
 
															+        """
														
 
															+        self.logger: Optional[logging.Logger] = logger
														
 
															+        self.options: dict[str, object] = dict(options)
														
 
															+        self._metadata: dict[str, object] = {}
														
 
															+
														
 
															+    @abstractmethod
														
 
															+    def connect(self) -> None:
														
 
															+        """Establish connection to backend.
														
 
															+
														
 
															+        Raises:
														
 
															+            ConnectError: If the backend connection cannot be established.
														
 
															+        """
														
 
															+
														
 
															+    @abstractmethod
														
 
															+    def ensure_indexes(self) -> None:
														
 
															+        """Create indexes if needed.
														
 
															+
														
 
															+        Notes:
														
 
															+            Implementations should be idempotent and safe to call multiple times.
														
 
															+        """
														
 
															+
														
 
															+    @abstractmethod
														
 
															+    def insert_many(self, docs: Iterable[Document]) -> int:
														
 
															+        """Insert documents and return count inserted/upserted.
														
 
															+
														
 
															+        Args:
														
 
															+            docs: Iterable of Document instances to persist.
														
 
															+
														
 
															+        Returns:
														
 
															+            Number of successfully inserted (or upserted) documents.
														
 
															+
														
 
															+        Raises:
														
 
															+            InsertError: If the insert operation fails.
														
 
															+        """
														
 
															+
														
 
															+    @abstractmethod
														
 
															+    def close(self) -> None:
														
 
															+        """Close connections and release resources.
														
 
															+
														
 
															+        Notes:
														
 
															+            This method should be safe to call multiple times.
														
 
															+        """
														
 
															+
														
 
															+    # ---- Optional lifecycle hooks ----
														
 
															+    def on_connect(self) -> None:  # pragma: no cover
														
 
															+        """Hook invoked after successful connect()."""
														
 
															+
														
 
															+    def on_close(self) -> None:  # pragma: no cover
														
 
															+        """Hook invoked after successful close()."""
														
 
															+
														
 
															+    def before_insert(self, _docs: Iterable[Document]) -> None:  # pragma: no cover
														
 
															+        """Hook invoked before insert_many().
														
 
															+
														
 
															+        Args:
														
 
															+            _docs: Documents intended for insertion (read-only).
														
 
															+        """
														
 
															+
														
 
															+    def after_insert(
														
 
															+        self, _docs: Iterable[Document], _count: int
														
 
															+    ) -> None:  # pragma: no cover
														
 
															+        """Hook invoked after insert_many().
														
 
															+
														
 
															+        Args:
														
 
															+            _docs: Documents that were attempted to be inserted.
														
 
															+            _count: Number of documents successfully inserted/upserted.
														
 
															+        """
														
 
															+
														
 
															+    # ---- Generic configuration & metadata ----
														
 
															+    def configure(self, **options: object) -> None:  # pragma: no cover
														
 
															+        """Update generic configuration options for this DB instance.
														
 
															+
														
 
															+        Args:
														
 
															+            **options: Arbitrary key/value options.
														
 
															+        """
														
 
															+        self.options.update(options)
														
 
															+
														
 
															+    @property
														
 
															+    def metadata(self) -> Mapping[str, object]:  # pragma: no cover
														
 
															+        """Read-only view of DB metadata."""
														
 
															+        return dict(self._metadata)
														
 
															+
														
 
															+    def set_metadata(self, **meta: object) -> None:  # pragma: no cover
														
 
															+        """Update DB metadata.
														
 
															+
														
 
															+        Args:
														
 
															+            **meta: Arbitrary key/value metadata.
														
 
															+        """
														
 
															+        self._metadata.update(meta)
														
--- a/src/databank/reporter/__init__.py
+++ b/src/databank/reporter/__init__.py
@@ -0,0 +1 @@
 
															+"""Reporter abstractions."""
														
--- a/src/databank/reporter/base.py
+++ b/src/databank/reporter/base.py
@@ -0,0 +1,110 @@
 
															+"""Abstract reporting interfaces (no concrete implementations).
														
 
															+
														
 
															+Contract (must implement):
														
 
															+- notify_start(spider_name, urls)
														
 
															+- notify_success(spider_name, count)
														
 
															+- notify_error(spider_name, error)
														
 
															+- notify_summary(summary)
														
 
															+
														
 
															+Extension points (optional to override):
														
 
															+- on_session_start/on_session_end: reporter session boundaries.
														
 
															+- configure/metadata: generic configuration and metadata.
														
 
															+
														
 
															+Notes:
														
 
															+- This module intentionally contains no concrete delivery channel (e.g., email/webhook).
														
 
															+"""
														
 
															+
														
 
															+from __future__ import annotations
														
 
															+
														
 
															+from abc import ABC, abstractmethod
														
 
															+import logging
														
 
															+from typing import Iterable, Mapping, Optional
														
 
															+
														
 
															+from databank.core.models import RunSummary
														
 
															+
														
 
															+
														
 
															+class ReporterError(Exception):
														
 
															+    """Base error for reporter operations."""
														
 
															+
														
 
															+
														
 
															+class BaseReporter(ABC):
														
 
															+    """Minimal reporter lifecycle hooks with optional configuration."""
														
 
															+
														
 
															+    def __init__(
														
 
															+        self, *, logger: Optional[logging.Logger] = None, **options: object
														
 
															+    ) -> None:
														
 
															+        """Initialize reporter with optional logger and options.
														
 
															+
														
 
															+        Args:
														
 
															+            logger: Optional logger for diagnostics.
														
 
															+            **options: Arbitrary key/value options.
														
 
															+        """
														
 
															+        self.logger: Optional[logging.Logger] = logger
														
 
															+        self.options: dict[str, object] = dict(options)
														
 
															+        self._metadata: dict[str, object] = {}
														
 
															+
														
 
															+    @abstractmethod
														
 
															+    def notify_start(self, spider_name: str, urls: Iterable[str]) -> None:
														
 
															+        """Called before a spider starts.
														
 
															+
														
 
															+        Args:
														
 
															+            spider_name: The spider identifier.
														
 
															+            urls: The list (or iterable) of URLs to be processed.
														
 
															+        """
														
 
															+
														
 
															+    @abstractmethod
														
 
															+    def notify_success(self, spider_name: str, count: int) -> None:
														
 
															+        """Called after a spider successfully persists docs.
														
 
															+
														
 
															+        Args:
														
 
															+            spider_name: The spider identifier.
														
 
															+            count: Number of documents persisted by this run.
														
 
															+        """
														
 
															+
														
 
															+    @abstractmethod
														
 
															+    def notify_error(self, spider_name: str, error: str) -> None:
														
 
															+        """Called when a spider run fails.
														
 
															+
														
 
															+        Args:
														
 
															+            spider_name: The spider identifier.
														
 
															+            error: Error message (or formatted summary) of the failure.
														
 
															+        """
														
 
															+
														
 
															+    @abstractmethod
														
 
															+    def notify_summary(self, summary: RunSummary) -> None:
														
 
															+        """Called once after all spiders finished.
														
 
															+
														
 
															+        Args:
														
 
															+            summary: Aggregated run summary across all spiders.
														
 
															+        """
														
 
															+
														
 
															+    # Intentionally no Email/Webhook specific bases at initialization stage
														
 
															+
														
 
															+    # ---- Optional lifecycle hooks ----
														
 
															+    def on_session_start(self) -> None:  # pragma: no cover
														
 
															+        """Hook invoked when a reporting session begins."""
														
 
															+
														
 
															+    def on_session_end(self) -> None:  # pragma: no cover
														
 
															+        """Hook invoked when a reporting session ends."""
														
 
															+
														
 
															+    # ---- Generic configuration & metadata ----
														
 
															+    def configure(self, **options: object) -> None:  # pragma: no cover
														
 
															+        """Update generic configuration options for this reporter instance.
														
 
															+
														
 
															+        Args:
														
 
															+            **options: Arbitrary key/value options.
														
 
															+        """
														
 
															+        self.options.update(options)
														
 
															+
														
 
															+    @property
														
 
															+    def metadata(self) -> Mapping[str, object]:  # pragma: no cover
														
 
															+        """Read-only view of reporter metadata."""
														
 
															+        return dict(self._metadata)
														
 
															+
														
 
															+    def set_metadata(self, **meta: object) -> None:  # pragma: no cover
														
 
															+        """Update reporter metadata.
														
 
															+
														
 
															+        Args:
														
 
															+            **meta: Arbitrary key/value metadata.
														
 
															+        """
														
 
															+        self._metadata.update(meta)
														
--- a/src/databank/scheduler/__init__.py
+++ b/src/databank/scheduler/__init__.py
@@ -0,0 +1 @@
 
															+"""Scheduling abstractions."""
														
--- a/src/databank/scheduler/base.py
+++ b/src/databank/scheduler/base.py
@@ -0,0 +1,159 @@
 
															+"""Abstract scheduler/runner interfaces (no concrete implementations).
														
 
															+
														
 
															+Contract (must implement):
														
 
															+- RunnerBase.run(spiders): coordinate spiders and return RunSummary.
														
 
															+- SchedulerBase.schedule(): install/trigger schedules.
														
 
															+
														
 
															+Extension points (optional to override):
														
 
															+- RunnerBase: on_run_start/on_run_end hooks, configure/metadata.
														
 
															+- SchedulerBase: on_schedule/on_unschedule hooks, configure/metadata.
														
 
															+"""
														
 
															+
														
 
															+from __future__ import annotations
														
 
															+
														
 
															+from abc import ABC, abstractmethod
														
 
															+import logging
														
 
															+from typing import Dict, Iterable, List, Mapping, Optional
														
 
															+
														
 
															+from databank.core.models import RunSummary
														
 
															+from databank.db.base import BaseDB
														
 
															+from databank.reporter.base import BaseReporter
														
 
															+from databank.spiders.base import BaseSpider
														
 
															+
														
 
															+
														
 
															+class RunnerError(Exception):
														
 
															+    """Base error for runner operations."""
														
 
															+
														
 
															+
														
 
															+class SchedulerError(Exception):
														
 
															+    """Base error for scheduler operations."""
														
 
															+
														
 
															+
														
 
															+class RunnerBase(ABC):
														
 
															+    """Coordinate spiders, DB and reporters (interface only)."""
														
 
															+
														
 
															+    def __init__(
														
 
															+        self,
														
 
															+        db: BaseDB,
														
 
															+        reporters: Iterable[BaseReporter],
														
 
															+        *,
														
 
															+        logger: Optional[logging.Logger] = None,
														
 
															+        **options: object,
														
 
															+    ) -> None:
														
 
															+        """Initialize the runner with DB and reporters.
														
 
															+
														
 
															+        Args:
														
 
															+            db: Database backend instance.
														
 
															+            reporters: Reporter instances to receive notifications.
														
 
															+            logger: Optional logger for diagnostics.
														
 
															+            **options: Arbitrary key/value options.
														
 
															+        """
														
 
															+        self._db = db
														
 
															+        self._reporters = list(reporters)
														
 
															+        self.logger: Optional[logging.Logger] = logger
														
 
															+        self.options: dict[str, object] = dict(options)
														
 
															+        self._metadata: dict[str, object] = {}
														
 
															+
														
 
															+    @abstractmethod
														
 
															+    def run(self, spiders: Dict[BaseSpider, List[str]]) -> RunSummary:
														
 
															+        """Run spiders over URLs and persist results; return summary.
														
 
															+
														
 
															+        Args:
														
 
															+            spiders: A mapping of spider instances to their respective URL lists.
														
 
															+
														
 
															+        Returns:
														
 
															+            Aggregated :class:`RunSummary` across all spiders.
														
 
															+        """
														
 
															+
														
 
															+    # ---- Optional lifecycle hooks ----
														
 
															+    def on_run_start(
														
 
															+        self, _spiders: Dict[BaseSpider, List[str]]
														
 
															+    ) -> None:  # pragma: no cover
														
 
															+        """Hook invoked before a coordinated run begins.
														
 
															+
														
 
															+        Args:
														
 
															+            _spiders: The mapping that will be processed during this run.
														
 
															+        """
														
 
															+
														
 
															+    def on_run_end(self, _summary: RunSummary) -> None:  # pragma: no cover
														
 
															+        """Hook invoked after a coordinated run ends.
														
 
															+
														
 
															+        Args:
														
 
															+            _summary: The aggregated result of the run.
														
 
															+        """
														
 
															+
														
 
															+    # ---- Generic configuration & metadata ----
														
 
															+    def configure(self, **options: object) -> None:  # pragma: no cover
														
 
															+        """Update generic configuration options for this runner instance.
														
 
															+
														
 
															+        Args:
														
 
															+            **options: Arbitrary key/value options.
														
 
															+        """
														
 
															+        self.options.update(options)
														
 
															+
														
 
															+    @property
														
 
															+    def metadata(self) -> Mapping[str, object]:  # pragma: no cover
														
 
															+        """Read-only view of runner metadata."""
														
 
															+        return dict(self._metadata)
														
 
															+
														
 
															+    def set_metadata(self, **meta: object) -> None:  # pragma: no cover
														
 
															+        """Update runner metadata.
														
 
															+
														
 
															+        Args:
														
 
															+            **meta: Arbitrary key/value metadata.
														
 
															+        """
														
 
															+        self._metadata.update(meta)
														
 
															+
														
 
															+
														
 
															+class SchedulerBase(ABC):
														
 
															+    """An interface for scheduling runs (cron/systemd left to ops/impl)."""
														
 
															+
														
 
															+    def __init__(
														
 
															+        self, *, logger: Optional[logging.Logger] = None, **options: object
														
 
															+    ) -> None:
														
 
															+        """Initialize the scheduler with optional logger and options.
														
 
															+
														
 
															+        Args:
														
 
															+            logger: Optional logger for diagnostics.
														
 
															+            **options: Arbitrary key/value options.
														
 
															+        """
														
 
															+        self.logger: Optional[logging.Logger] = logger
														
 
															+        self.options: dict[str, object] = dict(options)
														
 
															+        self._metadata: dict[str, object] = {}
														
 
															+
														
 
															+    @abstractmethod
														
 
															+    def schedule(self) -> None:  # pragma: no cover
														
 
															+        """Install/trigger schedules (implementation dependent).
														
 
															+
														
 
															+        Notes:
														
 
															+            Implementations may choose OS-native schedulers such as cron or systemd.
														
 
															+        """
														
 
															+
														
 
															+    # ---- Optional lifecycle hooks ----
														
 
															+    def on_schedule(self) -> None:  # pragma: no cover
														
 
															+        """Hook invoked when schedules are registered."""
														
 
															+
														
 
															+    def on_unschedule(self) -> None:  # pragma: no cover
														
 
															+        """Hook invoked when schedules are removed."""
														
 
															+
														
 
															+    # ---- Generic configuration & metadata ----
														
 
															+    def configure(self, **options: object) -> None:  # pragma: no cover
														
 
															+        """Update generic configuration options for this scheduler instance.
														
 
															+
														
 
															+        Args:
														
 
															+            **options: Arbitrary key/value options.
														
 
															+        """
														
 
															+        self.options.update(options)
														
 
															+
														
 
															+    @property
														
 
															+    def metadata(self) -> Mapping[str, object]:  # pragma: no cover
														
 
															+        """Read-only view of scheduler metadata."""
														
 
															+        return dict(self._metadata)
														
 
															+
														
 
															+    def set_metadata(self, **meta: object) -> None:  # pragma: no cover
														
 
															+        """Update scheduler metadata.
														
 
															+
														
 
															+        Args:
														
 
															+            **meta: Arbitrary key/value metadata.
														
 
															+        """
														
 
															+        self._metadata.update(meta)
														
--- a/src/databank/spiders/__init__.py
+++ b/src/databank/spiders/__init__.py
@@ -0,0 +1 @@
 
															+"""Spider abstractions."""
														
--- a/src/databank/spiders/base.py
+++ b/src/databank/spiders/base.py
@@ -0,0 +1,268 @@
 
															+"""Abstract spider interface for Databank (no concrete spiders here).
														
 
															+
														
 
															+Contract (must implement):
														
 
															+- build_payload(url): pure builder for request parameters/headers/body.
														
 
															+- fetch(url, payload): perform retrieval and return raw textual content.
														
 
															+- parse(url, content, payload): transform raw content into a sequence of Documents.
														
 
															+
														
 
															+Extension points (optional to override):
														
 
															+- on_run_start / on_run_end: hooks at the boundaries of a run.
														
 
															+- should_fetch: per-URL gate to skip fetching (e.g., dedup or robots checks).
														
 
															+- before_fetch / after_fetch: lifecycle hooks around network retrieval.
														
 
															+- transform: post-parse normalization/filtering step over parsed documents.
														
 
															+- handle_error: customize error handling per-URL (default: re-raise).
														
 
															+- configure/metadata: provide generic configuration and metadata.
														
 
															+- close: release resources (sessions, files, etc.); context-manager friendly.
														
 
															+
														
 
															+Notes:
														
 
															+- This module intentionally contains no concrete spider implementation.
														
 
															+- The provided class attributes (timeouts, retries, etc.) are advisory hints;
														
 
															+    concrete implementations may honor them but this base does not enforce behavior.
														
 
															+"""
														
 
															+
														
 
															+from __future__ import annotations
														
 
															+
														
 
															+from abc import ABC, abstractmethod
														
 
															+import logging
														
 
															+from typing import Any, Iterable, Mapping, Sequence, Optional
														
 
															+
														
 
															+from databank.core.models import Document
														
 
															+
														
 
															+# Type aliases for clarity
														
 
															+URL = str
														
 
															+Payload = Mapping[str, Any]
														
 
															+Documents = Sequence[Document]
														
 
															+
														
 
															+
														
 
															+# Error hierarchy (implementations may raise these for clarity)
														
 
															+class SpiderError(Exception):
														
 
															+    """Base error for spider operations."""
														
 
															+
														
 
															+
														
 
															+class BuildPayloadError(SpiderError):
														
 
															+    """Raised when building the request payload fails for a URL."""
														
 
															+
														
 
															+
														
 
															+class FetchError(SpiderError):
														
 
															+    """Raised when fetching raw content for a URL fails."""
														
 
															+
														
 
															+
														
 
															+class ParseError(SpiderError):
														
 
															+    """Raised when parsing raw content into documents fails."""
														
 
															+
														
 
															+
														
 
															+class BaseSpider(ABC):
														
 
															+    """Abstract spider definition.
														
 
															+
														
 
															+    Attributes (advisory; implementations may choose to honor these):
														
 
															+    - name: Identifier for this spider.
														
 
															+    - max_retries: Suggested maximum retry attempts per URL (default 0).
														
 
															+    - request_timeout_s: Suggested request timeout in seconds (None = no limit).
														
 
															+    - rate_limit_per_sec: Suggested maximum requests per second (None = unlimited).
														
 
															+    - default_headers: Suggested default HTTP headers mapping.
														
 
															+    - logger: Optional logger for diagnostics.
														
 
															+    - options: Arbitrary configuration bag provided at construction.
														
 
															+    - metadata: Arbitrary, read-mostly metadata about this spider instance.
														
 
															+    """
														
 
															+
														
 
															+    name: str = "base"
														
 
															+    max_retries: int = 0
														
 
															+    request_timeout_s: Optional[float] = None
														
 
															+    rate_limit_per_sec: Optional[float] = None
														
 
															+    default_headers: Optional[Mapping[str, str]] = None
														
 
															+
														
 
															+    def __init__(
														
 
															+        self, *, logger: Optional[logging.Logger] = None, **options: Any
														
 
															+    ) -> None:
														
 
															+        """Initialize spider with optional logger and configuration options.
														
 
															+
														
 
															+        Args:
														
 
															+            logger: Optional logger for diagnostics.
														
 
															+            **options: Arbitrary configuration bag for concrete implementation.
														
 
															+        """
														
 
															+        self.logger: Optional[logging.Logger] = logger
														
 
															+        self.options: dict[str, Any] = dict(options)
														
 
															+        self._metadata: dict[str, Any] = {}
														
 
															+
														
 
															+    @abstractmethod
														
 
															+    def build_payload(self, url: URL) -> Payload:
														
 
															+        """Build request params/body or headers from a URL.
														
 
															+
														
 
															+        Args:
														
 
															+            url: Target URL to be fetched.
														
 
															+
														
 
															+        Returns:
														
 
															+            A mapping of request parameters/headers/body to be used by ``fetch``.
														
 
															+
														
 
															+        Raises:
														
 
															+            BuildPayloadError: If payload construction fails for ``url``.
														
 
															+        """
														
 
															+
														
 
															+    @abstractmethod
														
 
															+    def fetch(self, url: URL, payload: Payload) -> str:
														
 
															+        """Fetch raw textual content for a URL using the given payload.
														
 
															+
														
 
															+        Args:
														
 
															+            url: Target URL to fetch.
														
 
															+            payload: Request parameters prepared by ``build_payload``.
														
 
															+
														
 
															+        Returns:
														
 
															+            Raw textual content.
														
 
															+
														
 
															+        Raises:
														
 
															+            FetchError: If retrieval fails for ``url``.
														
 
															+        """
														
 
															+
														
 
															+    @abstractmethod
														
 
															+    def parse(self, url: URL, content: str, payload: Payload) -> Documents:
														
 
															+        """Parse raw content into a sequence of Documents.
														
 
															+
														
 
															+        Args:
														
 
															+            url: URL associated with the content.
														
 
															+            content: Raw textual content fetched by ``fetch``.
														
 
															+            payload: The payload used to fetch, for context if needed.
														
 
															+
														
 
															+        Returns:
														
 
															+            A sequence of :class:`~databank.core.models.Document` instances.
														
 
															+
														
 
															+        Raises:
														
 
															+            ParseError: If parsing fails for ``url``.
														
 
															+        """
														
 
															+
														
 
															+    # ---- Optional lifecycle hooks (no-op by default) ----
														
 
															+    def on_run_start(self, urls: Iterable[URL]) -> None:  # pragma: no cover
														
 
															+        """Hook invoked once before processing a batch of URLs.
														
 
															+
														
 
															+        Args:
														
 
															+            urls: Collection of URLs to be processed in this run.
														
 
															+        """
														
 
															+
														
 
															+    def on_run_end(
														
 
															+        self, urls: Iterable[URL], results: Sequence[Document], error_count: int
														
 
															+    ) -> None:  # pragma: no cover
														
 
															+        """Hook invoked once after processing a batch of URLs.
														
 
															+
														
 
															+        Args:
														
 
															+            urls: The same collection passed to ``on_run_start``.
														
 
															+            results: All successfully parsed documents.
														
 
															+            error_count: Number of URLs that raised errors.
														
 
															+        """
														
 
															+
														
 
															+    def should_fetch(self, _url: URL, _payload: Payload) -> bool:  # pragma: no cover
														
 
															+        """Return False to skip fetching this URL (e.g., dedup, robots, filters).
														
 
															+
														
 
															+        Returns:
														
 
															+            True to proceed with fetching; False to skip this URL.
														
 
															+        """
														
 
															+        return True
														
 
															+
														
 
															+    def before_fetch(self, url: URL, payload: Payload) -> None:  # pragma: no cover
														
 
															+        """Hook invoked before fetch; override for logging/metrics/rate-limit.
														
 
															+
														
 
															+        Args:
														
 
															+            url: URL to fetch.
														
 
															+            payload: Request parameters prepared by ``build_payload``.
														
 
															+        """
														
 
															+
														
 
															+    def after_fetch(
														
 
															+        self, url: URL, payload: Payload, content: str
														
 
															+    ) -> None:  # pragma: no cover
														
 
															+        """Hook invoked after fetch; override for logging/metrics/tracing.
														
 
															+
														
 
															+        Args:
														
 
															+            url: URL fetched.
														
 
															+            payload: Request parameters used to fetch.
														
 
															+            content: Raw textual content returned by ``fetch``.
														
 
															+        """
														
 
															+
														
 
															+    def handle_error(
														
 
															+        self, url: URL, payload: Payload, exc: Exception
														
 
															+    ) -> None:  # pragma: no cover
														
 
															+        """Handle per-URL errors; default behavior re-raises the exception.
														
 
															+
														
 
															+        Implementations may log, collect metrics, or convert exceptions.
														
 
															+
														
 
															+        Args:
														
 
															+            url: URL whose processing failed.
														
 
															+            payload: Payload built for this URL.
														
 
															+            exc: Original exception raised.
														
 
															+        """
														
 
															+        raise exc
														
 
															+
														
 
															+    def transform(self, _url: URL, docs: Documents) -> Documents:  # pragma: no cover
														
 
															+        """Post-parse transformation/normalization stage; default is identity.
														
 
															+
														
 
															+        Args:
														
 
															+            _url: URL associated with ``docs``.
														
 
															+            docs: Parsed documents.
														
 
															+
														
 
															+        Returns:
														
 
															+            Possibly modified documents (default: unchanged).
														
 
															+        """
														
 
															+        return docs
														
 
															+
														
 
															+    def run(self, urls: Iterable[URL]) -> list[Document]:
														
 
															+        """Reference orchestration: build -> fetch -> parse.
														
 
															+
														
 
															+        Steps:
														
 
															+            1) ``build_payload`` per URL.
														
 
															+            2) ``should_fetch`` gate.
														
 
															+            3) ``before_fetch`` -> ``fetch`` -> ``after_fetch``.
														
 
															+            4) ``parse`` -> ``transform``.
														
 
															+
														
 
															+        Implementations may override for concurrency, caching, or tracing.
														
 
															+
														
 
															+        Args:
														
 
															+            urls: URLs to process.
														
 
															+
														
 
															+        Returns:
														
 
															+            A list of parsed documents across all URLs.
														
 
															+        """
														
 
															+        results: list[Document] = []
														
 
															+        urls_seq = tuple(urls)
														
 
															+        self.on_run_start(urls_seq)
														
 
															+        error_count = 0
														
 
															+        for url in urls_seq:
														
 
															+            payload = self.build_payload(url)
														
 
															+            if not self.should_fetch(url, payload):
														
 
															+                continue
														
 
															+            self.before_fetch(url, payload)
														
 
															+            try:
														
 
															+                raw = self.fetch(url, payload)
														
 
															+                self.after_fetch(url, payload, raw)
														
 
															+                docs = self.parse(url, raw, payload)
														
 
															+                docs = self.transform(url, docs)
														
 
															+            except Exception as exc:  # pylint: disable=broad-except
														
 
															+                self.handle_error(url, payload, exc)
														
 
															+                error_count += 1
														
 
															+                continue
														
 
															+            results.extend(docs)
														
 
															+        self.on_run_end(urls_seq, results, error_count)
														
 
															+        return results
														
 
															+
														
 
															+    # ---- Resource management ----
														
 
															+    def close(self) -> None:  # pragma: no cover
														
 
															+        """Release resources (e.g., network sessions)."""
														
 
															+
														
 
															+    # ---- Generic configuration & metadata ----
														
 
															+    def configure(self, **options: Any) -> None:  # pragma: no cover
														
 
															+        """Update generic configuration options for this spider instance."""
														
 
															+        self.options.update(options)
														
 
															+
														
 
															+    @property
														
 
															+    def metadata(self) -> Mapping[str, Any]:  # pragma: no cover
														
 
															+        """Read-only view of spider metadata."""
														
 
															+        return dict(self._metadata)
														
 
															+
														
 
															+    def set_metadata(self, **meta: Any) -> None:  # pragma: no cover
														
 
															+        """Update spider metadata."""
														
 
															+        self._metadata.update(meta)
														
 
															+
														
 
															+    # Context manager support
														
 
															+    def __enter__(self) -> "BaseSpider":  # pragma: no cover
														
 
															+        return self
														
 
															+
														
 
															+    def __exit__(self, exc_type, exc, tb) -> bool:  # pragma: no cover
														
 
															+        self.close()
														
 
															+        # Do not suppress exceptions
														
 
															+        return False