5 mesi fa · db7d006eb1
--- a/README.md
+++ b/README.md
@@ -1,3 +1,62 @@
 
				-# databank
			
 
				+# Databank (Abstract Skeleton)
			
 
				 
			
 
				-英超足球联赛数据仓库
			
 
				+This repository is a pure abstract skeleton intended to define stable contracts
			
 
				+for a multi-spider data pipeline. It deliberately contains only abstract/base
			
 
				+classes and core models, without any concrete implementations.
			
 
				+
			
 
				+Key modules (all abstract-only):
			
 
				+- spiders: `BaseSpider` with clear lifecycle hooks and advisory attributes.
			
 
				+- db: `BaseDB` defining minimal persistence operations and hooks.
			
 
				+- reporter: `BaseReporter` defining reporting lifecycle.
			
 
				+- scheduler: `RunnerBase` and `SchedulerBase` for coordination/scheduling.
			
 
				+- analytics: `AnalyticsBase` for generic analytics pipelines.
			
 
				+
			
 
				+Guidelines to extend (no code here, only how-to):
			
 
				+- Implementations MUST live in your own packages/modules and import these bases.
			
 
				+- Do NOT modify the base interfaces unless you intend a breaking change.
			
 
				+- Prefer composition and dependency injection over hard-coding dependencies.
			
 
				+
			
 
				+Implementing a spider (outline only):
			
 
				+1. Subclass `BaseSpider` and implement:
			
 
				+	- `build_payload(url) -> Payload`
			
 
				+	- `fetch(url, payload) -> str`
			
 
				+	- `parse(url, content, payload) -> Documents`
			
 
				+2. Optionally override hooks:
			
 
				+	- `on_run_start/on_run_end`, `should_fetch`, `before_fetch/after_fetch`, `transform`, `handle_error`, `close`.
			
 
				+3. Optionally honor advisory attributes like `max_retries`, `request_timeout_s`.
			
 
				+
			
 
				+Implementing a DB backend (outline only):
			
 
				+1. Subclass `BaseDB` and implement: `connect`, `ensure_indexes`, `insert_many`, `close`.
			
 
				+2. Optionally override hooks: `on_connect/on_close`, `before_insert/after_insert`.
			
 
				+
			
 
				+Implementing a reporter (outline only):
			
 
				+1. Subclass `BaseReporter` and implement: `notify_start`, `notify_success`, `notify_error`, `notify_summary`.
			
 
				+2. Optionally override `on_session_start/on_session_end`.
			
 
				+
			
 
				+Implementing a runner/scheduler (outline only):
			
 
				+1. Subclass `RunnerBase` to coordinate spiders -> DB -> reporters.
			
 
				+2. Subclass `SchedulerBase` to install/trigger schedules (e.g., via systemd/cron in your own code).
			
 
				+
			
 
				+Implementing analytics (outline only):
			
 
				+1. Subclass `AnalyticsBase` and implement `compute(data, **kwargs)`.
			
 
				+2. Optional staged hooks: `prepare`, `validate`, `transform`, `finalize`.
			
 
				+
			
 
				+Operations: systemd templates
			
 
				+- See `ops/systemd/databank.service` and `ops/systemd/databank.timer`.
			
 
				+- Customize `User`, `WorkingDirectory`, and `ExecStart` for your environment.
			
 
				+
			
 
				+Optional linting (no deps enforced)
			
 
				+- A minimal Pylint config is included in `pyproject.toml` under `[tool.pylint.*]`.
			
 
				+- You can run Pylint in your environment if desired, for example:
			
 
				+	- `pylint src/databank` (assuming Pylint is installed in your environment)
			
 
				+	- The config disables ABC-related false positives while keeping docstring checks.
			
 
				+
			
 
				+Optional typing and linting (ruff/mypy)
			
 
				+- Minimal configs for Ruff and mypy are also included in `pyproject.toml`.
			
 
				+- If you have them installed locally, example commands:
			
 
				+	- `ruff check src/databank`
			
 
				+	- `mypy src/databank`
			
 
				+	- Both are optional and will not run unless you invoke them.
			
 
				+
			
 
				+License
			
 
				+- See `LICENSE` for details.
			
--- a/ops/systemd/databank.service
+++ b/ops/systemd/databank.service
@@ -0,0 +1,15 @@
 
				+[Unit]
			
 
				+Description=Databank football data pipeline (abstract layer)
			
 
				+After=network-online.target
			
 
				+Wants=network-online.target
			
 
				+
			
 
				+[Service]
			
 
				+Type=simple
			
 
				+User=databank
			
 
				+WorkingDirectory=/opt/databank
			
 
				+ExecStart=/usr/bin/python3 -m databank
			
 
				+Restart=on-failure
			
 
				+RestartSec=10
			
 
				+
			
 
				+[Install]
			
 
				+WantedBy=multi-user.target
			
--- a/ops/systemd/databank.timer
+++ b/ops/systemd/databank.timer
@@ -0,0 +1,9 @@
 
				+[Unit]
			
 
				+Description=Run Databank pipeline periodically
			
 
				+
			
 
				+[Timer]
			
 
				+OnCalendar=hourly
			
 
				+Persistent=true
			
 
				+
			
 
				+[Install]
			
 
				+WantedBy=timers.target
			
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -0,0 +1,69 @@
 
				+[build-system]
			
 
				+requires = ["setuptools>=61.0"]
			
 
				+build-backend = "setuptools.build_meta"
			
 
				+
			
 
				+[project]
			
 
				+name = "databank"
			
 
				+version = "0.0.1"
			
 
				+description = "Databank - abstract-only initialization"
			
 
				+readme = "README.md"
			
 
				+requires-python = ">=3.9"
			
 
				+license = { file = "LICENSE" }
			
 
				+
			
 
				+[tool.setuptools]
			
 
				+package-dir = {"" = "src"}
			
 
				+
			
 
				+[tool.setuptools.packages.find]
			
 
				+where = ["src"]
			
 
				+include = ["databank*"]
			
 
				+
			
 
				+[tool.pytest.ini_options]
			
 
				+addopts = "-q"
			
 
				+pythonpath = ["src"]
			
 
				+
			
 
				+# Optional: Lint settings (no dependency changes here; for reference only)
			
 
				+[tool.pylint.main]
			
 
				+py-version = "3.9"
			
 
				+jobs = 0
			
 
				+recursive = true
			
 
				+
			
 
				+[tool.pylint.messages_control]
			
 
				+disable = [
			
 
				+		"too-few-public-methods",    # common for ABCs/interfaces
			
 
				+		"too-many-arguments",        # allowed in abstract signatures
			
 
				+		"too-many-instance-attributes",
			
 
				+]
			
 
				+enable = [
			
 
				+		"C0116",  # missing-function-docstring (kept enabled, already addressed)
			
 
				+]
			
 
				+
			
 
				+[tool.pylint.format]
			
 
				+max-line-length = 100
			
 
				+
			
 
				+[tool.pylint.basic]
			
 
				+good-names = [
			
 
				+	"db", "id", "url", "raw", "exc", "tb"
			
 
				+]
			
 
				+
			
 
				+# Optional: Ruff configuration (only active if ruff is installed in your env)
			
 
				+[tool.ruff]
			
 
				+line-length = 100
			
 
				+target-version = "py39"
			
 
				+extend-select = [
			
 
				+	"E",  # pycodestyle errors
			
 
				+	"F",  # pyflakes
			
 
				+	"N",  # pep8-naming
			
 
				+]
			
 
				+extend-ignore = [
			
 
				+	# Allow short names in specific contexts (already handled via pylint good-names)
			
 
				+]
			
 
				+
			
 
				+# Optional: mypy configuration (only active if mypy is installed in your env)
			
 
				+[tool.mypy]
			
 
				+python_version = "3.9"
			
 
				+warn_return_any = true
			
 
				+warn_unused_ignores = true
			
 
				+ignore_missing_imports = true
			
 
				+no_implicit_optional = false
			
 
				+strict_optional = false
			
 
				+disallow_untyped_defs = false
			
--- a/src/databank/__init__.py
+++ b/src/databank/__init__.py
@@ -0,0 +1,6 @@
 
				+"""Databank package root (abstract-only initialization)."""
			
 
				+
			
 
				+from __future__ import annotations
			
 
				+
			
 
				+__all__ = ["__version__"]
			
 
				+__version__ = "0.0.1"
			
--- a/src/databank/analytics/__init__.py
+++ b/src/databank/analytics/__init__.py
@@ -0,0 +1 @@
 
				+"""Analytics abstractions."""
			
--- a/src/databank/analytics/base.py
+++ b/src/databank/analytics/base.py
@@ -0,0 +1,115 @@
 
				+"""Abstract analytics interface (no algorithms, no specific modules).
			
 
				+
			
 
				+Contract (must implement):
			
 
				+- compute(data, **kwargs) -> Any
			
 
				+
			
 
				+Extension points (optional to override):
			
 
				+- prepare/validate/transform/finalize hooks to shape analytics pipeline.
			
 
				+- configure/metadata: generic configuration and metadata.
			
 
				+"""
			
 
				+
			
 
				+from __future__ import annotations
			
 
				+
			
 
				+from abc import ABC, abstractmethod
			
 
				+import logging
			
 
				+from typing import Any, Mapping, Optional
			
 
				+
			
 
				+
			
 
				+class AnalyticsError(Exception):
			
 
				+    """Base error for analytics operations."""
			
 
				+
			
 
				+
			
 
				+class AnalyticsBase(ABC):
			
 
				+    """A minimal, generic analytics interface.
			
 
				+
			
 
				+    Concrete analytics should implement compute(), while optional hooks provide
			
 
				+    a simple staged pipeline interface.
			
 
				+    """
			
 
				+
			
 
				+    def __init__(
			
 
				+        self, *, logger: Optional[logging.Logger] = None, **options: object
			
 
				+    ) -> None:
			
 
				+        """Initialize analytics instance with optional logger and options.
			
 
				+
			
 
				+        Args:
			
 
				+            logger: Optional logger for diagnostics.
			
 
				+            **options: Arbitrary key/value options.
			
 
				+        """
			
 
				+        self.logger: Optional[logging.Logger] = logger
			
 
				+        self.options: dict[str, object] = dict(options)
			
 
				+        self._metadata: dict[str, object] = {}
			
 
				+
			
 
				+    @abstractmethod
			
 
				+    def compute(self, data: Any, **kwargs: Any) -> Any:
			
 
				+        """Compute analytics result from input data.
			
 
				+
			
 
				+        Args:
			
 
				+            data: Input data for the analytics computation.
			
 
				+            **kwargs: Optional parameters to influence computation.
			
 
				+
			
 
				+        Returns:
			
 
				+            An analytics result, type defined by the concrete implementation.
			
 
				+        """
			
 
				+
			
 
				+    # ---- Optional hooks (no-op by default) ----
			
 
				+    def prepare(self, data: Any, **_kwargs: Any) -> Any:  # pragma: no cover
			
 
				+        """Pre-compute preparation (e.g., enrich context). Default: passthrough.
			
 
				+
			
 
				+        Args:
			
 
				+            data: Input to be prepared.
			
 
				+
			
 
				+        Returns:
			
 
				+            Prepared data (defaults to input unchanged).
			
 
				+        """
			
 
				+        return data
			
 
				+
			
 
				+    def validate(self, data: Any, **_kwargs: Any) -> None:  # pragma: no cover
			
 
				+        """Validate data preconditions. Default: no validation.
			
 
				+
			
 
				+        Args:
			
 
				+            data: Prepared input data.
			
 
				+        """
			
 
				+
			
 
				+    def transform(self, data: Any, **_kwargs: Any) -> Any:  # pragma: no cover
			
 
				+        """Transform data before compute. Default: passthrough.
			
 
				+
			
 
				+        Args:
			
 
				+            data: Validated input data.
			
 
				+
			
 
				+        Returns:
			
 
				+            Transformed data (defaults to input unchanged).
			
 
				+        """
			
 
				+        return data
			
 
				+
			
 
				+    def finalize(self, result: Any, **_kwargs: Any) -> Any:  # pragma: no cover
			
 
				+        """Finalize result post-compute (e.g., rounding/formatting). Default: identity.
			
 
				+
			
 
				+        Args:
			
 
				+            result: Raw result returned by ``compute``.
			
 
				+
			
 
				+        Returns:
			
 
				+            Final result (defaults to input unchanged).
			
 
				+        """
			
 
				+        return result
			
 
				+
			
 
				+    # ---- Generic configuration & metadata ----
			
 
				+    def configure(self, **options: object) -> None:  # pragma: no cover
			
 
				+        """Update generic configuration options for this analytics instance.
			
 
				+
			
 
				+        Args:
			
 
				+            **options: Arbitrary key/value options.
			
 
				+        """
			
 
				+        self.options.update(options)
			
 
				+
			
 
				+    @property
			
 
				+    def metadata(self) -> Mapping[str, object]:  # pragma: no cover
			
 
				+        """Read-only view of analytics metadata."""
			
 
				+        return dict(self._metadata)
			
 
				+
			
 
				+    def set_metadata(self, **meta: object) -> None:  # pragma: no cover
			
 
				+        """Update analytics metadata.
			
 
				+
			
 
				+        Args:
			
 
				+            **meta: Arbitrary key/value metadata.
			
 
				+        """
			
 
				+        self._metadata.update(meta)
			
--- a/src/databank/core/__init__.py
+++ b/src/databank/core/__init__.py
@@ -0,0 +1 @@
 
				+"""Core models and types."""
			
--- a/src/databank/core/models.py
+++ b/src/databank/core/models.py
@@ -0,0 +1,31 @@
 
				+"""Core models (data-only).
			
 
				+
			
 
				+No concrete behavior here, just containers shared by interfaces.
			
 
				+"""
			
 
				+
			
 
				+from __future__ import annotations
			
 
				+
			
 
				+from dataclasses import dataclass, field
			
 
				+from datetime import UTC, datetime
			
 
				+from typing import Any, MutableMapping, Optional
			
 
				+
			
 
				+
			
 
				+@dataclass(slots=True)
			
 
				+class Document:
			
 
				+    """Generic document emitted by spiders and stored by DB backends."""
			
 
				+
			
 
				+    id: Optional[str]
			
 
				+    kind: str
			
 
				+    data: MutableMapping[str, Any] = field(default_factory=dict)
			
 
				+    created_at: datetime = field(default_factory=lambda: datetime.now(UTC))
			
 
				+
			
 
				+
			
 
				+@dataclass(slots=True)
			
 
				+class RunSummary:
			
 
				+    """Summary object passed to reporters after a run."""
			
 
				+
			
 
				+    started_at: datetime = field(default_factory=lambda: datetime.now(UTC))
			
 
				+    finished_at: Optional[datetime] = None
			
 
				+    total_docs: int = 0
			
 
				+    per_spider: MutableMapping[str, int] = field(default_factory=dict)
			
 
				+    errors: list[str] = field(default_factory=list)
			
--- a/src/databank/db/__init__.py
+++ b/src/databank/db/__init__.py
@@ -0,0 +1 @@
 
				+"""Database abstractions."""
			
--- a/src/databank/db/base.py
+++ b/src/databank/db/base.py
@@ -0,0 +1,150 @@
 
				+"""Abstract database backend interface (no concrete implementations).
			
 
				+
			
 
				+Contract (must implement):
			
 
				+- connect(): establish connection/clients.
			
 
				+- ensure_indexes(): create indexes if needed.
			
 
				+- insert_many(docs): persist documents and return count.
			
 
				+- close(): release resources.
			
 
				+
			
 
				+Extension points (optional to override):
			
 
				+- on_connect/on_close: lifecycle boundaries.
			
 
				+- before_insert/after_insert: hooks around bulk insert.
			
 
				+- configure/metadata: generic configuration and metadata.
			
 
				+
			
 
				+Notes:
			
 
				+- This module intentionally contains no concrete DB backend.
			
 
				+- Attributes like timeouts and retries are advisory; concrete backends may honor them.
			
 
				+"""
			
 
				+
			
 
				+from __future__ import annotations
			
 
				+
			
 
				+from abc import ABC, abstractmethod
			
 
				+import logging
			
 
				+from typing import Iterable, Mapping, Optional
			
 
				+
			
 
				+from databank.core.models import Document
			
 
				+
			
 
				+
			
 
				+# Error hierarchy
			
 
				+class DBError(Exception):
			
 
				+    """Base error for DB operations."""
			
 
				+
			
 
				+
			
 
				+class ConnectError(DBError):
			
 
				+    """Raised when connecting to the backend fails."""
			
 
				+
			
 
				+
			
 
				+class InsertError(DBError):
			
 
				+    """Raised when inserting documents fails."""
			
 
				+
			
 
				+
			
 
				+class BaseDB(ABC):
			
 
				+    """Abstract DB defining minimal persistence operations.
			
 
				+
			
 
				+    Attributes (advisory hints):
			
 
				+    - request_timeout_s: Suggested timeout for DB operations (seconds).
			
 
				+    - max_retries: Suggested retries for transient failures.
			
 
				+    - logger: Optional logger.
			
 
				+    - options: Arbitrary configuration bag.
			
 
				+    - metadata: Arbitrary metadata for this DB instance.
			
 
				+    """
			
 
				+
			
 
				+    request_timeout_s: Optional[float] = None
			
 
				+    max_retries: int = 0
			
 
				+
			
 
				+    def __init__(
			
 
				+        self, *, logger: Optional[logging.Logger] = None, **options: object
			
 
				+    ) -> None:  # noqa: D401
			
 
				+        """Initialize DB backend with optional logger and options.
			
 
				+
			
 
				+        Args:
			
 
				+            logger: Optional logger for diagnostics.
			
 
				+            **options: Arbitrary key/value options.
			
 
				+        """
			
 
				+        self.logger: Optional[logging.Logger] = logger
			
 
				+        self.options: dict[str, object] = dict(options)
			
 
				+        self._metadata: dict[str, object] = {}
			
 
				+
			
 
				+    @abstractmethod
			
 
				+    def connect(self) -> None:
			
 
				+        """Establish connection to backend.
			
 
				+
			
 
				+        Raises:
			
 
				+            ConnectError: If the backend connection cannot be established.
			
 
				+        """
			
 
				+
			
 
				+    @abstractmethod
			
 
				+    def ensure_indexes(self) -> None:
			
 
				+        """Create indexes if needed.
			
 
				+
			
 
				+        Notes:
			
 
				+            Implementations should be idempotent and safe to call multiple times.
			
 
				+        """
			
 
				+
			
 
				+    @abstractmethod
			
 
				+    def insert_many(self, docs: Iterable[Document]) -> int:
			
 
				+        """Insert documents and return count inserted/upserted.
			
 
				+
			
 
				+        Args:
			
 
				+            docs: Iterable of Document instances to persist.
			
 
				+
			
 
				+        Returns:
			
 
				+            Number of successfully inserted (or upserted) documents.
			
 
				+
			
 
				+        Raises:
			
 
				+            InsertError: If the insert operation fails.
			
 
				+        """
			
 
				+
			
 
				+    @abstractmethod
			
 
				+    def close(self) -> None:
			
 
				+        """Close connections and release resources.
			
 
				+
			
 
				+        Notes:
			
 
				+            This method should be safe to call multiple times.
			
 
				+        """
			
 
				+
			
 
				+    # ---- Optional lifecycle hooks ----
			
 
				+    def on_connect(self) -> None:  # pragma: no cover
			
 
				+        """Hook invoked after successful connect()."""
			
 
				+
			
 
				+    def on_close(self) -> None:  # pragma: no cover
			
 
				+        """Hook invoked after successful close()."""
			
 
				+
			
 
				+    def before_insert(self, _docs: Iterable[Document]) -> None:  # pragma: no cover
			
 
				+        """Hook invoked before insert_many().
			
 
				+
			
 
				+        Args:
			
 
				+            _docs: Documents intended for insertion (read-only).
			
 
				+        """
			
 
				+
			
 
				+    def after_insert(
			
 
				+        self, _docs: Iterable[Document], _count: int
			
 
				+    ) -> None:  # pragma: no cover
			
 
				+        """Hook invoked after insert_many().
			
 
				+
			
 
				+        Args:
			
 
				+            _docs: Documents that were attempted to be inserted.
			
 
				+            _count: Number of documents successfully inserted/upserted.
			
 
				+        """
			
 
				+
			
 
				+    # ---- Generic configuration & metadata ----
			
 
				+    def configure(self, **options: object) -> None:  # pragma: no cover
			
 
				+        """Update generic configuration options for this DB instance.
			
 
				+
			
 
				+        Args:
			
 
				+            **options: Arbitrary key/value options.
			
 
				+        """
			
 
				+        self.options.update(options)
			
 
				+
			
 
				+    @property
			
 
				+    def metadata(self) -> Mapping[str, object]:  # pragma: no cover
			
 
				+        """Read-only view of DB metadata."""
			
 
				+        return dict(self._metadata)
			
 
				+
			
 
				+    def set_metadata(self, **meta: object) -> None:  # pragma: no cover
			
 
				+        """Update DB metadata.
			
 
				+
			
 
				+        Args:
			
 
				+            **meta: Arbitrary key/value metadata.
			
 
				+        """
			
 
				+        self._metadata.update(meta)
			
--- a/src/databank/reporter/__init__.py
+++ b/src/databank/reporter/__init__.py
@@ -0,0 +1 @@
 
				+"""Reporter abstractions."""
			
--- a/src/databank/reporter/base.py
+++ b/src/databank/reporter/base.py
@@ -0,0 +1,110 @@
 
				+"""Abstract reporting interfaces (no concrete implementations).
			
 
				+
			
 
				+Contract (must implement):
			
 
				+- notify_start(spider_name, urls)
			
 
				+- notify_success(spider_name, count)
			
 
				+- notify_error(spider_name, error)
			
 
				+- notify_summary(summary)
			
 
				+
			
 
				+Extension points (optional to override):
			
 
				+- on_session_start/on_session_end: reporter session boundaries.
			
 
				+- configure/metadata: generic configuration and metadata.
			
 
				+
			
 
				+Notes:
			
 
				+- This module intentionally contains no concrete delivery channel (e.g., email/webhook).
			
 
				+"""
			
 
				+
			
 
				+from __future__ import annotations
			
 
				+
			
 
				+from abc import ABC, abstractmethod
			
 
				+import logging
			
 
				+from typing import Iterable, Mapping, Optional
			
 
				+
			
 
				+from databank.core.models import RunSummary
			
 
				+
			
 
				+
			
 
				+class ReporterError(Exception):
			
 
				+    """Base error for reporter operations."""
			
 
				+
			
 
				+
			
 
				+class BaseReporter(ABC):
			
 
				+    """Minimal reporter lifecycle hooks with optional configuration."""
			
 
				+
			
 
				+    def __init__(
			
 
				+        self, *, logger: Optional[logging.Logger] = None, **options: object
			
 
				+    ) -> None:
			
 
				+        """Initialize reporter with optional logger and options.
			
 
				+
			
 
				+        Args:
			
 
				+            logger: Optional logger for diagnostics.
			
 
				+            **options: Arbitrary key/value options.
			
 
				+        """
			
 
				+        self.logger: Optional[logging.Logger] = logger
			
 
				+        self.options: dict[str, object] = dict(options)
			
 
				+        self._metadata: dict[str, object] = {}
			
 
				+
			
 
				+    @abstractmethod
			
 
				+    def notify_start(self, spider_name: str, urls: Iterable[str]) -> None:
			
 
				+        """Called before a spider starts.
			
 
				+
			
 
				+        Args:
			
 
				+            spider_name: The spider identifier.
			
 
				+            urls: The list (or iterable) of URLs to be processed.
			
 
				+        """
			
 
				+
			
 
				+    @abstractmethod
			
 
				+    def notify_success(self, spider_name: str, count: int) -> None:
			
 
				+        """Called after a spider successfully persists docs.
			
 
				+
			
 
				+        Args:
			
 
				+            spider_name: The spider identifier.
			
 
				+            count: Number of documents persisted by this run.
			
 
				+        """
			
 
				+
			
 
				+    @abstractmethod
			
 
				+    def notify_error(self, spider_name: str, error: str) -> None:
			
 
				+        """Called when a spider run fails.
			
 
				+
			
 
				+        Args:
			
 
				+            spider_name: The spider identifier.
			
 
				+            error: Error message (or formatted summary) of the failure.
			
 
				+        """
			
 
				+
			
 
				+    @abstractmethod
			
 
				+    def notify_summary(self, summary: RunSummary) -> None:
			
 
				+        """Called once after all spiders finished.
			
 
				+
			
 
				+        Args:
			
 
				+            summary: Aggregated run summary across all spiders.
			
 
				+        """
			
 
				+
			
 
				+    # Intentionally no Email/Webhook specific bases at initialization stage
			
 
				+
			
 
				+    # ---- Optional lifecycle hooks ----
			
 
				+    def on_session_start(self) -> None:  # pragma: no cover
			
 
				+        """Hook invoked when a reporting session begins."""
			
 
				+
			
 
				+    def on_session_end(self) -> None:  # pragma: no cover
			
 
				+        """Hook invoked when a reporting session ends."""
			
 
				+
			
 
				+    # ---- Generic configuration & metadata ----
			
 
				+    def configure(self, **options: object) -> None:  # pragma: no cover
			
 
				+        """Update generic configuration options for this reporter instance.
			
 
				+
			
 
				+        Args:
			
 
				+            **options: Arbitrary key/value options.
			
 
				+        """
			
 
				+        self.options.update(options)
			
 
				+
			
 
				+    @property
			
 
				+    def metadata(self) -> Mapping[str, object]:  # pragma: no cover
			
 
				+        """Read-only view of reporter metadata."""
			
 
				+        return dict(self._metadata)
			
 
				+
			
 
				+    def set_metadata(self, **meta: object) -> None:  # pragma: no cover
			
 
				+        """Update reporter metadata.
			
 
				+
			
 
				+        Args:
			
 
				+            **meta: Arbitrary key/value metadata.
			
 
				+        """
			
 
				+        self._metadata.update(meta)
			
--- a/src/databank/scheduler/__init__.py
+++ b/src/databank/scheduler/__init__.py
@@ -0,0 +1 @@
 
				+"""Scheduling abstractions."""
			
--- a/src/databank/scheduler/base.py
+++ b/src/databank/scheduler/base.py
@@ -0,0 +1,159 @@
 
				+"""Abstract scheduler/runner interfaces (no concrete implementations).
			
 
				+
			
 
				+Contract (must implement):
			
 
				+- RunnerBase.run(spiders): coordinate spiders and return RunSummary.
			
 
				+- SchedulerBase.schedule(): install/trigger schedules.
			
 
				+
			
 
				+Extension points (optional to override):
			
 
				+- RunnerBase: on_run_start/on_run_end hooks, configure/metadata.
			
 
				+- SchedulerBase: on_schedule/on_unschedule hooks, configure/metadata.
			
 
				+"""
			
 
				+
			
 
				+from __future__ import annotations
			
 
				+
			
 
				+from abc import ABC, abstractmethod
			
 
				+import logging
			
 
				+from typing import Dict, Iterable, List, Mapping, Optional
			
 
				+
			
 
				+from databank.core.models import RunSummary
			
 
				+from databank.db.base import BaseDB
			
 
				+from databank.reporter.base import BaseReporter
			
 
				+from databank.spiders.base import BaseSpider
			
 
				+
			
 
				+
			
 
				+class RunnerError(Exception):
			
 
				+    """Base error for runner operations."""
			
 
				+
			
 
				+
			
 
				+class SchedulerError(Exception):
			
 
				+    """Base error for scheduler operations."""
			
 
				+
			
 
				+
			
 
				+class RunnerBase(ABC):
			
 
				+    """Coordinate spiders, DB and reporters (interface only)."""
			
 
				+
			
 
				+    def __init__(
			
 
				+        self,
			
 
				+        db: BaseDB,
			
 
				+        reporters: Iterable[BaseReporter],
			
 
				+        *,
			
 
				+        logger: Optional[logging.Logger] = None,
			
 
				+        **options: object,
			
 
				+    ) -> None:
			
 
				+        """Initialize the runner with DB and reporters.
			
 
				+
			
 
				+        Args:
			
 
				+            db: Database backend instance.
			
 
				+            reporters: Reporter instances to receive notifications.
			
 
				+            logger: Optional logger for diagnostics.
			
 
				+            **options: Arbitrary key/value options.
			
 
				+        """
			
 
				+        self._db = db
			
 
				+        self._reporters = list(reporters)
			
 
				+        self.logger: Optional[logging.Logger] = logger
			
 
				+        self.options: dict[str, object] = dict(options)
			
 
				+        self._metadata: dict[str, object] = {}
			
 
				+
			
 
				+    @abstractmethod
			
 
				+    def run(self, spiders: Dict[BaseSpider, List[str]]) -> RunSummary:
			
 
				+        """Run spiders over URLs and persist results; return summary.
			
 
				+
			
 
				+        Args:
			
 
				+            spiders: A mapping of spider instances to their respective URL lists.
			
 
				+
			
 
				+        Returns:
			
 
				+            Aggregated :class:`RunSummary` across all spiders.
			
 
				+        """
			
 
				+
			
 
				+    # ---- Optional lifecycle hooks ----
			
 
				+    def on_run_start(
			
 
				+        self, _spiders: Dict[BaseSpider, List[str]]
			
 
				+    ) -> None:  # pragma: no cover
			
 
				+        """Hook invoked before a coordinated run begins.
			
 
				+
			
 
				+        Args:
			
 
				+            _spiders: The mapping that will be processed during this run.
			
 
				+        """
			
 
				+
			
 
				+    def on_run_end(self, _summary: RunSummary) -> None:  # pragma: no cover
			
 
				+        """Hook invoked after a coordinated run ends.
			
 
				+
			
 
				+        Args:
			
 
				+            _summary: The aggregated result of the run.
			
 
				+        """
			
 
				+
			
 
				+    # ---- Generic configuration & metadata ----
			
 
				+    def configure(self, **options: object) -> None:  # pragma: no cover
			
 
				+        """Update generic configuration options for this runner instance.
			
 
				+
			
 
				+        Args:
			
 
				+            **options: Arbitrary key/value options.
			
 
				+        """
			
 
				+        self.options.update(options)
			
 
				+
			
 
				+    @property
			
 
				+    def metadata(self) -> Mapping[str, object]:  # pragma: no cover
			
 
				+        """Read-only view of runner metadata."""
			
 
				+        return dict(self._metadata)
			
 
				+
			
 
				+    def set_metadata(self, **meta: object) -> None:  # pragma: no cover
			
 
				+        """Update runner metadata.
			
 
				+
			
 
				+        Args:
			
 
				+            **meta: Arbitrary key/value metadata.
			
 
				+        """
			
 
				+        self._metadata.update(meta)
			
 
				+
			
 
				+
			
 
				+class SchedulerBase(ABC):
			
 
				+    """An interface for scheduling runs (cron/systemd left to ops/impl)."""
			
 
				+
			
 
				+    def __init__(
			
 
				+        self, *, logger: Optional[logging.Logger] = None, **options: object
			
 
				+    ) -> None:
			
 
				+        """Initialize the scheduler with optional logger and options.
			
 
				+
			
 
				+        Args:
			
 
				+            logger: Optional logger for diagnostics.
			
 
				+            **options: Arbitrary key/value options.
			
 
				+        """
			
 
				+        self.logger: Optional[logging.Logger] = logger
			
 
				+        self.options: dict[str, object] = dict(options)
			
 
				+        self._metadata: dict[str, object] = {}
			
 
				+
			
 
				+    @abstractmethod
			
 
				+    def schedule(self) -> None:  # pragma: no cover
			
 
				+        """Install/trigger schedules (implementation dependent).
			
 
				+
			
 
				+        Notes:
			
 
				+            Implementations may choose OS-native schedulers such as cron or systemd.
			
 
				+        """
			
 
				+
			
 
				+    # ---- Optional lifecycle hooks ----
			
 
				+    def on_schedule(self) -> None:  # pragma: no cover
			
 
				+        """Hook invoked when schedules are registered."""
			
 
				+
			
 
				+    def on_unschedule(self) -> None:  # pragma: no cover
			
 
				+        """Hook invoked when schedules are removed."""
			
 
				+
			
 
				+    # ---- Generic configuration & metadata ----
			
 
				+    def configure(self, **options: object) -> None:  # pragma: no cover
			
 
				+        """Update generic configuration options for this scheduler instance.
			
 
				+
			
 
				+        Args:
			
 
				+            **options: Arbitrary key/value options.
			
 
				+        """
			
 
				+        self.options.update(options)
			
 
				+
			
 
				+    @property
			
 
				+    def metadata(self) -> Mapping[str, object]:  # pragma: no cover
			
 
				+        """Read-only view of scheduler metadata."""
			
 
				+        return dict(self._metadata)
			
 
				+
			
 
				+    def set_metadata(self, **meta: object) -> None:  # pragma: no cover
			
 
				+        """Update scheduler metadata.
			
 
				+
			
 
				+        Args:
			
 
				+            **meta: Arbitrary key/value metadata.
			
 
				+        """
			
 
				+        self._metadata.update(meta)
			
--- a/src/databank/spiders/__init__.py
+++ b/src/databank/spiders/__init__.py
@@ -0,0 +1 @@
 
				+"""Spider abstractions."""
			
--- a/src/databank/spiders/base.py
+++ b/src/databank/spiders/base.py
@@ -0,0 +1,268 @@
 
				+"""Abstract spider interface for Databank (no concrete spiders here).
			
 
				+
			
 
				+Contract (must implement):
			
 
				+- build_payload(url): pure builder for request parameters/headers/body.
			
 
				+- fetch(url, payload): perform retrieval and return raw textual content.
			
 
				+- parse(url, content, payload): transform raw content into a sequence of Documents.
			
 
				+
			
 
				+Extension points (optional to override):
			
 
				+- on_run_start / on_run_end: hooks at the boundaries of a run.
			
 
				+- should_fetch: per-URL gate to skip fetching (e.g., dedup or robots checks).
			
 
				+- before_fetch / after_fetch: lifecycle hooks around network retrieval.
			
 
				+- transform: post-parse normalization/filtering step over parsed documents.
			
 
				+- handle_error: customize error handling per-URL (default: re-raise).
			
 
				+- configure/metadata: provide generic configuration and metadata.
			
 
				+- close: release resources (sessions, files, etc.); context-manager friendly.
			
 
				+
			
 
				+Notes:
			
 
				+- This module intentionally contains no concrete spider implementation.
			
 
				+- The provided class attributes (timeouts, retries, etc.) are advisory hints;
			
 
				+    concrete implementations may honor them but this base does not enforce behavior.
			
 
				+"""
			
 
				+
			
 
				+from __future__ import annotations
			
 
				+
			
 
				+from abc import ABC, abstractmethod
			
 
				+import logging
			
 
				+from typing import Any, Iterable, Mapping, Sequence, Optional
			
 
				+
			
 
				+from databank.core.models import Document
			
 
				+
			
 
				+# Type aliases for clarity
			
 
				+URL = str
			
 
				+Payload = Mapping[str, Any]
			
 
				+Documents = Sequence[Document]
			
 
				+
			
 
				+
			
 
				+# Error hierarchy (implementations may raise these for clarity)
			
 
				+class SpiderError(Exception):
			
 
				+    """Base error for spider operations."""
			
 
				+
			
 
				+
			
 
				+class BuildPayloadError(SpiderError):
			
 
				+    """Raised when building the request payload fails for a URL."""
			
 
				+
			
 
				+
			
 
				+class FetchError(SpiderError):
			
 
				+    """Raised when fetching raw content for a URL fails."""
			
 
				+
			
 
				+
			
 
				+class ParseError(SpiderError):
			
 
				+    """Raised when parsing raw content into documents fails."""
			
 
				+
			
 
				+
			
 
				+class BaseSpider(ABC):
			
 
				+    """Abstract spider definition.
			
 
				+
			
 
				+    Attributes (advisory; implementations may choose to honor these):
			
 
				+    - name: Identifier for this spider.
			
 
				+    - max_retries: Suggested maximum retry attempts per URL (default 0).
			
 
				+    - request_timeout_s: Suggested request timeout in seconds (None = no limit).
			
 
				+    - rate_limit_per_sec: Suggested maximum requests per second (None = unlimited).
			
 
				+    - default_headers: Suggested default HTTP headers mapping.
			
 
				+    - logger: Optional logger for diagnostics.
			
 
				+    - options: Arbitrary configuration bag provided at construction.
			
 
				+    - metadata: Arbitrary, read-mostly metadata about this spider instance.
			
 
				+    """
			
 
				+
			
 
				+    name: str = "base"
			
 
				+    max_retries: int = 0
			
 
				+    request_timeout_s: Optional[float] = None
			
 
				+    rate_limit_per_sec: Optional[float] = None
			
 
				+    default_headers: Optional[Mapping[str, str]] = None
			
 
				+
			
 
				+    def __init__(
			
 
				+        self, *, logger: Optional[logging.Logger] = None, **options: Any
			
 
				+    ) -> None:
			
 
				+        """Initialize spider with optional logger and configuration options.
			
 
				+
			
 
				+        Args:
			
 
				+            logger: Optional logger for diagnostics.
			
 
				+            **options: Arbitrary configuration bag for concrete implementation.
			
 
				+        """
			
 
				+        self.logger: Optional[logging.Logger] = logger
			
 
				+        self.options: dict[str, Any] = dict(options)
			
 
				+        self._metadata: dict[str, Any] = {}
			
 
				+
			
 
				+    @abstractmethod
			
 
				+    def build_payload(self, url: URL) -> Payload:
			
 
				+        """Build request params/body or headers from a URL.
			
 
				+
			
 
				+        Args:
			
 
				+            url: Target URL to be fetched.
			
 
				+
			
 
				+        Returns:
			
 
				+            A mapping of request parameters/headers/body to be used by ``fetch``.
			
 
				+
			
 
				+        Raises:
			
 
				+            BuildPayloadError: If payload construction fails for ``url``.
			
 
				+        """
			
 
				+
			
 
				+    @abstractmethod
			
 
				+    def fetch(self, url: URL, payload: Payload) -> str:
			
 
				+        """Fetch raw textual content for a URL using the given payload.
			
 
				+
			
 
				+        Args:
			
 
				+            url: Target URL to fetch.
			
 
				+            payload: Request parameters prepared by ``build_payload``.
			
 
				+
			
 
				+        Returns:
			
 
				+            Raw textual content.
			
 
				+
			
 
				+        Raises:
			
 
				+            FetchError: If retrieval fails for ``url``.
			
 
				+        """
			
 
				+
			
 
				+    @abstractmethod
			
 
				+    def parse(self, url: URL, content: str, payload: Payload) -> Documents:
			
 
				+        """Parse raw content into a sequence of Documents.
			
 
				+
			
 
				+        Args:
			
 
				+            url: URL associated with the content.
			
 
				+            content: Raw textual content fetched by ``fetch``.
			
 
				+            payload: The payload used to fetch, for context if needed.
			
 
				+
			
 
				+        Returns:
			
 
				+            A sequence of :class:`~databank.core.models.Document` instances.
			
 
				+
			
 
				+        Raises:
			
 
				+            ParseError: If parsing fails for ``url``.
			
 
				+        """
			
 
				+
			
 
				+    # ---- Optional lifecycle hooks (no-op by default) ----
			
 
				+    def on_run_start(self, urls: Iterable[URL]) -> None:  # pragma: no cover
			
 
				+        """Hook invoked once before processing a batch of URLs.
			
 
				+
			
 
				+        Args:
			
 
				+            urls: Collection of URLs to be processed in this run.
			
 
				+        """
			
 
				+
			
 
				+    def on_run_end(
			
 
				+        self, urls: Iterable[URL], results: Sequence[Document], error_count: int
			
 
				+    ) -> None:  # pragma: no cover
			
 
				+        """Hook invoked once after processing a batch of URLs.
			
 
				+
			
 
				+        Args:
			
 
				+            urls: The same collection passed to ``on_run_start``.
			
 
				+            results: All successfully parsed documents.
			
 
				+            error_count: Number of URLs that raised errors.
			
 
				+        """
			
 
				+
			
 
				+    def should_fetch(self, _url: URL, _payload: Payload) -> bool:  # pragma: no cover
			
 
				+        """Return False to skip fetching this URL (e.g., dedup, robots, filters).
			
 
				+
			
 
				+        Returns:
			
 
				+            True to proceed with fetching; False to skip this URL.
			
 
				+        """
			
 
				+        return True
			
 
				+
			
 
				+    def before_fetch(self, url: URL, payload: Payload) -> None:  # pragma: no cover
			
 
				+        """Hook invoked before fetch; override for logging/metrics/rate-limit.
			
 
				+
			
 
				+        Args:
			
 
				+            url: URL to fetch.
			
 
				+            payload: Request parameters prepared by ``build_payload``.
			
 
				+        """
			
 
				+
			
 
				+    def after_fetch(
			
 
				+        self, url: URL, payload: Payload, content: str
			
 
				+    ) -> None:  # pragma: no cover
			
 
				+        """Hook invoked after fetch; override for logging/metrics/tracing.
			
 
				+
			
 
				+        Args:
			
 
				+            url: URL fetched.
			
 
				+            payload: Request parameters used to fetch.
			
 
				+            content: Raw textual content returned by ``fetch``.
			
 
				+        """
			
 
				+
			
 
				+    def handle_error(
			
 
				+        self, url: URL, payload: Payload, exc: Exception
			
 
				+    ) -> None:  # pragma: no cover
			
 
				+        """Handle per-URL errors; default behavior re-raises the exception.
			
 
				+
			
 
				+        Implementations may log, collect metrics, or convert exceptions.
			
 
				+
			
 
				+        Args:
			
 
				+            url: URL whose processing failed.
			
 
				+            payload: Payload built for this URL.
			
 
				+            exc: Original exception raised.
			
 
				+        """
			
 
				+        raise exc
			
 
				+
			
 
				+    def transform(self, _url: URL, docs: Documents) -> Documents:  # pragma: no cover
			
 
				+        """Post-parse transformation/normalization stage; default is identity.
			
 
				+
			
 
				+        Args:
			
 
				+            _url: URL associated with ``docs``.
			
 
				+            docs: Parsed documents.
			
 
				+
			
 
				+        Returns:
			
 
				+            Possibly modified documents (default: unchanged).
			
 
				+        """
			
 
				+        return docs
			
 
				+
			
 
				+    def run(self, urls: Iterable[URL]) -> list[Document]:
			
 
				+        """Reference orchestration: build -> fetch -> parse.
			
 
				+
			
 
				+        Steps:
			
 
				+            1) ``build_payload`` per URL.
			
 
				+            2) ``should_fetch`` gate.
			
 
				+            3) ``before_fetch`` -> ``fetch`` -> ``after_fetch``.
			
 
				+            4) ``parse`` -> ``transform``.
			
 
				+
			
 
				+        Implementations may override for concurrency, caching, or tracing.
			
 
				+
			
 
				+        Args:
			
 
				+            urls: URLs to process.
			
 
				+
			
 
				+        Returns:
			
 
				+            A list of parsed documents across all URLs.
			
 
				+        """
			
 
				+        results: list[Document] = []
			
 
				+        urls_seq = tuple(urls)
			
 
				+        self.on_run_start(urls_seq)
			
 
				+        error_count = 0
			
 
				+        for url in urls_seq:
			
 
				+            payload = self.build_payload(url)
			
 
				+            if not self.should_fetch(url, payload):
			
 
				+                continue
			
 
				+            self.before_fetch(url, payload)
			
 
				+            try:
			
 
				+                raw = self.fetch(url, payload)
			
 
				+                self.after_fetch(url, payload, raw)
			
 
				+                docs = self.parse(url, raw, payload)
			
 
				+                docs = self.transform(url, docs)
			
 
				+            except Exception as exc:  # pylint: disable=broad-except
			
 
				+                self.handle_error(url, payload, exc)
			
 
				+                error_count += 1
			
 
				+                continue
			
 
				+            results.extend(docs)
			
 
				+        self.on_run_end(urls_seq, results, error_count)
			
 
				+        return results
			
 
				+
			
 
				+    # ---- Resource management ----
			
 
				+    def close(self) -> None:  # pragma: no cover
			
 
				+        """Release resources (e.g., network sessions)."""
			
 
				+
			
 
				+    # ---- Generic configuration & metadata ----
			
 
				+    def configure(self, **options: Any) -> None:  # pragma: no cover
			
 
				+        """Update generic configuration options for this spider instance."""
			
 
				+        self.options.update(options)
			
 
				+
			
 
				+    @property
			
 
				+    def metadata(self) -> Mapping[str, Any]:  # pragma: no cover
			
 
				+        """Read-only view of spider metadata."""
			
 
				+        return dict(self._metadata)
			
 
				+
			
 
				+    def set_metadata(self, **meta: Any) -> None:  # pragma: no cover
			
 
				+        """Update spider metadata."""
			
 
				+        self._metadata.update(meta)
			
 
				+
			
 
				+    # Context manager support
			
 
				+    def __enter__(self) -> "BaseSpider":  # pragma: no cover
			
 
				+        return self
			
 
				+
			
 
				+    def __exit__(self, exc_type, exc, tb) -> bool:  # pragma: no cover
			
 
				+        self.close()
			
 
				+        # Do not suppress exceptions
			
 
				+        return False