Prechádzať zdrojové kódy

添加 MongoDB 后端实现,配置和引导工具,示例脚本,以及更新文档和抽象类,增强项目结构和可用性。

admin 3 mesiacov pred
rodič
commit
0c27ee2645

+ 39 - 1
README.md

@@ -1,5 +1,13 @@
 # Databank (Abstract Skeleton)
 
+Policy: scripts are executable, examples are stubs
+- All runnable entry points live under `scripts/` and can be executed directly with `python`.
+- Files under `src/databank/examples/` are stubs for guidance/docs only and will print instructions.
+- Prefer running demos via scripts:
+	- `python scripts/seed_leagues_mongo.py`
+	- `python scripts/seed_seasons_mongo.py`
+	- `python scripts/test_get_league_match_list.py`
+
 This repository is a pure abstract skeleton intended to define stable contracts
 for a multi-spider data pipeline. It deliberately contains only abstract/base
 classes and core models, without any concrete implementations.
@@ -59,4 +67,34 @@ Optional typing and linting (ruff/mypy)
 	- Both are optional and will not run unless you invoke them.
 
 License
-- See `LICENSE` for details.
+- See `LICENSE` for details.
+
+Abstract-safe initialization helpers
+- `databank.config.settings`:
+	- `DBSettings`: 通用数据库设置容器(不绑定具体后端)。
+	- `load_db_settings(prefix="DATABANK_DB_")`: 从环境变量读取设置(如 `DATABANK_DB_URI`、`DATABANK_DB_NAME` 等)。
+	- `settings_to_options(settings)`: 将 `DBSettings` 转换为通用 `configure(**options)` 所需字典。
+	- `merge_options(base, extra)`: 合并两份 options(右侧覆盖)。
+- `databank.bootstrap.db`:
+	- `DBBootstrapOptions`: 启动选项(是否 `connect`、`ensure_indexes`,以及 `configure_options`)。
+	- `bootstrap_db(db, options)`: 以抽象方式调用 `configure`→`connect`→`ensure_indexes`。
+	- `db_session(db, options)`: 上下文管理器,产出连接后的 DB,并在退出时安全关闭。
+
+示例(仅展示编排,不包含具体后端实现):
+```python
+from databank import config, bootstrap
+
+# 假设你有一个自定义的 DB 实现 `MyDB(BaseDB)`,此处仅示意。
+from mypkg.db import MyDB  # 你的实现,不在本仓库内
+
+settings = config.load_db_settings()
+options = config.settings_to_options(settings)
+
+db = MyDB()
+boot = bootstrap.DBBootstrapOptions(configure_options=options, connect=True, ensure_indexes=True)
+
+with bootstrap.db_session(db, boot) as conn:
+		# 在此使用 conn.insert_many([...]) 等抽象方法
+		pass
+```
+以上编排层不引入任何具体驱动或后端,仅依赖于 `BaseDB` 约定,便于后续在你自己的实现中复用。

+ 2 - 0
pyproject.toml

@@ -55,6 +55,8 @@ extend-select = [
 	"N",  # pep8-naming
 ]
 extend-ignore = [
+	# E203: whitespace before ':' (PEP 8 recommendation conflicts with some formatters)
+	"E203",
 	# Allow short names in specific contexts (already handled via pylint good-names)
 ]
 

+ 74 - 0
scripts/seed_leagues_mongo.py

@@ -0,0 +1,74 @@
+"""Seed initial league data into a local MongoDB instance.
+
+Usage (PowerShell):
+  python -m pip install "pymongo>=4.7"
+  $env:DATABANK_DB_URI = "mongodb://localhost:27017"
+  $env:DATABANK_DB_NAME = "databank"
+  python scripts/seed_leagues_mongo.py
+"""
+
+from __future__ import annotations
+
+import os
+from typing import Any, Dict, Iterable
+
+from pymongo import MongoClient
+
+
+def get_connection() -> tuple[MongoClient, str]:
+    """Create a MongoDB client and return ``(client, db_name)``.
+
+    Reads env vars with sensible defaults.
+    """
+    uri_default = "mongodb://localhost:27017"
+    name_default = "databank"
+
+    uri = os.getenv("DATABANK_DB_URI", uri_default)
+    name = os.getenv("DATABANK_DB_NAME", name_default)
+
+    client = MongoClient(uri)
+    return client, name
+
+
+def ensure_unique_index_leagues(db) -> None:
+    """Ensure a unique index on ``leagues.league_id``."""
+    leagues = db["leagues"]
+    leagues.create_index("league_id", unique=True)
+
+
+def upsert_leagues(db, docs: Iterable[Dict[str, Any]]) -> dict:
+    """Upsert provided league docs by ``league_id``; return counters."""
+    leagues = db["leagues"]
+    inserted = 0
+    updated = 0
+    for doc in docs:
+        key = {"league_id": doc["league_id"]}
+        # Replace entire document on upsert to keep fields consistent
+        result = leagues.replace_one(key, doc, upsert=True)
+        if result.matched_count:
+            updated += int(result.modified_count == 1)
+        else:
+            inserted += 1
+    return {"inserted": inserted, "updated": updated}
+
+
+def main() -> None:
+    """Entry point to seed initial league documents into MongoDB."""
+    client, db_name = get_connection()
+    try:
+        db = client[db_name]
+        ensure_unique_index_leagues(db)
+        seed = [
+            {"league_id": 2079, "league_name": "英超", "max_round": 38},
+            {"league_id": 2080, "league_name": "英冠", "max_round": 46},
+            {"league_id": 2081, "league_name": "英甲", "max_round": 46},
+            {"league_id": 2082, "league_name": "英乙", "max_round": 46},
+        ]
+        stats = upsert_leagues(db, seed)
+        print(f"Seed done: inserted={stats['inserted']}, updated={stats['updated']}")
+    finally:
+        client.close()
+
+
+if __name__ == "__main__":
+    main()

+ 104 - 0
scripts/seed_seasons_mongo.py

@@ -0,0 +1,104 @@
+"""Seed seasons into a local MongoDB instance with unique ``season``.
+
+Rules:
+- initial_year starts from 2016.
+- Determine current_year by today's date:
+  - If today > July 1 (strict), current_year = this_year + 1
+  - Else, current_year = this_year
+- Generate seasons from initial_year up to (but excluding) current_year, as strings
+  like "YYYY-YYYY+1". Stop when initial_year == current_year.
+- Ensure unique index on ``season`` and upsert each document.
+
+Environment variables (with defaults):
+- DATABANK_DB_URI (default: mongodb://localhost:27017)
+- DATABANK_DB_NAME (default: databank)
+
+Usage (PowerShell):
+  python -m pip install "pymongo>=4.7"
+  $env:DATABANK_DB_URI = "mongodb://localhost:27017"
+  $env:DATABANK_DB_NAME = "databank"
+  python scripts/seed_seasons_mongo.py
+"""
+
+from __future__ import annotations
+
+import os
+from datetime import date
+from typing import Any, Dict, Iterable, List, Tuple
+
+from pymongo import MongoClient
+
+
+def get_connection() -> Tuple[MongoClient, str]:
+    """Create a MongoDB client and return ``(client, db_name)``."""
+    uri = os.getenv("DATABANK_DB_URI", "mongodb://localhost:27017")
+    name = os.getenv("DATABANK_DB_NAME", "databank")
+    client = MongoClient(uri)
+    return client, name
+
+
+def calc_current_year(today: date | None = None) -> int:
+    """Calculate current_year per rule (> July 1 => year+1 else year)."""
+    today = today or date.today()
+    threshold = date(today.year, 7, 1)
+    return today.year + 1 if today > threshold else today.year
+
+
+def generate_seasons(initial_year: int, current_year: int) -> List[Dict[str, Any]]:
+    """Generate season documents from initial_year to current_year (exclusive)."""
+    seasons: List[Dict[str, Any]] = []
+    for start in range(initial_year, current_year):
+        end = start + 1
+        seasons.append(
+            {
+                "season": f"{start}-{end}",
+                "start_year": start,
+                "end_year": end,
+            }
+        )
+    return seasons
+
+
+def ensure_unique_index_seasons(db) -> None:
+    """Ensure a unique index on seasons.season."""
+    seasons = db["seasons"]
+    seasons.create_index("season", unique=True)
+
+
+def upsert_seasons(db, docs: Iterable[Dict[str, Any]]) -> Dict[str, int]:
+    """Upsert provided season docs by ``season``; return counters."""
+    seasons = db["seasons"]
+    inserted = 0
+    updated = 0
+    for doc in docs:
+        key = {"season": doc["season"]}
+        result = seasons.replace_one(key, doc, upsert=True)
+        if result.matched_count:
+            updated += int(result.modified_count == 1)
+        else:
+            inserted += 1
+    return {"inserted": inserted, "updated": updated}
+
+
+def main() -> None:
+    """Entry point to seed season documents into MongoDB with unique ``season``."""
+    client, db_name = get_connection()
+    try:
+        db = client[db_name]
+        ensure_unique_index_seasons(db)
+
+        initial_year = 2016
+        current = calc_current_year()
+        docs = generate_seasons(initial_year, current)
+
+        stats = upsert_seasons(db, docs)
+        print(
+            f"Seasons seed done: range=[{initial_year}, {current}) "
+            f"inserted={stats['inserted']}, updated={stats['updated']}"
+        )
+    finally:
+        client.close()
+
+
+if __name__ == "__main__":
+    main()

+ 65 - 0
scripts/test_get_league_match_list.py

@@ -0,0 +1,65 @@
+"""Executable demo: run GetLeagueMatchListSpider for three requests.
+
+Usage (PowerShell):
+  python -m pip install requests pymongo
+  $env:DATABANK_DB_URI = "mongodb://localhost:27017"
+  $env:DATABANK_DB_NAME = "databank"
+  python scripts/test_get_league_match_list.py
+"""
+
+from __future__ import annotations
+
+import os
+from collections import defaultdict
+
+from databank.db import MongoDB
+from databank.spiders.get_league_match_list import GetLeagueMatchListSpider
+
+
+def pick_tokens(max_tokens: int = 3) -> list[str]:
+    """Build up to ``max_tokens`` URL tokens from MongoDB collections."""
+    uri = os.getenv("DATABANK_DB_URI", "mongodb://localhost:27017")
+    name = os.getenv("DATABANK_DB_NAME", "databank")
+    db = MongoDB(uri=uri, name=name)
+    db.connect()
+    try:
+        leagues = db.find("leagues", projection={"_id": 0}, limit=10)
+        seasons = db.find("seasons", projection={"_id": 0}, limit=10)
+        if not leagues:
+            raise RuntimeError("No leagues found. Seed leagues first.")
+        if not seasons:
+            raise RuntimeError("No seasons found. Seed seasons first.")
+
+        league = sorted(leagues, key=lambda x: x.get("league_id", 0))[0]
+        max_round = int(league.get("max_round", 1))
+        season_name = seasons[0]["season"]
+
+        tokens: list[str] = []
+        rounds = list(range(1, max_round + 1))[:max_tokens]
+        for r in rounds:
+            tokens.append(f"{league['league_id']}|{season_name}|{r}")
+        return tokens[:max_tokens]
+    finally:
+        db.close()
+
+
+def main() -> None:
+    """Run the demo and print a compact summary to stdout."""
+    spider = GetLeagueMatchListSpider()
+    urls = pick_tokens()
+    docs = spider.run(urls)
+
+    print(f"Fetched {len(docs)} documents in total.")
+
+    per_token = defaultdict(list)
+    for d in docs:
+        per_token[d.data.get("token", "unknown")].append(d)
+
+    for token, items in per_token.items():
+        print(f"Token: {token}, docs: {len(items)}")
+        if items:
+            print("Sample:", items[0].data.get("match") or items[0].data)
+
+
+if __name__ == "__main__":
+    main()

+ 15 - 1
src/databank/__init__.py

@@ -2,5 +2,19 @@
 
 from __future__ import annotations
 
-__all__ = ["__version__"]
+__all__ = [
+    "__version__",
+    # Convenience exports (keep abstract-only helpers discoverable)
+    "config",
+    "bootstrap",
+    # Expose common subpackages for convenience and analyzer friendliness
+    "db",
+    "spiders",
+]
 __version__ = "0.0.1"
+
+# Re-export subpackages as namespaces for convenience (no concrete backends)
+from . import config  # noqa: F401
+from . import bootstrap  # noqa: F401
+from . import db  # noqa: F401
+from . import spiders  # noqa: F401

+ 11 - 0
src/databank/bootstrap/__init__.py

@@ -0,0 +1,11 @@
+"""Bootstrap utilities for databank (abstract-only).
+
+Exports:
+- `db` submodule with `DBBootstrapOptions`, `bootstrap_db`, and `db_session`.
+"""
+
+from __future__ import annotations
+
+from .db import DBBootstrapOptions, bootstrap_db, db_session
+
+__all__ = ["DBBootstrapOptions", "bootstrap_db", "db_session"]

+ 105 - 0
src/databank/bootstrap/db.py

@@ -0,0 +1,105 @@
+"""Abstract-safe DB bootstrap helpers.
+
+This module orchestrates initialization flows around `BaseDB` without binding
+to any concrete backend. It provides:
+- `DBBootstrapOptions`: container for generic options and lifecycle flags.
+- `bootstrap_db`: a helper to configure/connect/ensure_indexes guarded by flags.
+- `db_session`: a context manager to yield a connected DB and ensure `close()`.
+
+It relies solely on `databank.db.base.BaseDB` contracts and accepts generic
+options (e.g., from `databank.config.settings.settings_to_options`).
+"""
+
+from __future__ import annotations
+
+from contextlib import contextmanager, suppress
+from dataclasses import dataclass
+from typing import Iterator, Mapping, Optional, Protocol
+
+
+@dataclass(slots=True)
+class DBBootstrapOptions:
+    """Options for bootstrapping a `BaseDB` instance.
+
+    Attributes:
+        configure_options: Arbitrary options forwarded to `db.configure(**opts)`.
+        connect: Whether to call `db.connect()`.
+        ensure_indexes: Whether to call `db.ensure_indexes()` after connect.
+        suppress_close_errors: If True, suppress exceptions raised by `close()`.
+    """
+
+    configure_options: Mapping[str, object] | None = None
+    connect: bool = True
+    ensure_indexes: bool = True
+    suppress_close_errors: bool = False
+
+
+class BaseDBProto(Protocol):
+    """Protocol capturing the minimal `BaseDB` contract used here.
+
+    This avoids importing the actual `BaseDB` to keep this module abstract and
+    free from import path issues in early scaffolding stages.
+    """
+
+    def configure(self, **options: object) -> None:
+        """Configure the DB with arbitrary options."""
+
+    def connect(self) -> None:
+        """Establish the connection to the backend."""
+
+    def ensure_indexes(self) -> None:
+        """Create any required indexes on the backend, if applicable."""
+
+    def close(self) -> None:
+        """Close any open connections/resources."""
+
+
+def bootstrap_db(
+    db: BaseDBProto, options: Optional[DBBootstrapOptions] = None
+) -> BaseDBProto:
+    """Bootstrap a `BaseDB` using generic options and lifecycle flags.
+
+    The function avoids backend specifics. It only invokes `configure`, `connect`,
+    and `ensure_indexes` following the abstract `BaseDB` contract.
+
+    Args:
+        db: A `BaseDB` implementation instance.
+        options: Optional `DBBootstrapOptions`. Defaults connect+ensure_indexes.
+
+    Returns:
+        The same `db` instance after bootstrapping (for chaining).
+    """
+    opts = options or DBBootstrapOptions()
+    if opts.configure_options:
+        db.configure(**dict(opts.configure_options))
+    if opts.connect:
+        db.connect()
+        if opts.ensure_indexes:
+            db.ensure_indexes()
+    return db
+
+
+@contextmanager
+def db_session(
+    db: BaseDBProto, options: Optional[DBBootstrapOptions] = None
+) -> Iterator[BaseDBProto]:
+    """Context manager that yields a bootstrapped DB and closes it on exit.
+
+    Args:
+        db: A `BaseDB` implementation instance.
+        options: Bootstrap flags and configure options.
+
+    Yields:
+        A connected `BaseDB` instance.
+    """
+    try:
+        yield bootstrap_db(db, options)
+    finally:
+        # Always attempt to close regardless of errors raised by consumer
+        # By default (strict), do NOT suppress close errors.
+        if options and options.suppress_close_errors:
+            # Optionally suppress close errors if explicitly requested.
+            with suppress(Exception):  # pylint: disable=broad-except
+                db.close()
+        else:
+            db.close()

+ 12 - 0
src/databank/config/__init__.py

@@ -0,0 +1,12 @@
+"""Configuration utilities for databank (abstract-only)."""
+
+from __future__ import annotations
+
+from .settings import DBSettings, load_db_settings, settings_to_options, merge_options
+
+__all__ = [
+    "DBSettings",
+    "load_db_settings",
+    "settings_to_options",
+    "merge_options",
+]

+ 113 - 0
src/databank/config/settings.py

@@ -0,0 +1,113 @@
+"""Generic settings helpers for databank initialization (no concrete backends).
+
+This module avoids binding to any specific DB implementation. It provides:
+- A minimal `DBSettings` data container loaded from environment variables.
+- Utilities to convert settings into generic options that can be passed to
+  `BaseDB.configure(**options)`.
+
+Environment convention:
+- Prefix defaults to `DATABANK_DB_` (e.g., DATABANK_DB_URI, DATABANK_DB_NAME).
+- All fields are optional; concrete backends may ignore/extend them.
+"""
+
+from __future__ import annotations
+
+from dataclasses import dataclass
+import os
+from typing import Mapping
+
+
+@dataclass(slots=True)
+class DBSettings:
+    """Generic DB settings container.
+
+    This intentionally stays generic; concrete DB backends decide how to use
+    these values. If you don't use URIs, backends may prefer host/port/name.
+    """
+
+    uri: str | None = None
+    host: str | None = None
+    port: int | None = None
+    username: str | None = None
+    password: str | None = None
+    name: str | None = None  # database name
+    # Free-form extras to support backend-specific options without coupling
+    extras: dict[str, str] | None = None
+
+
+def load_db_settings(prefix: str = "DATABANK_DB_") -> DBSettings:
+    """Load DB settings from environment variables using a prefix.
+
+    Known keys (after prefix): URI, HOST, PORT, USERNAME, PASSWORD, NAME.
+    Any variables prefixed with `EXTRA_` will be stored under `extras`.
+
+    Args:
+        prefix: The environment variable prefix, default `DATABANK_DB_`.
+
+    Returns:
+        A `DBSettings` instance populated from environment variables.
+    """
+
+    def getenv(key: str) -> str | None:
+        return os.getenv(prefix + key) or None
+
+    extras: dict[str, str] = {}
+    # Collect EXTRA_* variables generically
+    for key, value in os.environ.items():
+        if key.startswith(prefix + "EXTRA_") and value:
+            extras[key[len(prefix + "EXTRA_") :]] = value
+
+    port_val = getenv("PORT")
+    return DBSettings(
+        uri=getenv("URI"),
+        host=getenv("HOST"),
+        port=int(port_val) if port_val and port_val.isdigit() else None,
+        username=getenv("USERNAME"),
+        password=getenv("PASSWORD"),
+        name=getenv("NAME"),
+        extras=extras or None,
+    )
+
+
+def settings_to_options(settings: DBSettings) -> dict[str, object]:
+    """Convert `DBSettings` into a generic options dict for `BaseDB.configure()`.
+
+    This function keeps keys generic and optional; backends may pick the fields
+    that are relevant to them.
+
+    Args:
+        settings: A `DBSettings` instance.
+
+    Returns:
+        A dict suitable for `BaseDB.configure(**options)`.
+    """
+    opts: dict[str, object] = {}
+    if settings.uri is not None:
+        opts["uri"] = settings.uri
+    if settings.host is not None:
+        opts["host"] = settings.host
+    if settings.port is not None:
+        opts["port"] = settings.port
+    if settings.username is not None:
+        opts["username"] = settings.username
+    if settings.password is not None:
+        opts["password"] = settings.password
+    if settings.name is not None:
+        opts["name"] = settings.name
+    if settings.extras:
+        # Merge extras but do not overwrite existing keys
+        for k, v in settings.extras.items():
+            opts.setdefault(k, v)
+    return opts
+
+
+def merge_options(
+    base: Mapping[str, object] | None, extra: Mapping[str, object] | None
+) -> dict[str, object]:
+    """Merge two option mappings into a new dict (right-biased on conflicts)."""
+    out: dict[str, object] = {}
+    if base:
+        out.update(base)
+    if extra:
+        out.update(extra)
+    return out

+ 12 - 1
src/databank/db/__init__.py

@@ -1 +1,12 @@
-"""Database abstractions."""
+"""Database abstractions and concrete optional backends."""
+
+from __future__ import annotations
+
+# Optional concrete backend (requires pymongo)
+try:  # pragma: no cover - optional dependency
+    from .mongo import MongoDB  # noqa: F401
+except ImportError:  # pragma: no cover
+    # Keep package importable even if pymongo is not installed.
+    MongoDB = None  # type: ignore[assignment]
+
+__all__ = ["MongoDB"]

+ 211 - 0
src/databank/db/mongo.py

@@ -0,0 +1,211 @@
+"""Concrete MongoDB backend implementing `BaseDB` for local reads/writes.
+
+This implementation keeps configuration simple and relies on options provided
+via `BaseDB.configure(**options)` or constructor `**options`:
+
+Options:
+- uri: Full MongoDB connection URI (preferred). Default: mongodb://localhost:27017
+- name: Database name. Default: databank
+- host: Host (used only if `uri` is absent). Default: localhost
+- port: Port (used only if `uri` is absent). Default: 27017
+- username/password: Credentials to embed in URI if `uri` is absent.
+- indexes: Optional mapping specifying indexes to ensure on `ensure_indexes`.
+  Example:
+    {
+      "leagues": [
+        {"keys": [("league_id", 1)], "unique": True}
+      ],
+      "seasons": [
+        {"keys": [("season", 1)], "unique": True}
+      ]
+    }
+
+Note: This module requires `pymongo`. Install via:
+  python -m pip install "pymongo>=4.7"
+"""
+
+from __future__ import annotations
+
+from typing import Any, Iterable, Mapping, Optional
+
+from pymongo import MongoClient
+from pymongo.collection import Collection
+from pymongo.database import Database
+from pymongo.errors import PyMongoError
+
+from .base import BaseDB, ConnectError, InsertError
+from ..core.models import Document
+
+
+class MongoDB(BaseDB):
+    """MongoDB implementation of `BaseDB`.
+
+    This class supports simple inserts (with optional upsert by `_id`) and
+    convenience read helpers (`find`, `find_one`). It is designed to be used
+    with local MongoDB by default but works with any URI.
+    """
+
+    def __init__(self, *, logger=None, **options: object) -> None:
+        """Initialize with optional logger and options (see module docstring)."""
+        super().__init__(logger=logger, **options)
+        self._client: Optional[MongoClient] = None
+        self._db: Optional[Database] = None
+
+    # ---- BaseDB required methods ----
+    def connect(self) -> None:
+        """Establish a connection to MongoDB and select the database.
+
+        Raises:
+            ConnectError: If the connection fails or database is not resolvable.
+        """
+        try:
+            uri = self._build_uri()
+            name = str(self.options.get("name") or "databank")
+            self._client = MongoClient(uri)
+            # Validate connection
+            self._client.admin.command("ping")
+            self._db = self._client[name]
+            self.on_connect()
+            if self.logger:
+                self.logger.debug("MongoDB connected: uri=%s db=%s", uri, name)
+        except PyMongoError as exc:  # pragma: no cover - environment dependent
+            raise ConnectError(f"Failed to connect to MongoDB: {exc}") from exc
+
+    def ensure_indexes(self) -> None:
+        """Create indexes defined in `options[\"indexes\"]` if provided.
+
+        The expected format is a mapping of collection name to a list of index
+        specs. Each index spec is a dict with keys:
+          - keys: list[tuple[str, int]] (e.g., [("league_id", ASCENDING)])
+          - unique: bool (optional)
+          - name: str (optional)
+        """
+        if self._db is None:
+            return
+        indexes = self.options.get("indexes")
+        if not isinstance(indexes, Mapping):
+            return
+        for coll_name, specs in indexes.items():
+            if not isinstance(specs, Iterable):
+                continue
+            coll = self._db[coll_name]
+            for spec in specs:
+                if not isinstance(spec, Mapping) or "keys" not in spec:
+                    continue
+                keys = list(spec["keys"])  # type: ignore[assignment]
+                unique = bool(spec.get("unique", False))
+                name = spec.get("name")
+                coll.create_index(keys, unique=unique, name=name)
+
+    def insert_many(self, docs: Iterable[Document]) -> int:
+        """Insert or upsert documents into collections named by `doc.kind`.
+
+        - If `doc.id` is present, performs a replace-one with upsert keyed by `_id`.
+        - Otherwise, inserts the document (non-upsert) letting MongoDB assign `_id`.
+
+        Returns:
+            Count of inserted/updated documents.
+
+        Raises:
+            InsertError: On any underlying PyMongo failure.
+        """
+        if self._db is None:
+            raise InsertError("Database not connected. Call connect() first.")
+        count = 0
+        try:
+            for doc in docs:
+                coll = self._collection_for(doc)
+                payload: dict[str, Any] = dict(doc.data)
+                payload.setdefault("created_at", doc.created_at)
+                if doc.id:
+                    payload["_id"] = doc.id
+                    res = coll.replace_one({"_id": doc.id}, payload, upsert=True)
+                    # Count insert or actual modification as success
+                    count += (
+                        1 if (res.matched_count == 0 or res.modified_count == 1) else 0
+                    )
+                else:
+                    coll.insert_one(payload)
+                    count += 1
+            self.after_insert([], count)
+            return count
+        except PyMongoError as exc:  # pragma: no cover - environment dependent
+            raise InsertError(f"Failed to insert documents: {exc}") from exc
+
+    def close(self) -> None:
+        """Close the MongoDB client and release resources."""
+        try:
+            if self._client is not None:
+                self._client.close()
+                if self.logger:
+                    self.logger.debug("MongoDB client closed")
+        finally:
+            self._client = None
+            self._db = None
+            self.on_close()
+
+    # ---- Convenience read helpers (not part of BaseDB interface) ----
+    def find(
+        self,
+        collection: str,
+        query: Optional[Mapping[str, Any]] = None,
+        projection: Optional[Mapping[str, int]] = None,
+        limit: Optional[int] = None,
+    ) -> list[dict[str, Any]]:
+        """Find documents in a collection and return them as a list.
+
+        Args:
+            collection: Collection name to read from.
+            query: MongoDB filter dict.
+            projection: Fields to include/exclude (MongoDB projection dict).
+            limit: Optional max number of documents to return.
+
+        Returns:
+            A list of raw MongoDB documents (dicts).
+        """
+        self._require_db()
+        coll = self._db[collection]  # type: ignore[index]
+        cursor = coll.find(filter=query or {}, projection=projection)
+        if limit:
+            cursor = cursor.limit(int(limit))
+        return list(cursor)
+
+    def find_one(
+        self,
+        collection: str,
+        query: Optional[Mapping[str, Any]] = None,
+        projection: Optional[Mapping[str, int]] = None,
+    ) -> Optional[dict[str, Any]]:
+        """Find a single document matching the query or return None."""
+        self._require_db()
+        coll = self._db[collection]  # type: ignore[index]
+        return coll.find_one(filter=query or {}, projection=projection)
+
+    # ---- Internals ----
+    def _require_db(self) -> None:
+        """Ensure `connect()` has been called and DB is available."""
+        if self._db is None:
+            raise ConnectError("Database not connected. Call connect() first.")
+
+    def _collection_for(self, doc: Document) -> Collection:
+        """Resolve the collection for a document based on its `kind`."""
+        self._require_db()
+        name = doc.kind or "documents"
+        return self._db[name]  # type: ignore[index]
+
+    def _build_uri(self) -> str:
+        """Build a MongoDB URI from options if `uri` not explicitly provided."""
+        uri = self.options.get("uri")
+        if isinstance(uri, str) and uri:
+            return uri
+        host = str(self.options.get("host") or "localhost")
+        port_val = self.options.get("port")
+        if isinstance(port_val, (int, str)):
+            port = int(port_val)
+        else:
+            port = 27017
+        username = self.options.get("username")
+        password = self.options.get("password")
+        if username and password:
+            return f"mongodb://{username}:{password}@{host}:{port}"
+        return f"mongodb://{host}:{port}"

+ 14 - 0
src/databank/examples/seed_leagues_mongo.py

@@ -0,0 +1,14 @@
+"""This example is a stub to guide usage.
+
+Use the executable script instead:
+  python scripts/seed_leagues_mongo.py
+"""
+
+from __future__ import annotations
+
+if __name__ == "__main__":  # pragma: no cover
+    print(
+        "This is a stub. Use the executable script instead:\n"
+        "  python scripts/seed_leagues_mongo.py"
+    )
+    raise SystemExit(0)

+ 14 - 0
src/databank/examples/seed_seasons_mongo.py

@@ -0,0 +1,14 @@
+"""This example is a stub to guide usage.
+
+Use the executable script instead:
+  python scripts/seed_seasons_mongo.py
+"""
+
+from __future__ import annotations
+
+if __name__ == "__main__":  # pragma: no cover
+    print(
+        "This is a stub. Use the executable script instead:\n"
+        "  python scripts/seed_seasons_mongo.py"
+    )
+    raise SystemExit(0)

+ 14 - 0
src/databank/examples/test_get_league_match_list.py

@@ -0,0 +1,14 @@
+"""This example is a stub to guide usage.
+
+Use the executable script instead:
+  python scripts/test_get_league_match_list.py
+"""
+
+from __future__ import annotations
+
+if __name__ == "__main__":  # pragma: no cover
+    print(
+        "This is a stub. Use the executable script instead:\n"
+        "  python scripts/test_get_league_match_list.py"
+    )
+    raise SystemExit(0)

+ 212 - 0
src/databank/spiders/get_league_match_list.py

@@ -0,0 +1,212 @@
+"""Concrete spider: getLeagueMatchList.
+
+Endpoint:
+- URL: https://sport.ttyingqiu.com/sportdata/f?platform=web
+- Method: POST
+- Payload (JSON):
+  {
+    "leagueId": <league_id>,
+    "pageNo": 1,
+    "pageSize": 100,
+    "round": <round>,
+    "seasonFlag": 0,
+    "seasonName": <season>,
+    "apiName": "getLeagueMatchList"
+  }
+
+Behavior:
+- Scheduler should pass tasks (league_id, season, round). For local testing,
+  you can encode them into the URL token string as "<league_id>|<season>|<round>".
+- The spider will make at most the provided number of URLs; your test driver
+  should stop after 3.
+- Sleep a random 1~2 seconds between requests (implemented in before_fetch).
+- If fetch/parsing error occurs, return an error Document; otherwise return
+  matchList entries with groupName == "联赛" only.
+"""
+
+from __future__ import annotations
+
+import json
+import random
+import time
+from typing import Any, Mapping
+
+try:  # Optional dependency; guide user to install if missing
+    import requests
+except ImportError:  # pragma: no cover
+    requests = None  # type: ignore[assignment]
+
+from .base import BaseSpider, BuildPayloadError, URL, Payload, Documents
+from ..core.models import Document
+
+
+class GetLeagueMatchListSpider(BaseSpider):
+    """Spider implementation for the getLeagueMatchList API."""
+
+    name = "getLeagueMatchList"
+    request_timeout_s = 15.0
+    default_headers = {
+        "Accept": "application/json, text/plain, */*",
+        "Content-Type": "application/json;charset=UTF-8",
+        "User-Agent": "Mozilla/5.0 (compatible; DatabankSpider/0.1)",
+    }
+
+    endpoint: str = "https://sport.ttyingqiu.com/sportdata/f?platform=web"
+
+    def build_payload(self, url: URL) -> Payload:
+        """Build JSON payload from a URL token `league|season|round`.
+
+        The scheduler should ideally pass structured data; for testing we parse
+        a token string split by '|'.
+        """
+        try:
+            league_str, season, round_str = url.split("|")
+            league_id = int(league_str)
+            round_no = int(round_str)
+        except Exception as exc:  # pylint: disable=broad-except
+            raise BuildPayloadError(f"Invalid task token: {url}") from exc
+
+        payload = {
+            "leagueId": league_id,
+            "pageNo": 1,
+            "pageSize": 100,
+            "round": round_no,
+            "seasonFlag": 0,
+            "seasonName": season,
+            "apiName": self.name,
+        }
+        return payload
+
+    def before_fetch(self, url: URL, payload: Payload) -> None:  # pragma: no cover
+        """Sleep a random 1~2 seconds to respect rate limits."""
+        delay = random.uniform(1.0, 2.0)
+        time.sleep(delay)
+
+    def fetch(self, url: URL, payload: Payload) -> str:
+        """POST to the endpoint and return raw text.
+
+        On error, return a JSON-encoded error payload so `parse` can emit a
+        structured error document instead of raising.
+        """
+        if requests is None:  # pragma: no cover - guidance only
+            return json.dumps(
+                {
+                    "error": "missing_dependency",
+                    "detail": "requests is not installed. Run: pip install requests",
+                }
+            )
+        try:
+            timeout = float(self.request_timeout_s or 15.0)
+            headers: Mapping[str, str] = dict(self.default_headers or {})
+            resp = requests.post(
+                self.endpoint, headers=headers, json=dict(payload), timeout=timeout
+            )
+            resp.raise_for_status()
+            return resp.text
+        except Exception as exc:  # pylint: disable=broad-except
+            return json.dumps(
+                {
+                    "error": "fetch_error",
+                    "detail": str(exc),
+                }
+            )
+
+    def parse(self, url: URL, content: str, payload: Payload) -> Documents:
+        """Parse JSON, filter matchList by groupName == '联赛', return Documents."""
+        try:
+            data = json.loads(content)
+        except json.JSONDecodeError as exc:
+            # Return an error document for visibility instead of raising
+            return [
+                Document(
+                    id=None,
+                    kind="error",
+                    data={"token": url, "reason": "invalid_json", "detail": str(exc)},
+                )
+            ]
+
+        # If fetch reported an error, convert to error document directly
+        if isinstance(data, Mapping) and "error" in data:
+            return [
+                Document(
+                    id=None,
+                    kind="error",
+                    data={
+                        "token": url,
+                        "reason": str(data.get("error")),
+                        "detail": str(data.get("detail")),
+                        "payload": dict(payload),
+                    },
+                )
+            ]
+
+        # Extract matchList with defensive fallbacks
+        match_list: list[dict[str, Any]] = []
+        for path in (
+            ("result", "matchList"),
+            ("data", "matchList"),
+            ("matchList",),
+        ):
+            cur: Any = data
+            for key in path:
+                if isinstance(cur, Mapping) and key in cur:
+                    cur = cur[key]
+                else:
+                    cur = None
+                    break
+            if isinstance(cur, list):
+                match_list = cur
+                break
+
+        if not match_list:
+            # Return error document if API failed or schema unexpected
+            return [
+                Document(
+                    id=None,
+                    kind="error",
+                    data={
+                        "token": url,
+                        "reason": "no_match_list",
+                        "payload": dict(payload),
+                        "raw_keys": (
+                            list(data.keys())
+                            if isinstance(data, dict)
+                            else str(type(data))
+                        ),
+                    },
+                )
+            ]
+
+        # Filter by groupName == "联赛"
+        filtered = [
+            item
+            for item in match_list
+            if isinstance(item, Mapping) and item.get("groupName") == "联赛"
+        ]
+
+        docs: list[Document] = []
+        for item in filtered:
+            doc_id = (
+                str(item.get("matchId")) if item.get("matchId") is not None else None
+            )
+            docs.append(
+                Document(
+                    id=doc_id,
+                    kind="match",
+                    data={"token": url, "payload": dict(payload), "match": dict(item)},
+                )
+            )
+        return docs
+
+
+if __name__ == "__main__":  # pragma: no cover
+    # This module defines a spider class. It isn't meant to be executed directly.
+    # Use the packaged example instead:
+    #   python -m databank.examples.test_get_league_match_list
+    print(
+        "This file defines GetLeagueMatchListSpider.\n"
+        "Run the packaged example instead:\n"
+        "  python -m databank.examples.test_get_league_match_list\n"
+        "Or import the spider class in your scheduler to provide tasks."
+    )
+    raise SystemExit(0)