"""Executable demo: run GetLeagueMatchListSpider for three requests. Usage (PowerShell): python -m pip install requests pymongo $env:DATABANK_DB_URI = "mongodb://localhost:27017" $env:DATABANK_DB_NAME = "databank" python scripts/test_get_league_match_list.py """ from __future__ import annotations import os from collections import defaultdict from datetime import UTC, datetime from time import perf_counter from databank.db import MongoDB from databank.db.base import InsertError from databank.spiders.get_league_match_list import GetLeagueMatchListSpider from databank.reporter.daily_file import DailyFileReporter from databank.core.models import RunSummary def pick_tokens(max_tokens: int = 3) -> list[str]: """Build up to ``max_tokens`` URL tokens from MongoDB collections.""" uri = os.getenv("DATABANK_DB_URI", "mongodb://localhost:27017") name = os.getenv("DATABANK_DB_NAME", "databank") db = MongoDB(uri=uri, name=name) db.connect() try: leagues = db.find("leagues", projection={"_id": 0}, limit=10) seasons = db.find("seasons", projection={"_id": 0}, limit=10) if not leagues: raise RuntimeError("No leagues found. Seed leagues first.") if not seasons: raise RuntimeError("No seasons found. Seed seasons first.") league = sorted(leagues, key=lambda x: x.get("league_id", 0))[0] max_round = int(league.get("max_round", 1)) season_name = seasons[0]["season"] tokens: list[str] = [] rounds = list(range(1, max_round + 1))[:max_tokens] for r in rounds: tokens.append(f"{league['league_id']}|{season_name}|{r}") return tokens[:max_tokens] finally: db.close() def main() -> None: """Run the demo with reporter integration and print a compact summary.""" spider = GetLeagueMatchListSpider() reporter = DailyFileReporter(timezone="utc+8") # Prepare summary for duration/error tracking summary = RunSummary() try: urls = pick_tokens() except Exception as exc: # pylint: disable=broad-except # Record error and finalize summary reporter.notify_error(spider.name, f"pick_tokens failed: {exc}") summary.errors.append(str(exc)) summary.finished_at = datetime.now(UTC) reporter.notify_summary(summary) print("pick_tokens failed:", exc) return reporter.notify_start(spider.name, urls) # DB connection for persistence; success count is based on DB insert result uri = os.getenv("DATABANK_DB_URI", "mongodb://localhost:27017") name = os.getenv("DATABANK_DB_NAME", "databank") db = MongoDB(uri=uri, name=name) db.connect() all_docs = [] parsed_success_total = 0 persisted_total = 0 error_messages: list[str] = [] error_docs_total = 0 total_time_s_accum = 0.0 try: for url in urls: t0 = perf_counter() docs = spider.run([url]) dt = perf_counter() - t0 total_time_s_accum += dt all_docs.extend(docs) url_success = [d for d in docs if d.kind != "error"] url_errors = [d for d in docs if d.kind == "error"] parsed_success_total += len(url_success) error_docs_total += len(url_errors) # Persist success docs and report using DB result if url_success: try: inserted = db.insert_many(url_success) except InsertError as exc: msg = f"insert_many failed: {exc}" reporter.notify_error(spider.name, msg) error_messages.append(msg) inserted = 0 reporter.notify_success(spider.name, inserted) persisted_total += inserted # Report errors per doc for err in url_errors: reason = err.data.get("reason") detail = err.data.get("detail") msg = f"{reason}: {detail}" if detail else str(reason) reporter.notify_error(spider.name, msg) error_messages.append(msg) finally: db.close() total_time_s = float(total_time_s_accum) avg_time_s = (total_time_s / len(urls)) if urls else 0.0 # Final summary summary.total_docs = persisted_total summary.per_spider[spider.name] = persisted_total # Attach concise metrics to summary.errors for visibility in the log summary.errors.append( f"metrics: attempted_urls={len(urls)} parsed_success={parsed_success_total} " f"persisted={persisted_total} error_docs={error_docs_total} " f"url_time_total_s={total_time_s:.3f} url_time_avg_s={avg_time_s:.3f}" ) summary.errors.extend(error_messages) summary.finished_at = datetime.now(UTC) reporter.notify_summary(summary) print( f"Fetched {len(all_docs)} documents, parsed_success={parsed_success_total}, " f"persisted={persisted_total}, error_docs={error_docs_total}, " f"url_time_total_s={total_time_s:.3f}, url_time_avg_s={avg_time_s:.3f}." ) per_token = defaultdict(list) for d in all_docs: per_token[d.data.get("token", "unknown")].append(d) for token, items in per_token.items(): print(f"Token: {token}, docs: {len(items)}") if items: print("Sample:", items[0].data.get("match") or items[0].data) if __name__ == "__main__": main()