| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154 |
- """Executable demo: run GetLeagueMatchListSpider for three requests.
- Usage (PowerShell):
- python -m pip install requests pymongo
- $env:DATABANK_DB_URI = "mongodb://localhost:27017"
- $env:DATABANK_DB_NAME = "databank"
- python scripts/test_get_league_match_list.py
- """
- from __future__ import annotations
- import os
- from collections import defaultdict
- from datetime import UTC, datetime
- from time import perf_counter
- from databank.db import MongoDB
- from databank.db.base import InsertError
- from databank.spiders.get_league_match_list import GetLeagueMatchListSpider
- from databank.reporter.daily_file import DailyFileReporter
- from databank.core.models import RunSummary
- def pick_tokens(max_tokens: int = 3) -> list[str]:
- """Build up to ``max_tokens`` URL tokens from MongoDB collections."""
- uri = os.getenv("DATABANK_DB_URI", "mongodb://localhost:27017")
- name = os.getenv("DATABANK_DB_NAME", "databank")
- db = MongoDB(uri=uri, name=name)
- db.connect()
- try:
- leagues = db.find("leagues", projection={"_id": 0}, limit=10)
- seasons = db.find("seasons", projection={"_id": 0}, limit=10)
- if not leagues:
- raise RuntimeError("No leagues found. Seed leagues first.")
- if not seasons:
- raise RuntimeError("No seasons found. Seed seasons first.")
- league = sorted(leagues, key=lambda x: x.get("league_id", 0))[0]
- max_round = int(league.get("max_round", 1))
- season_name = seasons[0]["season"]
- tokens: list[str] = []
- rounds = list(range(1, max_round + 1))[:max_tokens]
- for r in rounds:
- tokens.append(f"{league['league_id']}|{season_name}|{r}")
- return tokens[:max_tokens]
- finally:
- db.close()
- def main() -> None:
- """Run the demo with reporter integration and print a compact summary."""
- spider = GetLeagueMatchListSpider()
- reporter = DailyFileReporter(timezone="utc+8")
- # Prepare summary for duration/error tracking
- summary = RunSummary()
- try:
- urls = pick_tokens()
- except Exception as exc: # pylint: disable=broad-except
- # Record error and finalize summary
- reporter.notify_error(spider.name, f"pick_tokens failed: {exc}")
- summary.errors.append(str(exc))
- summary.finished_at = datetime.now(UTC)
- reporter.notify_summary(summary)
- print("pick_tokens failed:", exc)
- return
- reporter.notify_start(spider.name, urls)
- # DB connection for persistence; success count is based on DB insert result
- uri = os.getenv("DATABANK_DB_URI", "mongodb://localhost:27017")
- name = os.getenv("DATABANK_DB_NAME", "databank")
- db = MongoDB(uri=uri, name=name)
- db.connect()
- all_docs = []
- parsed_success_total = 0
- persisted_total = 0
- error_messages: list[str] = []
- error_docs_total = 0
- total_time_s_accum = 0.0
- try:
- for url in urls:
- t0 = perf_counter()
- docs = spider.run([url])
- dt = perf_counter() - t0
- total_time_s_accum += dt
- all_docs.extend(docs)
- url_success = [d for d in docs if d.kind != "error"]
- url_errors = [d for d in docs if d.kind == "error"]
- parsed_success_total += len(url_success)
- error_docs_total += len(url_errors)
- # Persist success docs and report using DB result
- if url_success:
- try:
- inserted = db.insert_many(url_success)
- except InsertError as exc:
- msg = f"insert_many failed: {exc}"
- reporter.notify_error(spider.name, msg)
- error_messages.append(msg)
- inserted = 0
- reporter.notify_success(spider.name, inserted)
- persisted_total += inserted
- # Report errors per doc
- for err in url_errors:
- reason = err.data.get("reason")
- detail = err.data.get("detail")
- msg = f"{reason}: {detail}" if detail else str(reason)
- reporter.notify_error(spider.name, msg)
- error_messages.append(msg)
- finally:
- db.close()
- total_time_s = float(total_time_s_accum)
- avg_time_s = (total_time_s / len(urls)) if urls else 0.0
- # Final summary
- summary.total_docs = persisted_total
- summary.per_spider[spider.name] = persisted_total
- # Attach concise metrics to summary.errors for visibility in the log
- summary.errors.append(
- f"metrics: attempted_urls={len(urls)} parsed_success={parsed_success_total} "
- f"persisted={persisted_total} error_docs={error_docs_total} "
- f"url_time_total_s={total_time_s:.3f} url_time_avg_s={avg_time_s:.3f}"
- )
- summary.errors.extend(error_messages)
- summary.finished_at = datetime.now(UTC)
- reporter.notify_summary(summary)
- print(
- f"Fetched {len(all_docs)} documents, parsed_success={parsed_success_total}, "
- f"persisted={persisted_total}, error_docs={error_docs_total}, "
- f"url_time_total_s={total_time_s:.3f}, url_time_avg_s={avg_time_s:.3f}."
- )
- per_token = defaultdict(list)
- for d in all_docs:
- per_token[d.data.get("token", "unknown")].append(d)
- for token, items in per_token.items():
- print(f"Token: {token}, docs: {len(items)}")
- if items:
- print("Sample:", items[0].data.get("match") or items[0].data)
- if __name__ == "__main__":
- main()
|