test_get_league_match_list.py 5.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161
  1. """Executable demo: run GetLeagueMatchListSpider for three requests.
  2. Usage (PowerShell):
  3. python -m pip install requests pymongo
  4. $env:DATABANK_DB_URI = "mongodb://localhost:27017"
  5. $env:DATABANK_DB_NAME = "databank"
  6. python scripts/test_get_league_match_list.py
  7. """
  8. from __future__ import annotations
  9. import os
  10. from collections import defaultdict
  11. from datetime import UTC, datetime
  12. from time import perf_counter
  13. from databank.db import MongoDB
  14. from databank.db.base import InsertError
  15. from databank.spiders.get_league_match_list import GetLeagueMatchListSpider
  16. from databank.reporter.daily_file import DailyFileReporter
  17. from databank.core.models import RunSummary
  18. from databank.core.tasks import MatchListTask
  19. def pick_tokens(max_tokens: int = 3) -> list[MatchListTask]:
  20. """Build up to ``max_tokens`` structured tasks from MongoDB collections."""
  21. uri = os.getenv("DATABANK_DB_URI", "mongodb://localhost:27017")
  22. name = os.getenv("DATABANK_DB_NAME", "databank")
  23. db = MongoDB(uri=uri, name=name)
  24. db.connect()
  25. try:
  26. leagues = db.find("leagues", projection={"_id": 0}, limit=10)
  27. seasons = db.find("seasons", projection={"_id": 0}, limit=10)
  28. if not leagues:
  29. raise RuntimeError("No leagues found. Seed leagues first.")
  30. if not seasons:
  31. raise RuntimeError("No seasons found. Seed seasons first.")
  32. league = sorted(leagues, key=lambda x: x.get("league_id", 0))[0]
  33. max_round = int(league.get("max_round", 1))
  34. season_name = seasons[0]["season"]
  35. tasks: list[MatchListTask] = []
  36. rounds = list(range(1, max_round + 1))[:max_tokens]
  37. for r in rounds:
  38. tasks.append(
  39. MatchListTask(
  40. league_id=int(league["league_id"]),
  41. season=season_name,
  42. round_no=int(r),
  43. )
  44. )
  45. return tasks[:max_tokens]
  46. finally:
  47. db.close()
  48. def main() -> None:
  49. """Run the demo with reporter integration and print a compact summary."""
  50. spider = GetLeagueMatchListSpider()
  51. reporter = DailyFileReporter(timezone="utc+8")
  52. # Prepare summary for duration/error tracking
  53. summary = RunSummary()
  54. try:
  55. tasks = pick_tokens()
  56. except Exception as exc: # pylint: disable=broad-except
  57. # Record error and finalize summary
  58. reporter.notify_error(spider.name, f"pick_tokens failed: {exc}")
  59. summary.errors.append(str(exc))
  60. summary.finished_at = datetime.now(UTC)
  61. reporter.notify_summary(summary)
  62. print("pick_tokens failed:", exc)
  63. return
  64. reporter.notify_start(spider.name, [t.token() for t in tasks])
  65. # DB connection for persistence; success count is based on DB insert result
  66. uri = os.getenv("DATABANK_DB_URI", "mongodb://localhost:27017")
  67. name = os.getenv("DATABANK_DB_NAME", "databank")
  68. db = MongoDB(uri=uri, name=name)
  69. db.connect()
  70. all_docs = []
  71. parsed_success_total = 0
  72. persisted_total = 0
  73. error_messages: list[str] = []
  74. error_docs_total = 0
  75. total_time_s_accum = 0.0
  76. try:
  77. for task in tasks:
  78. t0 = perf_counter()
  79. docs = spider.run([task])
  80. dt = perf_counter() - t0
  81. total_time_s_accum += dt
  82. all_docs.extend(docs)
  83. url_success = [d for d in docs if d.kind != "error"]
  84. url_errors = [d for d in docs if d.kind == "error"]
  85. parsed_success_total += len(url_success)
  86. error_docs_total += len(url_errors)
  87. # Persist success docs and report using DB result
  88. if url_success:
  89. try:
  90. inserted = db.insert_many(url_success)
  91. except InsertError as exc:
  92. msg = f"insert_many failed: {exc}"
  93. reporter.notify_error(spider.name, msg)
  94. error_messages.append(msg)
  95. inserted = 0
  96. reporter.notify_success(spider.name, inserted)
  97. persisted_total += inserted
  98. # Report errors per doc
  99. for err in url_errors:
  100. reason = err.data.get("reason")
  101. detail = err.data.get("detail")
  102. msg = f"{reason}: {detail}" if detail else str(reason)
  103. reporter.notify_error(spider.name, msg)
  104. error_messages.append(msg)
  105. finally:
  106. db.close()
  107. total_time_s = float(total_time_s_accum)
  108. avg_time_s = (total_time_s / len(tasks)) if tasks else 0.0
  109. # Final summary
  110. summary.total_docs = persisted_total
  111. summary.per_spider[spider.name] = persisted_total
  112. # Attach concise metrics to summary.errors for visibility in the log
  113. summary.errors.append(
  114. f"metrics: attempted_urls={len(tasks)} parsed_success={parsed_success_total} "
  115. f"persisted={persisted_total} error_docs={error_docs_total} "
  116. f"url_time_total_s={total_time_s:.3f} url_time_avg_s={avg_time_s:.3f}"
  117. )
  118. summary.errors.extend(error_messages)
  119. summary.finished_at = datetime.now(UTC)
  120. reporter.notify_summary(summary)
  121. print(
  122. f"Fetched {len(all_docs)} documents, parsed_success={parsed_success_total}, "
  123. f"persisted={persisted_total}, error_docs={error_docs_total}, "
  124. f"url_time_total_s={total_time_s:.3f}, url_time_avg_s={avg_time_s:.3f}."
  125. )
  126. per_token = defaultdict(list)
  127. for d in all_docs:
  128. per_token[d.data.get("token", "unknown")].append(d)
  129. for token, items in per_token.items():
  130. print(f"Token: {token}, docs: {len(items)}")
  131. if items:
  132. print("Sample:", items[0].data.get("match") or items[0].data)
  133. if __name__ == "__main__":
  134. main()