test_get_league_match_list.py 5.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154
  1. """Executable demo: run GetLeagueMatchListSpider for three requests.
  2. Usage (PowerShell):
  3. python -m pip install requests pymongo
  4. $env:DATABANK_DB_URI = "mongodb://localhost:27017"
  5. $env:DATABANK_DB_NAME = "databank"
  6. python scripts/test_get_league_match_list.py
  7. """
  8. from __future__ import annotations
  9. import os
  10. from collections import defaultdict
  11. from datetime import UTC, datetime
  12. from time import perf_counter
  13. from databank.db import MongoDB
  14. from databank.db.base import InsertError
  15. from databank.spiders.get_league_match_list import GetLeagueMatchListSpider
  16. from databank.reporter.daily_file import DailyFileReporter
  17. from databank.core.models import RunSummary
  18. def pick_tokens(max_tokens: int = 3) -> list[str]:
  19. """Build up to ``max_tokens`` URL tokens from MongoDB collections."""
  20. uri = os.getenv("DATABANK_DB_URI", "mongodb://localhost:27017")
  21. name = os.getenv("DATABANK_DB_NAME", "databank")
  22. db = MongoDB(uri=uri, name=name)
  23. db.connect()
  24. try:
  25. leagues = db.find("leagues", projection={"_id": 0}, limit=10)
  26. seasons = db.find("seasons", projection={"_id": 0}, limit=10)
  27. if not leagues:
  28. raise RuntimeError("No leagues found. Seed leagues first.")
  29. if not seasons:
  30. raise RuntimeError("No seasons found. Seed seasons first.")
  31. league = sorted(leagues, key=lambda x: x.get("league_id", 0))[0]
  32. max_round = int(league.get("max_round", 1))
  33. season_name = seasons[0]["season"]
  34. tokens: list[str] = []
  35. rounds = list(range(1, max_round + 1))[:max_tokens]
  36. for r in rounds:
  37. tokens.append(f"{league['league_id']}|{season_name}|{r}")
  38. return tokens[:max_tokens]
  39. finally:
  40. db.close()
  41. def main() -> None:
  42. """Run the demo with reporter integration and print a compact summary."""
  43. spider = GetLeagueMatchListSpider()
  44. reporter = DailyFileReporter(timezone="utc+8")
  45. # Prepare summary for duration/error tracking
  46. summary = RunSummary()
  47. try:
  48. urls = pick_tokens()
  49. except Exception as exc: # pylint: disable=broad-except
  50. # Record error and finalize summary
  51. reporter.notify_error(spider.name, f"pick_tokens failed: {exc}")
  52. summary.errors.append(str(exc))
  53. summary.finished_at = datetime.now(UTC)
  54. reporter.notify_summary(summary)
  55. print("pick_tokens failed:", exc)
  56. return
  57. reporter.notify_start(spider.name, urls)
  58. # DB connection for persistence; success count is based on DB insert result
  59. uri = os.getenv("DATABANK_DB_URI", "mongodb://localhost:27017")
  60. name = os.getenv("DATABANK_DB_NAME", "databank")
  61. db = MongoDB(uri=uri, name=name)
  62. db.connect()
  63. all_docs = []
  64. parsed_success_total = 0
  65. persisted_total = 0
  66. error_messages: list[str] = []
  67. error_docs_total = 0
  68. total_time_s_accum = 0.0
  69. try:
  70. for url in urls:
  71. t0 = perf_counter()
  72. docs = spider.run([url])
  73. dt = perf_counter() - t0
  74. total_time_s_accum += dt
  75. all_docs.extend(docs)
  76. url_success = [d for d in docs if d.kind != "error"]
  77. url_errors = [d for d in docs if d.kind == "error"]
  78. parsed_success_total += len(url_success)
  79. error_docs_total += len(url_errors)
  80. # Persist success docs and report using DB result
  81. if url_success:
  82. try:
  83. inserted = db.insert_many(url_success)
  84. except InsertError as exc:
  85. msg = f"insert_many failed: {exc}"
  86. reporter.notify_error(spider.name, msg)
  87. error_messages.append(msg)
  88. inserted = 0
  89. reporter.notify_success(spider.name, inserted)
  90. persisted_total += inserted
  91. # Report errors per doc
  92. for err in url_errors:
  93. reason = err.data.get("reason")
  94. detail = err.data.get("detail")
  95. msg = f"{reason}: {detail}" if detail else str(reason)
  96. reporter.notify_error(spider.name, msg)
  97. error_messages.append(msg)
  98. finally:
  99. db.close()
  100. total_time_s = float(total_time_s_accum)
  101. avg_time_s = (total_time_s / len(urls)) if urls else 0.0
  102. # Final summary
  103. summary.total_docs = persisted_total
  104. summary.per_spider[spider.name] = persisted_total
  105. # Attach concise metrics to summary.errors for visibility in the log
  106. summary.errors.append(
  107. f"metrics: attempted_urls={len(urls)} parsed_success={parsed_success_total} "
  108. f"persisted={persisted_total} error_docs={error_docs_total} "
  109. f"url_time_total_s={total_time_s:.3f} url_time_avg_s={avg_time_s:.3f}"
  110. )
  111. summary.errors.extend(error_messages)
  112. summary.finished_at = datetime.now(UTC)
  113. reporter.notify_summary(summary)
  114. print(
  115. f"Fetched {len(all_docs)} documents, parsed_success={parsed_success_total}, "
  116. f"persisted={persisted_total}, error_docs={error_docs_total}, "
  117. f"url_time_total_s={total_time_s:.3f}, url_time_avg_s={avg_time_s:.3f}."
  118. )
  119. per_token = defaultdict(list)
  120. for d in all_docs:
  121. per_token[d.data.get("token", "unknown")].append(d)
  122. for token, items in per_token.items():
  123. print(f"Token: {token}, docs: {len(items)}")
  124. if items:
  125. print("Sample:", items[0].data.get("match") or items[0].data)
  126. if __name__ == "__main__":
  127. main()