run_scheduler_incremental.py 3.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118
  1. """Run scheduler in incremental mode: only current/latest season per league."""
  2. from __future__ import annotations
  3. import os
  4. from typing import Dict, List
  5. from databank.db import MongoDB
  6. from databank.reporter.daily_file import DailyFileReporter
  7. from databank.spiders.get_league_match_list import GetLeagueMatchListSpider
  8. from databank.spiders.base import BaseSpider
  9. from databank.scheduler.orchestrator import DatabankScheduler, TaskProvider
  10. from databank.tasks.providers import league_matchlist_from_mongo
  11. from databank.analytics.simple_counts import PerTokenCounter
  12. essential_env = {
  13. "DATABANK_DB_URI": "mongodb://localhost:27017",
  14. "DATABANK_DB_NAME": "databank",
  15. }
  16. def main() -> None:
  17. """Entry point: run the scheduler in incremental mode (latest season only)."""
  18. uri = os.getenv("DATABANK_DB_URI", essential_env["DATABANK_DB_URI"])
  19. name = os.getenv("DATABANK_DB_NAME", essential_env["DATABANK_DB_NAME"])
  20. db = MongoDB(
  21. uri=uri,
  22. name=name,
  23. indexes={
  24. "match": [
  25. {
  26. "keys": [("match.matchId", 1)],
  27. "unique": True,
  28. "name": "uniq_match_matchId",
  29. }
  30. ]
  31. },
  32. )
  33. db.connect()
  34. db.ensure_indexes()
  35. # Spiders
  36. get_match = GetLeagueMatchListSpider()
  37. spiders: List[BaseSpider] = [get_match]
  38. # Reporters
  39. reporters = [DailyFileReporter(timezone="utc+8")]
  40. # Task providers wiring (incremental)
  41. provider = league_matchlist_from_mongo(mode="incremental")
  42. tasks_provider: Dict[BaseSpider, TaskProvider] = {get_match: provider}
  43. # Preflight: generate tasks once to validate seeds/config
  44. preview_tasks = provider(get_match, db)
  45. if not preview_tasks:
  46. print(
  47. "No tasks generated (incremental mode). "
  48. "Please ensure MongoDB has seeds in 'leagues' and 'seasons'."
  49. )
  50. print(
  51. "Try seeding first: python scripts/seed_leagues_mongo.py "
  52. "and python scripts/seed_seasons_mongo.py"
  53. )
  54. return
  55. else:
  56. print(
  57. f"Prepared {len(preview_tasks)} task(s) for incremental run "
  58. f"(showing up to 3):"
  59. )
  60. for t in preview_tasks[:3]:
  61. token = t.token() if hasattr(t, "token") else str(t)
  62. print(" -", token)
  63. # Analyzers
  64. analyzers = [PerTokenCounter()]
  65. scheduler = DatabankScheduler(
  66. db=db,
  67. spiders=spiders,
  68. reporters=reporters,
  69. task_providers=tasks_provider,
  70. analyzers=analyzers,
  71. )
  72. summary = scheduler.run_once()
  73. print("Scheduler(incremental) finished. Total persisted:", summary.total_docs)
  74. if summary.total_docs == 0:
  75. # Diagnostics: inspect returned docs to explain why nothing persisted
  76. docs = scheduler.get_last_docs() if hasattr(scheduler, "get_last_docs") else []
  77. kinds = {}
  78. for d in docs:
  79. kinds[d.kind] = kinds.get(d.kind, 0) + 1
  80. if kinds:
  81. print("Returned document kinds:", kinds)
  82. # Show first few error reasons if present
  83. errs = [d for d in docs if d.kind == "error"]
  84. if errs:
  85. preview = errs[:3]
  86. print("Sample errors (up to 3):")
  87. for e in preview:
  88. reason = e.data.get("reason") if isinstance(e.data, dict) else None
  89. detail = e.data.get("detail") if isinstance(e.data, dict) else None
  90. print(" -", reason, ":", detail)
  91. else:
  92. print(
  93. "Runner produced no documents. "
  94. "Check network/API accessibility and spider filters."
  95. )
  96. print(
  97. "Note: Spider filters keep only groupName='联赛' and "
  98. "elapsedTime='已完场', and skip future-dated matches."
  99. )
  100. if __name__ == "__main__":
  101. main()