run_scheduler_incremental.py 3.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105
  1. """Run scheduler in incremental mode: only current/latest season per league."""
  2. from __future__ import annotations
  3. import os
  4. from typing import Dict, List
  5. from databank.db import MongoDB
  6. from databank.reporter.daily_file import DailyFileReporter
  7. from databank.spiders.get_league_match_list import GetLeagueMatchListSpider
  8. from databank.spiders.base import BaseSpider
  9. from databank.scheduler.orchestrator import DatabankScheduler, TaskProvider
  10. from databank.tasks.providers import league_matchlist_from_mongo
  11. from databank.analytics.simple_counts import PerTokenCounter
  12. essential_env = {
  13. "DATABANK_DB_URI": "mongodb://localhost:27017",
  14. "DATABANK_DB_NAME": "databank",
  15. }
  16. def main() -> None:
  17. """Entry point: run the scheduler in incremental mode (latest season only)."""
  18. uri = os.getenv("DATABANK_DB_URI", essential_env["DATABANK_DB_URI"])
  19. name = os.getenv("DATABANK_DB_NAME", essential_env["DATABANK_DB_NAME"])
  20. db = MongoDB(uri=uri, name=name)
  21. db.connect()
  22. # Spiders
  23. get_match = GetLeagueMatchListSpider()
  24. spiders: List[BaseSpider] = [get_match]
  25. # Reporters
  26. reporters = [DailyFileReporter(timezone="utc+8")]
  27. # Task providers wiring (incremental)
  28. provider = league_matchlist_from_mongo(mode="incremental")
  29. tasks_provider: Dict[BaseSpider, TaskProvider] = {get_match: provider}
  30. # Preflight: generate tasks once to validate seeds/config
  31. preview_tasks = provider(get_match, db)
  32. if not preview_tasks:
  33. print(
  34. "No tasks generated (incremental mode). "
  35. "Please ensure MongoDB has seeds in 'leagues' and 'seasons'."
  36. )
  37. print(
  38. "Try seeding first: python scripts/seed_leagues_mongo.py "
  39. "and python scripts/seed_seasons_mongo.py"
  40. )
  41. return
  42. else:
  43. print(
  44. f"Prepared {len(preview_tasks)} task(s) for incremental run "
  45. f"(showing up to 3):"
  46. )
  47. for t in preview_tasks[:3]:
  48. token = t.token() if hasattr(t, "token") else str(t)
  49. print(" -", token)
  50. # Analyzers
  51. analyzers = [PerTokenCounter()]
  52. scheduler = DatabankScheduler(
  53. db=db,
  54. spiders=spiders,
  55. reporters=reporters,
  56. task_providers=tasks_provider,
  57. analyzers=analyzers,
  58. )
  59. summary = scheduler.run_once()
  60. print("Scheduler(incremental) finished. Total persisted:", summary.total_docs)
  61. if summary.total_docs == 0:
  62. # Diagnostics: inspect returned docs to explain why nothing persisted
  63. docs = scheduler.get_last_docs() if hasattr(scheduler, "get_last_docs") else []
  64. kinds = {}
  65. for d in docs:
  66. kinds[d.kind] = kinds.get(d.kind, 0) + 1
  67. if kinds:
  68. print("Returned document kinds:", kinds)
  69. # Show first few error reasons if present
  70. errs = [d for d in docs if d.kind == "error"]
  71. if errs:
  72. preview = errs[:3]
  73. print("Sample errors (up to 3):")
  74. for e in preview:
  75. reason = e.data.get("reason") if isinstance(e.data, dict) else None
  76. detail = e.data.get("detail") if isinstance(e.data, dict) else None
  77. print(" -", reason, ":", detail)
  78. else:
  79. print(
  80. "Runner produced no documents. "
  81. "Check network/API accessibility and spider filters."
  82. )
  83. print(
  84. "Note: Spider filters keep only groupName='联赛' and "
  85. "elapsedTime='已完场', and skip future-dated matches."
  86. )
  87. if __name__ == "__main__":
  88. main()