run_scheduler_full.py 2.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172
  1. """Run scheduler in full mode: all seasons and rounds for available leagues."""
  2. from __future__ import annotations
  3. import os
  4. from typing import Dict, List
  5. from databank.db import MongoDB
  6. from databank.reporter.daily_file import DailyFileReporter
  7. from databank.spiders.get_league_match_list import GetLeagueMatchListSpider
  8. from databank.spiders.base import BaseSpider
  9. from databank.scheduler.orchestrator import DatabankScheduler, TaskProvider
  10. from databank.tasks.providers import league_matchlist_from_mongo
  11. from databank.analytics.simple_counts import PerTokenCounter
  12. essential_env = {
  13. "DATABANK_DB_URI": "mongodb://localhost:27017",
  14. "DATABANK_DB_NAME": "databank",
  15. }
  16. def main() -> None:
  17. """Entry point: run the scheduler in full mode (all seasons and rounds)."""
  18. uri = os.getenv("DATABANK_DB_URI", essential_env["DATABANK_DB_URI"])
  19. name = os.getenv("DATABANK_DB_NAME", essential_env["DATABANK_DB_NAME"])
  20. db = MongoDB(
  21. uri=uri,
  22. name=name,
  23. indexes={
  24. "match": [
  25. {
  26. "keys": [("match.matchId", 1)],
  27. "unique": True,
  28. "name": "uniq_match_matchId",
  29. }
  30. ]
  31. },
  32. )
  33. db.connect()
  34. db.ensure_indexes()
  35. # Spiders
  36. get_match = GetLeagueMatchListSpider()
  37. spiders: List[BaseSpider] = [get_match]
  38. # Reporters
  39. reporters = [DailyFileReporter(timezone="utc+8")]
  40. # Task providers wiring (full, no caps)
  41. tasks_provider: Dict[BaseSpider, TaskProvider] = {
  42. get_match: league_matchlist_from_mongo(mode="full"),
  43. }
  44. # Analyzers
  45. analyzers = [PerTokenCounter()]
  46. scheduler = DatabankScheduler(
  47. db=db,
  48. spiders=spiders,
  49. reporters=reporters,
  50. task_providers=tasks_provider,
  51. analyzers=analyzers,
  52. )
  53. summary = scheduler.run_once()
  54. print("Scheduler(full) finished. Total persisted:", summary.total_docs)
  55. if __name__ == "__main__":
  56. main()