run_scheduler.py 1.8 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364
  1. """Run the formal Databank scheduler orchestrating spiders, reporters, and analyzers.
  2. Usage (PowerShell):
  3. # Ensure deps
  4. # python -m pip install requests pymongo
  5. # Configure DB if needed
  6. # $env:DATABANK_DB_URI = "mongodb://localhost:27017"
  7. # $env:DATABANK_DB_NAME = "databank"
  8. python scripts/run_scheduler.py
  9. """
  10. from __future__ import annotations
  11. import os
  12. from typing import Dict, List
  13. from databank.db import MongoDB
  14. from databank.reporter.daily_file import DailyFileReporter
  15. from databank.spiders.get_league_match_list import GetLeagueMatchListSpider
  16. from databank.spiders.base import BaseSpider
  17. from databank.scheduler.orchestrator import DatabankScheduler, TaskProvider
  18. from databank.tasks.providers import league_matchlist_from_mongo
  19. from databank.analytics.simple_counts import PerTokenCounter
  20. def main() -> None:
  21. """Entry point that builds and runs the Databank scheduler once."""
  22. uri = os.getenv("DATABANK_DB_URI", "mongodb://localhost:27017")
  23. name = os.getenv("DATABANK_DB_NAME", "databank")
  24. db = MongoDB(uri=uri, name=name)
  25. db.connect()
  26. # Spiders
  27. get_match = GetLeagueMatchListSpider()
  28. spiders: List[BaseSpider] = [get_match]
  29. # Reporters
  30. reporters = [DailyFileReporter(timezone="utc+8")]
  31. # Task providers wiring (no caps in production)
  32. tasks_provider: Dict[BaseSpider, TaskProvider] = {
  33. get_match: league_matchlist_from_mongo(),
  34. }
  35. # Analyzers
  36. analyzers = [PerTokenCounter()]
  37. # Orchestrator
  38. scheduler = DatabankScheduler(
  39. db=db,
  40. spiders=spiders,
  41. reporters=reporters,
  42. task_providers=tasks_provider,
  43. analyzers=analyzers,
  44. interval_s=None, # set to seconds to loop
  45. )
  46. summary = scheduler.run_once()
  47. print("Scheduler finished. Total persisted:", summary.total_docs)
  48. if __name__ == "__main__":
  49. main()