run_scheduler.py 2.1 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677
  1. """Run the formal Databank scheduler orchestrating spiders, reporters, and analyzers.
  2. Usage (PowerShell):
  3. # Ensure deps
  4. # python -m pip install requests pymongo
  5. # Configure DB if needed
  6. # $env:DATABANK_DB_URI = "mongodb://localhost:27017"
  7. # $env:DATABANK_DB_NAME = "databank"
  8. python scripts/run_scheduler.py
  9. """
  10. from __future__ import annotations
  11. import os
  12. from typing import Dict, List
  13. from databank.db import MongoDB
  14. from databank.reporter.daily_file import DailyFileReporter
  15. from databank.spiders.get_league_match_list import GetLeagueMatchListSpider
  16. from databank.spiders.base import BaseSpider
  17. from databank.scheduler.orchestrator import DatabankScheduler, TaskProvider
  18. from databank.tasks.providers import league_matchlist_from_mongo
  19. from databank.analytics.simple_counts import PerTokenCounter
  20. def main() -> None:
  21. """Entry point that builds and runs the Databank scheduler once."""
  22. uri = os.getenv("DATABANK_DB_URI", "mongodb://localhost:27017")
  23. name = os.getenv("DATABANK_DB_NAME", "databank")
  24. db = MongoDB(
  25. uri=uri,
  26. name=name,
  27. indexes={
  28. "match": [
  29. {
  30. "keys": [("match.matchId", 1)],
  31. "unique": True,
  32. "name": "uniq_match_matchId",
  33. }
  34. ]
  35. },
  36. )
  37. db.connect()
  38. db.ensure_indexes()
  39. # Spiders
  40. get_match = GetLeagueMatchListSpider()
  41. spiders: List[BaseSpider] = [get_match]
  42. # Reporters
  43. reporters = [DailyFileReporter(timezone="utc+8")]
  44. # Task providers wiring (no caps in production)
  45. tasks_provider: Dict[BaseSpider, TaskProvider] = {
  46. get_match: league_matchlist_from_mongo(),
  47. }
  48. # Analyzers
  49. analyzers = [PerTokenCounter()]
  50. # Orchestrator
  51. scheduler = DatabankScheduler(
  52. db=db,
  53. spiders=spiders,
  54. reporters=reporters,
  55. task_providers=tasks_provider,
  56. analyzers=analyzers,
  57. interval_s=None, # set to seconds to loop
  58. )
  59. summary = scheduler.run_once()
  60. print("Scheduler finished. Total persisted:", summary.total_docs)
  61. if __name__ == "__main__":
  62. main()