"""Run the formal Databank scheduler orchestrating spiders, reporters, and analyzers. Usage (PowerShell): # Ensure deps # python -m pip install requests pymongo # Configure DB if needed # $env:DATABANK_DB_URI = "mongodb://localhost:27017" # $env:DATABANK_DB_NAME = "databank" python scripts/run_scheduler.py """ from __future__ import annotations import os from typing import Dict, List from databank.db import MongoDB from databank.reporter.daily_file import DailyFileReporter from databank.spiders.get_league_match_list import GetLeagueMatchListSpider from databank.spiders.base import BaseSpider from databank.scheduler.orchestrator import DatabankScheduler, TaskProvider from databank.tasks.providers import league_matchlist_from_mongo from databank.analytics.simple_counts import PerTokenCounter def main() -> None: """Entry point that builds and runs the Databank scheduler once.""" uri = os.getenv("DATABANK_DB_URI", "mongodb://localhost:27017") name = os.getenv("DATABANK_DB_NAME", "databank") db = MongoDB( uri=uri, name=name, indexes={ "match": [ { "keys": [("match.matchId", 1)], "unique": True, "name": "uniq_match_matchId", } ] }, ) db.connect() db.ensure_indexes() # Spiders get_match = GetLeagueMatchListSpider() spiders: List[BaseSpider] = [get_match] # Reporters reporters = [DailyFileReporter(timezone="utc+8")] # Task providers wiring (no caps in production) tasks_provider: Dict[BaseSpider, TaskProvider] = { get_match: league_matchlist_from_mongo(), } # Analyzers analyzers = [PerTokenCounter()] # Orchestrator scheduler = DatabankScheduler( db=db, spiders=spiders, reporters=reporters, task_providers=tasks_provider, analyzers=analyzers, interval_s=None, # set to seconds to loop ) summary = scheduler.run_once() print("Scheduler finished. Total persisted:", summary.total_docs) if __name__ == "__main__": main()