| 12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364 |
- """Run the formal Databank scheduler orchestrating spiders, reporters, and analyzers.
- Usage (PowerShell):
- # Ensure deps
- # python -m pip install requests pymongo
- # Configure DB if needed
- # $env:DATABANK_DB_URI = "mongodb://localhost:27017"
- # $env:DATABANK_DB_NAME = "databank"
- python scripts/run_scheduler.py
- """
- from __future__ import annotations
- import os
- from typing import Dict, List
- from databank.db import MongoDB
- from databank.reporter.daily_file import DailyFileReporter
- from databank.spiders.get_league_match_list import GetLeagueMatchListSpider
- from databank.spiders.base import BaseSpider
- from databank.scheduler.orchestrator import DatabankScheduler, TaskProvider
- from databank.tasks.providers import league_matchlist_from_mongo
- from databank.analytics.simple_counts import PerTokenCounter
- def main() -> None:
- """Entry point that builds and runs the Databank scheduler once."""
- uri = os.getenv("DATABANK_DB_URI", "mongodb://localhost:27017")
- name = os.getenv("DATABANK_DB_NAME", "databank")
- db = MongoDB(uri=uri, name=name)
- db.connect()
- # Spiders
- get_match = GetLeagueMatchListSpider()
- spiders: List[BaseSpider] = [get_match]
- # Reporters
- reporters = [DailyFileReporter(timezone="utc+8")]
- # Task providers wiring (no caps in production)
- tasks_provider: Dict[BaseSpider, TaskProvider] = {
- get_match: league_matchlist_from_mongo(),
- }
- # Analyzers
- analyzers = [PerTokenCounter()]
- # Orchestrator
- scheduler = DatabankScheduler(
- db=db,
- spiders=spiders,
- reporters=reporters,
- task_providers=tasks_provider,
- analyzers=analyzers,
- interval_s=None, # set to seconds to loop
- )
- summary = scheduler.run_once()
- print("Scheduler finished. Total persisted:", summary.total_docs)
- if __name__ == "__main__":
- main()
|