run_analyzers.py 4.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124
  1. """Run selected analyzers over existing DB data or last run results.
  2. This is a scaffold to wire analyzers end-to-end once implementations are ready.
  3. """
  4. from __future__ import annotations
  5. import os
  6. import sys
  7. import pathlib
  8. from typing import Iterable, List
  9. import importlib
  10. # Ensure 'src' is on sys.path when running from repo root or scripts dir
  11. _SCRIPT_DIR = pathlib.Path(__file__).resolve().parent
  12. _ROOT = _SCRIPT_DIR.parent
  13. _SRC = _ROOT / "src"
  14. if _SRC.exists() and str(_SRC) not in sys.path:
  15. sys.path.insert(0, str(_SRC))
  16. def _ensure_src_on_path() -> None:
  17. """Put repository src on sys.path if available (no-op if already present)."""
  18. script_dir = pathlib.Path(__file__).resolve().parent
  19. root = script_dir.parent
  20. src = root / "src"
  21. if src.exists():
  22. sys.path.insert(0, str(src))
  23. def load_data(db, limit: int | None = None) -> List[dict]:
  24. """Load match documents from DB as analyzer inputs (scaffold)."""
  25. # NOTE: Adjust projection to include what analyzers need.
  26. return db.find("match", projection={"_id": 0}, limit=limit)
  27. def _safe_count(db, kind: str) -> int:
  28. """Best-effort count using find; avoids driver-specific count API."""
  29. try:
  30. docs = db.find(kind, projection={"_id": 1}, limit=None)
  31. return len(list(docs)) if docs is not None else 0
  32. except (RuntimeError, ValueError, TypeError): # diagnostics only
  33. return 0
  34. def main() -> None:
  35. """Build analyzers and run them on existing data (skeleton)."""
  36. # Ensure imports resolve regardless of cwd, then import dynamically
  37. _ensure_src_on_path()
  38. db_mod = importlib.import_module("databank.db")
  39. teams_mod = importlib.import_module("databank.analytics.teams")
  40. elo_mod = importlib.import_module("databank.analytics.elo")
  41. dc_mod = importlib.import_module("databank.analytics.dixon_coles")
  42. mc_mod = importlib.import_module("databank.analytics.markov_chain")
  43. h2h_mod = importlib.import_module("databank.analytics.h2h")
  44. calib_mod = importlib.import_module("databank.analytics.calibration")
  45. season_mc_mod = importlib.import_module("databank.analytics.monte_carlo")
  46. sos_mod = importlib.import_module("databank.analytics.sos")
  47. mongodb_cls = getattr(db_mod, "MongoDB")
  48. team_extractor_cls = getattr(teams_mod, "TeamExtractorAnalyzer")
  49. elo_analyzer_cls = getattr(elo_mod, "EloAnalyzer")
  50. dixon_coles_cls = getattr(dc_mod, "DixonColesAnalyzer")
  51. markov_chain_cls = getattr(mc_mod, "MarkovChainAnalyzer")
  52. h2h_cls = getattr(h2h_mod, "H2HAnalyzer")
  53. calibration_cls = getattr(calib_mod, "CalibrationAnalyzer")
  54. season_mc_cls = getattr(season_mc_mod, "SeasonMonteCarloAnalyzer")
  55. sos_cls = getattr(sos_mod, "StrengthOfScheduleAnalyzer")
  56. uri = os.getenv("DATABANK_DB_URI", "mongodb://localhost:27017")
  57. name = os.getenv("DATABANK_DB_NAME", "databank")
  58. db = mongodb_cls(uri=uri, name=name)
  59. db.connect()
  60. data = load_data(db)
  61. print(f"Loaded matches: {len(data) if data is not None else 0}")
  62. analyzers: Iterable = [
  63. team_extractor_cls(),
  64. elo_analyzer_cls(),
  65. dixon_coles_cls(),
  66. markov_chain_cls(),
  67. h2h_cls(),
  68. calibration_cls(),
  69. sos_cls(),
  70. season_mc_cls(),
  71. ]
  72. for analyzer in analyzers:
  73. print(f"Running analyzer: {analyzer.__class__.__name__}")
  74. try:
  75. analyzer.prepare(data)
  76. analyzer.validate(data)
  77. transformed = analyzer.transform(data)
  78. result = analyzer.compute(transformed, db=db, persist=True)
  79. analyzer.finalize(result)
  80. print(f" -> Done: {analyzer.__class__.__name__}")
  81. # Diagnostics: show where data is persisted for Elo
  82. if isinstance(analyzer, elo_analyzer_cls):
  83. ratings_cnt = _safe_count(db, "elo_ratings")
  84. history_cnt = _safe_count(db, "ratings_history")
  85. print(
  86. " Elo persisted to collections:",
  87. f"elo_ratings={ratings_cnt}",
  88. f"ratings_history={history_cnt}",
  89. )
  90. try:
  91. processed = (
  92. result.get("processed") if isinstance(result, dict) else None
  93. )
  94. print(f" Elo processed matches: {processed}")
  95. except (RuntimeError, ValueError, TypeError): # diagnostics only
  96. pass
  97. except NotImplementedError as exc:
  98. print(f" -> Skipped (not implemented): {exc}")
  99. except (RuntimeError, ValueError) as exc: # pragma: no cover - diagnostics only
  100. print(f" -> Error: {type(exc).__name__}: {exc}")
  101. if __name__ == "__main__":
  102. main()