|
|
@@ -1,35 +1,219 @@
|
|
|
-"""Dixon–Coles (DC) model analyzer skeleton for football scores.
|
|
|
+"""Dixon–Coles (DC) model analyzer for football scores (minimal V1).
|
|
|
|
|
|
Purpose:
|
|
|
-- Estimate attacking/defending strengths and home advantage via Poisson/DC likelihood.
|
|
|
-- Produce scoreline distribution and W/D/L probabilities per match.
|
|
|
+- Estimate attacking/defending strengths and home advantage via Poisson-style rates.
|
|
|
+- Produce parameters per group (league-season or global) and persist them.
|
|
|
"""
|
|
|
|
|
|
from __future__ import annotations
|
|
|
|
|
|
-from typing import Any
|
|
|
+from typing import Any, Dict, Iterable, Optional
|
|
|
+from collections import defaultdict
|
|
|
+import re
|
|
|
+
|
|
|
+from databank.core.models import Document
|
|
|
+from databank.db.base import BaseDB
|
|
|
|
|
|
from .base import AnalyticsBase
|
|
|
|
|
|
|
|
|
class DixonColesAnalyzer(AnalyticsBase):
|
|
|
- """Estimate Poisson/DC parameters and infer probabilities.
|
|
|
+ """Estimate Poisson-style parameters and infer probabilities (baseline)."""
|
|
|
|
|
|
- This is a scaffold. Implementation steps usually include:
|
|
|
- - Build likelihood over historical matches with time decay.
|
|
|
- - Optimize attack/defense params per team and a home advantage term.
|
|
|
- - Apply DC correlation adjustment for low-score outcomes.
|
|
|
- - Infer scoreline and aggregate to W/D/L probabilities.
|
|
|
- """
|
|
|
+ def compute(self, data: Any, **kwargs: Any) -> Any: # noqa: D401
|
|
|
+ """Fit/estimate Poisson-style parameters and basic probabilities.
|
|
|
|
|
|
- def compute(self, data: Any, **kwargs: Any) -> Any:
|
|
|
- """Fit/estimate DC parameters and produce probabilities.
|
|
|
+ Minimal V1 without numerical optimization: rate-based factors.
|
|
|
|
|
|
Args:
|
|
|
- data: Iterable of match-like docs.
|
|
|
- **kwargs: decay, regularization, max_goals, db, preview_only.
|
|
|
+ data: Iterable of match-like docs (same shape as Elo input).
|
|
|
+ **kwargs: group_by ("league_season"|"global"), max_goals (int),
|
|
|
+ persist (bool), db (BaseDB).
|
|
|
|
|
|
Returns:
|
|
|
- A dict with parameters and/or per-match probability outputs.
|
|
|
+ dict with per-group parameters summary. If persist=True, writes
|
|
|
+ documents to 'dc_params'.
|
|
|
"""
|
|
|
- raise NotImplementedError("Dixon–Coles model not implemented yet")
|
|
|
+ group_by = str(kwargs.get("group_by", "league_season"))
|
|
|
+ max_goals = int(kwargs.get("max_goals", 8))
|
|
|
+ persist = bool(kwargs.get("persist", True))
|
|
|
+ db: Optional[BaseDB] = kwargs.get("db")
|
|
|
+
|
|
|
+ # Helpers
|
|
|
+ def _get_team(match: dict, side: str) -> Optional[str]:
|
|
|
+ id_val = match.get(f"{side}TeamId") or match.get(f"{side}Id")
|
|
|
+ if side == "home":
|
|
|
+ id_val = id_val or match.get("hostTeamId") or match.get("hostId")
|
|
|
+ else:
|
|
|
+ id_val = (
|
|
|
+ id_val
|
|
|
+ or match.get("awayTeamId")
|
|
|
+ or match.get("guestTeamId")
|
|
|
+ or match.get("guestId")
|
|
|
+ )
|
|
|
+ return str(id_val) if id_val is not None else None
|
|
|
+
|
|
|
+ def _get_score(match: dict, side: str) -> Optional[int]:
|
|
|
+ keys = [
|
|
|
+ f"{side}Score",
|
|
|
+ f"{side}Goals",
|
|
|
+ f"{side}Goal",
|
|
|
+ f"{'hostScore' if side=='home' else 'guestScore'}",
|
|
|
+ ]
|
|
|
+ for k in keys:
|
|
|
+ if k in match and match[k] is not None:
|
|
|
+ try:
|
|
|
+ return int(match[k])
|
|
|
+ except (ValueError, TypeError):
|
|
|
+ continue
|
|
|
+ sc = match.get("score")
|
|
|
+ if isinstance(sc, list):
|
|
|
+ for s in reversed(sc):
|
|
|
+ if not isinstance(s, str):
|
|
|
+ continue
|
|
|
+ s2 = s.strip()
|
|
|
+ m = re.search(r"(\d+)\s*:\s*(\d+)", s2)
|
|
|
+ if m:
|
|
|
+ try:
|
|
|
+ h = int(m.group(1))
|
|
|
+ a = int(m.group(2))
|
|
|
+ return h if side == "home" else a
|
|
|
+ except (ValueError, TypeError):
|
|
|
+ pass
|
|
|
+ return None
|
|
|
+
|
|
|
+ def _context(doc: dict) -> tuple[Optional[str], Optional[str]]:
|
|
|
+ payload = None
|
|
|
+ if isinstance(doc, dict):
|
|
|
+ data_dict = (
|
|
|
+ doc.get("data") if isinstance(doc.get("data"), dict) else None
|
|
|
+ )
|
|
|
+ if data_dict:
|
|
|
+ payload = data_dict.get("payload")
|
|
|
+ if payload is None and isinstance(doc.get("payload"), dict):
|
|
|
+ payload = doc.get("payload")
|
|
|
+ if isinstance(payload, dict):
|
|
|
+ league_id = payload.get("leagueId")
|
|
|
+ season = payload.get("seasonName") or payload.get("season")
|
|
|
+ lid = str(league_id) if league_id is not None else None
|
|
|
+ s = str(season) if season is not None else None
|
|
|
+ return (lid, s)
|
|
|
+ return (None, None)
|
|
|
+
|
|
|
+ def _group_key(doc: dict) -> str:
|
|
|
+ if group_by == "league_season":
|
|
|
+ lid, s = _context(doc)
|
|
|
+ return f"{lid or 'NA'}::{s or 'NA'}"
|
|
|
+ return "global"
|
|
|
+
|
|
|
+ # Accumulate statistics
|
|
|
+ totals = defaultdict(
|
|
|
+ lambda: {
|
|
|
+ "matches": 0,
|
|
|
+ "goals_home": 0,
|
|
|
+ "goals_away": 0,
|
|
|
+ }
|
|
|
+ )
|
|
|
+ per_team = defaultdict(
|
|
|
+ lambda: {
|
|
|
+ "home_played": 0,
|
|
|
+ "home_for": 0,
|
|
|
+ "home_against": 0,
|
|
|
+ "away_played": 0,
|
|
|
+ "away_for": 0,
|
|
|
+ "away_against": 0,
|
|
|
+ }
|
|
|
+ )
|
|
|
+
|
|
|
+ def _finished(match: dict) -> bool:
|
|
|
+ hs = _get_score(match, "home")
|
|
|
+ as_ = _get_score(match, "away")
|
|
|
+ return hs is not None and as_ is not None
|
|
|
+
|
|
|
+ items = list(data) if isinstance(data, Iterable) else []
|
|
|
+ for d in items:
|
|
|
+ m = None
|
|
|
+ if isinstance(d, dict):
|
|
|
+ m = d.get("match") or d.get("data", {}).get("match")
|
|
|
+ if not isinstance(m, dict):
|
|
|
+ continue
|
|
|
+ if not _finished(m):
|
|
|
+ continue
|
|
|
+ h = _get_team(m, "home")
|
|
|
+ a = _get_team(m, "away")
|
|
|
+ hs = _get_score(m, "home")
|
|
|
+ as_ = _get_score(m, "away")
|
|
|
+ if not h or not a or hs is None or as_ is None:
|
|
|
+ continue
|
|
|
+ g = _group_key(d)
|
|
|
+ totals[g]["matches"] += 1
|
|
|
+ totals[g]["goals_home"] += hs
|
|
|
+ totals[g]["goals_away"] += as_
|
|
|
+ per_team[(g, h)]["home_played"] += 1
|
|
|
+ per_team[(g, h)]["home_for"] += hs
|
|
|
+ per_team[(g, h)]["home_against"] += as_
|
|
|
+ per_team[(g, a)]["away_played"] += 1
|
|
|
+ per_team[(g, a)]["away_for"] += as_
|
|
|
+ per_team[(g, a)]["away_against"] += hs
|
|
|
+
|
|
|
+ # Compute parameters per group
|
|
|
+ params: Dict[str, Dict[str, Dict[str, float]]] = {}
|
|
|
+ docs: list[Document] = []
|
|
|
+ eps = 1e-9
|
|
|
+ for g, t in totals.items():
|
|
|
+ n = max(1, t["matches"])
|
|
|
+ league_home_avg = t["goals_home"] / n
|
|
|
+ league_away_avg = t["goals_away"] / n
|
|
|
+ params[g] = {}
|
|
|
+ for (gg, team_id), st in per_team.items():
|
|
|
+ if gg != g:
|
|
|
+ continue
|
|
|
+ hp = st["home_played"]
|
|
|
+ ap = st["away_played"]
|
|
|
+ hf = st["home_for"]
|
|
|
+ ha = st["home_against"]
|
|
|
+ af = st["away_for"]
|
|
|
+ aa = st["away_against"]
|
|
|
+ # Factors around 1.0 (simple shrinkage via eps)
|
|
|
+ att_home = (
|
|
|
+ (hf / max(1, hp)) / (league_home_avg + eps) if hp > 0 else 1.0
|
|
|
+ )
|
|
|
+ att_away = (
|
|
|
+ (af / max(1, ap)) / (league_away_avg + eps) if ap > 0 else 1.0
|
|
|
+ )
|
|
|
+ def_home = (
|
|
|
+ (ha / max(1, hp)) / (league_away_avg + eps) if hp > 0 else 1.0
|
|
|
+ )
|
|
|
+ def_away = (
|
|
|
+ (aa / max(1, ap)) / (league_home_avg + eps) if ap > 0 else 1.0
|
|
|
+ )
|
|
|
+ params[g][team_id] = {
|
|
|
+ "attack_home": float(att_home),
|
|
|
+ "attack_away": float(att_away),
|
|
|
+ "defense_home": float(def_home),
|
|
|
+ "defense_away": float(def_away),
|
|
|
+ "league_home_avg": float(league_home_avg),
|
|
|
+ "league_away_avg": float(league_away_avg),
|
|
|
+ }
|
|
|
+ if persist and db:
|
|
|
+ docs.append(
|
|
|
+ Document(
|
|
|
+ id=f"{g}:{team_id}",
|
|
|
+ kind="dc_params",
|
|
|
+ data={
|
|
|
+ "group": g,
|
|
|
+ "team_id": team_id,
|
|
|
+ **params[g][team_id],
|
|
|
+ },
|
|
|
+ )
|
|
|
+ )
|
|
|
+
|
|
|
+ if persist and db and docs:
|
|
|
+ db.insert_many(docs)
|
|
|
+
|
|
|
+ return {
|
|
|
+ "groups": list(params.keys()),
|
|
|
+ "params": params,
|
|
|
+ "matches_used": sum(totals[g]["matches"] for g in totals),
|
|
|
+ "max_goals": max_goals,
|
|
|
+ }
|