3 mesi fa · cba1c818da
--- a/src/databank/analytics/dixon_coles.py
+++ b/src/databank/analytics/dixon_coles.py
@@ -1,35 +1,219 @@
 
				-"""Dixon–Coles (DC) model analyzer skeleton for football scores.
			
 
				+"""Dixon–Coles (DC) model analyzer for football scores (minimal V1).
			
 
				 
			
 
				 Purpose:
			
 
				-- Estimate attacking/defending strengths and home advantage via Poisson/DC likelihood.
			
 
				-- Produce scoreline distribution and W/D/L probabilities per match.
			
 
				+- Estimate attacking/defending strengths and home advantage via Poisson-style rates.
			
 
				+- Produce parameters per group (league-season or global) and persist them.
			
 
				 """
			
 
				 
			
 
				 from __future__ import annotations
			
 
				 
			
 
				-from typing import Any
			
 
				+from typing import Any, Dict, Iterable, Optional
			
 
				+from collections import defaultdict
			
 
				+import re
			
 
				+
			
 
				+from databank.core.models import Document
			
 
				+from databank.db.base import BaseDB
			
 
				 
			
 
				 from .base import AnalyticsBase
			
 
				 
			
 
				 
			
 
				 class DixonColesAnalyzer(AnalyticsBase):
			
 
				-    """Estimate Poisson/DC parameters and infer probabilities.
			
 
				+    """Estimate Poisson-style parameters and infer probabilities (baseline)."""
			
 
				 
			
 
				-    This is a scaffold. Implementation steps usually include:
			
 
				-    - Build likelihood over historical matches with time decay.
			
 
				-    - Optimize attack/defense params per team and a home advantage term.
			
 
				-    - Apply DC correlation adjustment for low-score outcomes.
			
 
				-    - Infer scoreline and aggregate to W/D/L probabilities.
			
 
				-    """
			
 
				+    def compute(self, data: Any, **kwargs: Any) -> Any:  # noqa: D401
			
 
				+        """Fit/estimate Poisson-style parameters and basic probabilities.
			
 
				 
			
 
				-    def compute(self, data: Any, **kwargs: Any) -> Any:
			
 
				-        """Fit/estimate DC parameters and produce probabilities.
			
 
				+        Minimal V1 without numerical optimization: rate-based factors.
			
 
				 
			
 
				         Args:
			
 
				-                data: Iterable of match-like docs.
			
 
				-                **kwargs: decay, regularization, max_goals, db, preview_only.
			
 
				+            data: Iterable of match-like docs (same shape as Elo input).
			
 
				+            **kwargs: group_by ("league_season"|"global"), max_goals (int),
			
 
				+                persist (bool), db (BaseDB).
			
 
				 
			
 
				         Returns:
			
 
				-                A dict with parameters and/or per-match probability outputs.
			
 
				+            dict with per-group parameters summary. If persist=True, writes
			
 
				+            documents to 'dc_params'.
			
 
				         """
			
 
				-        raise NotImplementedError("Dixon–Coles model not implemented yet")
			
 
				+        group_by = str(kwargs.get("group_by", "league_season"))
			
 
				+        max_goals = int(kwargs.get("max_goals", 8))
			
 
				+        persist = bool(kwargs.get("persist", True))
			
 
				+        db: Optional[BaseDB] = kwargs.get("db")
			
 
				+
			
 
				+        # Helpers
			
 
				+        def _get_team(match: dict, side: str) -> Optional[str]:
			
 
				+            id_val = match.get(f"{side}TeamId") or match.get(f"{side}Id")
			
 
				+            if side == "home":
			
 
				+                id_val = id_val or match.get("hostTeamId") or match.get("hostId")
			
 
				+            else:
			
 
				+                id_val = (
			
 
				+                    id_val
			
 
				+                    or match.get("awayTeamId")
			
 
				+                    or match.get("guestTeamId")
			
 
				+                    or match.get("guestId")
			
 
				+                )
			
 
				+            return str(id_val) if id_val is not None else None
			
 
				+
			
 
				+        def _get_score(match: dict, side: str) -> Optional[int]:
			
 
				+            keys = [
			
 
				+                f"{side}Score",
			
 
				+                f"{side}Goals",
			
 
				+                f"{side}Goal",
			
 
				+                f"{'hostScore' if side=='home' else 'guestScore'}",
			
 
				+            ]
			
 
				+            for k in keys:
			
 
				+                if k in match and match[k] is not None:
			
 
				+                    try:
			
 
				+                        return int(match[k])
			
 
				+                    except (ValueError, TypeError):
			
 
				+                        continue
			
 
				+            sc = match.get("score")
			
 
				+            if isinstance(sc, list):
			
 
				+                for s in reversed(sc):
			
 
				+                    if not isinstance(s, str):
			
 
				+                        continue
			
 
				+                    s2 = s.strip()
			
 
				+                    m = re.search(r"(\d+)\s*:\s*(\d+)", s2)
			
 
				+                    if m:
			
 
				+                        try:
			
 
				+                            h = int(m.group(1))
			
 
				+                            a = int(m.group(2))
			
 
				+                            return h if side == "home" else a
			
 
				+                        except (ValueError, TypeError):
			
 
				+                            pass
			
 
				+            return None
			
 
				+
			
 
				+        def _context(doc: dict) -> tuple[Optional[str], Optional[str]]:
			
 
				+            payload = None
			
 
				+            if isinstance(doc, dict):
			
 
				+                data_dict = (
			
 
				+                    doc.get("data") if isinstance(doc.get("data"), dict) else None
			
 
				+                )
			
 
				+                if data_dict:
			
 
				+                    payload = data_dict.get("payload")
			
 
				+                if payload is None and isinstance(doc.get("payload"), dict):
			
 
				+                    payload = doc.get("payload")
			
 
				+            if isinstance(payload, dict):
			
 
				+                league_id = payload.get("leagueId")
			
 
				+                season = payload.get("seasonName") or payload.get("season")
			
 
				+                lid = str(league_id) if league_id is not None else None
			
 
				+                s = str(season) if season is not None else None
			
 
				+                return (lid, s)
			
 
				+            return (None, None)
			
 
				+
			
 
				+        def _group_key(doc: dict) -> str:
			
 
				+            if group_by == "league_season":
			
 
				+                lid, s = _context(doc)
			
 
				+                return f"{lid or 'NA'}::{s or 'NA'}"
			
 
				+            return "global"
			
 
				+
			
 
				+        # Accumulate statistics
			
 
				+        totals = defaultdict(
			
 
				+            lambda: {
			
 
				+                "matches": 0,
			
 
				+                "goals_home": 0,
			
 
				+                "goals_away": 0,
			
 
				+            }
			
 
				+        )
			
 
				+        per_team = defaultdict(
			
 
				+            lambda: {
			
 
				+                "home_played": 0,
			
 
				+                "home_for": 0,
			
 
				+                "home_against": 0,
			
 
				+                "away_played": 0,
			
 
				+                "away_for": 0,
			
 
				+                "away_against": 0,
			
 
				+            }
			
 
				+        )
			
 
				+
			
 
				+        def _finished(match: dict) -> bool:
			
 
				+            hs = _get_score(match, "home")
			
 
				+            as_ = _get_score(match, "away")
			
 
				+            return hs is not None and as_ is not None
			
 
				+
			
 
				+        items = list(data) if isinstance(data, Iterable) else []
			
 
				+        for d in items:
			
 
				+            m = None
			
 
				+            if isinstance(d, dict):
			
 
				+                m = d.get("match") or d.get("data", {}).get("match")
			
 
				+            if not isinstance(m, dict):
			
 
				+                continue
			
 
				+            if not _finished(m):
			
 
				+                continue
			
 
				+            h = _get_team(m, "home")
			
 
				+            a = _get_team(m, "away")
			
 
				+            hs = _get_score(m, "home")
			
 
				+            as_ = _get_score(m, "away")
			
 
				+            if not h or not a or hs is None or as_ is None:
			
 
				+                continue
			
 
				+            g = _group_key(d)
			
 
				+            totals[g]["matches"] += 1
			
 
				+            totals[g]["goals_home"] += hs
			
 
				+            totals[g]["goals_away"] += as_
			
 
				+            per_team[(g, h)]["home_played"] += 1
			
 
				+            per_team[(g, h)]["home_for"] += hs
			
 
				+            per_team[(g, h)]["home_against"] += as_
			
 
				+            per_team[(g, a)]["away_played"] += 1
			
 
				+            per_team[(g, a)]["away_for"] += as_
			
 
				+            per_team[(g, a)]["away_against"] += hs
			
 
				+
			
 
				+        # Compute parameters per group
			
 
				+        params: Dict[str, Dict[str, Dict[str, float]]] = {}
			
 
				+        docs: list[Document] = []
			
 
				+        eps = 1e-9
			
 
				+        for g, t in totals.items():
			
 
				+            n = max(1, t["matches"])
			
 
				+            league_home_avg = t["goals_home"] / n
			
 
				+            league_away_avg = t["goals_away"] / n
			
 
				+            params[g] = {}
			
 
				+            for (gg, team_id), st in per_team.items():
			
 
				+                if gg != g:
			
 
				+                    continue
			
 
				+                hp = st["home_played"]
			
 
				+                ap = st["away_played"]
			
 
				+                hf = st["home_for"]
			
 
				+                ha = st["home_against"]
			
 
				+                af = st["away_for"]
			
 
				+                aa = st["away_against"]
			
 
				+                # Factors around 1.0 (simple shrinkage via eps)
			
 
				+                att_home = (
			
 
				+                    (hf / max(1, hp)) / (league_home_avg + eps) if hp > 0 else 1.0
			
 
				+                )
			
 
				+                att_away = (
			
 
				+                    (af / max(1, ap)) / (league_away_avg + eps) if ap > 0 else 1.0
			
 
				+                )
			
 
				+                def_home = (
			
 
				+                    (ha / max(1, hp)) / (league_away_avg + eps) if hp > 0 else 1.0
			
 
				+                )
			
 
				+                def_away = (
			
 
				+                    (aa / max(1, ap)) / (league_home_avg + eps) if ap > 0 else 1.0
			
 
				+                )
			
 
				+                params[g][team_id] = {
			
 
				+                    "attack_home": float(att_home),
			
 
				+                    "attack_away": float(att_away),
			
 
				+                    "defense_home": float(def_home),
			
 
				+                    "defense_away": float(def_away),
			
 
				+                    "league_home_avg": float(league_home_avg),
			
 
				+                    "league_away_avg": float(league_away_avg),
			
 
				+                }
			
 
				+                if persist and db:
			
 
				+                    docs.append(
			
 
				+                        Document(
			
 
				+                            id=f"{g}:{team_id}",
			
 
				+                            kind="dc_params",
			
 
				+                            data={
			
 
				+                                "group": g,
			
 
				+                                "team_id": team_id,
			
 
				+                                **params[g][team_id],
			
 
				+                            },
			
 
				+                        )
			
 
				+                    )
			
 
				+
			
 
				+        if persist and db and docs:
			
 
				+            db.insert_many(docs)
			
 
				+
			
 
				+        return {
			
 
				+            "groups": list(params.keys()),
			
 
				+            "params": params,
			
 
				+            "matches_used": sum(totals[g]["matches"] for g in totals),
			
 
				+            "max_goals": max_goals,
			
 
				+        }
			
--- a/src/databank/analytics/elo.py
+++ b/src/databank/analytics/elo.py
@@ -227,7 +227,15 @@ class EloAnalyzer(AnalyticsBase):
 
				                 }
			
 
				             )
			
 
				 
			
 
				-        matches.sort(key=lambda x: x["ts"])
			
 
				+        # Stable sort by (ts, match_id); push unknown/zero ts to the end
			
 
				+        def _sort_key(rec: dict) -> tuple[float, str]:
			
 
				+            ts_val = rec.get("ts") or 0.0
			
 
				+            # push missing/zero timestamps to the end
			
 
				+            k_ts = ts_val if ts_val > 0 else float("inf")
			
 
				+            mid = rec.get("match_id")
			
 
				+            return (k_ts, str(mid) if mid is not None else "")
			
 
				+
			
 
				+        matches.sort(key=_sort_key)
			
 
				 
			
 
				         # Optional incremental skip:
			
 
				         # ignore matches already in ratings_history by match_id
			
@@ -369,10 +377,14 @@ class EloAnalyzer(AnalyticsBase):
 
				         ranked = sorted(ratings.items(), key=lambda x: x[1], reverse=True)
			
 
				         ranks: Dict[str, int] = {tid: idx + 1 for idx, (tid, _) in enumerate(ranked)}
			
 
				 
			
 
				+        # Diagnostics: count records with unknown timestamp
			
 
				+        zero_ts = sum(1 for rec in matches if (rec.get("ts") or 0.0) <= 0.0)
			
 
				+
			
 
				         result: Dict[str, Any] = {
			
 
				             "ratings": ratings,
			
 
				             "processed": len(matches),
			
 
				             "ranks": ranks,
			
 
				+            "zero_ts": zero_ts,
			
 
				         }
			
 
				         if return_expectations:
			
 
				             result["expectations"] = expectations