Sfoglia il codice sorgente

优化Dixon-Coles分析器,添加参数计算和持久化功能;重构Elo分析器的排序逻辑,确保时间戳稳定排序并统计未知时间戳记录。

admin 1 mese fa
parent
commit
cba1c818da
2 ha cambiato i file con 214 aggiunte e 18 eliminazioni
  1. 201 17
      src/databank/analytics/dixon_coles.py
  2. 13 1
      src/databank/analytics/elo.py

+ 201 - 17
src/databank/analytics/dixon_coles.py

@@ -1,35 +1,219 @@
-"""Dixon–Coles (DC) model analyzer skeleton for football scores.
+"""Dixon–Coles (DC) model analyzer for football scores (minimal V1).
 
 Purpose:
-- Estimate attacking/defending strengths and home advantage via Poisson/DC likelihood.
-- Produce scoreline distribution and W/D/L probabilities per match.
+- Estimate attacking/defending strengths and home advantage via Poisson-style rates.
+- Produce parameters per group (league-season or global) and persist them.
 """
 
 from __future__ import annotations
 
-from typing import Any
+from typing import Any, Dict, Iterable, Optional
+from collections import defaultdict
+import re
+
+from databank.core.models import Document
+from databank.db.base import BaseDB
 
 from .base import AnalyticsBase
 
 
 class DixonColesAnalyzer(AnalyticsBase):
-    """Estimate Poisson/DC parameters and infer probabilities.
+    """Estimate Poisson-style parameters and infer probabilities (baseline)."""
 
-    This is a scaffold. Implementation steps usually include:
-    - Build likelihood over historical matches with time decay.
-    - Optimize attack/defense params per team and a home advantage term.
-    - Apply DC correlation adjustment for low-score outcomes.
-    - Infer scoreline and aggregate to W/D/L probabilities.
-    """
+    def compute(self, data: Any, **kwargs: Any) -> Any:  # noqa: D401
+        """Fit/estimate Poisson-style parameters and basic probabilities.
 
-    def compute(self, data: Any, **kwargs: Any) -> Any:
-        """Fit/estimate DC parameters and produce probabilities.
+        Minimal V1 without numerical optimization: rate-based factors.
 
         Args:
-                data: Iterable of match-like docs.
-                **kwargs: decay, regularization, max_goals, db, preview_only.
+            data: Iterable of match-like docs (same shape as Elo input).
+            **kwargs: group_by ("league_season"|"global"), max_goals (int),
+                persist (bool), db (BaseDB).
 
         Returns:
-                A dict with parameters and/or per-match probability outputs.
+            dict with per-group parameters summary. If persist=True, writes
+            documents to 'dc_params'.
         """
-        raise NotImplementedError("Dixon–Coles model not implemented yet")
+        group_by = str(kwargs.get("group_by", "league_season"))
+        max_goals = int(kwargs.get("max_goals", 8))
+        persist = bool(kwargs.get("persist", True))
+        db: Optional[BaseDB] = kwargs.get("db")
+
+        # Helpers
+        def _get_team(match: dict, side: str) -> Optional[str]:
+            id_val = match.get(f"{side}TeamId") or match.get(f"{side}Id")
+            if side == "home":
+                id_val = id_val or match.get("hostTeamId") or match.get("hostId")
+            else:
+                id_val = (
+                    id_val
+                    or match.get("awayTeamId")
+                    or match.get("guestTeamId")
+                    or match.get("guestId")
+                )
+            return str(id_val) if id_val is not None else None
+
+        def _get_score(match: dict, side: str) -> Optional[int]:
+            keys = [
+                f"{side}Score",
+                f"{side}Goals",
+                f"{side}Goal",
+                f"{'hostScore' if side=='home' else 'guestScore'}",
+            ]
+            for k in keys:
+                if k in match and match[k] is not None:
+                    try:
+                        return int(match[k])
+                    except (ValueError, TypeError):
+                        continue
+            sc = match.get("score")
+            if isinstance(sc, list):
+                for s in reversed(sc):
+                    if not isinstance(s, str):
+                        continue
+                    s2 = s.strip()
+                    m = re.search(r"(\d+)\s*:\s*(\d+)", s2)
+                    if m:
+                        try:
+                            h = int(m.group(1))
+                            a = int(m.group(2))
+                            return h if side == "home" else a
+                        except (ValueError, TypeError):
+                            pass
+            return None
+
+        def _context(doc: dict) -> tuple[Optional[str], Optional[str]]:
+            payload = None
+            if isinstance(doc, dict):
+                data_dict = (
+                    doc.get("data") if isinstance(doc.get("data"), dict) else None
+                )
+                if data_dict:
+                    payload = data_dict.get("payload")
+                if payload is None and isinstance(doc.get("payload"), dict):
+                    payload = doc.get("payload")
+            if isinstance(payload, dict):
+                league_id = payload.get("leagueId")
+                season = payload.get("seasonName") or payload.get("season")
+                lid = str(league_id) if league_id is not None else None
+                s = str(season) if season is not None else None
+                return (lid, s)
+            return (None, None)
+
+        def _group_key(doc: dict) -> str:
+            if group_by == "league_season":
+                lid, s = _context(doc)
+                return f"{lid or 'NA'}::{s or 'NA'}"
+            return "global"
+
+        # Accumulate statistics
+        totals = defaultdict(
+            lambda: {
+                "matches": 0,
+                "goals_home": 0,
+                "goals_away": 0,
+            }
+        )
+        per_team = defaultdict(
+            lambda: {
+                "home_played": 0,
+                "home_for": 0,
+                "home_against": 0,
+                "away_played": 0,
+                "away_for": 0,
+                "away_against": 0,
+            }
+        )
+
+        def _finished(match: dict) -> bool:
+            hs = _get_score(match, "home")
+            as_ = _get_score(match, "away")
+            return hs is not None and as_ is not None
+
+        items = list(data) if isinstance(data, Iterable) else []
+        for d in items:
+            m = None
+            if isinstance(d, dict):
+                m = d.get("match") or d.get("data", {}).get("match")
+            if not isinstance(m, dict):
+                continue
+            if not _finished(m):
+                continue
+            h = _get_team(m, "home")
+            a = _get_team(m, "away")
+            hs = _get_score(m, "home")
+            as_ = _get_score(m, "away")
+            if not h or not a or hs is None or as_ is None:
+                continue
+            g = _group_key(d)
+            totals[g]["matches"] += 1
+            totals[g]["goals_home"] += hs
+            totals[g]["goals_away"] += as_
+            per_team[(g, h)]["home_played"] += 1
+            per_team[(g, h)]["home_for"] += hs
+            per_team[(g, h)]["home_against"] += as_
+            per_team[(g, a)]["away_played"] += 1
+            per_team[(g, a)]["away_for"] += as_
+            per_team[(g, a)]["away_against"] += hs
+
+        # Compute parameters per group
+        params: Dict[str, Dict[str, Dict[str, float]]] = {}
+        docs: list[Document] = []
+        eps = 1e-9
+        for g, t in totals.items():
+            n = max(1, t["matches"])
+            league_home_avg = t["goals_home"] / n
+            league_away_avg = t["goals_away"] / n
+            params[g] = {}
+            for (gg, team_id), st in per_team.items():
+                if gg != g:
+                    continue
+                hp = st["home_played"]
+                ap = st["away_played"]
+                hf = st["home_for"]
+                ha = st["home_against"]
+                af = st["away_for"]
+                aa = st["away_against"]
+                # Factors around 1.0 (simple shrinkage via eps)
+                att_home = (
+                    (hf / max(1, hp)) / (league_home_avg + eps) if hp > 0 else 1.0
+                )
+                att_away = (
+                    (af / max(1, ap)) / (league_away_avg + eps) if ap > 0 else 1.0
+                )
+                def_home = (
+                    (ha / max(1, hp)) / (league_away_avg + eps) if hp > 0 else 1.0
+                )
+                def_away = (
+                    (aa / max(1, ap)) / (league_home_avg + eps) if ap > 0 else 1.0
+                )
+                params[g][team_id] = {
+                    "attack_home": float(att_home),
+                    "attack_away": float(att_away),
+                    "defense_home": float(def_home),
+                    "defense_away": float(def_away),
+                    "league_home_avg": float(league_home_avg),
+                    "league_away_avg": float(league_away_avg),
+                }
+                if persist and db:
+                    docs.append(
+                        Document(
+                            id=f"{g}:{team_id}",
+                            kind="dc_params",
+                            data={
+                                "group": g,
+                                "team_id": team_id,
+                                **params[g][team_id],
+                            },
+                        )
+                    )
+
+        if persist and db and docs:
+            db.insert_many(docs)
+
+        return {
+            "groups": list(params.keys()),
+            "params": params,
+            "matches_used": sum(totals[g]["matches"] for g in totals),
+            "max_goals": max_goals,
+        }

+ 13 - 1
src/databank/analytics/elo.py

@@ -227,7 +227,15 @@ class EloAnalyzer(AnalyticsBase):
                 }
             )
 
-        matches.sort(key=lambda x: x["ts"])
+        # Stable sort by (ts, match_id); push unknown/zero ts to the end
+        def _sort_key(rec: dict) -> tuple[float, str]:
+            ts_val = rec.get("ts") or 0.0
+            # push missing/zero timestamps to the end
+            k_ts = ts_val if ts_val > 0 else float("inf")
+            mid = rec.get("match_id")
+            return (k_ts, str(mid) if mid is not None else "")
+
+        matches.sort(key=_sort_key)
 
         # Optional incremental skip:
         # ignore matches already in ratings_history by match_id
@@ -369,10 +377,14 @@ class EloAnalyzer(AnalyticsBase):
         ranked = sorted(ratings.items(), key=lambda x: x[1], reverse=True)
         ranks: Dict[str, int] = {tid: idx + 1 for idx, (tid, _) in enumerate(ranked)}
 
+        # Diagnostics: count records with unknown timestamp
+        zero_ts = sum(1 for rec in matches if (rec.get("ts") or 0.0) <= 0.0)
+
         result: Dict[str, Any] = {
             "ratings": ratings,
             "processed": len(matches),
             "ranks": ranks,
+            "zero_ts": zero_ts,
         }
         if return_expectations:
             result["expectations"] = expectations