3 kuukautta sitten · 75d2ba0ee8
--- a/scripts/run_analyzers.py
+++ b/scripts/run_analyzers.py
@@ -113,6 +113,15 @@ def main() -> None:
 
				         except (ValueError, TypeError):
			
 
				             return default
			
 
				 
			
 
				+    def _env_int_optional(name: str) -> int | None:
			
 
				+        s = os.getenv(name)
			
 
				+        if not s:
			
 
				+            return None
			
 
				+        try:
			
 
				+            return int(s)
			
 
				+        except ValueError:
			
 
				+            return None
			
 
				+
			
 
				     dc_kwargs = {
			
 
				         "halflife_days": _env_float("DATABANK_DC_HALFLIFE_DAYS", 180.0),
			
 
				         "rho_range": _env_rho_range("DATABANK_DC_RHO_RANGE", (-0.3, 0.3)),
			
@@ -122,12 +131,34 @@ def main() -> None:
 
				     }
			
 
				 
			
 
				     # Optional history configuration for DC
			
 
				-    history_mode = os.getenv("DATABANK_DC_HISTORY", "none").strip().lower()
			
 
				-    if history_mode in {"none", "predictions", "snapshots"}:
			
 
				+    history_mode = os.getenv("DATABANK_DC_HISTORY", "both").strip().lower()
			
 
				+    if history_mode in {"none", "predictions", "snapshots", "both"}:
			
 
				         dc_kwargs["history"] = history_mode
			
 
				     dc_kwargs["snapshot_every"] = _env_int("DATABANK_DC_SNAPSHOT_EVERY", 10)
			
 
				     dc_kwargs["max_iters_history"] = _env_int("DATABANK_DC_MAX_ITERS_HISTORY", 10)
			
 
				     dc_kwargs["max_goals"] = _env_int("DATABANK_DC_MAX_GOALS", 8)
			
 
				+    # Progress controls
			
 
				+    dc_kwargs["verbose"] = os.getenv("DATABANK_DC_VERBOSE", "1").strip() not in {
			
 
				+        "0",
			
 
				+        "false",
			
 
				+        "False",
			
 
				+    }
			
 
				+    dc_kwargs["progress_every"] = _env_int("DATABANK_DC_PROGRESS_EVERY", 100)
			
 
				+    dc_kwargs["flush_every"] = _env_int("DATABANK_DC_FLUSH_EVERY", 1000)
			
 
				+    dc_kwargs["skip_if_exists"] = os.getenv(
			
 
				+        "DATABANK_DC_SKIP_IF_EXISTS", "0"
			
 
				+    ).strip() not in {
			
 
				+        "0",
			
 
				+        "false",
			
 
				+        "False",
			
 
				+    }
			
 
				+    # Safety clamps/tuning knobs
			
 
				+    dc_kwargs["param_min"] = _env_float("DATABANK_DC_PARAM_MIN", 0.3)
			
 
				+    dc_kwargs["param_max"] = _env_float("DATABANK_DC_PARAM_MAX", 3.0)
			
 
				+    dc_kwargs["base_min"] = _env_float("DATABANK_DC_BASE_MIN", 0.3)
			
 
				+    dc_kwargs["base_max"] = _env_float("DATABANK_DC_BASE_MAX", 3.0)
			
 
				+    dc_kwargs["mu_max"] = _env_float("DATABANK_DC_MU_MAX", 6.0)
			
 
				+    dc_kwargs["min_history_matches"] = _env_int("DATABANK_DC_MIN_HISTORY_MATCHES", 3)
			
 
				 
			
 
				     for analyzer in analyzers:
			
 
				         print(f"Running analyzer: {analyzer.__class__.__name__}")
			
@@ -139,6 +170,53 @@ def main() -> None:
 
				                 # Pass DC-specific kwargs from environment
			
 
				                 result = analyzer.compute(transformed, db=db, persist=True, **dc_kwargs)
			
 
				                 print("    DC config:", dc_kwargs)
			
 
				+            elif isinstance(analyzer, calibration_cls):
			
 
				+                # Calibration env knobs
			
 
				+                cal_kwargs = {
			
 
				+                    "source_kind": os.getenv(
			
 
				+                        "DATABANK_CAL_SOURCE_KIND", "dc_predictions"
			
 
				+                    ),
			
 
				+                    "bins": _env_int("DATABANK_CAL_BINS", 10),
			
 
				+                    # User asked higher min sample threshold, default 50
			
 
				+                    "min_per_bin": _env_int("DATABANK_CAL_MIN_PER_BIN", 50),
			
 
				+                    "verbose": os.getenv("DATABANK_CAL_VERBOSE", "1").strip()
			
 
				+                    not in {"0", "false", "False"},
			
 
				+                }
			
 
				+                since_ts = _env_int_optional("DATABANK_CAL_SINCE_TS")
			
 
				+                until_ts = _env_int_optional("DATABANK_CAL_UNTIL_TS")
			
 
				+                if since_ts is not None:
			
 
				+                    cal_kwargs["since_ts"] = since_ts
			
 
				+                if until_ts is not None:
			
 
				+                    cal_kwargs["until_ts"] = until_ts
			
 
				+                season = os.getenv("DATABANK_CAL_SEASON")
			
 
				+                if season:
			
 
				+                    cal_kwargs["season_filter"] = season
			
 
				+                groups_inc = os.getenv("DATABANK_CAL_GROUPS_INCLUDE")
			
 
				+                if groups_inc:
			
 
				+                    cal_kwargs["groups_include"] = groups_inc
			
 
				+
			
 
				+                result = analyzer.compute(
			
 
				+                    transformed, db=db, persist=True, **cal_kwargs
			
 
				+                )
			
 
				+                print("    CAL config:", cal_kwargs)
			
 
				+                # Console summary of calibration metrics per group
			
 
				+                if isinstance(result, dict) and isinstance(result.get("metrics"), dict):
			
 
				+                    metrics = result["metrics"]
			
 
				+                    print("    CAL summary (per group):")
			
 
				+                    for gk, m in metrics.items():
			
 
				+                        try:
			
 
				+                            b = float(m.get("brier", float("nan")))
			
 
				+                            ll = float(m.get("logloss", float("nan")))
			
 
				+                            ece = float(m.get("ece", float("nan")))
			
 
				+                            n = int(m.get("n", 0))
			
 
				+                            print(
			
 
				+                                (
			
 
				+                                    f"      - {gk}: n={n} | Brier={b:.4f}"
			
 
				+                                    f" | LogLoss={ll:.4f} | ECE={ece:.4f}"
			
 
				+                                )
			
 
				+                            )
			
 
				+                        except (ValueError, TypeError, KeyError):
			
 
				+                            print(f"      - {gk}: (metrics unavailable)")
			
 
				             else:
			
 
				                 result = analyzer.compute(transformed, db=db, persist=True)
			
 
				             analyzer.finalize(result)
			
@@ -168,8 +246,12 @@ def main() -> None:
 
				                     mu = result.get("matches_used")
			
 
				                     persisted = result.get("persisted")
			
 
				                     print(
			
 
				-                        f"    DC matches_used: {mu}; persisted docs in this run: {persisted}"
			
 
				+                        "    DC matches_used: "
			
 
				+                        f"{mu}; persisted docs in this run: {persisted}"
			
 
				                     )
			
 
				+                    gs = result.get("groups_skipped")
			
 
				+                    if isinstance(gs, int) and gs > 0:
			
 
				+                        print(f"    DC groups skipped (skip_if_exists): {gs}")
			
 
				                 # Optional extra collections
			
 
				                 preds_cnt = _safe_count(db, "dc_predictions")
			
 
				                 snaps_cnt = _safe_count(db, "dc_params_history")
			
@@ -177,6 +259,13 @@ def main() -> None:
 
				                     print("    DC predictions count:", preds_cnt)
			
 
				                 if snaps_cnt:
			
 
				                     print("    DC params history count:", snaps_cnt)
			
 
				+                if isinstance(result, dict):
			
 
				+                    pw = result.get("predictions_written")
			
 
				+                    sw = result.get("snapshots_written")
			
 
				+                    if pw is not None or sw is not None:
			
 
				+                        print(
			
 
				+                            f"    DC written (streaming): predictions={pw}, snapshots={sw}"
			
 
				+                        )
			
 
				         except NotImplementedError as exc:
			
 
				             print(f" -> Skipped (not implemented): {exc}")
			
 
				         except (RuntimeError, ValueError) as exc:  # pragma: no cover - diagnostics only
			
--- a/src/databank/analytics/calibration.py
+++ b/src/databank/analytics/calibration.py
@@ -1,14 +1,20 @@
 
				-"""Probability calibration analyzer skeleton.
			
 
				+"""Probability calibration analyzer.
			
 
				 
			
 
				-Use cases:
			
 
				-- Calibrate model probabilities (e.g., W/D/L) using Platt scaling or isotonic.
			
 
				-- Evaluate calibration via reliability curves and ECE/MCE metrics.
			
 
				+Scope in this version:
			
 
				+- Evaluate calibration quality for multiclass W/D/L probabilities.
			
 
				+- Compute metrics: Brier score, LogLoss, ECE (expected calibration error).
			
 
				+- Produce reliability curves (per-class) with configurable bins.
			
 
				+- Consume predictions from DB (default: dc_predictions) and persist results.
			
 
				 """
			
 
				 
			
 
				 from __future__ import annotations
			
 
				 
			
 
				-from typing import Any
			
 
				+from typing import Any, Dict, Iterable, Optional, Tuple, List, cast
			
 
				+from collections import defaultdict
			
 
				+import math
			
 
				 
			
 
				+from databank.core.models import Document
			
 
				+from databank.db.base import BaseDB
			
 
				 from .base import AnalyticsBase
			
 
				 
			
 
				 
			
@@ -16,13 +22,310 @@ class CalibrationAnalyzer(AnalyticsBase):
 
				     """Calibrate predicted probabilities and evaluate calibration quality."""
			
 
				 
			
 
				     def compute(self, data: Any, **kwargs: Any) -> Any:
			
 
				-        """Fit calibration mapping and/or apply to incoming probabilities.
			
 
				+        """Compute calibration metrics and reliability curves.
			
 
				 
			
 
				         Args:
			
 
				-            data: Iterable of predictions with observed outcomes.
			
 
				-            **kwargs: method ("platt"|"isotonic"), per_league, db.
			
 
				+            data: Ignored by default if source_kind uses DB collections.
			
 
				+            source_kind: Predictions collection kind (default "dc_predictions").
			
 
				+            group_by: "league_season" (default) or "global" (uses stored group).
			
 
				+            bins: Number of bins for reliability curves and ECE (default 10).
			
 
				+            min_per_bin: Minimum samples to include a bin in curves (default 20).
			
 
				+            persist: Whether to persist results (default True).
			
 
				+            db: Database handle when persist=True.
			
 
				+            verbose: Print progress (default True).
			
 
				 
			
 
				         Returns:
			
 
				-            Calibration model parameters and post-calibration metrics.
			
 
				+            Dict with groups, metrics, and persisted counts.
			
 
				         """
			
 
				-        raise NotImplementedError("Calibration analyzer not implemented yet")
			
 
				+
			
 
				+        source_kind: str = str(kwargs.get("source_kind", "dc_predictions"))
			
 
				+        # group_by reserved for future use; currently uses stored group in docs
			
 
				+        # group_by: str = str(kwargs.get("group_by", "league_season"))
			
 
				+        bins: int = int(kwargs.get("bins", 10))
			
 
				+        min_per_bin: int = int(kwargs.get("min_per_bin", 20))
			
 
				+        persist: bool = bool(kwargs.get("persist", True))
			
 
				+        verbose: bool = bool(kwargs.get("verbose", True))
			
 
				+        db: Optional[BaseDB] = kwargs.get("db")
			
 
				+        # Optional filters
			
 
				+        since_ts_raw = kwargs.get("since_ts")
			
 
				+        until_ts_raw = kwargs.get("until_ts")
			
 
				+        season_filter: Optional[str] = kwargs.get("season_filter")
			
 
				+        groups_include_raw = kwargs.get("groups_include")
			
 
				+        groups_include: Optional[List[str]] = None
			
 
				+        if isinstance(groups_include_raw, str):
			
 
				+            groups_include = [
			
 
				+                s.strip() for s in groups_include_raw.split(",") if s.strip()
			
 
				+            ]
			
 
				+        elif isinstance(groups_include_raw, list):
			
 
				+            groups_include = [str(x) for x in groups_include_raw]
			
 
				+        try:
			
 
				+            since_ts = int(since_ts_raw) if since_ts_raw is not None else None
			
 
				+        except (ValueError, TypeError):
			
 
				+            since_ts = None
			
 
				+        try:
			
 
				+            until_ts = int(until_ts_raw) if until_ts_raw is not None else None
			
 
				+        except (ValueError, TypeError):
			
 
				+            until_ts = None
			
 
				+
			
 
				+        eps = 1e-12
			
 
				+
			
 
				+        def _one_hot(outcome: str) -> Tuple[int, int, int]:
			
 
				+            if outcome == "H":
			
 
				+                return 1, 0, 0
			
 
				+            if outcome == "D":
			
 
				+                return 0, 1, 0
			
 
				+            if outcome == "A":
			
 
				+                return 0, 0, 1
			
 
				+            return 0, 0, 0
			
 
				+
			
 
				+        def _get_outcome(h: int, a: int) -> str:
			
 
				+            if h > a:
			
 
				+                return "H"
			
 
				+            if h == a:
			
 
				+                return "D"
			
 
				+            return "A"
			
 
				+
			
 
				+        # Load predictions from DB
			
 
				+        preds: Iterable[dict]
			
 
				+        if source_kind == "dc_predictions":
			
 
				+            if not (persist and db):
			
 
				+                # If DB not provided, fallback to data if it already consists of prediction docs
			
 
				+                preds = data if isinstance(data, Iterable) else []
			
 
				+            else:
			
 
				+                # Access DB collection for predictions
			
 
				+                db_any = cast(Any, db)
			
 
				+                preds = db_any.find(
			
 
				+                    "dc_predictions",
			
 
				+                    projection={"_id": 0},
			
 
				+                    limit=None,
			
 
				+                )
			
 
				+        else:
			
 
				+            preds = data if isinstance(data, Iterable) else []
			
 
				+
			
 
				+        # Group predictions by stored group key (e.g., "lid::season").
			
 
				+        groups: Dict[str, List[dict]] = defaultdict(list)
			
 
				+        count_in = 0
			
 
				+        for p in preds or []:
			
 
				+            if not isinstance(p, dict):
			
 
				+                continue
			
 
				+            gk = p.get("group") or "global"
			
 
				+            # Required fields: p_home/p_draw/p_away and observed scores
			
 
				+            try:
			
 
				+                ph = float(p.get("p_home", 0.0))
			
 
				+                pd = float(p.get("p_draw", 0.0))
			
 
				+                pa = float(p.get("p_away", 0.0))
			
 
				+            except (ValueError, TypeError):
			
 
				+                continue
			
 
				+            oh_raw = p.get("observed_h")
			
 
				+            oa_raw = p.get("observed_a")
			
 
				+            if oh_raw is None or oa_raw is None:
			
 
				+                continue
			
 
				+            try:
			
 
				+                oh = int(oh_raw)
			
 
				+                oa = int(oa_raw)
			
 
				+            except (ValueError, TypeError):
			
 
				+                continue
			
 
				+            # Optional filters: time window and season/group
			
 
				+            cut_ts = p.get("cutoff_ts")
			
 
				+            try:
			
 
				+                cut_ts = int(cut_ts) if cut_ts is not None else None
			
 
				+            except (ValueError, TypeError):
			
 
				+                cut_ts = None
			
 
				+            if since_ts is not None and (cut_ts is None or cut_ts < since_ts):
			
 
				+                continue
			
 
				+            if until_ts is not None and (cut_ts is None or cut_ts >= until_ts):
			
 
				+                continue
			
 
				+            if season_filter and isinstance(gk, str):
			
 
				+                # allow simple match on season suffix in group e.g. 'lid::2024-2025'
			
 
				+                if season_filter not in gk:
			
 
				+                    continue
			
 
				+            if groups_include is not None and str(gk) not in groups_include:
			
 
				+                continue
			
 
				+            # Normalize probabilities to sum=1
			
 
				+            s = ph + pd + pa
			
 
				+            if s <= 0:
			
 
				+                continue
			
 
				+            ph, pd, pa = ph / s, pd / s, pa / s
			
 
				+            outcome = _get_outcome(oh, oa)
			
 
				+            groups[str(gk)].append(
			
 
				+                {
			
 
				+                    "p": (ph, pd, pa),
			
 
				+                    "outcome": outcome,
			
 
				+                }
			
 
				+            )
			
 
				+            count_in += 1
			
 
				+
			
 
				+        if verbose:
			
 
				+            print(
			
 
				+                f"[CAL] Loaded predictions: {count_in}; groups: {len(groups)}",
			
 
				+                flush=True,
			
 
				+            )
			
 
				+
			
 
				+        # Metrics per group
			
 
				+        docs: List[Document] = []
			
 
				+        results: Dict[str, dict] = {}
			
 
				+        num_bins = max(2, int(bins))
			
 
				+
			
 
				+        def _bin_index(p: float) -> int:
			
 
				+            p = min(1.0 - 1e-12, max(0.0, float(p)))
			
 
				+            return min(num_bins - 1, int(p * num_bins))
			
 
				+
			
 
				+        for gk, rows in groups.items():
			
 
				+            if not rows:
			
 
				+                continue
			
 
				+
			
 
				+            n_samples = float(len(rows))
			
 
				+            # Overall ECE via confidence bins of max probability
			
 
				+            conf_sum = [0.0 for _ in range(num_bins)]
			
 
				+            acc_sum = [0.0 for _ in range(num_bins)]
			
 
				+            cnt = [0 for _ in range(num_bins)]
			
 
				+
			
 
				+            # Brier and LogLoss
			
 
				+            brier_sum = 0.0
			
 
				+            logloss_sum = 0.0
			
 
				+
			
 
				+            # Per-class reliability bins
			
 
				+            cls_bins = {
			
 
				+                "H": {
			
 
				+                    "p_sum": [0.0] * num_bins,
			
 
				+                    "y_sum": [0.0] * num_bins,
			
 
				+                    "cnt": [0] * num_bins,
			
 
				+                },
			
 
				+                "D": {
			
 
				+                    "p_sum": [0.0] * num_bins,
			
 
				+                    "y_sum": [0.0] * num_bins,
			
 
				+                    "cnt": [0] * num_bins,
			
 
				+                },
			
 
				+                "A": {
			
 
				+                    "p_sum": [0.0] * num_bins,
			
 
				+                    "y_sum": [0.0] * num_bins,
			
 
				+                    "cnt": [0] * num_bins,
			
 
				+                },
			
 
				+            }
			
 
				+
			
 
				+            for r in rows:
			
 
				+                ph, pd, pa = r["p"]
			
 
				+                oh, od, oa = _one_hot(r["outcome"])
			
 
				+                # overall
			
 
				+                conf = max(ph, pd, pa)
			
 
				+                pred_label = (
			
 
				+                    "H" if ph >= pd and ph >= pa else ("D" if pd >= pa else "A")
			
 
				+                )
			
 
				+                correct = (
			
 
				+                    1.0
			
 
				+                    if (
			
 
				+                        (pred_label == "H" and oh == 1)
			
 
				+                        or (pred_label == "D" and od == 1)
			
 
				+                        or (pred_label == "A" and oa == 1)
			
 
				+                    )
			
 
				+                    else 0.0
			
 
				+                )
			
 
				+                bi = _bin_index(conf)
			
 
				+                conf_sum[bi] += conf
			
 
				+                acc_sum[bi] += correct
			
 
				+                cnt[bi] += 1
			
 
				+
			
 
				+                # brier (multiclass)
			
 
				+                brier_sum += (ph - oh) ** 2 + (pd - od) ** 2 + (pa - oa) ** 2
			
 
				+
			
 
				+                # logloss
			
 
				+                p_true = max(eps, ph if oh else (pd if od else pa))
			
 
				+                logloss_sum += -math.log(p_true)
			
 
				+
			
 
				+                # per-class bins
			
 
				+                for lab, p_val, y_val in (
			
 
				+                    ("H", ph, oh),
			
 
				+                    ("D", pd, od),
			
 
				+                    ("A", pa, oa),
			
 
				+                ):
			
 
				+                    bi2 = _bin_index(p_val)
			
 
				+                    cls_bins[lab]["p_sum"][bi2] += p_val
			
 
				+                    cls_bins[lab]["y_sum"][bi2] += y_val
			
 
				+                    cls_bins[lab]["cnt"][bi2] += 1
			
 
				+
			
 
				+            # Compute ECE
			
 
				+            ece = 0.0
			
 
				+            bins_out: List[dict] = []
			
 
				+            for i in range(num_bins):
			
 
				+                if cnt[i] == 0:
			
 
				+                    continue
			
 
				+                conf_avg = conf_sum[i] / cnt[i]
			
 
				+                acc_avg = acc_sum[i] / cnt[i]
			
 
				+                w = cnt[i] / n_samples
			
 
				+                ece += w * abs(acc_avg - conf_avg)
			
 
				+
			
 
				+            # Build per-class reliability curves
			
 
				+            for lab in ("H", "D", "A"):
			
 
				+                p_sum = cls_bins[lab]["p_sum"]
			
 
				+                y_sum = cls_bins[lab]["y_sum"]
			
 
				+                c_arr = cls_bins[lab]["cnt"]
			
 
				+                for i in range(num_bins):
			
 
				+                    c = c_arr[i]
			
 
				+                    if c < min_per_bin:
			
 
				+                        continue
			
 
				+                    p_avg = p_sum[i] / c
			
 
				+                    y_avg = y_sum[i] / c
			
 
				+                    bins_out.append(
			
 
				+                        {
			
 
				+                            "group": gk,
			
 
				+                            "class": lab,
			
 
				+                            "bin": i,
			
 
				+                            "bins": num_bins,
			
 
				+                            "p_low": i / num_bins,
			
 
				+                            "p_high": (i + 1) / num_bins,
			
 
				+                            "avg_p": p_avg,
			
 
				+                            "emp_rate": y_avg,
			
 
				+                            "count": c,
			
 
				+                            "source": source_kind,
			
 
				+                        }
			
 
				+                    )
			
 
				+
			
 
				+            # Compose metrics
			
 
				+            metrics = {
			
 
				+                "group": gk,
			
 
				+                "n": int(n_samples),
			
 
				+                "brier": brier_sum / n_samples,
			
 
				+                "logloss": logloss_sum / n_samples,
			
 
				+                "ece": ece,
			
 
				+                "bins": num_bins,
			
 
				+                "min_per_bin": int(min_per_bin),
			
 
				+                "source": source_kind,
			
 
				+            }
			
 
				+            results[gk] = metrics
			
 
				+
			
 
				+            if persist and db:
			
 
				+                docs.append(
			
 
				+                    Document(
			
 
				+                        id=f"{gk}:{source_kind}:metrics:{num_bins}",
			
 
				+                        kind="calibration_metrics",
			
 
				+                        data=metrics,
			
 
				+                    )
			
 
				+                )
			
 
				+                for b in bins_out:
			
 
				+                    cls_label = b["class"]
			
 
				+                    bin_idx = b["bin"]
			
 
				+                    doc_id = f"{gk}:{source_kind}:bins:{cls_label}:{bin_idx}:{num_bins}"
			
 
				+                    docs.append(
			
 
				+                        Document(
			
 
				+                            id=doc_id,
			
 
				+                            kind="calibration_bins",
			
 
				+                            data=b,
			
 
				+                        )
			
 
				+                    )
			
 
				+
			
 
				+        if persist and db and docs:
			
 
				+            db.insert_many(docs)
			
 
				+            if verbose:
			
 
				+                metrics_cnt = sum(1 for d in docs if d.kind == "calibration_metrics")
			
 
				+                bins_cnt = sum(1 for d in docs if d.kind == "calibration_bins")
			
 
				+                print(
			
 
				+                    (f"[CAL] Persisted metrics={metrics_cnt} " f"bins={bins_cnt}"),
			
 
				+                    flush=True,
			
 
				+                )
			
 
				+
			
 
				+        return {
			
 
				+            "groups": list(results.keys()),
			
 
				+            "metrics": results,
			
 
				+            "persisted": len(docs) if (persist and db) else 0,
			
 
				+            "source": source_kind,
			
 
				+        }
			
--- a/src/databank/analytics/dixon_coles.py
+++ b/src/databank/analytics/dixon_coles.py
@@ -9,7 +9,7 @@ Purpose:
 
				 
			
 
				 from __future__ import annotations
			
 
				 
			
 
				-from typing import Any, Dict, Iterable, Optional, Tuple
			
 
				+from typing import Any, Dict, Iterable, Optional, Tuple, cast
			
 
				 from collections import defaultdict
			
 
				 import re
			
 
				 import math
			
@@ -43,9 +43,10 @@ class DixonColesAnalyzer(AnalyticsBase):
 
				             l2_base: L2 regularization on log base means (default 0.0).
			
 
				             step_size: Learning rate for "coord" optimizer (default 0.1).
			
 
				             outer_iters: Max outer iterations for "coord" (default 10).
			
 
				-            history: "none" (default), "predictions" (per-match), or "snapshots".
			
 
				+            history: "both" (default), "predictions" (per-match), "snapshots", or "none".
			
 
				                 - predictions: persist pre-match H/D/A probs and mu/nu for each match.
			
 
				                 - snapshots: persist team parameter snapshots at cutoffs.
			
 
				+                - both: do both predictions and snapshots in a single pass.
			
 
				             snapshot_every: For snapshots, persist every Nth match (default 10).
			
 
				             max_iters_history: Iterations to use for history fits (default max_iters//2).
			
 
				             max_goals: Max goals cap for probability table (default 8).
			
@@ -67,7 +68,7 @@ class DixonColesAnalyzer(AnalyticsBase):
 
				         l2_base: float = float(kwargs.get("l2_base", 0.0))
			
 
				         step_size: float = float(kwargs.get("step_size", 0.1))
			
 
				         outer_iters: int = int(kwargs.get("outer_iters", 10))
			
 
				-        history: str = str(kwargs.get("history", "none"))
			
 
				+        history: str = str(kwargs.get("history", "both")).lower()
			
 
				         snapshot_every: int = int(kwargs.get("snapshot_every", 10))
			
 
				         max_iters_history: Optional[int] = kwargs.get("max_iters_history")
			
 
				         if max_iters_history is not None:
			
@@ -76,6 +77,19 @@ class DixonColesAnalyzer(AnalyticsBase):
 
				             except (ValueError, TypeError):
			
 
				                 max_iters_history = None
			
 
				         max_goals: int = int(kwargs.get("max_goals", 8))
			
 
				+        # Progress / flushing controls
			
 
				+        verbose: bool = bool(kwargs.get("verbose", True))
			
 
				+        progress_every: int = int(kwargs.get("progress_every", 100))
			
 
				+        flush_every: int = int(kwargs.get("flush_every", 1000))
			
 
				+        # Skip switch: avoid recomputing groups already up-to-date
			
 
				+        skip_if_exists: bool = bool(kwargs.get("skip_if_exists", False))
			
 
				+        # Safety clamps to avoid extreme values in history
			
 
				+        param_min: float = float(kwargs.get("param_min", 0.3))
			
 
				+        param_max: float = float(kwargs.get("param_max", 3.0))
			
 
				+        base_min: float = float(kwargs.get("base_min", 0.3))
			
 
				+        base_max: float = float(kwargs.get("base_max", 3.0))
			
 
				+        mu_max_cap: float = float(kwargs.get("mu_max", 6.0))
			
 
				+        min_history_matches: int = int(kwargs.get("min_history_matches", 3))
			
 
				 
			
 
				         # Helpers
			
 
				         def _get_ts(match: dict) -> int:
			
@@ -266,6 +280,20 @@ class DixonColesAnalyzer(AnalyticsBase):
 
				                 for k in def_a:
			
 
				                     def_a[k] /= gm
			
 
				 
			
 
				+            def _clamp_params() -> None:
			
 
				+                # Clamp parameters to reasonable ranges
			
 
				+                nonlocal base_h, base_a
			
 
				+                base_h = max(base_min, min(base_max, float(base_h)))
			
 
				+                base_a = max(base_min, min(base_max, float(base_a)))
			
 
				+                for k in att_h:
			
 
				+                    att_h[k] = max(param_min, min(param_max, float(att_h[k])))
			
 
				+                for k in att_a:
			
 
				+                    att_a[k] = max(param_min, min(param_max, float(att_a[k])))
			
 
				+                for k in def_h:
			
 
				+                    def_h[k] = max(param_min, min(param_max, float(def_h[k])))
			
 
				+                for k in def_a:
			
 
				+                    def_a[k] = max(param_min, min(param_max, float(def_a[k])))
			
 
				+
			
 
				             def _expected(r: dict) -> tuple[float, float]:
			
 
				                 mu = base_h * att_h[r["home"]] * def_a[r["away"]]
			
 
				                 nu = base_a * att_a[r["away"]] * def_h[r["home"]]
			
@@ -297,9 +325,15 @@ class DixonColesAnalyzer(AnalyticsBase):
 
				                             den += r["w"] * mu
			
 
				                         if den > 0:
			
 
				                             factor = num / den
			
 
				+                            # Dampen extreme single-step updates
			
 
				+                            if factor < 0.5:
			
 
				+                                factor = 0.5
			
 
				+                            elif factor > 2.0:
			
 
				+                                factor = 2.0
			
 
				                             delta = max(delta, abs(1 - factor))
			
 
				                             att_h[t] *= factor
			
 
				                     _normalize()
			
 
				+                    _clamp_params()
			
 
				 
			
 
				                     # Update attack_away
			
 
				                     for t in teams:
			
@@ -313,9 +347,14 @@ class DixonColesAnalyzer(AnalyticsBase):
 
				                             den += r["w"] * nu
			
 
				                         if den > 0:
			
 
				                             factor = num / den
			
 
				+                            if factor < 0.5:
			
 
				+                                factor = 0.5
			
 
				+                            elif factor > 2.0:
			
 
				+                                factor = 2.0
			
 
				                             delta = max(delta, abs(1 - factor))
			
 
				                             att_a[t] *= factor
			
 
				                     _normalize()
			
 
				+                    _clamp_params()
			
 
				 
			
 
				                     # Update defense_away (affects mu)
			
 
				                     for t in teams:
			
@@ -329,9 +368,14 @@ class DixonColesAnalyzer(AnalyticsBase):
 
				                             den += r["w"] * mu
			
 
				                         if den > 0:
			
 
				                             factor = num / den
			
 
				+                            if factor < 0.5:
			
 
				+                                factor = 0.5
			
 
				+                            elif factor > 2.0:
			
 
				+                                factor = 2.0
			
 
				                             delta = max(delta, abs(1 - factor))
			
 
				                             def_a[t] *= factor
			
 
				                     _normalize()
			
 
				+                    _clamp_params()
			
 
				 
			
 
				                     # Update defense_home (affects nu)
			
 
				                     for t in teams:
			
@@ -345,9 +389,14 @@ class DixonColesAnalyzer(AnalyticsBase):
 
				                             den += r["w"] * nu
			
 
				                         if den > 0:
			
 
				                             factor = num / den
			
 
				+                            if factor < 0.5:
			
 
				+                                factor = 0.5
			
 
				+                            elif factor > 2.0:
			
 
				+                                factor = 2.0
			
 
				                             delta = max(delta, abs(1 - factor))
			
 
				                             def_h[t] *= factor
			
 
				                     _normalize()
			
 
				+                    _clamp_params()
			
 
				 
			
 
				                     if delta < tol:
			
 
				                         break
			
@@ -362,7 +411,7 @@ class DixonColesAnalyzer(AnalyticsBase):
 
				                 log_base_a = math.log(max(1e-9, base_a))
			
 
				 
			
 
				                 def _sync_from_logs() -> None:
			
 
				-                    nonlocal base_h, base_a
			
 
				+                    nonlocal base_h, base_a, log_base_h, log_base_a
			
 
				                     for t in teams:
			
 
				                         att_h[t] = math.exp(log_att_h[t])
			
 
				                         att_a[t] = math.exp(log_att_a[t])
			
@@ -370,6 +419,15 @@ class DixonColesAnalyzer(AnalyticsBase):
 
				                         def_a[t] = math.exp(log_def_a[t])
			
 
				                     base_h = math.exp(log_base_h)
			
 
				                     base_a = math.exp(log_base_a)
			
 
				+                    _clamp_params()
			
 
				+                    # Reflect clamps back to logs
			
 
				+                    for t in teams:
			
 
				+                        log_att_h[t] = math.log(att_h[t])
			
 
				+                        log_att_a[t] = math.log(att_a[t])
			
 
				+                        log_def_h[t] = math.log(def_h[t])
			
 
				+                        log_def_a[t] = math.log(def_a[t])
			
 
				+                    log_base_h = math.log(base_h)
			
 
				+                    log_base_a = math.log(base_a)
			
 
				 
			
 
				                 def _center_logs() -> None:
			
 
				                     # Enforce identifiability: mean of logs = 0 per block
			
@@ -512,7 +570,67 @@ class DixonColesAnalyzer(AnalyticsBase):
 
				         all_params: Dict[str, Dict[str, Dict[str, float]]] = {}
			
 
				         docs: list[Document] = []
			
 
				         total_matches = 0
			
 
				+        preds_written = 0
			
 
				+        snaps_written = 0
			
 
				+        skipped_groups = 0
			
 
				+
			
 
				+        def _maybe_flush(reason: str = "") -> None:
			
 
				+            nonlocal docs
			
 
				+            if persist and db and docs and len(docs) >= max(1, flush_every):
			
 
				+                try:
			
 
				+                    db.insert_many(docs)
			
 
				+                    if verbose:
			
 
				+                        print(
			
 
				+                            f"[DC] Flushed {len(docs)} docs to DB"
			
 
				+                            + (f" ({reason})" if reason else ""),
			
 
				+                            flush=True,
			
 
				+                        )
			
 
				+                finally:
			
 
				+                    docs = []
			
 
				+
			
 
				+        if verbose:
			
 
				+            print(f"[DC] Groups to fit: {len(groups)}", flush=True)
			
 
				+
			
 
				         for gk, rows in groups.items():
			
 
				+            # Optional: skip group if summary doc exists with same match count
			
 
				+            if skip_if_exists and persist and db:
			
 
				+                try:
			
 
				+                    db_any = cast(Any, db)
			
 
				+                    existing = None
			
 
				+                    docs_iter = db_any.find(
			
 
				+                        "dc_params", projection={"_id": 0}, limit=None
			
 
				+                    )
			
 
				+                    if docs_iter is not None:
			
 
				+                        for d in docs_iter:
			
 
				+                            if not isinstance(d, dict):
			
 
				+                                continue
			
 
				+                            if d.get("group") == gk and d.get("summary") is True:
			
 
				+                                existing = d
			
 
				+                                break
			
 
				+                    if isinstance(existing, dict):
			
 
				+                        try:
			
 
				+                            prev_matches = int(existing.get("matches", -1))
			
 
				+                        except (ValueError, TypeError):
			
 
				+                            prev_matches = -1
			
 
				+                        if prev_matches == len(rows):
			
 
				+                            if verbose:
			
 
				+                                print(
			
 
				+                                    (
			
 
				+                                        f"[DC] Skip group '{gk}' "
			
 
				+                                        f"(skip_if_exists; matches={prev_matches})"
			
 
				+                                    ),
			
 
				+                                    flush=True,
			
 
				+                                )
			
 
				+                            skipped_groups += 1
			
 
				+                            continue
			
 
				+                except (RuntimeError, ValueError, TypeError):
			
 
				+                    # Diagnostics only: if existence check fails, proceed normally
			
 
				+                    pass
			
 
				+            if verbose:
			
 
				+                print(
			
 
				+                    f"[DC] Processing group '{gk}' with {len(rows)} matches",
			
 
				+                    flush=True,
			
 
				+                )
			
 
				             p, stats = _fit_group(rows)
			
 
				             all_params[gk] = p
			
 
				             total_matches += int(stats.get("matches", 0))
			
@@ -529,6 +647,7 @@ class DixonColesAnalyzer(AnalyticsBase):
 
				                             },
			
 
				                         )
			
 
				                     )
			
 
				+                    _maybe_flush("dc_params")
			
 
				                 # Optionally also persist a group-level summary doc
			
 
				                 docs.append(
			
 
				                     Document(
			
@@ -545,14 +664,18 @@ class DixonColesAnalyzer(AnalyticsBase):
 
				                         },
			
 
				                     )
			
 
				                 )
			
 
				+                _maybe_flush("dc_params_summary")
			
 
				 
			
 
				-            # Optional history: pre-match predictions or parameter snapshots
			
 
				-            if history in ("predictions", "snapshots") and persist and db:
			
 
				+            # Optional history: pre-match predictions and/or parameter snapshots
			
 
				+            do_preds = history in ("predictions", "both")
			
 
				+            do_snaps = history in ("snapshots", "both")
			
 
				+            if (do_preds or do_snaps) and persist and db:
			
 
				                 # Helper for DC-phi adjusted joint probabilities
			
 
				                 def _probs_hda(
			
 
				                     mu: float, nu: float, rho: float
			
 
				                 ) -> tuple[float, float, float]:
			
 
				                     def _pois(k: int, lam: float) -> float:
			
 
				+                        lam = max(1e-9, float(lam))
			
 
				                         return math.exp(k * math.log(lam) - lam - math.lgamma(k + 1))
			
 
				 
			
 
				                     # Use same phi as in likelihood
			
@@ -606,7 +729,7 @@ class DixonColesAnalyzer(AnalyticsBase):
 
				 
			
 
				                 for i, r in enumerate(rows_sorted):
			
 
				                     prior = rows_sorted[:i]
			
 
				-                    if not prior:
			
 
				+                    if not prior or len(prior) < max(0, int(min_history_matches)):
			
 
				                         continue
			
 
				                     p_h, st_h = _fit_group(
			
 
				                         prior,
			
@@ -642,8 +765,10 @@ class DixonColesAnalyzer(AnalyticsBase):
 
				                         * att_a_map.get(r["away"], 1.0)
			
 
				                         * def_h_map.get(r["home"], 1.0)
			
 
				                     )
			
 
				+                    mu = max(1e-9, min(mu_max_cap, float(mu)))
			
 
				+                    nu = max(1e-9, min(mu_max_cap, float(nu)))
			
 
				 
			
 
				-                    if history == "predictions":
			
 
				+                    if do_preds:
			
 
				                         ph, pd, pa = _probs_hda(mu, nu, rho_h)
			
 
				                         docs.append(
			
 
				                             Document(
			
@@ -668,8 +793,10 @@ class DixonColesAnalyzer(AnalyticsBase):
 
				                                 },
			
 
				                             )
			
 
				                         )
			
 
				+                        preds_written += 1
			
 
				+                        _maybe_flush("dc_predictions")
			
 
				 
			
 
				-                    if history == "snapshots" and (i % max(1, snapshot_every) == 0):
			
 
				+                    if do_snaps and (i % max(1, snapshot_every) == 0):
			
 
				                         cut_ts = int(r["ts"])
			
 
				                         for team_id, vals in p_h.items():
			
 
				                             docs.append(
			
@@ -700,13 +827,30 @@ class DixonColesAnalyzer(AnalyticsBase):
 
				                                     },
			
 
				                                 )
			
 
				                             )
			
 
				+                            snaps_written += 1
			
 
				+                            _maybe_flush("dc_params_history")
			
 
				+
			
 
				+                    if verbose and (i + 1) % max(1, progress_every) == 0:
			
 
				+                        print(
			
 
				+                            (
			
 
				+                                f"[DC] Group '{gk}' history progress: "
			
 
				+                                f"{i + 1}/{len(rows_sorted)} "
			
 
				+                                f"(preds={preds_written}, snaps={snaps_written})"
			
 
				+                            ),
			
 
				+                            flush=True,
			
 
				+                        )
			
 
				 
			
 
				         if persist and db and docs:
			
 
				             db.insert_many(docs)
			
 
				+            if verbose:
			
 
				+                print(f"[DC] Final flush {len(docs)} docs", flush=True)
			
 
				 
			
 
				         return {
			
 
				             "groups": list(all_params.keys()),
			
 
				             "params": all_params,
			
 
				             "matches_used": total_matches,
			
 
				             "persisted": len(docs) if docs else 0,
			
 
				+            "predictions_written": preds_written,
			
 
				+            "snapshots_written": snaps_written,
			
 
				+            "groups_skipped": skipped_groups,
			
 
				         }