|
|
@@ -0,0 +1,196 @@
|
|
|
+"""Team extraction and normalization analyzer skeleton.
|
|
|
+
|
|
|
+Purpose:
|
|
|
+- Scan matches, extract home/away team names, normalize to canonical team records.
|
|
|
+- Maintain a teams collection (team_id, name_canonical, aliases, metadata).
|
|
|
+
|
|
|
+Notes:
|
|
|
+- This is a scaffold only; implement normalization and DB writes in compute().
|
|
|
+"""
|
|
|
+
|
|
|
+from __future__ import annotations
|
|
|
+
|
|
|
+from typing import Any, Iterable, Dict, Set
|
|
|
+import re
|
|
|
+
|
|
|
+from databank.core.models import Document
|
|
|
+from databank.db.base import BaseDB
|
|
|
+
|
|
|
+from .base import AnalyticsBase
|
|
|
+
|
|
|
+
|
|
|
+class TeamExtractorAnalyzer(AnalyticsBase):
|
|
|
+ """Extract and normalize teams from match documents.
|
|
|
+
|
|
|
+ Expected input data:
|
|
|
+ - An iterable of match-like documents (e.g., runner.last_docs or DB query results).
|
|
|
+ Each item is expected to have a dict-like attribute `.data` with a nested `match` dict.
|
|
|
+
|
|
|
+ Output:
|
|
|
+ - Should update a `teams` collection in DB (to be implemented by you).
|
|
|
+ - Return a summary dict: {"inserted": int, "updated": int, "seen": int}.
|
|
|
+ """
|
|
|
+
|
|
|
+ def compute(self, data: Any, **kwargs: Any) -> Any:
|
|
|
+ """Compute team extraction and normalization.
|
|
|
+
|
|
|
+ Args:
|
|
|
+ data: Iterable of documents containing match payloads.
|
|
|
+ **kwargs: Optional parameters, e.g., db, dry_run, preview_limit.
|
|
|
+
|
|
|
+ Returns:
|
|
|
+ A summary dict of changes or a preview list in dry-run.
|
|
|
+ """
|
|
|
+ db: BaseDB | None = kwargs.get("db") # optional DB for persistence
|
|
|
+ dry_run: bool = bool(kwargs.get("dry_run", False))
|
|
|
+
|
|
|
+ def _norm(name: str) -> str:
|
|
|
+ s = name.strip().lower()
|
|
|
+ s = re.sub(r"[\s\-_.]+", " ", s)
|
|
|
+ s = re.sub(r"[^0-9a-z\u4e00-\u9fff ]+", "", s)
|
|
|
+ return s.strip()
|
|
|
+
|
|
|
+ def _extract_team(
|
|
|
+ obj: dict, prefix_candidates: Iterable[str]
|
|
|
+ ) -> tuple[str | None, str | None]:
|
|
|
+ # Try to find team name and id with multiple possible keys
|
|
|
+ name_keys = [
|
|
|
+ "TeamName",
|
|
|
+ "teamName",
|
|
|
+ "name",
|
|
|
+ "team_name",
|
|
|
+ "homeTeamName",
|
|
|
+ "awayTeamName",
|
|
|
+ "homeName",
|
|
|
+ "awayName",
|
|
|
+ ]
|
|
|
+ id_keys = [
|
|
|
+ "TeamId",
|
|
|
+ "teamId",
|
|
|
+ "id",
|
|
|
+ "team_id",
|
|
|
+ "homeTeamId",
|
|
|
+ "awayTeamId",
|
|
|
+ "homeId",
|
|
|
+ "awayId",
|
|
|
+ ]
|
|
|
+ name_val = None
|
|
|
+ id_val = None
|
|
|
+ for pref in prefix_candidates:
|
|
|
+ for nk in name_keys:
|
|
|
+ key = pref + nk if pref else nk
|
|
|
+ if key in obj and isinstance(obj[key], str):
|
|
|
+ name_val = obj[key]
|
|
|
+ break
|
|
|
+ for ik in id_keys:
|
|
|
+ key = pref + ik if pref else ik
|
|
|
+ if key in obj and obj[key] is not None:
|
|
|
+ id_val = str(obj[key])
|
|
|
+ break
|
|
|
+ return name_val, id_val
|
|
|
+
|
|
|
+ # Collect canonical teams
|
|
|
+ teams: Dict[str, Dict[str, Any]] = {}
|
|
|
+ aliases_map: Dict[str, Set[str]] = {}
|
|
|
+
|
|
|
+ items: list[Any] = list(data) if isinstance(data, Iterable) else []
|
|
|
+ for d in items:
|
|
|
+ # DB record style: dict with 'match' field
|
|
|
+ match = None
|
|
|
+ if isinstance(d, dict):
|
|
|
+ match = d.get("match") or d.get("data", {}).get("match")
|
|
|
+ if not isinstance(match, dict):
|
|
|
+ continue
|
|
|
+
|
|
|
+ # Extract home/away teams
|
|
|
+ # Try direct fields first
|
|
|
+ home_name = (
|
|
|
+ match.get("homeTeamName")
|
|
|
+ or match.get("homeName")
|
|
|
+ or match.get("hostTeamName")
|
|
|
+ )
|
|
|
+ away_name = (
|
|
|
+ match.get("awayTeamName")
|
|
|
+ or match.get("awayName")
|
|
|
+ or match.get("guestTeamName")
|
|
|
+ )
|
|
|
+ home_id = match.get("homeTeamId") or match.get("homeId")
|
|
|
+ away_id = match.get("awayTeamId") or match.get("awayId")
|
|
|
+
|
|
|
+ # Fallback: generic extractor with prefixes
|
|
|
+ if not home_name or not home_id:
|
|
|
+ n, i = _extract_team(match, ("home", "host", ""))
|
|
|
+ home_name = home_name or n
|
|
|
+ home_id = home_id or i
|
|
|
+ if not away_name or not away_id:
|
|
|
+ n, i = _extract_team(match, ("away", "guest", ""))
|
|
|
+ away_name = away_name or n
|
|
|
+ away_id = away_id or i
|
|
|
+
|
|
|
+ # Build canonical IDs (fallback to normalized name)
|
|
|
+ if home_name:
|
|
|
+ h_norm = _norm(home_name)
|
|
|
+ h_id = str(home_id) if home_id else h_norm
|
|
|
+ rec = teams.setdefault(
|
|
|
+ h_id, {"team_id": h_id, "name_canonical": h_norm, "aliases": set()}
|
|
|
+ )
|
|
|
+ rec["name_canonical"] = rec.get("name_canonical", h_norm)
|
|
|
+ aliases_map.setdefault(h_id, set()).add(home_name)
|
|
|
+ if away_name:
|
|
|
+ a_norm = _norm(away_name)
|
|
|
+ a_id = str(away_id) if away_id else a_norm
|
|
|
+ rec = teams.setdefault(
|
|
|
+ a_id, {"team_id": a_id, "name_canonical": a_norm, "aliases": set()}
|
|
|
+ )
|
|
|
+ rec["name_canonical"] = rec.get("name_canonical", a_norm)
|
|
|
+ aliases_map.setdefault(a_id, set()).add(away_name)
|
|
|
+
|
|
|
+ # If DB provided, merge existing aliases for idempotent union updates
|
|
|
+ existing_aliases: Dict[str, Set[str]] = {}
|
|
|
+ if db and teams:
|
|
|
+ ids = list(teams.keys())
|
|
|
+ db_any: Any = db # allow dynamic attribute access for find
|
|
|
+ try:
|
|
|
+ # Try fetch existing by _id in a single query
|
|
|
+ existing = db_any.find(
|
|
|
+ "teams",
|
|
|
+ {"_id": {"$in": ids}},
|
|
|
+ projection=None,
|
|
|
+ limit=None,
|
|
|
+ ) # type: ignore[arg-type]
|
|
|
+ except TypeError:
|
|
|
+ # Fallback signature without projection/limit
|
|
|
+ existing = db_any.find(
|
|
|
+ "teams",
|
|
|
+ {"_id": {"$in": ids}},
|
|
|
+ ) # type: ignore[assignment]
|
|
|
+ for doc in existing or []:
|
|
|
+ key = str(doc.get("_id") or doc.get("id") or doc.get("team_id") or "")
|
|
|
+ if not key:
|
|
|
+ continue
|
|
|
+ aliases_list = doc.get("aliases")
|
|
|
+ if not isinstance(aliases_list, list):
|
|
|
+ aliases_list = (doc.get("data", {}) or {}).get("aliases")
|
|
|
+ if not isinstance(aliases_list, list):
|
|
|
+ aliases_list = []
|
|
|
+ existing_aliases[key] = set(map(str, aliases_list))
|
|
|
+
|
|
|
+ # Materialize aliases and prepare docs
|
|
|
+ docs: list[Document] = []
|
|
|
+ for tid, rec in teams.items():
|
|
|
+ merged: Set[str] = set(sorted(aliases_map.get(tid, set())))
|
|
|
+ if tid in existing_aliases:
|
|
|
+ merged |= existing_aliases[tid]
|
|
|
+ aliases = sorted(merged)
|
|
|
+ payload = {
|
|
|
+ "team_id": tid,
|
|
|
+ "name_canonical": rec.get("name_canonical"),
|
|
|
+ "aliases": aliases,
|
|
|
+ }
|
|
|
+ docs.append(Document(id=tid, kind="teams", data=payload))
|
|
|
+
|
|
|
+ if dry_run or not db or not docs:
|
|
|
+ return {"seen": len(items), "prepared": len(docs)}
|
|
|
+
|
|
|
+ inserted = db.insert_many(docs)
|
|
|
+ return {"seen": len(items), "prepared": len(docs), "upserted": inserted}
|