Browse Source

为 MongoDB 连接添加索引配置,确保 matchId 唯一性;在调度器中确保索引存在。更新 GetLeagueMatchListSpider 以支持从环境变量读取未来比赛限制,优化未来比赛计数逻辑。

admin 2 tháng trước cách đây
mục cha
commit
ded001848b

+ 14 - 1
scripts/run_scheduler.py

@@ -28,8 +28,21 @@ def main() -> None:
     uri = os.getenv("DATABANK_DB_URI", "mongodb://localhost:27017")
     name = os.getenv("DATABANK_DB_NAME", "databank")
 
-    db = MongoDB(uri=uri, name=name)
+    db = MongoDB(
+        uri=uri,
+        name=name,
+        indexes={
+            "match": [
+                {
+                    "keys": [("match.matchId", 1)],
+                    "unique": True,
+                    "name": "uniq_match_matchId",
+                }
+            ]
+        },
+    )
     db.connect()
+    db.ensure_indexes()
 
     # Spiders
     get_match = GetLeagueMatchListSpider()

+ 14 - 1
scripts/run_scheduler_full.py

@@ -25,8 +25,21 @@ def main() -> None:
     uri = os.getenv("DATABANK_DB_URI", essential_env["DATABANK_DB_URI"])
     name = os.getenv("DATABANK_DB_NAME", essential_env["DATABANK_DB_NAME"])
 
-    db = MongoDB(uri=uri, name=name)
+    db = MongoDB(
+        uri=uri,
+        name=name,
+        indexes={
+            "match": [
+                {
+                    "keys": [("match.matchId", 1)],
+                    "unique": True,
+                    "name": "uniq_match_matchId",
+                }
+            ]
+        },
+    )
     db.connect()
+    db.ensure_indexes()
 
     # Spiders
     get_match = GetLeagueMatchListSpider()

+ 14 - 1
scripts/run_scheduler_incremental.py

@@ -25,8 +25,21 @@ def main() -> None:
     uri = os.getenv("DATABANK_DB_URI", essential_env["DATABANK_DB_URI"])
     name = os.getenv("DATABANK_DB_NAME", essential_env["DATABANK_DB_NAME"])
 
-    db = MongoDB(uri=uri, name=name)
+    db = MongoDB(
+        uri=uri,
+        name=name,
+        indexes={
+            "match": [
+                {
+                    "keys": [("match.matchId", 1)],
+                    "unique": True,
+                    "name": "uniq_match_matchId",
+                }
+            ]
+        },
+    )
     db.connect()
+    db.ensure_indexes()
 
     # Spiders
     get_match = GetLeagueMatchListSpider()

+ 5 - 0
src/databank/scheduler/simple_runner.py

@@ -129,6 +129,11 @@ class SimpleRunner:
                         if action == "stop_season":
                             key = f"{data.get('league_id', league_id)}|{data.get('season', season)}"
                             stop_seasons.add(key)
+                            print(
+                                f"\n[{spider.name}] stop_season signaled for {key} "
+                                f"(future_count={data.get('future_count')}, "
+                                f"limit={data.get('future_exceed_limit')})"
+                            )
 
                 # update progress metrics after handling the task
                 processed_tasks += 1

+ 30 - 9
src/databank/spiders/get_league_match_list.py

@@ -30,6 +30,7 @@ import json
 import random
 import time
 from typing import Any, Mapping
+import os
 from datetime import datetime, timedelta, timezone
 
 try:  # Optional dependency; guide user to install if missing
@@ -58,7 +59,7 @@ class GetLeagueMatchListSpider(BaseSpider):
     # UTC+8 timezone for date comparisons
     tz = timezone(timedelta(hours=8))
     # When future-dated matches exceed this number in a round, suggest stopping the season
-    future_exceed_limit: int = 5
+    future_exceed_limit: int = int(os.getenv("DATABANK_FUTURE_EXCEED_LIMIT", "5"))
 
     def build_payload(self, task: Task) -> Payload:
         """Build JSON payload from a MatchListTask (structured input)."""
@@ -198,16 +199,14 @@ class GetLeagueMatchListSpider(BaseSpider):
                 )
             ]
 
-        # First stage: league-only and finished-only (elapsedTime == '已完场')
-        stage1 = [
+        # Stage 1: league-only (do NOT filter by elapsedTime yet, to allow future-count)
+        league_items = [
             item
             for item in match_list
-            if isinstance(item, Mapping)
-            and item.get("groupName") == "联赛"
-            and item.get("elapsedTime") == "已完场"
+            if isinstance(item, Mapping) and item.get("groupName") == "联赛"
         ]
 
-        # Second stage: drop future-dated matches and count them
+        # Helper for date extraction
         today = datetime.now(self.tz).date()
 
         def _extract_dt(it: Mapping[str, Any]) -> datetime | None:
@@ -219,6 +218,10 @@ class GetLeagueMatchListSpider(BaseSpider):
                 "beginTime",
                 "match_time",
                 "start_time",
+                "kickOffTime",
+                "kickoffTime",
+                "matchStartTime",
+                "matchBeginTime",
             ):
                 if key not in it:
                     continue
@@ -235,6 +238,15 @@ class GetLeagueMatchListSpider(BaseSpider):
                 # String formats
                 if isinstance(val, str):
                     s = val.strip()
+                    # Numeric string timestamps (sec/ms)
+                    if s.isdigit():
+                        try:
+                            ts = float(s)
+                            if ts > 1e12:
+                                ts /= 1000.0
+                            return datetime.fromtimestamp(ts, tz=self.tz)
+                        except (ValueError, OSError):
+                            pass
                     for fmt in (
                         "%Y-%m-%d %H:%M:%S",
                         "%Y-%m-%d %H:%M",
@@ -258,12 +270,21 @@ class GetLeagueMatchListSpider(BaseSpider):
                         continue
             return None
 
+        # Count future-dated matches across ALL league items (regardless of finished)
         future_count = 0
-        filtered: list[Mapping[str, Any]] = []
-        for it in stage1:
+        for it in league_items:
             dt = _extract_dt(it)
             if dt is not None and dt.date() > today:
                 future_count += 1
+
+        # Stage 2: keep only finished items for persistence, and still guard against future-dated
+        filtered: list[Mapping[str, Any]] = []
+        for it in league_items:
+            if it.get("elapsedTime") != "已完场":
+                continue
+            dt = _extract_dt(it)
+            if dt is not None and dt.date() > today:
+                # Defensive: finished items shouldn't be future, skip if happens
                 continue
             filtered.append(it)