|
|
@@ -30,6 +30,7 @@ import json
|
|
|
import random
|
|
|
import time
|
|
|
from typing import Any, Mapping
|
|
|
+from datetime import datetime, timedelta, timezone
|
|
|
|
|
|
try: # Optional dependency; guide user to install if missing
|
|
|
import requests
|
|
|
@@ -54,6 +55,11 @@ class GetLeagueMatchListSpider(BaseSpider):
|
|
|
|
|
|
endpoint: str = "https://sport.ttyingqiu.com/sportdata/f?platform=web"
|
|
|
|
|
|
+ # UTC+8 timezone for date comparisons
|
|
|
+ tz = timezone(timedelta(hours=8))
|
|
|
+ # When future-dated matches exceed this number in a round, suggest stopping the season
|
|
|
+ future_exceed_limit: int = 5
|
|
|
+
|
|
|
def build_payload(self, task: Task) -> Payload:
|
|
|
"""Build JSON payload from a MatchListTask (structured input)."""
|
|
|
if not isinstance(task, MatchListTask):
|
|
|
@@ -92,8 +98,12 @@ class GetLeagueMatchListSpider(BaseSpider):
|
|
|
try:
|
|
|
timeout = float(self.request_timeout_s or 15.0)
|
|
|
headers: Mapping[str, str] = dict(self.default_headers or {})
|
|
|
+ # Remove internal fields (e.g., `_task`) to avoid JSON serialization issues
|
|
|
+ safe_payload = {
|
|
|
+ k: v for k, v in dict(payload).items() if not str(k).startswith("_")
|
|
|
+ }
|
|
|
resp = requests.post(
|
|
|
- self.endpoint, headers=headers, json=dict(payload), timeout=timeout
|
|
|
+ self.endpoint, headers=headers, json=safe_payload, timeout=timeout
|
|
|
)
|
|
|
resp.raise_for_status()
|
|
|
return resp.text
|
|
|
@@ -106,7 +116,14 @@ class GetLeagueMatchListSpider(BaseSpider):
|
|
|
)
|
|
|
|
|
|
def parse(self, task: Task, content: str, payload: Payload) -> Documents:
|
|
|
- """Parse JSON, filter matchList by groupName == '联赛', return Documents."""
|
|
|
+ """Parse JSON, keep only finished league matches and handle future dates.
|
|
|
+
|
|
|
+ Rules:
|
|
|
+ - Keep items where groupName == '联赛' AND elapsedTime == '已完场'.
|
|
|
+ - Skip matches whose date is after today (UTC+8); count them.
|
|
|
+ - If future_count >= future_exceed_limit, emit a control Document
|
|
|
+ advising to stop the season.
|
|
|
+ """
|
|
|
try:
|
|
|
data = json.loads(content)
|
|
|
except json.JSONDecodeError as exc:
|
|
|
@@ -125,6 +142,9 @@ class GetLeagueMatchListSpider(BaseSpider):
|
|
|
|
|
|
# If fetch reported an error, convert to error document directly
|
|
|
if isinstance(data, Mapping) and "error" in data:
|
|
|
+ safe_payload = {
|
|
|
+ k: v for k, v in dict(payload).items() if not str(k).startswith("_")
|
|
|
+ }
|
|
|
return [
|
|
|
Document(
|
|
|
id=None,
|
|
|
@@ -133,7 +153,7 @@ class GetLeagueMatchListSpider(BaseSpider):
|
|
|
"token": task.token() if hasattr(task, "token") else str(task),
|
|
|
"reason": str(data.get("error")),
|
|
|
"detail": str(data.get("detail")),
|
|
|
- "payload": dict(payload),
|
|
|
+ "payload": safe_payload,
|
|
|
},
|
|
|
)
|
|
|
]
|
|
|
@@ -158,6 +178,9 @@ class GetLeagueMatchListSpider(BaseSpider):
|
|
|
|
|
|
if not match_list:
|
|
|
# Return error document if API failed or schema unexpected
|
|
|
+ safe_payload = {
|
|
|
+ k: v for k, v in dict(payload).items() if not str(k).startswith("_")
|
|
|
+ }
|
|
|
return [
|
|
|
Document(
|
|
|
id=None,
|
|
|
@@ -165,7 +188,7 @@ class GetLeagueMatchListSpider(BaseSpider):
|
|
|
data={
|
|
|
"token": task.token() if hasattr(task, "token") else str(task),
|
|
|
"reason": "no_match_list",
|
|
|
- "payload": dict(payload),
|
|
|
+ "payload": safe_payload,
|
|
|
"raw_keys": (
|
|
|
list(data.keys())
|
|
|
if isinstance(data, dict)
|
|
|
@@ -175,29 +198,113 @@ class GetLeagueMatchListSpider(BaseSpider):
|
|
|
)
|
|
|
]
|
|
|
|
|
|
- # Filter by groupName == "联赛"
|
|
|
- filtered = [
|
|
|
+ # First stage: league-only and finished-only (elapsedTime == '已完场')
|
|
|
+ stage1 = [
|
|
|
item
|
|
|
for item in match_list
|
|
|
- if isinstance(item, Mapping) and item.get("groupName") == "联赛"
|
|
|
+ if isinstance(item, Mapping)
|
|
|
+ and item.get("groupName") == "联赛"
|
|
|
+ and item.get("elapsedTime") == "已完场"
|
|
|
]
|
|
|
|
|
|
+ # Second stage: drop future-dated matches and count them
|
|
|
+ today = datetime.now(self.tz).date()
|
|
|
+
|
|
|
+ def _extract_dt(it: Mapping[str, Any]) -> datetime | None:
|
|
|
+ for key in (
|
|
|
+ "matchTime",
|
|
|
+ "matchDate",
|
|
|
+ "startTime",
|
|
|
+ "gameTime",
|
|
|
+ "beginTime",
|
|
|
+ "match_time",
|
|
|
+ "start_time",
|
|
|
+ ):
|
|
|
+ if key not in it:
|
|
|
+ continue
|
|
|
+ val = it.get(key)
|
|
|
+ # Timestamp numbers (sec or ms)
|
|
|
+ if isinstance(val, (int, float)):
|
|
|
+ ts = float(val)
|
|
|
+ if ts > 1e12: # likely ms
|
|
|
+ ts /= 1000.0
|
|
|
+ try:
|
|
|
+ return datetime.fromtimestamp(ts, tz=self.tz)
|
|
|
+ except (ValueError, OSError):
|
|
|
+ continue
|
|
|
+ # String formats
|
|
|
+ if isinstance(val, str):
|
|
|
+ s = val.strip()
|
|
|
+ for fmt in (
|
|
|
+ "%Y-%m-%d %H:%M:%S",
|
|
|
+ "%Y-%m-%d %H:%M",
|
|
|
+ "%Y/%m/%d %H:%M:%S",
|
|
|
+ "%Y/%m/%d %H:%M",
|
|
|
+ "%Y-%m-%d",
|
|
|
+ "%Y/%m/%d",
|
|
|
+ ):
|
|
|
+ try:
|
|
|
+ return datetime.strptime(s, fmt).replace(tzinfo=self.tz)
|
|
|
+ except ValueError:
|
|
|
+ pass
|
|
|
+ try:
|
|
|
+ dt_iso = datetime.fromisoformat(s)
|
|
|
+ if dt_iso.tzinfo is None:
|
|
|
+ dt_iso = dt_iso.replace(tzinfo=self.tz)
|
|
|
+ else:
|
|
|
+ dt_iso = dt_iso.astimezone(self.tz)
|
|
|
+ return dt_iso
|
|
|
+ except ValueError:
|
|
|
+ continue
|
|
|
+ return None
|
|
|
+
|
|
|
+ future_count = 0
|
|
|
+ filtered: list[Mapping[str, Any]] = []
|
|
|
+ for it in stage1:
|
|
|
+ dt = _extract_dt(it)
|
|
|
+ if dt is not None and dt.date() > today:
|
|
|
+ future_count += 1
|
|
|
+ continue
|
|
|
+ filtered.append(it)
|
|
|
+
|
|
|
docs: list[Document] = []
|
|
|
for item in filtered:
|
|
|
doc_id = (
|
|
|
str(item.get("matchId")) if item.get("matchId") is not None else None
|
|
|
)
|
|
|
+ # Ensure to store payload without internal fields
|
|
|
+ safe_payload = {
|
|
|
+ k: v for k, v in dict(payload).items() if not str(k).startswith("_")
|
|
|
+ }
|
|
|
docs.append(
|
|
|
Document(
|
|
|
id=doc_id,
|
|
|
kind="match",
|
|
|
data={
|
|
|
"token": task.token() if hasattr(task, "token") else str(task),
|
|
|
- "payload": dict(payload),
|
|
|
+ "payload": safe_payload,
|
|
|
"match": dict(item),
|
|
|
},
|
|
|
)
|
|
|
)
|
|
|
+ # Append a control document if threshold reached
|
|
|
+ if future_count >= self.future_exceed_limit:
|
|
|
+ docs.append(
|
|
|
+ Document(
|
|
|
+ id=None,
|
|
|
+ kind="control",
|
|
|
+ data={
|
|
|
+ "action": "stop_season",
|
|
|
+ "reason": "too_many_future_matches",
|
|
|
+ "future_count": future_count,
|
|
|
+ "future_exceed_limit": self.future_exceed_limit,
|
|
|
+ "season": getattr(task, "season", None),
|
|
|
+ "league_id": getattr(task, "league_id", None),
|
|
|
+ "round_no": getattr(task, "round_no", None),
|
|
|
+ },
|
|
|
+ )
|
|
|
+ )
|
|
|
+
|
|
|
return docs
|
|
|
|
|
|
|