|
@@ -0,0 +1,90 @@
|
|
|
|
|
+"""Webhook reporter implementation using HTTP POST with retry and timeouts."""
|
|
|
|
|
+
|
|
|
|
|
+from __future__ import annotations
|
|
|
|
|
+
|
|
|
|
|
+import logging
|
|
|
|
|
+import time
|
|
|
|
|
+from dataclasses import dataclass
|
|
|
|
|
+from typing import Iterable, Mapping, Optional
|
|
|
|
|
+
|
|
|
|
|
+import requests
|
|
|
|
|
+from requests.adapters import HTTPAdapter
|
|
|
|
|
+from urllib3.util.retry import Retry
|
|
|
|
|
+
|
|
|
|
|
+from claudia.core import RunSummary
|
|
|
|
|
+from .base import BaseReporter
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+logger = logging.getLogger(__name__)
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+@dataclass(slots=True)
|
|
|
|
|
+class WebhookConfig:
|
|
|
|
|
+ """配置 WebhookReporter 的 HTTP 行为。
|
|
|
|
|
+
|
|
|
|
|
+ 字段:
|
|
|
|
|
+ - timeout: 单次请求超时时间(秒)。
|
|
|
|
|
+ - max_retries: 失败重试次数(总共会尝试 max_retries+1 次)。
|
|
|
|
|
+ - backoff_factor: 重试退避因子(仅用于本地循环重试的 sleep)。
|
|
|
|
|
+ - headers: 会话级请求头。
|
|
|
|
|
+ """
|
|
|
|
|
+
|
|
|
|
|
+ timeout: int = 5
|
|
|
|
|
+ max_retries: int = 2
|
|
|
|
|
+ backoff_factor: float = 0.0
|
|
|
|
|
+ headers: Mapping[str, str] | None = None
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+class WebhookReporter(BaseReporter):
|
|
|
|
|
+ """Webhook 报告器:通过 HTTP POST 上报各类事件。"""
|
|
|
|
|
+
|
|
|
|
|
+ def __init__(
|
|
|
|
|
+ self,
|
|
|
|
|
+ url: str,
|
|
|
|
|
+ config: Optional[WebhookConfig] = None,
|
|
|
|
|
+ session: Optional[requests.Session] = None,
|
|
|
|
|
+ ) -> None:
|
|
|
|
|
+ self._url = url
|
|
|
|
|
+ self._config = config or WebhookConfig()
|
|
|
|
|
+ # 优先使用注入的 session(便于测试),否则创建带 Retry 的 Session
|
|
|
|
|
+ self._session = session or requests.Session()
|
|
|
|
|
+ if session is None:
|
|
|
|
|
+ retry = Retry(
|
|
|
|
|
+ total=self._config.max_retries,
|
|
|
|
|
+ backoff_factor=self._config.backoff_factor,
|
|
|
|
|
+ status_forcelist=(500, 502, 503, 504),
|
|
|
|
|
+ allowed_methods=("POST",),
|
|
|
|
|
+ )
|
|
|
|
|
+ adapter = HTTPAdapter(max_retries=retry)
|
|
|
|
|
+ self._session.mount("http://", adapter)
|
|
|
|
|
+ self._session.mount("https://", adapter)
|
|
|
|
|
+ if self._config.headers:
|
|
|
|
|
+ self._session.headers.update(dict(self._config.headers))
|
|
|
|
|
+
|
|
|
|
|
+ def _post(self, payload: dict) -> bool:
|
|
|
|
|
+ """发送 POST,带有简单的本地重试保护,避免异常冒泡影响主流程。"""
|
|
|
|
|
+ attempts = self._config.max_retries + 1
|
|
|
|
|
+ for attempt in range(1, attempts + 1):
|
|
|
|
|
+ try:
|
|
|
|
|
+ resp = self._session.post(self._url, json=payload, timeout=self._config.timeout)
|
|
|
|
|
+ # 若需要,可强制失败时抛出异常以触发本地重试
|
|
|
|
|
+ if hasattr(resp, "raise_for_status"):
|
|
|
|
|
+ resp.raise_for_status()
|
|
|
|
|
+ return True
|
|
|
|
|
+ except requests.RequestException as exc:
|
|
|
|
|
+ logger.warning("Webhook post failed (attempt %s/%s): %s", attempt, attempts, exc)
|
|
|
|
|
+ if attempt < attempts and self._config.backoff_factor > 0:
|
|
|
|
|
+ time.sleep(self._config.backoff_factor)
|
|
|
|
|
+ return False
|
|
|
|
|
+
|
|
|
|
|
+ def notify_start(self, spider_name: str, urls: Iterable[str]) -> None: # pragma: no cover
|
|
|
|
|
+ self._post({"event": "start", "spider": spider_name, "count": len(list(urls))})
|
|
|
|
|
+
|
|
|
|
|
+ def notify_success(self, spider_name: str, count: int) -> None: # pragma: no cover
|
|
|
|
|
+ self._post({"event": "success", "spider": spider_name, "count": count})
|
|
|
|
|
+
|
|
|
|
|
+ def notify_error(self, spider_name: str, error: str) -> None: # pragma: no cover
|
|
|
|
|
+ self._post({"event": "error", "spider": spider_name, "error": error})
|
|
|
|
|
+
|
|
|
|
|
+ def notify_summary(self, summary: RunSummary) -> None: # pragma: no cover
|
|
|
|
|
+ self._post({"event": "summary", "total": summary.total_docs})
|