Prechádzať zdrojové kódy

完善项目结构,添加核心功能与测试,包括数据库、爬虫、报告器及调度器实现;更新开发指南与配置文件

admin 3 mesiacov pred
rodič
commit
84873d64a1

+ 6 - 1
.vscode/settings.json

@@ -3,5 +3,10 @@
   "python.terminal.useEnvFile": true,
   "python.analysis.extraPaths": [
     "${workspaceFolder}/src"
-  ]
+  ],
+  "editor.defaultFormatter": "ms-python.black-formatter",
+  "editor.formatOnSave": true,
+  "python.linting.enabled": false,
+  "ruff.lint.run": "onSave",
+  "ruff.configuration": "${workspaceFolder}/pyproject.toml"
 }

+ 28 - 0
.vscode/tasks.json

@@ -0,0 +1,28 @@
+{
+  // VS Code tasks for common project actions
+  "version": "2.0.0",
+  "tasks": [
+    {
+      "label": "pytest",
+      "type": "shell",
+      "command": "${command:python.interpreterPath}",
+      "args": ["-m", "pytest", "-q"],
+      "group": "build",
+  "problemMatcher": ["$python"]
+    },
+    {
+      "label": "pylint:src",
+      "type": "shell",
+      "command": "${command:python.interpreterPath}",
+      "args": ["-m", "pylint", "src/claudia", "--score=n", "--rcfile", "pylintrc"],
+      "group": "build"
+    },
+    {
+      "label": "pylint:tests",
+      "type": "shell",
+      "command": "${command:python.interpreterPath}",
+      "args": ["-m", "pylint", "tests", "--score=n", "--rcfile", "pylintrc"],
+      "group": "build"
+    }
+  ]
+}

+ 51 - 1
README.md

@@ -1,3 +1,53 @@
 # claudia
 
-招投标信息自动化平台
+招投标信息自动化平台
+
+## 开发指南
+
+本项目已集成 Ruff(含 isort 规则)与 Black,用于代码检查与格式化;VS Code 保存时会自动格式化与运行 Ruff 检查。
+
+### 安装开发依赖
+
+```pwsh
+pip install -e .[dev]
+```
+
+### 代码检查(Ruff)
+
+仅检查:
+
+```pwsh
+python -m ruff check .
+```
+
+自动修复:
+
+```pwsh
+python -m ruff check . --fix
+```
+
+### 代码格式化(Black)
+
+自动格式化:
+
+```pwsh
+python -m black .
+```
+
+仅检查:
+
+```pwsh
+python -m black --check .
+```
+
+### 运行测试
+
+```pwsh
+python -m pytest -q
+```
+
+### VS Code 设置(已启用)
+- `editor.defaultFormatter = "ms-python.black-formatter"`
+- `editor.formatOnSave = true`
+- `ruff.lint.run = "onSave"`
+- `python.terminal.useEnvFile = true`(使用 `.env` 的 `PYTHONPATH=src`)

+ 8 - 0
pylintrc

@@ -0,0 +1,8 @@
+[MASTER]
+# Ensure pylint can import the project package by adding src to sys.path
+init-hook=
+    import sys, os; sys.path.insert(0, os.path.abspath("src"))
+
+# You can tweak further global settings here as needed
+# good-names=i,j,k,ex,_,db
+# ignore-patterns=venv,\.venv,build,dist

+ 40 - 1
pyproject.toml

@@ -6,9 +6,18 @@ readme = "README.md"
 requires-python = ">=3.9"
 keywords = ["scaffold", "template"]
 license = { file = "LICENSE" }
+dependencies = [
+	"requests>=2.31",
+	"pymongo>=4.6",
+]
 
 [project.optional-dependencies]
-dev = ["pytest>=7"]
+dev = [
+	"pytest>=7",
+	"ruff>=0.5",
+	"black>=24.3",
+	"pylint>=3.2",
+]
 
 [build-system]
 requires = ["setuptools>=61.0"]
@@ -20,3 +29,33 @@ claudia = "claudia.__main__:main"
 [tool.pytest.ini_options]
 addopts = "-q"
 pythonpath = ["src"]
+
+[tool.ruff]
+line-length = 100
+target-version = "py39"
+
+[tool.ruff.lint]
+select = ["E", "F", "I", "UP"]
+ignore = []
+
+[tool.ruff.format]
+quote-style = "double"
+indent-style = "space"
+skip-magic-trailing-comma = false
+line-ending = "lf"
+
+[tool.black]
+line-length = 100
+target-version = ["py39"]
+include = "\\.(py|pyi)$"
+exclude = '''
+/(
+	\.git
+| \.venv
+| build
+| dist
+| __pycache__
+| \.mypy_cache
+| \.ruff_cache
+)
+'''

+ 19 - 1
src/claudia/__init__.py

@@ -2,6 +2,24 @@
 
 __version__ = "0.1.0"
 
+from .core import Document, RunSummary
+from .spiders import BaseSpider, ExampleSpider
+from .db import BaseDB, MemoryDB, MongoDB
+from .reporter import BaseReporter, MemoryReporter, WebhookReporter, EmailReporter
+from .scheduler import Runner
+
 __all__ = [
-    # public API placeholders will be added here later
+    "__version__",
+    "Document",
+    "RunSummary",
+    "BaseSpider",
+    "ExampleSpider",
+    "BaseDB",
+    "MemoryDB",
+    "MongoDB",
+    "BaseReporter",
+    "MemoryReporter",
+    "WebhookReporter",
+    "EmailReporter",
+    "Runner",
 ]

+ 4 - 0
src/claudia/__main__.py

@@ -1,4 +1,8 @@
+"""Entry point for running Claudia as a module."""
+
+
 def main() -> int:
+    """Simple entry function printing readiness message and returning exit code."""
     print("Claudia package is ready.")
     return 0
 

+ 5 - 0
src/claudia/core/__init__.py

@@ -0,0 +1,5 @@
+"""Core models public API exports for Claudia."""
+
+from .models import Document, RunSummary
+
+__all__ = ["Document", "RunSummary"]

+ 29 - 0
src/claudia/core/models.py

@@ -0,0 +1,29 @@
+"""Core data models used across Claudia (Document, RunSummary)."""
+
+from __future__ import annotations
+
+from dataclasses import dataclass, field
+from datetime import datetime, UTC
+from typing import Any, Dict, List, Optional
+
+
+@dataclass(slots=True)
+class Document:
+    """统一的数据输出模型。"""
+
+    id: Optional[str]
+    url: str
+    payload: Dict[str, Any] = field(default_factory=dict)
+    data: Dict[str, Any] = field(default_factory=dict)
+    fetched_at: datetime = field(default_factory=lambda: datetime.now(UTC))
+
+
+@dataclass(slots=True)
+class RunSummary:
+    """一次调度运行的汇总信息。"""
+
+    started_at: datetime = field(default_factory=lambda: datetime.now(UTC))
+    finished_at: Optional[datetime] = None
+    total_docs: int = 0
+    per_spider: Dict[str, int] = field(default_factory=dict)
+    errors: List[str] = field(default_factory=list)

+ 7 - 0
src/claudia/db/__init__.py

@@ -0,0 +1,7 @@
+"""Database backends public API exports for Claudia."""
+
+from .base import BaseDB
+from .memory import MemoryDB
+from .mongo import MongoDB
+
+__all__ = ["BaseDB", "MemoryDB", "MongoDB"]

+ 24 - 0
src/claudia/db/base.py

@@ -0,0 +1,24 @@
+"""Database abstraction layer (BaseDB)."""
+
+from __future__ import annotations
+
+from abc import ABC, abstractmethod
+from typing import Iterable
+
+from claudia.core import Document
+
+
+class BaseDB(ABC):
+    """数据库抽象基类:定义连接、批量写入与关闭接口。"""
+
+    @abstractmethod
+    def connect(self) -> None:
+        """Establish a connection to backend storage if needed."""
+
+    @abstractmethod
+    def insert_many(self, docs: Iterable[Document]) -> int:
+        """Insert multiple documents and return the number of inserted items."""
+
+    @abstractmethod
+    def close(self) -> None:
+        """Close the connection and release resources if needed."""

+ 31 - 0
src/claudia/db/memory.py

@@ -0,0 +1,31 @@
+"""In-memory DB implementation for testing and demos."""
+
+from __future__ import annotations
+
+from typing import Iterable, List
+
+from claudia.core import Document
+from .base import BaseDB
+
+
+class MemoryDB(BaseDB):
+    """内存数据库:简易列表存储,实现 BaseDB 接口,用于测试或演示。"""
+
+    def __init__(self) -> None:
+        self._items: List[Document] = []
+
+    def connect(self) -> None:  # pragma: no cover
+        return None
+
+    def insert_many(self, docs: Iterable[Document]) -> int:
+        before = len(self._items)
+        self._items.extend(docs)
+        return len(self._items) - before
+
+    def close(self) -> None:  # pragma: no cover
+        return None
+
+    @property
+    def items(self) -> List[Document]:
+        """Return a shallow copy of stored documents (for tests)."""
+        return list(self._items)

+ 61 - 0
src/claudia/db/mongo.py

@@ -0,0 +1,61 @@
+"""MongoDB persistence implementation for Claudia."""
+
+from __future__ import annotations
+
+from typing import Iterable
+
+from pymongo import MongoClient
+from pymongo.collection import Collection
+from pymongo import UpdateOne
+
+from claudia.core import Document
+from .base import BaseDB
+
+
+class MongoDB(BaseDB):
+    """MongoDB 实现:提供连接、唯一索引初始化与基于 (url, fetched_at) 的幂等 upsert。"""
+
+    def __init__(self, uri: str, db_name: str, collection: str) -> None:
+        self._uri = uri
+        self._db_name = db_name
+        self._col_name = collection
+        self._client: MongoClient | None = None
+        self._col: Collection | None = None
+
+    def connect(self) -> None:
+        self._client = MongoClient(self._uri)
+        self._col = self._client[self._db_name][self._col_name]
+        # 初始化索引:
+        # 1) 若提供 _id 则天然唯一;
+        # 2) 额外创建 (url, fetched_at) 复合唯一索引防重复写入;
+        assert self._col is not None
+        self._col.create_index([("url", 1), ("fetched_at", 1)], unique=True, name="uniq_url_time")
+
+    def insert_many(self, docs: Iterable[Document]) -> int:
+        assert self._col is not None, "Call connect() first"
+        ops = []
+        for d in docs:
+            body = {
+                "_id": d.id,  # 若为 None,MongoDB 会自动生成 ObjectId
+                "url": d.url,
+                "payload": d.payload,
+                "data": d.data,
+                "fetched_at": d.fetched_at,
+            }
+            # 基于 (url, fetched_at) upsert,避免重复写入
+            ops.append(
+                UpdateOne(
+                    {"url": d.url, "fetched_at": d.fetched_at},
+                    {"$setOnInsert": body},
+                    upsert=True,
+                )
+            )
+        if not ops:
+            return 0
+        result = self._col.bulk_write(ops, ordered=False)
+        # upserted_count 表示新插入的条数
+        return int(getattr(result, "upserted_count", 0))
+
+    def close(self) -> None:
+        if self._client is not None:
+            self._client.close()

+ 14 - 0
src/claudia/reporter/__init__.py

@@ -0,0 +1,14 @@
+"""Reporter public API exports for Claudia."""
+
+from .base import BaseReporter
+from .memory import MemoryReporter
+from .webhook import WebhookReporter, WebhookConfig
+from .email import EmailReporter
+
+__all__ = [
+    "BaseReporter",
+    "MemoryReporter",
+    "WebhookReporter",
+    "WebhookConfig",
+    "EmailReporter",
+]

+ 28 - 0
src/claudia/reporter/base.py

@@ -0,0 +1,28 @@
+"""Reporter abstraction layer (BaseReporter)."""
+
+from __future__ import annotations
+
+from abc import ABC, abstractmethod
+from typing import Iterable
+
+from claudia.core import RunSummary
+
+
+class BaseReporter(ABC):
+    """报告器抽象基类:定义任务生命周期的通知接口。"""
+
+    @abstractmethod
+    def notify_start(self, spider_name: str, urls: Iterable[str]) -> None:
+        """Called before a spider starts with the list of target URLs."""
+
+    @abstractmethod
+    def notify_success(self, spider_name: str, count: int) -> None:
+        """Called when a spider successfully inserts documents; `count` is number inserted."""
+
+    @abstractmethod
+    def notify_error(self, spider_name: str, error: str) -> None:
+        """Called when a spider run raises an error; `error` is a short string."""
+
+    @abstractmethod
+    def notify_summary(self, summary: RunSummary) -> None:
+        """Called after the whole run is finished with a `RunSummary`."""

+ 28 - 0
src/claudia/reporter/email.py

@@ -0,0 +1,28 @@
+"""Email reporter (placeholder) for later SMTP/API integration."""
+
+from __future__ import annotations
+
+from typing import Iterable
+
+from claudia.core import RunSummary
+from .base import BaseReporter
+
+
+class EmailReporter(BaseReporter):
+    """邮件报告器(占位实现):可集成 SMTP 或企业邮 API。"""
+
+    def __init__(self, to: str) -> None:
+        self._to = to
+
+    def notify_start(self, spider_name: str, urls: Iterable[str]) -> None:  # pragma: no cover
+        # 这里留空实现,实际项目中可接入 SMTP/企业邮 API
+        pass
+
+    def notify_success(self, spider_name: str, count: int) -> None:  # pragma: no cover
+        pass
+
+    def notify_error(self, spider_name: str, error: str) -> None:  # pragma: no cover
+        pass
+
+    def notify_summary(self, summary: RunSummary) -> None:  # pragma: no cover
+        pass

+ 27 - 0
src/claudia/reporter/memory.py

@@ -0,0 +1,27 @@
+"""In-memory reporter implementation for testing."""
+
+from __future__ import annotations
+
+from typing import Iterable, List
+
+from claudia.core import RunSummary
+from .base import BaseReporter
+
+
+class MemoryReporter(BaseReporter):
+    """内存报告器:将通知事件保存在内存列表中,便于测试断言。"""
+
+    def __init__(self) -> None:
+        self.events: List[str] = []
+
+    def notify_start(self, spider_name: str, urls: Iterable[str]) -> None:
+        self.events.append(f"start:{spider_name}:{len(list(urls))}")
+
+    def notify_success(self, spider_name: str, count: int) -> None:
+        self.events.append(f"success:{spider_name}:{count}")
+
+    def notify_error(self, spider_name: str, error: str) -> None:
+        self.events.append(f"error:{spider_name}:{error}")
+
+    def notify_summary(self, summary: RunSummary) -> None:
+        self.events.append(f"summary:{summary.total_docs}")

+ 90 - 0
src/claudia/reporter/webhook.py

@@ -0,0 +1,90 @@
+"""Webhook reporter implementation using HTTP POST with retry and timeouts."""
+
+from __future__ import annotations
+
+import logging
+import time
+from dataclasses import dataclass
+from typing import Iterable, Mapping, Optional
+
+import requests
+from requests.adapters import HTTPAdapter
+from urllib3.util.retry import Retry
+
+from claudia.core import RunSummary
+from .base import BaseReporter
+
+
+logger = logging.getLogger(__name__)
+
+
+@dataclass(slots=True)
+class WebhookConfig:
+    """配置 WebhookReporter 的 HTTP 行为。
+
+    字段:
+    - timeout: 单次请求超时时间(秒)。
+    - max_retries: 失败重试次数(总共会尝试 max_retries+1 次)。
+    - backoff_factor: 重试退避因子(仅用于本地循环重试的 sleep)。
+    - headers: 会话级请求头。
+    """
+
+    timeout: int = 5
+    max_retries: int = 2
+    backoff_factor: float = 0.0
+    headers: Mapping[str, str] | None = None
+
+
+class WebhookReporter(BaseReporter):
+    """Webhook 报告器:通过 HTTP POST 上报各类事件。"""
+
+    def __init__(
+        self,
+        url: str,
+        config: Optional[WebhookConfig] = None,
+        session: Optional[requests.Session] = None,
+    ) -> None:
+        self._url = url
+        self._config = config or WebhookConfig()
+        # 优先使用注入的 session(便于测试),否则创建带 Retry 的 Session
+        self._session = session or requests.Session()
+        if session is None:
+            retry = Retry(
+                total=self._config.max_retries,
+                backoff_factor=self._config.backoff_factor,
+                status_forcelist=(500, 502, 503, 504),
+                allowed_methods=("POST",),
+            )
+            adapter = HTTPAdapter(max_retries=retry)
+            self._session.mount("http://", adapter)
+            self._session.mount("https://", adapter)
+        if self._config.headers:
+            self._session.headers.update(dict(self._config.headers))
+
+    def _post(self, payload: dict) -> bool:
+        """发送 POST,带有简单的本地重试保护,避免异常冒泡影响主流程。"""
+        attempts = self._config.max_retries + 1
+        for attempt in range(1, attempts + 1):
+            try:
+                resp = self._session.post(self._url, json=payload, timeout=self._config.timeout)
+                # 若需要,可强制失败时抛出异常以触发本地重试
+                if hasattr(resp, "raise_for_status"):
+                    resp.raise_for_status()
+                return True
+            except requests.RequestException as exc:
+                logger.warning("Webhook post failed (attempt %s/%s): %s", attempt, attempts, exc)
+                if attempt < attempts and self._config.backoff_factor > 0:
+                    time.sleep(self._config.backoff_factor)
+        return False
+
+    def notify_start(self, spider_name: str, urls: Iterable[str]) -> None:  # pragma: no cover
+        self._post({"event": "start", "spider": spider_name, "count": len(list(urls))})
+
+    def notify_success(self, spider_name: str, count: int) -> None:  # pragma: no cover
+        self._post({"event": "success", "spider": spider_name, "count": count})
+
+    def notify_error(self, spider_name: str, error: str) -> None:  # pragma: no cover
+        self._post({"event": "error", "spider": spider_name, "error": error})
+
+    def notify_summary(self, summary: RunSummary) -> None:  # pragma: no cover
+        self._post({"event": "summary", "total": summary.total_docs})

+ 5 - 0
src/claudia/scheduler/__init__.py

@@ -0,0 +1,5 @@
+"""Scheduler public API exports for Claudia."""
+
+from .runner import Runner
+
+__all__ = ["Runner"]

+ 50 - 0
src/claudia/scheduler/runner.py

@@ -0,0 +1,50 @@
+"""Scheduler runner orchestrating spiders, database, and reporters."""
+
+from __future__ import annotations
+
+from datetime import datetime, UTC
+from typing import Dict, Iterable, List
+
+from claudia.core import Document, RunSummary
+from claudia.db import BaseDB
+from claudia.reporter import BaseReporter
+from claudia.spiders import BaseSpider
+
+
+class Runner:  # pylint: disable=too-few-public-methods
+    """调度器:负责协调 Spider -> DB -> Reporter 的执行流程并产出汇总。"""
+
+    def __init__(self, db: BaseDB, reporters: Iterable[BaseReporter]) -> None:
+        """Initialize runner with a database backend and reporters."""
+        self._db = db
+        self._reporters = list(reporters)
+
+    def run(self, spiders: Dict[BaseSpider, List[str]]) -> RunSummary:
+        """Run all spiders with their URL lists and persist results, returning summary."""
+        summary = RunSummary()
+        self._db.connect()
+        try:
+            total = 0
+            for spider, urls in spiders.items():
+                for r in self._reporters:
+                    r.notify_start(spider.name, urls)
+                try:
+                    docs: List[Document] = spider.run(urls)
+                    inserted = self._db.insert_many(docs)
+                    total += inserted
+                    summary.per_spider[spider.name] = (
+                        summary.per_spider.get(spider.name, 0) + inserted
+                    )
+                    for r in self._reporters:
+                        r.notify_success(spider.name, inserted)
+                except Exception as exc:  # pylint: disable=broad-except
+                    summary.errors.append(f"{spider.name}: {exc}")
+                    for r in self._reporters:
+                        r.notify_error(spider.name, str(exc))
+            summary.total_docs = total
+        finally:
+            summary.finished_at = datetime.now(UTC)
+            for r in self._reporters:
+                r.notify_summary(summary)
+            self._db.close()
+        return summary

+ 7 - 0
src/claudia/spiders/__init__.py

@@ -0,0 +1,7 @@
+"""Spiders public API exports for Claudia."""
+
+from .base import BaseSpider
+from .example import ExampleSpider
+from .http import HTTPSpider, HTTPConfig
+
+__all__ = ["BaseSpider", "ExampleSpider", "HTTPSpider", "HTTPConfig"]

+ 40 - 0
src/claudia/spiders/base.py

@@ -0,0 +1,40 @@
+"""Spider abstraction layer (BaseSpider) for Claudia."""
+
+from __future__ import annotations
+
+from abc import ABC, abstractmethod
+from typing import Any, Dict, Iterable, List
+
+from claudia.core import Document
+
+
+class BaseSpider(ABC):
+    """爬虫抽象基类:输入 url,子类自定义 payload 与解析。"""
+
+    name: str = "base"
+
+    @abstractmethod
+    def build_payload(self, url: str) -> Dict[str, Any]:
+        """根据 url 生成请求 payload。"""
+
+    @abstractmethod
+    def parse(self, url: str, content: str, payload: Dict[str, Any]) -> List[Document]:
+        """解析内容为 Document 列表。"""
+
+    def fetch(self, url: str, payload: Dict[str, Any]) -> str:
+        """默认的抓取实现(示例:返回一个假内容,避免真实网络)。子类可覆盖。"""
+        return f"<html><title>Dummy</title><body>url={url};payload={payload}</body></html>"
+
+    def make_document(self, url: str, payload: Dict[str, Any], data: Dict[str, Any]) -> Document:
+        """构造一个标准 Document,供子类在 `parse` 中复用,减少重复代码。"""
+        return Document(id=None, url=url, payload=payload, data=data)
+
+    def run(self, urls: Iterable[str]) -> List[Document]:
+        """Execute the spider for all given URLs and return aggregated documents."""
+        results: List[Document] = []
+        for url in urls:
+            payload = self.build_payload(url)
+            content = self.fetch(url, payload)
+            docs = self.parse(url, content, payload)
+            results.extend(docs)
+        return results

+ 21 - 0
src/claudia/spiders/example.py

@@ -0,0 +1,21 @@
+"""Example spider demonstrating payload building and parsing without real network."""
+
+from __future__ import annotations
+
+from typing import Any, Dict, List
+
+from claudia.core import Document
+from .base import BaseSpider
+
+
+class ExampleSpider(BaseSpider):
+    """示例爬虫:不发起真实网络请求,演示 payload 构造与解析流程。"""
+
+    name = "example"
+
+    def build_payload(self, url: str) -> Dict[str, Any]:
+        return {"q": "demo", "source": "example"}
+
+    def parse(self, url: str, content: str, payload: Dict[str, Any]) -> List[Document]:
+        # 简单示例:把内容长度等信息写入 data
+        return [self.make_document(url, payload, {"length": len(content), "preview": content[:40]})]

+ 82 - 0
src/claudia/spiders/http.py

@@ -0,0 +1,82 @@
+"""HTTP-based spider using requests with retry and timeouts."""
+
+from __future__ import annotations
+
+from dataclasses import dataclass
+from typing import Any, Dict, List, Mapping, Optional
+
+import requests
+from requests.adapters import HTTPAdapter
+from urllib3.util.retry import Retry
+
+from claudia.core import Document
+from .base import BaseSpider
+
+
+@dataclass
+class HTTPConfig:
+    """HTTP 请求爬虫的配置。
+
+    字段说明:
+    - method: 请求方法,支持 'GET' 或 'POST'。
+    - headers: 会话级请求头,将在初始化时合并到 Session。
+    - timeout: 单次请求的超时时间(秒)。
+    - max_retries: 遇到可重试状态码时的最大重试次数。
+    - backoff_factor: 重试退避因子,配合 urllib3 的 Retry 使用。
+    """
+
+    method: str = "GET"  # or POST
+    headers: Mapping[str, str] | None = None
+    timeout: int = 10
+    max_retries: int = 2
+    backoff_factor: float = 0.5
+
+
+class HTTPSpider(BaseSpider):
+    """基于 requests 的通用 HTTP 爬虫。
+
+    功能特性:
+    - 支持 GET/POST 两种方法。
+    - 支持会话级请求头合并、请求超时与失败重试(500/502/503/504)。
+    - 通过 `build_payload` 生成查询或请求体,`parse` 产出 `Document` 列表。
+
+    属性:
+    - config: `HTTPConfig` 实例,控制方法、头、超时与重试等。
+    - _session: `requests.Session`,已挂载带 `Retry` 的 `HTTPAdapter`。
+
+    用法:
+    - 覆写 `parse` 以根据响应内容构造领域数据;必要时覆写 `build_payload`。
+    - 通过 `HTTPConfig(method='POST')` 切换为 POST 请求,`headers` 用于统一附加头部。
+    """
+
+    name = "http"
+
+    def __init__(self, config: Optional[HTTPConfig] = None) -> None:
+        self.config = config or HTTPConfig()
+        self._session = requests.Session()
+        retry = Retry(
+            total=self.config.max_retries,
+            backoff_factor=self.config.backoff_factor,
+            status_forcelist=(500, 502, 503, 504),
+            allowed_methods=("GET", "POST"),
+        )
+        adapter = HTTPAdapter(max_retries=retry)
+        self._session.mount("http://", adapter)
+        self._session.mount("https://", adapter)
+        if self.config.headers:
+            self._session.headers.update(dict(self.config.headers))
+
+    def build_payload(self, url: str) -> Dict[str, Any]:
+        return {}
+
+    def fetch(self, url: str, payload: Dict[str, Any]) -> str:
+        if self.config.method.upper() == "POST":
+            resp = self._session.post(url, json=payload or None, timeout=self.config.timeout)
+        else:
+            resp = self._session.get(url, params=payload or None, timeout=self.config.timeout)
+        resp.raise_for_status()
+        return resp.text
+
+    def parse(self, url: str, content: str, payload: Dict[str, Any]) -> List[Document]:
+        # 示例:简单返回内容长度
+        return [self.make_document(url, payload, {"length": len(content)})]

+ 1 - 0
tests/__init__.py

@@ -0,0 +1 @@
+"""Test package marker for pylint compatibility."""

+ 3 - 0
tests/conftest.py

@@ -1,8 +1,11 @@
+"""Pytest configuration helpers for path setup to include src/ on sys.path."""
+
 import sys
 from pathlib import Path
 
 
 def _ensure_src_on_syspath():
+    """Prepend the repository src/ path onto sys.path for import resolution."""
     root = Path(__file__).resolve().parents[1]
     src = root / "src"
     src_str = str(src)

+ 39 - 0
tests/test_http_spider.py

@@ -0,0 +1,39 @@
+"""HTTPSpider integration test with mocked network calls."""
+
+from __future__ import annotations
+
+from typing import Any, Dict
+
+import types
+
+from claudia.spiders import HTTPSpider, HTTPConfig
+from claudia.db import MemoryDB
+from claudia.scheduler import Runner
+from claudia.reporter import MemoryReporter
+
+
+def test_http_spider_parse_and_runner_integration():
+    """Mock HTTPSpider.fetch to avoid real network and validate pipeline flow."""
+    spider = HTTPSpider(HTTPConfig(method="GET"))
+
+    def fake_fetch(_url: str, _payload: Dict[str, Any]) -> str:
+        return "<html><body>hello</body></html>"
+
+    # Monkeypatch the instance method
+    spider.fetch = types.MethodType(lambda self, url, payload: fake_fetch(url, payload), spider)
+
+    db = MemoryDB()
+    reporter = MemoryReporter()
+    runner = Runner(db=db, reporters=[reporter])
+
+    urls = ["https://example.com/a", "https://example.com/b"]
+    summary = runner.run({spider: urls})
+
+    assert summary.total_docs == 2
+    assert summary.per_spider.get(spider.name) == 2
+    assert not summary.errors
+
+    # Reporter events were recorded
+    assert any(e.startswith("start:") for e in reporter.events)
+    assert any(e.startswith("success:") for e in reporter.events)
+    assert any(e.startswith("summary:") for e in reporter.events)

+ 4 - 0
tests/test_imports.py

@@ -1,12 +1,16 @@
+"""Pylint-friendly smoke tests for package imports and entry point."""
+
 import importlib
 
 
 def test_import_package():
+    """Module can be imported and contains a version."""
     mod = importlib.import_module("claudia")
     assert hasattr(mod, "__version__")
 
 
 def test_entry_point_main_returns_zero():
+    """Entry point function `main` exists and returns zero."""
     entry = importlib.import_module("claudia.__main__")
     assert callable(entry.main)
     assert entry.main() == 0

+ 32 - 0
tests/test_pipeline.py

@@ -0,0 +1,32 @@
+"""End-to-end pipeline test for Claudia runner and components."""
+
+from __future__ import annotations
+
+from claudia.db import MemoryDB
+from claudia.reporter import MemoryReporter
+from claudia.scheduler import Runner
+from claudia.spiders import ExampleSpider
+
+
+def test_pipeline_end_to_end():
+    """Run an example spider through the runner and assert outcomes."""
+    spider = ExampleSpider()
+    db = MemoryDB()
+    reporter = MemoryReporter()
+    runner = Runner(db=db, reporters=[reporter])
+
+    urls = [
+        "https://example.com/a",
+        "https://example.com/b",
+    ]
+    summary = runner.run({spider: urls})
+
+    # 插入应等于解析产出(每个 URL 一个文档)
+    assert summary.total_docs == 2
+    assert summary.per_spider.get(spider.name) == 2
+    assert not summary.errors
+
+    # reporter 至少包含 start/success/summary 事件
+    assert any(e.startswith("start:") for e in reporter.events)
+    assert any(e.startswith("success:") for e in reporter.events)
+    assert any(e.startswith("summary:") for e in reporter.events)

+ 42 - 0
tests/test_runner_errors.py

@@ -0,0 +1,42 @@
+"""Runner error handling test: reporters receive error notifications and summary collects errors."""
+
+from __future__ import annotations
+
+from typing import Any, Dict, List
+
+from claudia.core import Document
+from claudia.spiders import BaseSpider
+from claudia.db import MemoryDB
+from claudia.scheduler import Runner
+from claudia.reporter import MemoryReporter
+
+
+class BoomSpider(BaseSpider):
+    """Spider that intentionally raises to exercise Runner error handling."""
+
+    name = "boom"
+
+    def build_payload(self, url: str) -> Dict[str, Any]:
+        return {}
+
+    def parse(
+        self, url: str, content: str, payload: Dict[str, Any]
+    ) -> List[Document]:  # pragma: no cover
+        return []
+
+    def fetch(self, url: str, payload: Dict[str, Any]) -> str:  # pragma: no cover
+        raise RuntimeError("boom!")
+
+
+def test_runner_handles_errors_and_reports():
+    """Runner should record errors and reporters should receive error notifications."""
+    spider = BoomSpider()
+    db = MemoryDB()
+    reporter = MemoryReporter()
+    runner = Runner(db=db, reporters=[reporter])
+
+    summary = runner.run({spider: ["https://example.com/a"]})
+
+    assert summary.total_docs == 0
+    assert summary.errors and any("boom" in e for e in summary.errors)
+    assert any(e.startswith("error:") for e in reporter.events)