فهرست منبع

添加 DailyFileReporter 实现,支持按天记录日志并归档,更新示例脚本以展示集成用法。

admin 2 ماه پیش
والد
کامیت
ff2f1d2be1
5فایلهای تغییر یافته به همراه393 افزوده شده و 6 حذف شده
  1. 3 0
      .gitignore
  2. 7 1
      README.md
  3. 54 0
      scripts/reporter_demo.py
  4. 94 5
      scripts/test_get_league_match_list.py
  5. 235 0
      src/databank/reporter/daily_file.py

+ 3 - 0
.gitignore

@@ -58,3 +58,6 @@ docs/_build/
 # PyBuilder
 target/
 
+# Local logs (generated by DailyFileReporter)
+logs/
+

+ 7 - 1
README.md

@@ -7,6 +7,7 @@ Policy: scripts are executable, examples are stubs
 	- `python scripts/seed_leagues_mongo.py`
 	- `python scripts/seed_seasons_mongo.py`
 	- `python scripts/test_get_league_match_list.py`
+	- `python scripts/reporter_demo.py`
 
 This repository is a pure abstract skeleton intended to define stable contracts
 for a multi-spider data pipeline. It deliberately contains only abstract/base
@@ -97,4 +98,9 @@ with bootstrap.db_session(db, boot) as conn:
 		# 在此使用 conn.insert_many([...]) 等抽象方法
 		pass
 ```
-以上编排层不引入任何具体驱动或后端,仅依赖于 `BaseDB` 约定,便于后续在你自己的实现中复用。
+以上编排层不引入任何具体驱动或后端,仅依赖于 `BaseDB` 约定,便于后续在你自己的实现中复用。
+
+Reporter 说明(DailyFileReporter)
+- 归档结构:`{log_dir}/YYYY/MM/report_{YYYY-MM-DD}.log`
+- 时区:默认 `UTC+8`(可选 `utc`、`local`、`utc+/-H[.m]`)
+- 示例:运行 `python scripts/reporter_demo.py`,在 `./logs/年/月/` 下查看日志文件。

+ 54 - 0
scripts/reporter_demo.py

@@ -0,0 +1,54 @@
+"""Reporter demo script.
+
+This script demonstrates how to use DailyFileReporter with UTC+8 timezone
+and year/month archived log directories. It simulates a short run with
+start/success/error/summary notifications.
+
+Usage (PowerShell):
+  pwsh -NoProfile -File ./scripts/reporter_demo.py
+or
+  python ./scripts/reporter_demo.py
+"""
+
+from __future__ import annotations
+
+import random
+import time
+from pathlib import Path
+
+from databank.reporter.daily_file import DailyFileReporter
+from databank.core.models import RunSummary
+
+
+def main() -> None:
+    """Run the reporter demo."""
+    rep = DailyFileReporter(log_dir=str(Path.cwd() / "logs"), timezone="utc+8")
+
+    # Simulate a run
+    spider_name = "demoSpider"
+    seeds = ["tokenA", "tokenB", "tokenC"]
+    rep.notify_start(spider_name, seeds)
+
+    # simulate successes
+    docs = 0
+    for _ in range(2):
+        cnt = random.randint(1, 5)
+        docs += cnt
+        rep.notify_success(spider_name, cnt)
+        time.sleep(0.2)
+
+    # simulate an error
+    rep.notify_error(spider_name, "Simulated network timeout")
+
+    # Final summary
+    summary = RunSummary()
+    summary.total_docs = docs
+    summary.per_spider[spider_name] = docs
+    rep.notify_summary(summary)
+
+    print("Reporter demo finished.")
+    print("Logs are written under:", Path.cwd() / "logs")
+
+
+if __name__ == "__main__":
+    main()

+ 94 - 5
scripts/test_get_league_match_list.py

@@ -11,9 +11,14 @@ from __future__ import annotations
 
 import os
 from collections import defaultdict
+from datetime import UTC, datetime
+from time import perf_counter
 
 from databank.db import MongoDB
+from databank.db.base import InsertError
 from databank.spiders.get_league_match_list import GetLeagueMatchListSpider
+from databank.reporter.daily_file import DailyFileReporter
+from databank.core.models import RunSummary
 
 
 def pick_tokens(max_tokens: int = 3) -> list[str]:
@@ -44,15 +49,99 @@ def pick_tokens(max_tokens: int = 3) -> list[str]:
 
 
 def main() -> None:
-    """Run the demo and print a compact summary to stdout."""
+    """Run the demo with reporter integration and print a compact summary."""
     spider = GetLeagueMatchListSpider()
-    urls = pick_tokens()
-    docs = spider.run(urls)
+    reporter = DailyFileReporter(timezone="utc+8")
 
-    print(f"Fetched {len(docs)} documents in total.")
+    # Prepare summary for duration/error tracking
+    summary = RunSummary()
+    try:
+        urls = pick_tokens()
+    except Exception as exc:  # pylint: disable=broad-except
+        # Record error and finalize summary
+        reporter.notify_error(spider.name, f"pick_tokens failed: {exc}")
+        summary.errors.append(str(exc))
+        summary.finished_at = datetime.now(UTC)
+        reporter.notify_summary(summary)
+        print("pick_tokens failed:", exc)
+        return
+
+    reporter.notify_start(spider.name, urls)
+
+    # DB connection for persistence; success count is based on DB insert result
+    uri = os.getenv("DATABANK_DB_URI", "mongodb://localhost:27017")
+    name = os.getenv("DATABANK_DB_NAME", "databank")
+    db = MongoDB(uri=uri, name=name)
+    db.connect()
+
+    all_docs = []
+    parsed_success_total = 0
+    persisted_total = 0
+    error_messages: list[str] = []
+    error_docs_total = 0
+    total_time_s_accum = 0.0
+
+    try:
+        for url in urls:
+            t0 = perf_counter()
+            docs = spider.run([url])
+            dt = perf_counter() - t0
+
+            total_time_s_accum += dt
+
+            all_docs.extend(docs)
+            url_success = [d for d in docs if d.kind != "error"]
+            url_errors = [d for d in docs if d.kind == "error"]
+
+            parsed_success_total += len(url_success)
+            error_docs_total += len(url_errors)
+
+            # Persist success docs and report using DB result
+            if url_success:
+                try:
+                    inserted = db.insert_many(url_success)
+                except InsertError as exc:
+                    msg = f"insert_many failed: {exc}"
+                    reporter.notify_error(spider.name, msg)
+                    error_messages.append(msg)
+                    inserted = 0
+                reporter.notify_success(spider.name, inserted)
+                persisted_total += inserted
+
+            # Report errors per doc
+            for err in url_errors:
+                reason = err.data.get("reason")
+                detail = err.data.get("detail")
+                msg = f"{reason}: {detail}" if detail else str(reason)
+                reporter.notify_error(spider.name, msg)
+                error_messages.append(msg)
+    finally:
+        db.close()
+
+    total_time_s = float(total_time_s_accum)
+    avg_time_s = (total_time_s / len(urls)) if urls else 0.0
+
+    # Final summary
+    summary.total_docs = persisted_total
+    summary.per_spider[spider.name] = persisted_total
+    # Attach concise metrics to summary.errors for visibility in the log
+    summary.errors.append(
+        f"metrics: attempted_urls={len(urls)} parsed_success={parsed_success_total} "
+        f"persisted={persisted_total} error_docs={error_docs_total} "
+        f"url_time_total_s={total_time_s:.3f} url_time_avg_s={avg_time_s:.3f}"
+    )
+    summary.errors.extend(error_messages)
+    summary.finished_at = datetime.now(UTC)
+    reporter.notify_summary(summary)
+
+    print(
+        f"Fetched {len(all_docs)} documents, parsed_success={parsed_success_total}, "
+        f"persisted={persisted_total}, error_docs={error_docs_total}, "
+        f"url_time_total_s={total_time_s:.3f}, url_time_avg_s={avg_time_s:.3f}."
+    )
 
     per_token = defaultdict(list)
-    for d in docs:
+    for d in all_docs:
         per_token[d.data.get("token", "unknown")].append(d)
 
     for token, items in per_token.items():

+ 235 - 0
src/databank/reporter/daily_file.py

@@ -0,0 +1,235 @@
+"""Concrete reporter that writes daily TXT logs to local files.
+
+Features:
+- Year/Month archive directories: logs are stored under ``{log_dir}/YYYY/MM/``.
+- One file per day within the month directory, e.g., ``report_2025-09-24.log``.
+- Logs start URLs, success count, error messages, and a final summary.
+- Simple plain-text format, append-only, safe for repeated calls.
+
+Configuration options (kwargs to constructor or ``configure``):
+- ``log_dir``: Base directory to store logs (default: ``logs`` under project root).
+- ``filename_pattern``: Pattern with ``{date}`` placeholder (default: ``report_{date}.log``).
+- ``timezone``: ``"utc"``, ``"local"`` or ``"utc+8"`` (default: ``"utc+8"``).
+
+Example usage:
+    from databank.reporter.daily_file import DailyFileReporter
+    from databank.core.models import RunSummary
+
+    rep = DailyFileReporter(log_dir="./logs", timezone="utc+8")
+    rep.notify_start("getLeagueMatchList", ["2079|2016-2017|1"])
+    rep.notify_success("getLeagueMatchList", 42)
+    rep.notify_error("getLeagueMatchList", "Timeout while fetching")
+
+    summary = RunSummary()
+    summary.total_docs = 42
+    summary.per_spider["getLeagueMatchList"] = 42
+    rep.notify_summary(summary)
+"""
+
+from __future__ import annotations
+
+from dataclasses import dataclass
+from datetime import datetime, timezone, timedelta
+import logging
+from pathlib import Path
+from typing import Iterable, Optional
+
+try:
+    from .base import BaseReporter
+    from ..core.models import RunSummary
+except ImportError as _exc:  # direct-run friendliness
+    # If executed directly (not as a package module), relative imports fail.
+    # Provide a helpful message and exit cleanly instead of a stack trace.
+    if __name__ == "__main__":
+        print(
+            "This module defines DailyFileReporter and isn't meant to be executed directly.\n"
+            "Run the demo script instead:\n"
+            "  python c:\\Python\\databank\\scripts\\reporter_demo.py\n"
+        )
+        raise SystemExit(0)
+    raise
+
+
+@dataclass
+class DailyFileReporterOptions:
+    """Options for DailyFileReporter."""
+
+    log_dir: str = "logs"
+    filename_pattern: str = "report_{date}.log"
+    timezone: str = "utc+8"  # "utc", "local", or explicit offset like "utc+8"
+
+
+class DailyFileReporter(BaseReporter):
+    """Reporter that appends human-readable logs to daily files."""
+
+    def __init__(
+        self,
+        *,
+        logger: Optional[logging.Logger] = None,
+        **options: object,
+    ) -> None:
+        """Initialize reporter with optional logger and options."""
+        super().__init__(logger=logger, **options)
+        self._opts = self._build_options(options)
+        self._ensure_dir()
+
+    # ---- BaseReporter API ----
+    def notify_start(self, spider_name: str, urls: Iterable[str]) -> None:
+        lines = [
+            self._prefix("START"),
+            f"spider={spider_name}",
+            f"urls={list(urls)}",
+            "",
+        ]
+        self._write_lines(lines)
+
+    def notify_success(self, spider_name: str, count: int) -> None:
+        lines = [
+            self._prefix("SUCCESS"),
+            f"spider={spider_name}",
+            f"persisted={count}",
+            "",
+        ]
+        self._write_lines(lines)
+
+    def notify_error(self, spider_name: str, error: str) -> None:
+        lines = [
+            self._prefix("ERROR"),
+            f"spider={spider_name}",
+            f"message={error}",
+            "",
+        ]
+        self._write_lines(lines)
+
+    def notify_summary(self, summary: RunSummary) -> None:
+        # Derive simple duration seconds if finished_at is provided.
+        duration_s: float | None = None
+        if summary.finished_at is not None:
+            duration_s = (summary.finished_at - summary.started_at).total_seconds()
+        lines = [
+            self._prefix("SUMMARY"),
+            f"total_docs={summary.total_docs}",
+            f"per_spider={dict(summary.per_spider)}",
+            f"errors={list(summary.errors)}",
+            f"duration_s={duration_s if duration_s is not None else 'n/a'}",
+            "",
+        ]
+        self._write_lines(lines)
+
+    # ---- Helpers ----
+    def _build_options(self, options: dict[str, object]) -> DailyFileReporterOptions:
+        log_dir_opt = str(options.get("log_dir", DailyFileReporterOptions.log_dir))
+        filename_pattern = str(
+            options.get("filename_pattern", DailyFileReporterOptions.filename_pattern)
+        )
+        timezone_opt = str(options.get("timezone", DailyFileReporterOptions.timezone))
+        tz_lower = timezone_opt.lower()
+        # Accept "utc", "local", and explicit offsets like "utc+8"/"utc-8" (also decimals)
+        if not (
+            tz_lower == "utc"
+            or tz_lower == "local"
+            or tz_lower.startswith("utc+")
+            or tz_lower.startswith("utc-")
+        ):
+            tz_lower = "utc"
+        # Ensure log_dir stays inside project root
+        resolved_log_dir = self._resolve_log_dir(Path(log_dir_opt))
+        return DailyFileReporterOptions(
+            log_dir=str(resolved_log_dir),
+            filename_pattern=filename_pattern,
+            timezone=tz_lower,
+        )
+
+    def _project_root(self) -> Path:
+        """Locate project root by searching upwards for pyproject.toml or .git.
+
+        Falls back to the directory two levels up from this file (repo root
+        pattern: src/databank/...), and finally to current working directory.
+        """
+        here = Path(__file__).resolve()
+        for ancestor in here.parents:
+            if (ancestor / "pyproject.toml").exists() or (ancestor / ".git").exists():
+                return ancestor
+        # Fallback heuristics
+        try:
+            return here.parents[3]
+        except IndexError:  # pragma: no cover - defensive
+            return Path.cwd()
+
+    def _resolve_log_dir(self, configured: Path) -> Path:
+        """Resolve and constrain the log directory to be inside project root.
+
+        - Relative paths are resolved against project root.
+        - Absolute paths outside the project root are redirected to <root>/logs
+          with a warning.
+        """
+        root = self._project_root()
+        if not configured.is_absolute():
+            return root / configured
+        try:
+            # If this succeeds, it's inside the root
+            _ = configured.relative_to(root)
+            return configured
+        except ValueError:
+            fallback = root / "logs"
+            if self.logger:
+                self.logger.warning(
+                    "log_dir '%s' is outside project root; using '%s' instead",
+                    configured,
+                    fallback,
+                )
+            return fallback
+
+    def _ensure_dir(self) -> None:
+        Path(self._opts.log_dir).mkdir(parents=True, exist_ok=True)
+
+    def _tzinfo(self):
+        tz = self._opts.timezone.lower()
+        if tz == "local":
+            return datetime.now().astimezone().tzinfo
+        if tz == "utc":
+            return timezone.utc
+        if tz.startswith("utc+"):
+            try:
+                hours = float(tz.split("+", 1)[1])
+                return timezone(timedelta(hours=hours))
+            except ValueError:  # pragma: no cover - defensive
+                return timezone.utc
+        if tz.startswith("utc-"):
+            try:
+                hours = float(tz.split("-", 1)[1])
+                return timezone(-timedelta(hours=hours))
+            except ValueError:  # pragma: no cover - defensive
+                return timezone.utc
+        return timezone.utc
+
+    def _now(self) -> datetime:
+        return datetime.now(self._tzinfo())
+
+    def _date_str(self) -> str:
+        dt = self._now()
+        return dt.strftime("%Y-%m-%d")
+
+    def _prefix(self, level: str) -> str:
+        ts = self._now().strftime("%Y-%m-%dT%H:%M:%S%z")
+        return f"[{ts}] [{level}]"
+
+    def _log_path(self) -> Path:
+        dt = self._now()
+        year_dir = dt.strftime("%Y")
+        month_dir = dt.strftime("%m")
+        date_part = self._date_str()
+        filename = self._opts.filename_pattern.format(date=date_part)
+        return Path(self._opts.log_dir) / year_dir / month_dir / filename
+
+    def _write_lines(self, lines: list[str]) -> None:
+        try:
+            path = self._log_path()
+            path.parent.mkdir(parents=True, exist_ok=True)
+            with path.open("a", encoding="utf-8") as f:
+                f.write("\n".join(lines))
+                if not lines[-1].endswith("\n"):
+                    f.write("\n")
+        except (OSError, IOError) as exc:  # pragma: no cover - disk/io dependent
+            if self.logger:
+                self.logger.exception("DailyFileReporter write failed: %s", exc)