|
|
@@ -0,0 +1,235 @@
|
|
|
+"""Concrete reporter that writes daily TXT logs to local files.
|
|
|
+
|
|
|
+Features:
|
|
|
+- Year/Month archive directories: logs are stored under ``{log_dir}/YYYY/MM/``.
|
|
|
+- One file per day within the month directory, e.g., ``report_2025-09-24.log``.
|
|
|
+- Logs start URLs, success count, error messages, and a final summary.
|
|
|
+- Simple plain-text format, append-only, safe for repeated calls.
|
|
|
+
|
|
|
+Configuration options (kwargs to constructor or ``configure``):
|
|
|
+- ``log_dir``: Base directory to store logs (default: ``logs`` under project root).
|
|
|
+- ``filename_pattern``: Pattern with ``{date}`` placeholder (default: ``report_{date}.log``).
|
|
|
+- ``timezone``: ``"utc"``, ``"local"`` or ``"utc+8"`` (default: ``"utc+8"``).
|
|
|
+
|
|
|
+Example usage:
|
|
|
+ from databank.reporter.daily_file import DailyFileReporter
|
|
|
+ from databank.core.models import RunSummary
|
|
|
+
|
|
|
+ rep = DailyFileReporter(log_dir="./logs", timezone="utc+8")
|
|
|
+ rep.notify_start("getLeagueMatchList", ["2079|2016-2017|1"])
|
|
|
+ rep.notify_success("getLeagueMatchList", 42)
|
|
|
+ rep.notify_error("getLeagueMatchList", "Timeout while fetching")
|
|
|
+
|
|
|
+ summary = RunSummary()
|
|
|
+ summary.total_docs = 42
|
|
|
+ summary.per_spider["getLeagueMatchList"] = 42
|
|
|
+ rep.notify_summary(summary)
|
|
|
+"""
|
|
|
+
|
|
|
+from __future__ import annotations
|
|
|
+
|
|
|
+from dataclasses import dataclass
|
|
|
+from datetime import datetime, timezone, timedelta
|
|
|
+import logging
|
|
|
+from pathlib import Path
|
|
|
+from typing import Iterable, Optional
|
|
|
+
|
|
|
+try:
|
|
|
+ from .base import BaseReporter
|
|
|
+ from ..core.models import RunSummary
|
|
|
+except ImportError as _exc: # direct-run friendliness
|
|
|
+ # If executed directly (not as a package module), relative imports fail.
|
|
|
+ # Provide a helpful message and exit cleanly instead of a stack trace.
|
|
|
+ if __name__ == "__main__":
|
|
|
+ print(
|
|
|
+ "This module defines DailyFileReporter and isn't meant to be executed directly.\n"
|
|
|
+ "Run the demo script instead:\n"
|
|
|
+ " python c:\\Python\\databank\\scripts\\reporter_demo.py\n"
|
|
|
+ )
|
|
|
+ raise SystemExit(0)
|
|
|
+ raise
|
|
|
+
|
|
|
+
|
|
|
+@dataclass
|
|
|
+class DailyFileReporterOptions:
|
|
|
+ """Options for DailyFileReporter."""
|
|
|
+
|
|
|
+ log_dir: str = "logs"
|
|
|
+ filename_pattern: str = "report_{date}.log"
|
|
|
+ timezone: str = "utc+8" # "utc", "local", or explicit offset like "utc+8"
|
|
|
+
|
|
|
+
|
|
|
+class DailyFileReporter(BaseReporter):
|
|
|
+ """Reporter that appends human-readable logs to daily files."""
|
|
|
+
|
|
|
+ def __init__(
|
|
|
+ self,
|
|
|
+ *,
|
|
|
+ logger: Optional[logging.Logger] = None,
|
|
|
+ **options: object,
|
|
|
+ ) -> None:
|
|
|
+ """Initialize reporter with optional logger and options."""
|
|
|
+ super().__init__(logger=logger, **options)
|
|
|
+ self._opts = self._build_options(options)
|
|
|
+ self._ensure_dir()
|
|
|
+
|
|
|
+ # ---- BaseReporter API ----
|
|
|
+ def notify_start(self, spider_name: str, urls: Iterable[str]) -> None:
|
|
|
+ lines = [
|
|
|
+ self._prefix("START"),
|
|
|
+ f"spider={spider_name}",
|
|
|
+ f"urls={list(urls)}",
|
|
|
+ "",
|
|
|
+ ]
|
|
|
+ self._write_lines(lines)
|
|
|
+
|
|
|
+ def notify_success(self, spider_name: str, count: int) -> None:
|
|
|
+ lines = [
|
|
|
+ self._prefix("SUCCESS"),
|
|
|
+ f"spider={spider_name}",
|
|
|
+ f"persisted={count}",
|
|
|
+ "",
|
|
|
+ ]
|
|
|
+ self._write_lines(lines)
|
|
|
+
|
|
|
+ def notify_error(self, spider_name: str, error: str) -> None:
|
|
|
+ lines = [
|
|
|
+ self._prefix("ERROR"),
|
|
|
+ f"spider={spider_name}",
|
|
|
+ f"message={error}",
|
|
|
+ "",
|
|
|
+ ]
|
|
|
+ self._write_lines(lines)
|
|
|
+
|
|
|
+ def notify_summary(self, summary: RunSummary) -> None:
|
|
|
+ # Derive simple duration seconds if finished_at is provided.
|
|
|
+ duration_s: float | None = None
|
|
|
+ if summary.finished_at is not None:
|
|
|
+ duration_s = (summary.finished_at - summary.started_at).total_seconds()
|
|
|
+ lines = [
|
|
|
+ self._prefix("SUMMARY"),
|
|
|
+ f"total_docs={summary.total_docs}",
|
|
|
+ f"per_spider={dict(summary.per_spider)}",
|
|
|
+ f"errors={list(summary.errors)}",
|
|
|
+ f"duration_s={duration_s if duration_s is not None else 'n/a'}",
|
|
|
+ "",
|
|
|
+ ]
|
|
|
+ self._write_lines(lines)
|
|
|
+
|
|
|
+ # ---- Helpers ----
|
|
|
+ def _build_options(self, options: dict[str, object]) -> DailyFileReporterOptions:
|
|
|
+ log_dir_opt = str(options.get("log_dir", DailyFileReporterOptions.log_dir))
|
|
|
+ filename_pattern = str(
|
|
|
+ options.get("filename_pattern", DailyFileReporterOptions.filename_pattern)
|
|
|
+ )
|
|
|
+ timezone_opt = str(options.get("timezone", DailyFileReporterOptions.timezone))
|
|
|
+ tz_lower = timezone_opt.lower()
|
|
|
+ # Accept "utc", "local", and explicit offsets like "utc+8"/"utc-8" (also decimals)
|
|
|
+ if not (
|
|
|
+ tz_lower == "utc"
|
|
|
+ or tz_lower == "local"
|
|
|
+ or tz_lower.startswith("utc+")
|
|
|
+ or tz_lower.startswith("utc-")
|
|
|
+ ):
|
|
|
+ tz_lower = "utc"
|
|
|
+ # Ensure log_dir stays inside project root
|
|
|
+ resolved_log_dir = self._resolve_log_dir(Path(log_dir_opt))
|
|
|
+ return DailyFileReporterOptions(
|
|
|
+ log_dir=str(resolved_log_dir),
|
|
|
+ filename_pattern=filename_pattern,
|
|
|
+ timezone=tz_lower,
|
|
|
+ )
|
|
|
+
|
|
|
+ def _project_root(self) -> Path:
|
|
|
+ """Locate project root by searching upwards for pyproject.toml or .git.
|
|
|
+
|
|
|
+ Falls back to the directory two levels up from this file (repo root
|
|
|
+ pattern: src/databank/...), and finally to current working directory.
|
|
|
+ """
|
|
|
+ here = Path(__file__).resolve()
|
|
|
+ for ancestor in here.parents:
|
|
|
+ if (ancestor / "pyproject.toml").exists() or (ancestor / ".git").exists():
|
|
|
+ return ancestor
|
|
|
+ # Fallback heuristics
|
|
|
+ try:
|
|
|
+ return here.parents[3]
|
|
|
+ except IndexError: # pragma: no cover - defensive
|
|
|
+ return Path.cwd()
|
|
|
+
|
|
|
+ def _resolve_log_dir(self, configured: Path) -> Path:
|
|
|
+ """Resolve and constrain the log directory to be inside project root.
|
|
|
+
|
|
|
+ - Relative paths are resolved against project root.
|
|
|
+ - Absolute paths outside the project root are redirected to <root>/logs
|
|
|
+ with a warning.
|
|
|
+ """
|
|
|
+ root = self._project_root()
|
|
|
+ if not configured.is_absolute():
|
|
|
+ return root / configured
|
|
|
+ try:
|
|
|
+ # If this succeeds, it's inside the root
|
|
|
+ _ = configured.relative_to(root)
|
|
|
+ return configured
|
|
|
+ except ValueError:
|
|
|
+ fallback = root / "logs"
|
|
|
+ if self.logger:
|
|
|
+ self.logger.warning(
|
|
|
+ "log_dir '%s' is outside project root; using '%s' instead",
|
|
|
+ configured,
|
|
|
+ fallback,
|
|
|
+ )
|
|
|
+ return fallback
|
|
|
+
|
|
|
+ def _ensure_dir(self) -> None:
|
|
|
+ Path(self._opts.log_dir).mkdir(parents=True, exist_ok=True)
|
|
|
+
|
|
|
+ def _tzinfo(self):
|
|
|
+ tz = self._opts.timezone.lower()
|
|
|
+ if tz == "local":
|
|
|
+ return datetime.now().astimezone().tzinfo
|
|
|
+ if tz == "utc":
|
|
|
+ return timezone.utc
|
|
|
+ if tz.startswith("utc+"):
|
|
|
+ try:
|
|
|
+ hours = float(tz.split("+", 1)[1])
|
|
|
+ return timezone(timedelta(hours=hours))
|
|
|
+ except ValueError: # pragma: no cover - defensive
|
|
|
+ return timezone.utc
|
|
|
+ if tz.startswith("utc-"):
|
|
|
+ try:
|
|
|
+ hours = float(tz.split("-", 1)[1])
|
|
|
+ return timezone(-timedelta(hours=hours))
|
|
|
+ except ValueError: # pragma: no cover - defensive
|
|
|
+ return timezone.utc
|
|
|
+ return timezone.utc
|
|
|
+
|
|
|
+ def _now(self) -> datetime:
|
|
|
+ return datetime.now(self._tzinfo())
|
|
|
+
|
|
|
+ def _date_str(self) -> str:
|
|
|
+ dt = self._now()
|
|
|
+ return dt.strftime("%Y-%m-%d")
|
|
|
+
|
|
|
+ def _prefix(self, level: str) -> str:
|
|
|
+ ts = self._now().strftime("%Y-%m-%dT%H:%M:%S%z")
|
|
|
+ return f"[{ts}] [{level}]"
|
|
|
+
|
|
|
+ def _log_path(self) -> Path:
|
|
|
+ dt = self._now()
|
|
|
+ year_dir = dt.strftime("%Y")
|
|
|
+ month_dir = dt.strftime("%m")
|
|
|
+ date_part = self._date_str()
|
|
|
+ filename = self._opts.filename_pattern.format(date=date_part)
|
|
|
+ return Path(self._opts.log_dir) / year_dir / month_dir / filename
|
|
|
+
|
|
|
+ def _write_lines(self, lines: list[str]) -> None:
|
|
|
+ try:
|
|
|
+ path = self._log_path()
|
|
|
+ path.parent.mkdir(parents=True, exist_ok=True)
|
|
|
+ with path.open("a", encoding="utf-8") as f:
|
|
|
+ f.write("\n".join(lines))
|
|
|
+ if not lines[-1].endswith("\n"):
|
|
|
+ f.write("\n")
|
|
|
+ except (OSError, IOError) as exc: # pragma: no cover - disk/io dependent
|
|
|
+ if self.logger:
|
|
|
+ self.logger.exception("DailyFileReporter write failed: %s", exc)
|