dagent_eval/sdk/rag_eval/single_jump/report.py

"""
报告生成器：汇总召回测试结果，输出结构化报告。
"""
import json
from dataclasses import dataclass, field
from datetime import datetime
from .tester import RecallResult


@dataclass
class SectionStats:
    section_path: str
    doc_name: str
    file_id: str | None
    match_type: str | None
    total: int = 0
    recalled: int = 0       # 有召回结果的问题数
    empty: int = 0          # 空召回数
    errors: int = 0
    avg_cosine_sim: float | None = None
    avg_latency_ms: float | None = None


@dataclass
class SingleJumpReport:
    env_url: str
    org_id: str
    qa_file: str
    top_k: int
    cross_chunk: bool
    created_at: str = field(default_factory=lambda: datetime.now().isoformat())

    total_questions: int = 0
    total_sections: int = 0
    matched_sections: int = 0       # 成功映射到 file_id 的章节数
    unmatched_sections: int = 0
    recalled_questions: int = 0     # 有召回结果的问题数
    empty_questions: int = 0
    error_questions: int = 0

    recall_rate: float | None = None        # recalled / total
    empty_rate: float | None = None
    section_match_rate: float | None = None
    avg_cosine_sim: float | None = None
    avg_latency_ms: float | None = None

    section_stats: list[SectionStats] = field(default_factory=list)
    low_quality_results: list[dict] = field(default_factory=list)
    suspicious_results: list[dict] = field(default_factory=list)
    unmatched_section_list: list[str] = field(default_factory=list)

    def to_dict(self) -> dict:
        d = {
            "env_url": self.env_url,
            "org_id": self.org_id,
            "qa_file": self.qa_file,
            "top_k": self.top_k,
            "cross_chunk": self.cross_chunk,
            "created_at": self.created_at,
            "summary": {
                "total_questions": self.total_questions,
                "total_sections": self.total_sections,
                "matched_sections": self.matched_sections,
                "unmatched_sections": self.unmatched_sections,
                "recalled_questions": self.recalled_questions,
                "empty_questions": self.empty_questions,
                "error_questions": self.error_questions,
                "recall_rate": self.recall_rate,
                "empty_rate": self.empty_rate,
                "section_match_rate": self.section_match_rate,
                "avg_cosine_sim": self.avg_cosine_sim,
                "avg_latency_ms": self.avg_latency_ms,
            },
            "section_stats": [
                {
                    "section_path": s.section_path,
                    "doc_name": s.doc_name,
                    "file_id": s.file_id,
                    "match_type": s.match_type,
                    "total": s.total,
                    "recalled": s.recalled,
                    "empty": s.empty,
                    "errors": s.errors,
                    "avg_cosine_sim": s.avg_cosine_sim,
                    "avg_latency_ms": s.avg_latency_ms,
                }
                for s in self.section_stats
            ],
            "unmatched_sections": self.unmatched_section_list,
            "low_quality_count": len(self.low_quality_results),
            "suspicious_count": len(self.suspicious_results),
        }
        return d

    def save(self, path: str):
        with open(path, "w", encoding="utf-8") as f:
            json.dump(self.to_dict(), f, ensure_ascii=False, indent=2)

    def summary_text(self) -> str:
        lines = [
            "=" * 60,
            "  单跳召回测试报告",
            "=" * 60,
            f"  环境地址        : {self.env_url}",
            f"  总问题数        : {self.total_questions}",
            f"  总章节数        : {self.total_sections}",
            f"  章节匹配率      : {self.section_match_rate:.1%}" if self.section_match_rate is not None else "  章节匹配率      : N/A",
            f"  召回率          : {self.recall_rate:.1%}" if self.recall_rate is not None else "  召回率          : N/A",
            f"  空召回率        : {self.empty_rate:.1%}" if self.empty_rate is not None else "  空召回率        : N/A",
            f"  平均余弦相似度  : {self.avg_cosine_sim:.4f}" if self.avg_cosine_sim is not None else "  平均余弦相似度  : N/A",
            f"  平均延迟        : {self.avg_latency_ms:.0f}ms" if self.avg_latency_ms is not None else "  平均延迟        : N/A",
            f"  低质量样例      : {len(self.low_quality_results)}",
            f"  可疑样例        : {len(self.suspicious_results)}",
            "=" * 60,
        ]
        if self.unmatched_section_list:
            lines.append(f"  未匹配章节 ({len(self.unmatched_section_list)}):")
            for s in self.unmatched_section_list[:10]:
                lines.append(f"    - {s}")
            if len(self.unmatched_section_list) > 10:
                lines.append(f"    ... 共 {len(self.unmatched_section_list)} 个")
        return "\n".join(lines)


def build_report(
    results: list[RecallResult],
    env_url: str,
    org_id: str,
    qa_file: str,
    top_k: int,
    cross_chunk: bool,
    quality_info: dict | None = None,
) -> SingleJumpReport:
    report = SingleJumpReport(
        env_url=env_url,
        org_id=org_id,
        qa_file=qa_file,
        top_k=top_k,
        cross_chunk=cross_chunk,
    )

    # 按章节分组
    section_map: dict[str, SectionStats] = {}
    for r in results:
        key = r.section_path
        if key not in section_map:
            section_map[key] = SectionStats(
                section_path=r.section_path,
                doc_name=r.doc_name,
                file_id=r.file_id,
                match_type=r.match_type,
            )
        s = section_map[key]
        s.total += 1
        if r.error:
            s.errors += 1
        elif r.is_empty:
            s.empty += 1
        else:
            s.recalled += 1

    # 计算章节平均指标
    for key, s in section_map.items():
        sec_results = [r for r in results if r.section_path == key and not r.error and not r.is_empty]
        sims = [r.best_cosine_sim for r in sec_results if r.best_cosine_sim is not None]
        lats = [r.latency_ms for r in sec_results if r.latency_ms]
        s.avg_cosine_sim = round(sum(sims) / len(sims), 4) if sims else None
        s.avg_latency_ms = round(sum(lats) / len(lats), 1) if lats else None

    report.section_stats = list(section_map.values())
    report.total_sections = len(section_map)
    report.matched_sections = sum(1 for s in report.section_stats if s.file_id)
    report.unmatched_sections = report.total_sections - report.matched_sections
    report.unmatched_section_list = [
        s.section_path for s in report.section_stats if not s.file_id
    ]

    # 全局统计
    report.total_questions = len(results)
    report.recalled_questions = sum(1 for r in results if not r.error and not r.is_empty)
    report.empty_questions = sum(1 for r in results if not r.error and r.is_empty)
    report.error_questions = sum(1 for r in results if r.error)

    if report.total_questions > 0:
        report.recall_rate = round(report.recalled_questions / report.total_questions, 4)
        report.empty_rate = round(report.empty_questions / report.total_questions, 4)
    if report.total_sections > 0:
        report.section_match_rate = round(report.matched_sections / report.total_sections, 4)

    all_sims = [r.best_cosine_sim for r in results if r.best_cosine_sim is not None]
    all_lats = [r.latency_ms for r in results if r.latency_ms]
    report.avg_cosine_sim = round(sum(all_sims) / len(all_sims), 4) if all_sims else None
    report.avg_latency_ms = round(sum(all_lats) / len(all_lats), 1) if all_lats else None

    if quality_info:
        report.low_quality_results = [
            {"section": r.section_path, "qid": r.qid, "question": r.question, "sim": r.best_cosine_sim}
            for r in quality_info.get("low_quality", [])
        ]
        report.suspicious_results = [
            {"section": r.section_path, "qid": r.qid, "question": r.question,
             "expected_file": r.file_id, "retrieved_files": r.retrieved_file_ids}
            for r in quality_info.get("suspicious", [])
        ]

    return report