dagent_eval/server/service/dedup.py

# -*- coding: utf-8 -*-
"""
Question deduplication service.

使用 正则归一化 + 向量余弦相似度 两阶段查重：
1) 正则归一化：去除标点/空白/常见中文疑问助词后字符串完全相等，判为重复（sim=1.0）。
2) 向量相似度：对归一化后仍不同的问题，批量 embedding + 计算 cosine；
   >= similarity_threshold 判为重复。

相比 LLM 查重：更快、更便宜、结果确定，且可批量。
"""
import asyncio
import re
from typing import Callable, Optional

import numpy as np


# 空白 + ASCII/中英文全角标点
_PUNCT_RE = re.compile(
    r'[\s　 -⁯　-〿＀-￯'
    r'\-_=+*&^%$#@!\\/?.,;:\'"`~<>()\[\]{}]+'
)
# 结尾的疑问助词和语气词
_TAIL_PARTICLE_RE = re.compile(r'(?:吗|呢|啊|呀|哪|么|嘛|吧)+[?？。!！]*$')
# 开头的礼貌/引导词
_LEADING_ASK_RE = re.compile(r'^(?:请问一下|请问|问一下|那么|然后|所以)')


def _normalize(text: str) -> str:
    """问题文本的规范形式（用于正则查重）。"""
    if not text:
        return ""
    s = text.strip().lower()
    s = _LEADING_ASK_RE.sub("", s)
    s = _TAIL_PARTICLE_RE.sub("", s)
    s = _PUNCT_RE.sub("", s)
    return s


def _regex_duplicate_id(
    new_question: str,
    existing_questions: list[tuple[str, str]],
) -> Optional[str]:
    """规范化后的字符串与已有问题完全相等则判重，返回该已有问题 id。"""
    norm_new = _normalize(new_question)
    if not norm_new:
        return None
    for qid, existing_q in existing_questions:
        if _normalize(existing_q) == norm_new:
            return qid
    return None


async def _embed_texts(
    embed_client,
    model: str,
    texts: list[str],
    batch_size: int = 64,
) -> list[np.ndarray]:
    """批量 embedding，返回 L2 归一化后的向量列表（顺序与输入一致）。"""
    if not texts:
        return []
    out: list[np.ndarray] = []
    for i in range(0, len(texts), batch_size):
        batch = texts[i:i + batch_size]
        resp = await embed_client.embeddings.create(model=model, input=batch)
        for item in resp.data:
            v = np.asarray(item.embedding, dtype=np.float32)
            n = np.linalg.norm(v)
            if n > 0:
                v = v / n
            out.append(v)
    return out


async def deduplicate_questions_by_chunk(
    new_questions_by_chunk: dict[str, list[dict]],  # {chunk_id: [{id, question, ...}]}
    existing_questions_by_chunk: dict[str, list[tuple[str, str]]],  # {chunk_id: [(id, question)]}
    embed_client,
    embed_model: str,
    similarity_threshold: float = 0.85,
    max_parallel_chunks: int = 5,
    stop_check: Optional[Callable[[], bool]] = None,
    pause_check: Optional[Callable[[], bool]] = None,  # New: check if paused
    on_progress: Optional[Callable] = None,  # async callback(done, total)
) -> dict[str, tuple[Optional[str], float]]:
    """
    按切片并行查重。

    对每个切片：
      - 先用正则归一化做精确查重（新 vs 已有，新 vs 新同批）。
      - 剩余的问题批量 embedding，逐一与已有问题、该批内更早的问题计算 cosine，
        取最大值；>= threshold 判重。

    Returns:
        {new_question_id: (dup_of_id_or_None, similarity)}
    """
    chunk_sem = asyncio.Semaphore(max_parallel_chunks)
    results: dict[str, tuple[Optional[str], float]] = {}
    stopped = False
    done_count = 0
    total = sum(len(qs) for qs in new_questions_by_chunk.values())
    progress_lock = asyncio.Lock()

    async def bump_progress(n: int):
        nonlocal done_count
        async with progress_lock:
            done_count += n
            if on_progress:
                await on_progress(done_count, total)

    async def dedup_one_chunk(chunk_id: str, new_questions: list[dict]):
        nonlocal stopped
        if stopped or (stop_check and stop_check()):
            stopped = True
            return

        # Check pause before starting chunk
        if pause_check and await pause_check():
            stopped = True
            return

        existing = existing_questions_by_chunk.get(chunk_id, [])

        async with chunk_sem:
            if stopped or (stop_check and stop_check()):
                stopped = True
                return

            # Check pause again after acquiring semaphore
            if pause_check and await pause_check():
                stopped = True
                return

            # ── Step 1: 正则归一化查重 ─────────────────────────────────
            seen_norm: dict[str, str] = {}   # 归一化后的字符串 -> 首次出现该形式的新问题 id
            remaining: list[dict] = []

            for q in new_questions:
                # 与已有问题做规范化比对
                ex_id = _regex_duplicate_id(q["question"], existing)
                if ex_id:
                    results[q["id"]] = (ex_id, 1.0)
                    continue
                # 与同批次更早的新问题比对
                norm = _normalize(q["question"])
                if norm and norm in seen_norm:
                    results[q["id"]] = (seen_norm[norm], 1.0)
                    continue
                if norm:
                    seen_norm[norm] = q["id"]
                remaining.append(q)

            # ── Step 2: 向量相似度查重 ─────────────────────────────────
            if remaining:
                try:
                    new_texts = [q["question"] for q in remaining]
                    new_ids = [q["id"] for q in remaining]
                    existing_texts = [q for _, q in existing]
                    existing_ids = [qid for qid, _ in existing]

                    all_vecs = await _embed_texts(
                        embed_client, embed_model, new_texts + existing_texts
                    )
                    new_vecs = all_vecs[:len(new_texts)]
                    ex_vecs = all_vecs[len(new_texts):]

                    for i, nv in enumerate(new_vecs):
                        best_id: Optional[str] = None
                        best_sim = 0.0
                        # vs 已有问题
                        for ex_id, ev in zip(existing_ids, ex_vecs):
                            sim = float(np.dot(nv, ev))
                            if sim > best_sim:
                                best_sim = sim
                                best_id = ex_id
                        # vs 同批次更早的新问题（捕获批内近似重复）
                        for j in range(i):
                            sim = float(np.dot(nv, new_vecs[j]))
                            if sim > best_sim:
                                best_sim = sim
                                best_id = new_ids[j]

                        if best_id is not None and best_sim >= similarity_threshold:
                            results[new_ids[i]] = (best_id, round(best_sim, 4))
                        else:
                            results[new_ids[i]] = (None, 0.0)
                except Exception as e:
                    print(f"[WARN] Vector dedup failed for chunk {chunk_id}: {e}")
                    for q in remaining:
                        results.setdefault(q["id"], (None, 0.0))

        await bump_progress(len(new_questions))

    tasks = [
        dedup_one_chunk(chunk_id, questions)
        for chunk_id, questions in new_questions_by_chunk.items()
    ]
    await asyncio.gather(*tasks)
    return results