evaluation for rag systems

2026-03-11 22:30:02 +03:00
parent 5721bad117
commit 6c953a327f
11 changed files with 31897 additions and 1 deletions
--- a/generate_documents_to_test.py
+++ b/generate_documents_to_test.py
@@ -0,0 +1,377 @@
+#!/usr/bin/env python3
+from __future__ import annotations
+
+import hashlib
+import json
+import os
+import random
+import re
+import tempfile
+from collections import defaultdict
+from pathlib import Path
+from typing import Any
+
+from dotenv import load_dotenv
+from qdrant_client import QdrantClient
+from qdrant_client.http.models import Filter, FieldCondition, MatchValue
+import requests
+
+try:
+    from langchain_community.document_loaders import PyPDFLoader, TextLoader
+except Exception:  # pragma: no cover
+    PyPDFLoader = None
+    TextLoader = None
+
+try:
+    from langchain_community.document_loaders import UnstructuredWordDocumentLoader
+except Exception:  # pragma: no cover
+    UnstructuredWordDocumentLoader = None
+
+try:
+    from langchain_community.document_loaders import UnstructuredPowerPointLoader
+except Exception:  # pragma: no cover
+    UnstructuredPowerPointLoader = None
+
+try:
+    from langchain_community.document_loaders import UnstructuredExcelLoader
+except Exception:  # pragma: no cover
+    UnstructuredExcelLoader = None
+
+try:
+    from langchain_community.document_loaders import UnstructuredODTLoader
+except Exception:  # pragma: no cover
+    UnstructuredODTLoader = None
+
+ROOT = Path(__file__).resolve().parent
+LANGCHAIN_DIR = ROOT / "services" / "rag" / "langchain"
+LLAMAINDEX_DIR = ROOT / "services" / "rag" / "llamaindex"
+YADISK_JSON = ROOT / "yadisk_files.json"
+OUTPUT_MD = ROOT / "DOCUMENTS_TO_TEST.md"
+
+
+def safe_stem_from_remote(remote_path: str) -> str:
+    stem = Path(Path(remote_path).name).stem or "file"
+    return "".join(ch if ch.isalnum() or ch in ("-", "_") else "_" for ch in stem)
+
+
+def llama_prefect_filename(remote_path: str) -> str:
+    remote_name = Path(remote_path).name or "downloaded_file"
+    suffix = Path(remote_name).suffix
+    digest = hashlib.md5(remote_path.encode("utf-8")).hexdigest()[:10]
+    return f"{safe_stem_from_remote(remote_path)}_{digest}{suffix}"
+
+
+def get_loader(local_path: str):
+    ext = Path(local_path).suffix.lower()
+    if ext == ".pdf" and PyPDFLoader is not None:
+        return PyPDFLoader(local_path)
+    if ext in {".doc", ".docx"} and UnstructuredWordDocumentLoader is not None:
+        return UnstructuredWordDocumentLoader(
+            local_path, **{"strategy": "hi_res", "languages": ["rus"]}
+        )
+    if ext == ".pptx" and UnstructuredPowerPointLoader is not None:
+        return UnstructuredPowerPointLoader(
+            local_path, **{"strategy": "hi_res", "languages": ["rus"]}
+        )
+    if ext in {".xls", ".xlsx"} and UnstructuredExcelLoader is not None:
+        return UnstructuredExcelLoader(
+            local_path, **{"strategy": "hi_res", "languages": ["rus"]}
+        )
+    if ext == ".odt" and UnstructuredODTLoader is not None:
+        return UnstructuredODTLoader(
+            local_path, **{"strategy": "hi_res", "languages": ["rus"]}
+        )
+    if ext in {".txt", ".md"} and TextLoader is not None:
+        return TextLoader(local_path, encoding="utf-8")
+    return None
+
+
+def supported_loader_extensions() -> set[str]:
+    exts = set()
+    if PyPDFLoader is not None:
+        exts.add(".pdf")
+    if UnstructuredWordDocumentLoader is not None:
+        exts.update({".doc", ".docx"})
+    if UnstructuredPowerPointLoader is not None:
+        exts.add(".pptx")
+    if UnstructuredExcelLoader is not None:
+        exts.update({".xls", ".xlsx"})
+    if UnstructuredODTLoader is not None:
+        exts.add(".odt")
+    if TextLoader is not None:
+        exts.update({".txt", ".md"})
+    return exts
+
+
+def collect_langchain_paths(client: QdrantClient) -> set[str]:
+    paths: set[str] = set()
+    offset = None
+    while True:
+        points, offset = client.scroll(
+            collection_name="documents_langchain",
+            offset=offset,
+            limit=1000,
+            with_payload=True,
+            with_vectors=False,
+        )
+        if not points:
+            break
+        for p in points:
+            payload = p.payload or {}
+            md = payload.get("metadata") or {}
+            fp = md.get("file_path") or md.get("source")
+            if isinstance(fp, str) and fp:
+                paths.add(fp)
+        if offset is None:
+            break
+    return paths
+
+
+def collect_llama_filenames(client: QdrantClient) -> set[str]:
+    names: set[str] = set()
+    offset = None
+    while True:
+        points, offset = client.scroll(
+            collection_name="documents_llamaindex",
+            offset=offset,
+            limit=1000,
+            with_payload=True,
+            with_vectors=False,
+        )
+        if not points:
+            break
+        for p in points:
+            payload = p.payload or {}
+            name = payload.get("filename")
+            if isinstance(name, str) and name:
+                names.add(name)
+        if offset is None:
+            break
+    return names
+
+
+def first_unique(matches: list[str], fallback: str) -> str:
+    for m in matches:
+        m = m.strip()
+        if m:
+            return m
+    return fallback
+
+
+def build_questions(remote_path: str, text: str) -> dict[str, list[str]]:
+    text = " ".join((text or "").split())
+    text_preview = text[:15000]
+    years = sorted(
+        {
+            int(m)
+            for m in re.findall(r"\b(19\d{2}|20\d{2}|21\d{2})\b", text_preview)
+            if 1900 <= int(m) <= 2199
+        }
+    )
+    dates = re.findall(
+        r"\b(?:\d{1,2}[./-]\d{1,2}[./-]\d{2,4}|\d{4}[./-]\d{1,2}[./-]\d{1,2})\b",
+        text_preview,
+    )
+    numbers = re.findall(r"\b\d{2,}\b", text_preview)
+    quoted = re.findall(r"[\"«]([^\"»\n]{4,120})[\"»]", text_preview)
+    org_like = re.findall(
+        r"\b(?:ООО|АО|ПАО|ФГУП|Минтранс|Министерств[ао]|Правительств[ао]|Форум)\b[^\n,.]{0,80}",
+        text_preview,
+        flags=re.IGNORECASE,
+    )
+
+    year_q = (
+        f"В каком году в документе описывается ключевое событие ({years[0]}) и как это подтверждается контекстом?"
+        if years
+        else "Есть ли в документе указание на год события? Если да, какой именно год упомянут?"
+    )
+    date_q = (
+        f"Какая дата ({dates[0]}) встречается в документе и к какому событию/разделу она относится?"
+        if dates
+        else "Какие календарные даты или периоды (если есть) упомянуты в документе?"
+    )
+    num_q = (
+        f"Какое числовое значение ({numbers[0]}) встречается в документе и в каком контексте оно используется?"
+        if numbers
+        else "Есть ли в документе количественные показатели (суммы, проценты, номера, объемы) и что они обозначают?"
+    )
+    entity = first_unique(quoted, first_unique(org_like, Path(remote_path).name))
+    topic_hint = Path(remote_path).stem.replace("_", " ").replace("-", " ")
+    topic_hint = " ".join(topic_hint.split())[:120]
+    entity_q = f"Что в документе говорится про «{entity}»?"
+
+    return {
+        "Entity/Fact Recall (Response Relevance)": [
+            f"Что известно про «{entity}» в материалах базы?",
+            f"В контексте темы «{topic_hint}» кто выступает ключевым участником и какова его роль?",
+        ],
+        "Numerical & Temporal Precision": [
+            year_q.replace("в документе", "в материалах").replace("документе", "материалах"),
+            date_q.replace("в документе", "в материалах").replace("документе", "материалах"),
+            num_q.replace("в документе", "в материалах").replace("документе", "материалах"),
+        ],
+        "Context Precision (Evidence-anchored)": [
+            f"Найди в базе фрагмент, который лучше всего подтверждает тезис по теме «{topic_hint}», и объясни его релевантность.",
+            f"Есть ли в базе схожие по теме «{topic_hint}», но нерелевантные фрагменты, которые можно ошибочно выбрать?",
+        ],
+        "Faithfulness / Non-hallucination": [
+            f"Какая информация по теме «{topic_hint}» отсутствует в найденном контексте и не должна быть додумана?",
+            f"Если прямого ответа по теме «{topic_hint}» в материалах нет, как корректно ответить без галлюцинаций?",
+        ],
+        "Reasoning & Synthesis": [
+            f"Сформулируй краткий вывод по теме «{topic_hint}» в 2-3 пунктах, опираясь на несколько найденных фрагментов.",
+            f"Какие ограничения, риски или условия по теме «{topic_hint}» упоминаются в материалах, и как они влияют на вывод?",
+        ],
+    }
+
+
+def extract_document_text(docs: list[Any]) -> str:
+    chunks: list[str] = []
+    for doc in docs:
+        content = getattr(doc, "page_content", None)
+        if content is None:
+            content = getattr(doc, "text", None)
+        if isinstance(content, str) and content.strip():
+            chunks.append(content.strip())
+        if len(" ".join(chunks)) > 25000:
+            break
+    return "\n".join(chunks)[:25000]
+
+
+def download_yadisk_file(remote_path: str, token: str, local_path: str) -> None:
+    headers = {"Authorization": f"OAuth {token}"}
+    response = requests.get(
+        "https://cloud-api.yandex.net/v1/disk/resources/download",
+        headers=headers,
+        params={"path": remote_path},
+        timeout=30,
+    )
+    response.raise_for_status()
+    href = response.json()["href"]
+    file_response = requests.get(href, timeout=180)
+    file_response.raise_for_status()
+    with open(local_path, "wb") as f:
+        f.write(file_response.content)
+
+
+def fetch_text_from_yadisk(remote_path: str, token: str) -> str:
+    suffix = Path(remote_path).suffix
+    with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as tmp:
+        local_path = tmp.name
+    try:
+        download_yadisk_file(remote_path, token, local_path)
+        loader = get_loader(local_path)
+        if loader is None:
+            return ""
+        docs = loader.load()
+        return extract_document_text(docs)
+    finally:
+        if os.path.exists(local_path):
+            os.unlink(local_path)
+
+
+def main() -> int:
+    load_dotenv(LANGCHAIN_DIR / ".env")
+    qdrant_host = os.getenv("QDRANT_HOST")
+    qdrant_rest_port = int(os.getenv("QDRANT_REST_PORT", "6333"))
+    yadisk_token = os.getenv("YADISK_TOKEN", "").strip()
+    if not qdrant_host:
+        raise RuntimeError("QDRANT_HOST is missing in langchain .env")
+    if not yadisk_token:
+        raise RuntimeError("YADISK_TOKEN is missing in langchain .env")
+
+    with YADISK_JSON.open("r", encoding="utf-8") as f:
+        raw_paths = json.load(f)
+    if not isinstance(raw_paths, list):
+        raise RuntimeError("yadisk_files.json must be a JSON list of paths")
+    all_paths = [str(x) for x in raw_paths if isinstance(x, str)]
+
+    allowed_ext = supported_loader_extensions()
+    filtered_by_ext = [
+        p for p in all_paths if Path(p).suffix.lower() in allowed_ext and p.startswith("disk:/")
+    ]
+
+    client = QdrantClient(host=qdrant_host, port=qdrant_rest_port, timeout=60)
+    langchain_paths = collect_langchain_paths(client)
+    llama_filenames = collect_llama_filenames(client)
+
+    candidates = []
+    for path in filtered_by_ext:
+        if path not in langchain_paths:
+            continue
+        if llama_prefect_filename(path) not in llama_filenames:
+            continue
+        candidates.append(path)
+
+    random.seed(42)
+    random.shuffle(candidates)
+    if len(candidates) < 100:
+        raise RuntimeError(
+            f"Only {len(candidates)} candidate documents found in both collections; need 100"
+        )
+
+    rows: list[dict[str, Any]] = []
+    attempts = 0
+    for remote_path in candidates:
+        if len(rows) >= 100:
+            break
+        attempts += 1
+        idx = len(rows) + 1
+        print(f"[TRY {attempts:03d}] loading {remote_path}")
+        try:
+            text = fetch_text_from_yadisk(remote_path, yadisk_token)
+        except Exception as e:
+            print(f"  -> skip (download/read error): {e}")
+            continue
+        if not text.strip():
+            print("  -> skip (empty extracted text)")
+            continue
+        rows.append(
+            {
+                "index": idx,
+                "path": remote_path,
+                "questions": build_questions(remote_path, text),
+            }
+        )
+        print(f"[OK  {idx:03d}/100] prepared questions for {remote_path}")
+
+    if len(rows) < 100:
+        raise RuntimeError(
+            f"Only {len(rows)} documents were successfully downloaded/read and turned into questions"
+        )
+
+    lines: list[str] = []
+    lines.append("# DOCUMENTS_TO_TEST")
+    lines.append("")
+    lines.append("This dataset contains 100 YaDisk documents that were verified as present in both:")
+    lines.append("- `documents_langchain` (Qdrant)")
+    lines.append("- `documents_llamaindex` (Qdrant)")
+    lines.append("")
+    lines.append("Question sections are aligned with common RAG evaluation themes (retrieval + generation):")
+    lines.append("- Response relevance / entity-fact recall")
+    lines.append("- Numerical and temporal precision")
+    lines.append("- Context precision")
+    lines.append("- Faithfulness / non-hallucination")
+    lines.append("- Reasoning / synthesis")
+    lines.append("")
+    lines.append(
+        "_References used for evaluation themes: RAGAS metrics and NVIDIA RAG pipeline evaluation docs._"
+    )
+    lines.append("")
+
+    for row in rows:
+        lines.append(f"## {row['index']:03d}. `{row['path']}`")
+        lines.append("")
+        for section, qs in row["questions"].items():
+            lines.append(f"### {section}")
+            for q in qs:
+                lines.append(f"- {q}")
+            lines.append("")
+
+    OUTPUT_MD.write_text("\n".join(lines), encoding="utf-8")
+    print(f"Written: {OUTPUT_MD}")
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())