evaluation for rag systems

This commit is contained in:
2026-03-11 22:30:02 +03:00
parent 5721bad117
commit 6c953a327f
11 changed files with 31897 additions and 1 deletions

View File

@@ -0,0 +1,377 @@
#!/usr/bin/env python3
from __future__ import annotations
import hashlib
import json
import os
import random
import re
import tempfile
from collections import defaultdict
from pathlib import Path
from typing import Any
from dotenv import load_dotenv
from qdrant_client import QdrantClient
from qdrant_client.http.models import Filter, FieldCondition, MatchValue
import requests
try:
from langchain_community.document_loaders import PyPDFLoader, TextLoader
except Exception: # pragma: no cover
PyPDFLoader = None
TextLoader = None
try:
from langchain_community.document_loaders import UnstructuredWordDocumentLoader
except Exception: # pragma: no cover
UnstructuredWordDocumentLoader = None
try:
from langchain_community.document_loaders import UnstructuredPowerPointLoader
except Exception: # pragma: no cover
UnstructuredPowerPointLoader = None
try:
from langchain_community.document_loaders import UnstructuredExcelLoader
except Exception: # pragma: no cover
UnstructuredExcelLoader = None
try:
from langchain_community.document_loaders import UnstructuredODTLoader
except Exception: # pragma: no cover
UnstructuredODTLoader = None
ROOT = Path(__file__).resolve().parent
LANGCHAIN_DIR = ROOT / "services" / "rag" / "langchain"
LLAMAINDEX_DIR = ROOT / "services" / "rag" / "llamaindex"
YADISK_JSON = ROOT / "yadisk_files.json"
OUTPUT_MD = ROOT / "DOCUMENTS_TO_TEST.md"
def safe_stem_from_remote(remote_path: str) -> str:
stem = Path(Path(remote_path).name).stem or "file"
return "".join(ch if ch.isalnum() or ch in ("-", "_") else "_" for ch in stem)
def llama_prefect_filename(remote_path: str) -> str:
remote_name = Path(remote_path).name or "downloaded_file"
suffix = Path(remote_name).suffix
digest = hashlib.md5(remote_path.encode("utf-8")).hexdigest()[:10]
return f"{safe_stem_from_remote(remote_path)}_{digest}{suffix}"
def get_loader(local_path: str):
ext = Path(local_path).suffix.lower()
if ext == ".pdf" and PyPDFLoader is not None:
return PyPDFLoader(local_path)
if ext in {".doc", ".docx"} and UnstructuredWordDocumentLoader is not None:
return UnstructuredWordDocumentLoader(
local_path, **{"strategy": "hi_res", "languages": ["rus"]}
)
if ext == ".pptx" and UnstructuredPowerPointLoader is not None:
return UnstructuredPowerPointLoader(
local_path, **{"strategy": "hi_res", "languages": ["rus"]}
)
if ext in {".xls", ".xlsx"} and UnstructuredExcelLoader is not None:
return UnstructuredExcelLoader(
local_path, **{"strategy": "hi_res", "languages": ["rus"]}
)
if ext == ".odt" and UnstructuredODTLoader is not None:
return UnstructuredODTLoader(
local_path, **{"strategy": "hi_res", "languages": ["rus"]}
)
if ext in {".txt", ".md"} and TextLoader is not None:
return TextLoader(local_path, encoding="utf-8")
return None
def supported_loader_extensions() -> set[str]:
exts = set()
if PyPDFLoader is not None:
exts.add(".pdf")
if UnstructuredWordDocumentLoader is not None:
exts.update({".doc", ".docx"})
if UnstructuredPowerPointLoader is not None:
exts.add(".pptx")
if UnstructuredExcelLoader is not None:
exts.update({".xls", ".xlsx"})
if UnstructuredODTLoader is not None:
exts.add(".odt")
if TextLoader is not None:
exts.update({".txt", ".md"})
return exts
def collect_langchain_paths(client: QdrantClient) -> set[str]:
paths: set[str] = set()
offset = None
while True:
points, offset = client.scroll(
collection_name="documents_langchain",
offset=offset,
limit=1000,
with_payload=True,
with_vectors=False,
)
if not points:
break
for p in points:
payload = p.payload or {}
md = payload.get("metadata") or {}
fp = md.get("file_path") or md.get("source")
if isinstance(fp, str) and fp:
paths.add(fp)
if offset is None:
break
return paths
def collect_llama_filenames(client: QdrantClient) -> set[str]:
names: set[str] = set()
offset = None
while True:
points, offset = client.scroll(
collection_name="documents_llamaindex",
offset=offset,
limit=1000,
with_payload=True,
with_vectors=False,
)
if not points:
break
for p in points:
payload = p.payload or {}
name = payload.get("filename")
if isinstance(name, str) and name:
names.add(name)
if offset is None:
break
return names
def first_unique(matches: list[str], fallback: str) -> str:
for m in matches:
m = m.strip()
if m:
return m
return fallback
def build_questions(remote_path: str, text: str) -> dict[str, list[str]]:
text = " ".join((text or "").split())
text_preview = text[:15000]
years = sorted(
{
int(m)
for m in re.findall(r"\b(19\d{2}|20\d{2}|21\d{2})\b", text_preview)
if 1900 <= int(m) <= 2199
}
)
dates = re.findall(
r"\b(?:\d{1,2}[./-]\d{1,2}[./-]\d{2,4}|\d{4}[./-]\d{1,2}[./-]\d{1,2})\b",
text_preview,
)
numbers = re.findall(r"\b\d{2,}\b", text_preview)
quoted = re.findall(r"[\"«]([^\"»\n]{4,120})[\"»]", text_preview)
org_like = re.findall(
r"\b(?:ООО|АО|ПАО|ФГУП|Минтранс|Министерств[ао]|Правительств[ао]|Форум)\b[^\n,.]{0,80}",
text_preview,
flags=re.IGNORECASE,
)
year_q = (
f"В каком году в документе описывается ключевое событие ({years[0]}) и как это подтверждается контекстом?"
if years
else "Есть ли в документе указание на год события? Если да, какой именно год упомянут?"
)
date_q = (
f"Какая дата ({dates[0]}) встречается в документе и к какому событию/разделу она относится?"
if dates
else "Какие календарные даты или периоды (если есть) упомянуты в документе?"
)
num_q = (
f"Какое числовое значение ({numbers[0]}) встречается в документе и в каком контексте оно используется?"
if numbers
else "Есть ли в документе количественные показатели (суммы, проценты, номера, объемы) и что они обозначают?"
)
entity = first_unique(quoted, first_unique(org_like, Path(remote_path).name))
topic_hint = Path(remote_path).stem.replace("_", " ").replace("-", " ")
topic_hint = " ".join(topic_hint.split())[:120]
entity_q = f"Что в документе говорится про «{entity}»?"
return {
"Entity/Fact Recall (Response Relevance)": [
f"Что известно про «{entity}» в материалах базы?",
f"В контексте темы «{topic_hint}» кто выступает ключевым участником и какова его роль?",
],
"Numerical & Temporal Precision": [
year_q.replace("в документе", "в материалах").replace("документе", "материалах"),
date_q.replace("в документе", "в материалах").replace("документе", "материалах"),
num_q.replace("в документе", "в материалах").replace("документе", "материалах"),
],
"Context Precision (Evidence-anchored)": [
f"Найди в базе фрагмент, который лучше всего подтверждает тезис по теме «{topic_hint}», и объясни его релевантность.",
f"Есть ли в базе схожие по теме «{topic_hint}», но нерелевантные фрагменты, которые можно ошибочно выбрать?",
],
"Faithfulness / Non-hallucination": [
f"Какая информация по теме «{topic_hint}» отсутствует в найденном контексте и не должна быть додумана?",
f"Если прямого ответа по теме «{topic_hint}» в материалах нет, как корректно ответить без галлюцинаций?",
],
"Reasoning & Synthesis": [
f"Сформулируй краткий вывод по теме «{topic_hint}» в 2-3 пунктах, опираясь на несколько найденных фрагментов.",
f"Какие ограничения, риски или условия по теме «{topic_hint}» упоминаются в материалах, и как они влияют на вывод?",
],
}
def extract_document_text(docs: list[Any]) -> str:
chunks: list[str] = []
for doc in docs:
content = getattr(doc, "page_content", None)
if content is None:
content = getattr(doc, "text", None)
if isinstance(content, str) and content.strip():
chunks.append(content.strip())
if len(" ".join(chunks)) > 25000:
break
return "\n".join(chunks)[:25000]
def download_yadisk_file(remote_path: str, token: str, local_path: str) -> None:
headers = {"Authorization": f"OAuth {token}"}
response = requests.get(
"https://cloud-api.yandex.net/v1/disk/resources/download",
headers=headers,
params={"path": remote_path},
timeout=30,
)
response.raise_for_status()
href = response.json()["href"]
file_response = requests.get(href, timeout=180)
file_response.raise_for_status()
with open(local_path, "wb") as f:
f.write(file_response.content)
def fetch_text_from_yadisk(remote_path: str, token: str) -> str:
suffix = Path(remote_path).suffix
with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as tmp:
local_path = tmp.name
try:
download_yadisk_file(remote_path, token, local_path)
loader = get_loader(local_path)
if loader is None:
return ""
docs = loader.load()
return extract_document_text(docs)
finally:
if os.path.exists(local_path):
os.unlink(local_path)
def main() -> int:
load_dotenv(LANGCHAIN_DIR / ".env")
qdrant_host = os.getenv("QDRANT_HOST")
qdrant_rest_port = int(os.getenv("QDRANT_REST_PORT", "6333"))
yadisk_token = os.getenv("YADISK_TOKEN", "").strip()
if not qdrant_host:
raise RuntimeError("QDRANT_HOST is missing in langchain .env")
if not yadisk_token:
raise RuntimeError("YADISK_TOKEN is missing in langchain .env")
with YADISK_JSON.open("r", encoding="utf-8") as f:
raw_paths = json.load(f)
if not isinstance(raw_paths, list):
raise RuntimeError("yadisk_files.json must be a JSON list of paths")
all_paths = [str(x) for x in raw_paths if isinstance(x, str)]
allowed_ext = supported_loader_extensions()
filtered_by_ext = [
p for p in all_paths if Path(p).suffix.lower() in allowed_ext and p.startswith("disk:/")
]
client = QdrantClient(host=qdrant_host, port=qdrant_rest_port, timeout=60)
langchain_paths = collect_langchain_paths(client)
llama_filenames = collect_llama_filenames(client)
candidates = []
for path in filtered_by_ext:
if path not in langchain_paths:
continue
if llama_prefect_filename(path) not in llama_filenames:
continue
candidates.append(path)
random.seed(42)
random.shuffle(candidates)
if len(candidates) < 100:
raise RuntimeError(
f"Only {len(candidates)} candidate documents found in both collections; need 100"
)
rows: list[dict[str, Any]] = []
attempts = 0
for remote_path in candidates:
if len(rows) >= 100:
break
attempts += 1
idx = len(rows) + 1
print(f"[TRY {attempts:03d}] loading {remote_path}")
try:
text = fetch_text_from_yadisk(remote_path, yadisk_token)
except Exception as e:
print(f" -> skip (download/read error): {e}")
continue
if not text.strip():
print(" -> skip (empty extracted text)")
continue
rows.append(
{
"index": idx,
"path": remote_path,
"questions": build_questions(remote_path, text),
}
)
print(f"[OK {idx:03d}/100] prepared questions for {remote_path}")
if len(rows) < 100:
raise RuntimeError(
f"Only {len(rows)} documents were successfully downloaded/read and turned into questions"
)
lines: list[str] = []
lines.append("# DOCUMENTS_TO_TEST")
lines.append("")
lines.append("This dataset contains 100 YaDisk documents that were verified as present in both:")
lines.append("- `documents_langchain` (Qdrant)")
lines.append("- `documents_llamaindex` (Qdrant)")
lines.append("")
lines.append("Question sections are aligned with common RAG evaluation themes (retrieval + generation):")
lines.append("- Response relevance / entity-fact recall")
lines.append("- Numerical and temporal precision")
lines.append("- Context precision")
lines.append("- Faithfulness / non-hallucination")
lines.append("- Reasoning / synthesis")
lines.append("")
lines.append(
"_References used for evaluation themes: RAGAS metrics and NVIDIA RAG pipeline evaluation docs._"
)
lines.append("")
for row in rows:
lines.append(f"## {row['index']:03d}. `{row['path']}`")
lines.append("")
for section, qs in row["questions"].items():
lines.append(f"### {section}")
for q in qs:
lines.append(f"- {q}")
lines.append("")
OUTPUT_MD.write_text("\n".join(lines), encoding="utf-8")
print(f"Written: {OUTPUT_MD}")
return 0
if __name__ == "__main__":
raise SystemExit(main())