evaluation for rag systems
This commit is contained in:
377
generate_documents_to_test.py
Normal file
377
generate_documents_to_test.py
Normal file
@@ -0,0 +1,377 @@
|
||||
#!/usr/bin/env python3
|
||||
from __future__ import annotations
|
||||
|
||||
import hashlib
|
||||
import json
|
||||
import os
|
||||
import random
|
||||
import re
|
||||
import tempfile
|
||||
from collections import defaultdict
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
from dotenv import load_dotenv
|
||||
from qdrant_client import QdrantClient
|
||||
from qdrant_client.http.models import Filter, FieldCondition, MatchValue
|
||||
import requests
|
||||
|
||||
try:
|
||||
from langchain_community.document_loaders import PyPDFLoader, TextLoader
|
||||
except Exception: # pragma: no cover
|
||||
PyPDFLoader = None
|
||||
TextLoader = None
|
||||
|
||||
try:
|
||||
from langchain_community.document_loaders import UnstructuredWordDocumentLoader
|
||||
except Exception: # pragma: no cover
|
||||
UnstructuredWordDocumentLoader = None
|
||||
|
||||
try:
|
||||
from langchain_community.document_loaders import UnstructuredPowerPointLoader
|
||||
except Exception: # pragma: no cover
|
||||
UnstructuredPowerPointLoader = None
|
||||
|
||||
try:
|
||||
from langchain_community.document_loaders import UnstructuredExcelLoader
|
||||
except Exception: # pragma: no cover
|
||||
UnstructuredExcelLoader = None
|
||||
|
||||
try:
|
||||
from langchain_community.document_loaders import UnstructuredODTLoader
|
||||
except Exception: # pragma: no cover
|
||||
UnstructuredODTLoader = None
|
||||
|
||||
ROOT = Path(__file__).resolve().parent
|
||||
LANGCHAIN_DIR = ROOT / "services" / "rag" / "langchain"
|
||||
LLAMAINDEX_DIR = ROOT / "services" / "rag" / "llamaindex"
|
||||
YADISK_JSON = ROOT / "yadisk_files.json"
|
||||
OUTPUT_MD = ROOT / "DOCUMENTS_TO_TEST.md"
|
||||
|
||||
|
||||
def safe_stem_from_remote(remote_path: str) -> str:
|
||||
stem = Path(Path(remote_path).name).stem or "file"
|
||||
return "".join(ch if ch.isalnum() or ch in ("-", "_") else "_" for ch in stem)
|
||||
|
||||
|
||||
def llama_prefect_filename(remote_path: str) -> str:
|
||||
remote_name = Path(remote_path).name or "downloaded_file"
|
||||
suffix = Path(remote_name).suffix
|
||||
digest = hashlib.md5(remote_path.encode("utf-8")).hexdigest()[:10]
|
||||
return f"{safe_stem_from_remote(remote_path)}_{digest}{suffix}"
|
||||
|
||||
|
||||
def get_loader(local_path: str):
|
||||
ext = Path(local_path).suffix.lower()
|
||||
if ext == ".pdf" and PyPDFLoader is not None:
|
||||
return PyPDFLoader(local_path)
|
||||
if ext in {".doc", ".docx"} and UnstructuredWordDocumentLoader is not None:
|
||||
return UnstructuredWordDocumentLoader(
|
||||
local_path, **{"strategy": "hi_res", "languages": ["rus"]}
|
||||
)
|
||||
if ext == ".pptx" and UnstructuredPowerPointLoader is not None:
|
||||
return UnstructuredPowerPointLoader(
|
||||
local_path, **{"strategy": "hi_res", "languages": ["rus"]}
|
||||
)
|
||||
if ext in {".xls", ".xlsx"} and UnstructuredExcelLoader is not None:
|
||||
return UnstructuredExcelLoader(
|
||||
local_path, **{"strategy": "hi_res", "languages": ["rus"]}
|
||||
)
|
||||
if ext == ".odt" and UnstructuredODTLoader is not None:
|
||||
return UnstructuredODTLoader(
|
||||
local_path, **{"strategy": "hi_res", "languages": ["rus"]}
|
||||
)
|
||||
if ext in {".txt", ".md"} and TextLoader is not None:
|
||||
return TextLoader(local_path, encoding="utf-8")
|
||||
return None
|
||||
|
||||
|
||||
def supported_loader_extensions() -> set[str]:
|
||||
exts = set()
|
||||
if PyPDFLoader is not None:
|
||||
exts.add(".pdf")
|
||||
if UnstructuredWordDocumentLoader is not None:
|
||||
exts.update({".doc", ".docx"})
|
||||
if UnstructuredPowerPointLoader is not None:
|
||||
exts.add(".pptx")
|
||||
if UnstructuredExcelLoader is not None:
|
||||
exts.update({".xls", ".xlsx"})
|
||||
if UnstructuredODTLoader is not None:
|
||||
exts.add(".odt")
|
||||
if TextLoader is not None:
|
||||
exts.update({".txt", ".md"})
|
||||
return exts
|
||||
|
||||
|
||||
def collect_langchain_paths(client: QdrantClient) -> set[str]:
|
||||
paths: set[str] = set()
|
||||
offset = None
|
||||
while True:
|
||||
points, offset = client.scroll(
|
||||
collection_name="documents_langchain",
|
||||
offset=offset,
|
||||
limit=1000,
|
||||
with_payload=True,
|
||||
with_vectors=False,
|
||||
)
|
||||
if not points:
|
||||
break
|
||||
for p in points:
|
||||
payload = p.payload or {}
|
||||
md = payload.get("metadata") or {}
|
||||
fp = md.get("file_path") or md.get("source")
|
||||
if isinstance(fp, str) and fp:
|
||||
paths.add(fp)
|
||||
if offset is None:
|
||||
break
|
||||
return paths
|
||||
|
||||
|
||||
def collect_llama_filenames(client: QdrantClient) -> set[str]:
|
||||
names: set[str] = set()
|
||||
offset = None
|
||||
while True:
|
||||
points, offset = client.scroll(
|
||||
collection_name="documents_llamaindex",
|
||||
offset=offset,
|
||||
limit=1000,
|
||||
with_payload=True,
|
||||
with_vectors=False,
|
||||
)
|
||||
if not points:
|
||||
break
|
||||
for p in points:
|
||||
payload = p.payload or {}
|
||||
name = payload.get("filename")
|
||||
if isinstance(name, str) and name:
|
||||
names.add(name)
|
||||
if offset is None:
|
||||
break
|
||||
return names
|
||||
|
||||
|
||||
def first_unique(matches: list[str], fallback: str) -> str:
|
||||
for m in matches:
|
||||
m = m.strip()
|
||||
if m:
|
||||
return m
|
||||
return fallback
|
||||
|
||||
|
||||
def build_questions(remote_path: str, text: str) -> dict[str, list[str]]:
|
||||
text = " ".join((text or "").split())
|
||||
text_preview = text[:15000]
|
||||
years = sorted(
|
||||
{
|
||||
int(m)
|
||||
for m in re.findall(r"\b(19\d{2}|20\d{2}|21\d{2})\b", text_preview)
|
||||
if 1900 <= int(m) <= 2199
|
||||
}
|
||||
)
|
||||
dates = re.findall(
|
||||
r"\b(?:\d{1,2}[./-]\d{1,2}[./-]\d{2,4}|\d{4}[./-]\d{1,2}[./-]\d{1,2})\b",
|
||||
text_preview,
|
||||
)
|
||||
numbers = re.findall(r"\b\d{2,}\b", text_preview)
|
||||
quoted = re.findall(r"[\"«]([^\"»\n]{4,120})[\"»]", text_preview)
|
||||
org_like = re.findall(
|
||||
r"\b(?:ООО|АО|ПАО|ФГУП|Минтранс|Министерств[ао]|Правительств[ао]|Форум)\b[^\n,.]{0,80}",
|
||||
text_preview,
|
||||
flags=re.IGNORECASE,
|
||||
)
|
||||
|
||||
year_q = (
|
||||
f"В каком году в документе описывается ключевое событие ({years[0]}) и как это подтверждается контекстом?"
|
||||
if years
|
||||
else "Есть ли в документе указание на год события? Если да, какой именно год упомянут?"
|
||||
)
|
||||
date_q = (
|
||||
f"Какая дата ({dates[0]}) встречается в документе и к какому событию/разделу она относится?"
|
||||
if dates
|
||||
else "Какие календарные даты или периоды (если есть) упомянуты в документе?"
|
||||
)
|
||||
num_q = (
|
||||
f"Какое числовое значение ({numbers[0]}) встречается в документе и в каком контексте оно используется?"
|
||||
if numbers
|
||||
else "Есть ли в документе количественные показатели (суммы, проценты, номера, объемы) и что они обозначают?"
|
||||
)
|
||||
entity = first_unique(quoted, first_unique(org_like, Path(remote_path).name))
|
||||
topic_hint = Path(remote_path).stem.replace("_", " ").replace("-", " ")
|
||||
topic_hint = " ".join(topic_hint.split())[:120]
|
||||
entity_q = f"Что в документе говорится про «{entity}»?"
|
||||
|
||||
return {
|
||||
"Entity/Fact Recall (Response Relevance)": [
|
||||
f"Что известно про «{entity}» в материалах базы?",
|
||||
f"В контексте темы «{topic_hint}» кто выступает ключевым участником и какова его роль?",
|
||||
],
|
||||
"Numerical & Temporal Precision": [
|
||||
year_q.replace("в документе", "в материалах").replace("документе", "материалах"),
|
||||
date_q.replace("в документе", "в материалах").replace("документе", "материалах"),
|
||||
num_q.replace("в документе", "в материалах").replace("документе", "материалах"),
|
||||
],
|
||||
"Context Precision (Evidence-anchored)": [
|
||||
f"Найди в базе фрагмент, который лучше всего подтверждает тезис по теме «{topic_hint}», и объясни его релевантность.",
|
||||
f"Есть ли в базе схожие по теме «{topic_hint}», но нерелевантные фрагменты, которые можно ошибочно выбрать?",
|
||||
],
|
||||
"Faithfulness / Non-hallucination": [
|
||||
f"Какая информация по теме «{topic_hint}» отсутствует в найденном контексте и не должна быть додумана?",
|
||||
f"Если прямого ответа по теме «{topic_hint}» в материалах нет, как корректно ответить без галлюцинаций?",
|
||||
],
|
||||
"Reasoning & Synthesis": [
|
||||
f"Сформулируй краткий вывод по теме «{topic_hint}» в 2-3 пунктах, опираясь на несколько найденных фрагментов.",
|
||||
f"Какие ограничения, риски или условия по теме «{topic_hint}» упоминаются в материалах, и как они влияют на вывод?",
|
||||
],
|
||||
}
|
||||
|
||||
|
||||
def extract_document_text(docs: list[Any]) -> str:
|
||||
chunks: list[str] = []
|
||||
for doc in docs:
|
||||
content = getattr(doc, "page_content", None)
|
||||
if content is None:
|
||||
content = getattr(doc, "text", None)
|
||||
if isinstance(content, str) and content.strip():
|
||||
chunks.append(content.strip())
|
||||
if len(" ".join(chunks)) > 25000:
|
||||
break
|
||||
return "\n".join(chunks)[:25000]
|
||||
|
||||
|
||||
def download_yadisk_file(remote_path: str, token: str, local_path: str) -> None:
|
||||
headers = {"Authorization": f"OAuth {token}"}
|
||||
response = requests.get(
|
||||
"https://cloud-api.yandex.net/v1/disk/resources/download",
|
||||
headers=headers,
|
||||
params={"path": remote_path},
|
||||
timeout=30,
|
||||
)
|
||||
response.raise_for_status()
|
||||
href = response.json()["href"]
|
||||
file_response = requests.get(href, timeout=180)
|
||||
file_response.raise_for_status()
|
||||
with open(local_path, "wb") as f:
|
||||
f.write(file_response.content)
|
||||
|
||||
|
||||
def fetch_text_from_yadisk(remote_path: str, token: str) -> str:
|
||||
suffix = Path(remote_path).suffix
|
||||
with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as tmp:
|
||||
local_path = tmp.name
|
||||
try:
|
||||
download_yadisk_file(remote_path, token, local_path)
|
||||
loader = get_loader(local_path)
|
||||
if loader is None:
|
||||
return ""
|
||||
docs = loader.load()
|
||||
return extract_document_text(docs)
|
||||
finally:
|
||||
if os.path.exists(local_path):
|
||||
os.unlink(local_path)
|
||||
|
||||
|
||||
def main() -> int:
|
||||
load_dotenv(LANGCHAIN_DIR / ".env")
|
||||
qdrant_host = os.getenv("QDRANT_HOST")
|
||||
qdrant_rest_port = int(os.getenv("QDRANT_REST_PORT", "6333"))
|
||||
yadisk_token = os.getenv("YADISK_TOKEN", "").strip()
|
||||
if not qdrant_host:
|
||||
raise RuntimeError("QDRANT_HOST is missing in langchain .env")
|
||||
if not yadisk_token:
|
||||
raise RuntimeError("YADISK_TOKEN is missing in langchain .env")
|
||||
|
||||
with YADISK_JSON.open("r", encoding="utf-8") as f:
|
||||
raw_paths = json.load(f)
|
||||
if not isinstance(raw_paths, list):
|
||||
raise RuntimeError("yadisk_files.json must be a JSON list of paths")
|
||||
all_paths = [str(x) for x in raw_paths if isinstance(x, str)]
|
||||
|
||||
allowed_ext = supported_loader_extensions()
|
||||
filtered_by_ext = [
|
||||
p for p in all_paths if Path(p).suffix.lower() in allowed_ext and p.startswith("disk:/")
|
||||
]
|
||||
|
||||
client = QdrantClient(host=qdrant_host, port=qdrant_rest_port, timeout=60)
|
||||
langchain_paths = collect_langchain_paths(client)
|
||||
llama_filenames = collect_llama_filenames(client)
|
||||
|
||||
candidates = []
|
||||
for path in filtered_by_ext:
|
||||
if path not in langchain_paths:
|
||||
continue
|
||||
if llama_prefect_filename(path) not in llama_filenames:
|
||||
continue
|
||||
candidates.append(path)
|
||||
|
||||
random.seed(42)
|
||||
random.shuffle(candidates)
|
||||
if len(candidates) < 100:
|
||||
raise RuntimeError(
|
||||
f"Only {len(candidates)} candidate documents found in both collections; need 100"
|
||||
)
|
||||
|
||||
rows: list[dict[str, Any]] = []
|
||||
attempts = 0
|
||||
for remote_path in candidates:
|
||||
if len(rows) >= 100:
|
||||
break
|
||||
attempts += 1
|
||||
idx = len(rows) + 1
|
||||
print(f"[TRY {attempts:03d}] loading {remote_path}")
|
||||
try:
|
||||
text = fetch_text_from_yadisk(remote_path, yadisk_token)
|
||||
except Exception as e:
|
||||
print(f" -> skip (download/read error): {e}")
|
||||
continue
|
||||
if not text.strip():
|
||||
print(" -> skip (empty extracted text)")
|
||||
continue
|
||||
rows.append(
|
||||
{
|
||||
"index": idx,
|
||||
"path": remote_path,
|
||||
"questions": build_questions(remote_path, text),
|
||||
}
|
||||
)
|
||||
print(f"[OK {idx:03d}/100] prepared questions for {remote_path}")
|
||||
|
||||
if len(rows) < 100:
|
||||
raise RuntimeError(
|
||||
f"Only {len(rows)} documents were successfully downloaded/read and turned into questions"
|
||||
)
|
||||
|
||||
lines: list[str] = []
|
||||
lines.append("# DOCUMENTS_TO_TEST")
|
||||
lines.append("")
|
||||
lines.append("This dataset contains 100 YaDisk documents that were verified as present in both:")
|
||||
lines.append("- `documents_langchain` (Qdrant)")
|
||||
lines.append("- `documents_llamaindex` (Qdrant)")
|
||||
lines.append("")
|
||||
lines.append("Question sections are aligned with common RAG evaluation themes (retrieval + generation):")
|
||||
lines.append("- Response relevance / entity-fact recall")
|
||||
lines.append("- Numerical and temporal precision")
|
||||
lines.append("- Context precision")
|
||||
lines.append("- Faithfulness / non-hallucination")
|
||||
lines.append("- Reasoning / synthesis")
|
||||
lines.append("")
|
||||
lines.append(
|
||||
"_References used for evaluation themes: RAGAS metrics and NVIDIA RAG pipeline evaluation docs._"
|
||||
)
|
||||
lines.append("")
|
||||
|
||||
for row in rows:
|
||||
lines.append(f"## {row['index']:03d}. `{row['path']}`")
|
||||
lines.append("")
|
||||
for section, qs in row["questions"].items():
|
||||
lines.append(f"### {section}")
|
||||
for q in qs:
|
||||
lines.append(f"- {q}")
|
||||
lines.append("")
|
||||
|
||||
OUTPUT_MD.write_text("\n".join(lines), encoding="utf-8")
|
||||
print(f"Written: {OUTPUT_MD}")
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
raise SystemExit(main())
|
||||
Reference in New Issue
Block a user