evaluation for rag systems
This commit is contained in:
1
.gitignore
vendored
1
.gitignore
vendored
@@ -1,2 +1,3 @@
|
||||
data-unpacked-archives
|
||||
data-broken-archives
|
||||
.env
|
||||
|
||||
2314
DOCUMENTS_TO_TEST.md
Normal file
2314
DOCUMENTS_TO_TEST.md
Normal file
File diff suppressed because it is too large
Load Diff
28000
EVALUATION_RESULT.md
Normal file
28000
EVALUATION_RESULT.md
Normal file
File diff suppressed because it is too large
Load Diff
377
generate_documents_to_test.py
Normal file
377
generate_documents_to_test.py
Normal file
@@ -0,0 +1,377 @@
|
||||
#!/usr/bin/env python3
|
||||
from __future__ import annotations
|
||||
|
||||
import hashlib
|
||||
import json
|
||||
import os
|
||||
import random
|
||||
import re
|
||||
import tempfile
|
||||
from collections import defaultdict
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
from dotenv import load_dotenv
|
||||
from qdrant_client import QdrantClient
|
||||
from qdrant_client.http.models import Filter, FieldCondition, MatchValue
|
||||
import requests
|
||||
|
||||
try:
|
||||
from langchain_community.document_loaders import PyPDFLoader, TextLoader
|
||||
except Exception: # pragma: no cover
|
||||
PyPDFLoader = None
|
||||
TextLoader = None
|
||||
|
||||
try:
|
||||
from langchain_community.document_loaders import UnstructuredWordDocumentLoader
|
||||
except Exception: # pragma: no cover
|
||||
UnstructuredWordDocumentLoader = None
|
||||
|
||||
try:
|
||||
from langchain_community.document_loaders import UnstructuredPowerPointLoader
|
||||
except Exception: # pragma: no cover
|
||||
UnstructuredPowerPointLoader = None
|
||||
|
||||
try:
|
||||
from langchain_community.document_loaders import UnstructuredExcelLoader
|
||||
except Exception: # pragma: no cover
|
||||
UnstructuredExcelLoader = None
|
||||
|
||||
try:
|
||||
from langchain_community.document_loaders import UnstructuredODTLoader
|
||||
except Exception: # pragma: no cover
|
||||
UnstructuredODTLoader = None
|
||||
|
||||
ROOT = Path(__file__).resolve().parent
|
||||
LANGCHAIN_DIR = ROOT / "services" / "rag" / "langchain"
|
||||
LLAMAINDEX_DIR = ROOT / "services" / "rag" / "llamaindex"
|
||||
YADISK_JSON = ROOT / "yadisk_files.json"
|
||||
OUTPUT_MD = ROOT / "DOCUMENTS_TO_TEST.md"
|
||||
|
||||
|
||||
def safe_stem_from_remote(remote_path: str) -> str:
|
||||
stem = Path(Path(remote_path).name).stem or "file"
|
||||
return "".join(ch if ch.isalnum() or ch in ("-", "_") else "_" for ch in stem)
|
||||
|
||||
|
||||
def llama_prefect_filename(remote_path: str) -> str:
|
||||
remote_name = Path(remote_path).name or "downloaded_file"
|
||||
suffix = Path(remote_name).suffix
|
||||
digest = hashlib.md5(remote_path.encode("utf-8")).hexdigest()[:10]
|
||||
return f"{safe_stem_from_remote(remote_path)}_{digest}{suffix}"
|
||||
|
||||
|
||||
def get_loader(local_path: str):
|
||||
ext = Path(local_path).suffix.lower()
|
||||
if ext == ".pdf" and PyPDFLoader is not None:
|
||||
return PyPDFLoader(local_path)
|
||||
if ext in {".doc", ".docx"} and UnstructuredWordDocumentLoader is not None:
|
||||
return UnstructuredWordDocumentLoader(
|
||||
local_path, **{"strategy": "hi_res", "languages": ["rus"]}
|
||||
)
|
||||
if ext == ".pptx" and UnstructuredPowerPointLoader is not None:
|
||||
return UnstructuredPowerPointLoader(
|
||||
local_path, **{"strategy": "hi_res", "languages": ["rus"]}
|
||||
)
|
||||
if ext in {".xls", ".xlsx"} and UnstructuredExcelLoader is not None:
|
||||
return UnstructuredExcelLoader(
|
||||
local_path, **{"strategy": "hi_res", "languages": ["rus"]}
|
||||
)
|
||||
if ext == ".odt" and UnstructuredODTLoader is not None:
|
||||
return UnstructuredODTLoader(
|
||||
local_path, **{"strategy": "hi_res", "languages": ["rus"]}
|
||||
)
|
||||
if ext in {".txt", ".md"} and TextLoader is not None:
|
||||
return TextLoader(local_path, encoding="utf-8")
|
||||
return None
|
||||
|
||||
|
||||
def supported_loader_extensions() -> set[str]:
|
||||
exts = set()
|
||||
if PyPDFLoader is not None:
|
||||
exts.add(".pdf")
|
||||
if UnstructuredWordDocumentLoader is not None:
|
||||
exts.update({".doc", ".docx"})
|
||||
if UnstructuredPowerPointLoader is not None:
|
||||
exts.add(".pptx")
|
||||
if UnstructuredExcelLoader is not None:
|
||||
exts.update({".xls", ".xlsx"})
|
||||
if UnstructuredODTLoader is not None:
|
||||
exts.add(".odt")
|
||||
if TextLoader is not None:
|
||||
exts.update({".txt", ".md"})
|
||||
return exts
|
||||
|
||||
|
||||
def collect_langchain_paths(client: QdrantClient) -> set[str]:
|
||||
paths: set[str] = set()
|
||||
offset = None
|
||||
while True:
|
||||
points, offset = client.scroll(
|
||||
collection_name="documents_langchain",
|
||||
offset=offset,
|
||||
limit=1000,
|
||||
with_payload=True,
|
||||
with_vectors=False,
|
||||
)
|
||||
if not points:
|
||||
break
|
||||
for p in points:
|
||||
payload = p.payload or {}
|
||||
md = payload.get("metadata") or {}
|
||||
fp = md.get("file_path") or md.get("source")
|
||||
if isinstance(fp, str) and fp:
|
||||
paths.add(fp)
|
||||
if offset is None:
|
||||
break
|
||||
return paths
|
||||
|
||||
|
||||
def collect_llama_filenames(client: QdrantClient) -> set[str]:
|
||||
names: set[str] = set()
|
||||
offset = None
|
||||
while True:
|
||||
points, offset = client.scroll(
|
||||
collection_name="documents_llamaindex",
|
||||
offset=offset,
|
||||
limit=1000,
|
||||
with_payload=True,
|
||||
with_vectors=False,
|
||||
)
|
||||
if not points:
|
||||
break
|
||||
for p in points:
|
||||
payload = p.payload or {}
|
||||
name = payload.get("filename")
|
||||
if isinstance(name, str) and name:
|
||||
names.add(name)
|
||||
if offset is None:
|
||||
break
|
||||
return names
|
||||
|
||||
|
||||
def first_unique(matches: list[str], fallback: str) -> str:
|
||||
for m in matches:
|
||||
m = m.strip()
|
||||
if m:
|
||||
return m
|
||||
return fallback
|
||||
|
||||
|
||||
def build_questions(remote_path: str, text: str) -> dict[str, list[str]]:
|
||||
text = " ".join((text or "").split())
|
||||
text_preview = text[:15000]
|
||||
years = sorted(
|
||||
{
|
||||
int(m)
|
||||
for m in re.findall(r"\b(19\d{2}|20\d{2}|21\d{2})\b", text_preview)
|
||||
if 1900 <= int(m) <= 2199
|
||||
}
|
||||
)
|
||||
dates = re.findall(
|
||||
r"\b(?:\d{1,2}[./-]\d{1,2}[./-]\d{2,4}|\d{4}[./-]\d{1,2}[./-]\d{1,2})\b",
|
||||
text_preview,
|
||||
)
|
||||
numbers = re.findall(r"\b\d{2,}\b", text_preview)
|
||||
quoted = re.findall(r"[\"«]([^\"»\n]{4,120})[\"»]", text_preview)
|
||||
org_like = re.findall(
|
||||
r"\b(?:ООО|АО|ПАО|ФГУП|Минтранс|Министерств[ао]|Правительств[ао]|Форум)\b[^\n,.]{0,80}",
|
||||
text_preview,
|
||||
flags=re.IGNORECASE,
|
||||
)
|
||||
|
||||
year_q = (
|
||||
f"В каком году в документе описывается ключевое событие ({years[0]}) и как это подтверждается контекстом?"
|
||||
if years
|
||||
else "Есть ли в документе указание на год события? Если да, какой именно год упомянут?"
|
||||
)
|
||||
date_q = (
|
||||
f"Какая дата ({dates[0]}) встречается в документе и к какому событию/разделу она относится?"
|
||||
if dates
|
||||
else "Какие календарные даты или периоды (если есть) упомянуты в документе?"
|
||||
)
|
||||
num_q = (
|
||||
f"Какое числовое значение ({numbers[0]}) встречается в документе и в каком контексте оно используется?"
|
||||
if numbers
|
||||
else "Есть ли в документе количественные показатели (суммы, проценты, номера, объемы) и что они обозначают?"
|
||||
)
|
||||
entity = first_unique(quoted, first_unique(org_like, Path(remote_path).name))
|
||||
topic_hint = Path(remote_path).stem.replace("_", " ").replace("-", " ")
|
||||
topic_hint = " ".join(topic_hint.split())[:120]
|
||||
entity_q = f"Что в документе говорится про «{entity}»?"
|
||||
|
||||
return {
|
||||
"Entity/Fact Recall (Response Relevance)": [
|
||||
f"Что известно про «{entity}» в материалах базы?",
|
||||
f"В контексте темы «{topic_hint}» кто выступает ключевым участником и какова его роль?",
|
||||
],
|
||||
"Numerical & Temporal Precision": [
|
||||
year_q.replace("в документе", "в материалах").replace("документе", "материалах"),
|
||||
date_q.replace("в документе", "в материалах").replace("документе", "материалах"),
|
||||
num_q.replace("в документе", "в материалах").replace("документе", "материалах"),
|
||||
],
|
||||
"Context Precision (Evidence-anchored)": [
|
||||
f"Найди в базе фрагмент, который лучше всего подтверждает тезис по теме «{topic_hint}», и объясни его релевантность.",
|
||||
f"Есть ли в базе схожие по теме «{topic_hint}», но нерелевантные фрагменты, которые можно ошибочно выбрать?",
|
||||
],
|
||||
"Faithfulness / Non-hallucination": [
|
||||
f"Какая информация по теме «{topic_hint}» отсутствует в найденном контексте и не должна быть додумана?",
|
||||
f"Если прямого ответа по теме «{topic_hint}» в материалах нет, как корректно ответить без галлюцинаций?",
|
||||
],
|
||||
"Reasoning & Synthesis": [
|
||||
f"Сформулируй краткий вывод по теме «{topic_hint}» в 2-3 пунктах, опираясь на несколько найденных фрагментов.",
|
||||
f"Какие ограничения, риски или условия по теме «{topic_hint}» упоминаются в материалах, и как они влияют на вывод?",
|
||||
],
|
||||
}
|
||||
|
||||
|
||||
def extract_document_text(docs: list[Any]) -> str:
|
||||
chunks: list[str] = []
|
||||
for doc in docs:
|
||||
content = getattr(doc, "page_content", None)
|
||||
if content is None:
|
||||
content = getattr(doc, "text", None)
|
||||
if isinstance(content, str) and content.strip():
|
||||
chunks.append(content.strip())
|
||||
if len(" ".join(chunks)) > 25000:
|
||||
break
|
||||
return "\n".join(chunks)[:25000]
|
||||
|
||||
|
||||
def download_yadisk_file(remote_path: str, token: str, local_path: str) -> None:
|
||||
headers = {"Authorization": f"OAuth {token}"}
|
||||
response = requests.get(
|
||||
"https://cloud-api.yandex.net/v1/disk/resources/download",
|
||||
headers=headers,
|
||||
params={"path": remote_path},
|
||||
timeout=30,
|
||||
)
|
||||
response.raise_for_status()
|
||||
href = response.json()["href"]
|
||||
file_response = requests.get(href, timeout=180)
|
||||
file_response.raise_for_status()
|
||||
with open(local_path, "wb") as f:
|
||||
f.write(file_response.content)
|
||||
|
||||
|
||||
def fetch_text_from_yadisk(remote_path: str, token: str) -> str:
|
||||
suffix = Path(remote_path).suffix
|
||||
with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as tmp:
|
||||
local_path = tmp.name
|
||||
try:
|
||||
download_yadisk_file(remote_path, token, local_path)
|
||||
loader = get_loader(local_path)
|
||||
if loader is None:
|
||||
return ""
|
||||
docs = loader.load()
|
||||
return extract_document_text(docs)
|
||||
finally:
|
||||
if os.path.exists(local_path):
|
||||
os.unlink(local_path)
|
||||
|
||||
|
||||
def main() -> int:
|
||||
load_dotenv(LANGCHAIN_DIR / ".env")
|
||||
qdrant_host = os.getenv("QDRANT_HOST")
|
||||
qdrant_rest_port = int(os.getenv("QDRANT_REST_PORT", "6333"))
|
||||
yadisk_token = os.getenv("YADISK_TOKEN", "").strip()
|
||||
if not qdrant_host:
|
||||
raise RuntimeError("QDRANT_HOST is missing in langchain .env")
|
||||
if not yadisk_token:
|
||||
raise RuntimeError("YADISK_TOKEN is missing in langchain .env")
|
||||
|
||||
with YADISK_JSON.open("r", encoding="utf-8") as f:
|
||||
raw_paths = json.load(f)
|
||||
if not isinstance(raw_paths, list):
|
||||
raise RuntimeError("yadisk_files.json must be a JSON list of paths")
|
||||
all_paths = [str(x) for x in raw_paths if isinstance(x, str)]
|
||||
|
||||
allowed_ext = supported_loader_extensions()
|
||||
filtered_by_ext = [
|
||||
p for p in all_paths if Path(p).suffix.lower() in allowed_ext and p.startswith("disk:/")
|
||||
]
|
||||
|
||||
client = QdrantClient(host=qdrant_host, port=qdrant_rest_port, timeout=60)
|
||||
langchain_paths = collect_langchain_paths(client)
|
||||
llama_filenames = collect_llama_filenames(client)
|
||||
|
||||
candidates = []
|
||||
for path in filtered_by_ext:
|
||||
if path not in langchain_paths:
|
||||
continue
|
||||
if llama_prefect_filename(path) not in llama_filenames:
|
||||
continue
|
||||
candidates.append(path)
|
||||
|
||||
random.seed(42)
|
||||
random.shuffle(candidates)
|
||||
if len(candidates) < 100:
|
||||
raise RuntimeError(
|
||||
f"Only {len(candidates)} candidate documents found in both collections; need 100"
|
||||
)
|
||||
|
||||
rows: list[dict[str, Any]] = []
|
||||
attempts = 0
|
||||
for remote_path in candidates:
|
||||
if len(rows) >= 100:
|
||||
break
|
||||
attempts += 1
|
||||
idx = len(rows) + 1
|
||||
print(f"[TRY {attempts:03d}] loading {remote_path}")
|
||||
try:
|
||||
text = fetch_text_from_yadisk(remote_path, yadisk_token)
|
||||
except Exception as e:
|
||||
print(f" -> skip (download/read error): {e}")
|
||||
continue
|
||||
if not text.strip():
|
||||
print(" -> skip (empty extracted text)")
|
||||
continue
|
||||
rows.append(
|
||||
{
|
||||
"index": idx,
|
||||
"path": remote_path,
|
||||
"questions": build_questions(remote_path, text),
|
||||
}
|
||||
)
|
||||
print(f"[OK {idx:03d}/100] prepared questions for {remote_path}")
|
||||
|
||||
if len(rows) < 100:
|
||||
raise RuntimeError(
|
||||
f"Only {len(rows)} documents were successfully downloaded/read and turned into questions"
|
||||
)
|
||||
|
||||
lines: list[str] = []
|
||||
lines.append("# DOCUMENTS_TO_TEST")
|
||||
lines.append("")
|
||||
lines.append("This dataset contains 100 YaDisk documents that were verified as present in both:")
|
||||
lines.append("- `documents_langchain` (Qdrant)")
|
||||
lines.append("- `documents_llamaindex` (Qdrant)")
|
||||
lines.append("")
|
||||
lines.append("Question sections are aligned with common RAG evaluation themes (retrieval + generation):")
|
||||
lines.append("- Response relevance / entity-fact recall")
|
||||
lines.append("- Numerical and temporal precision")
|
||||
lines.append("- Context precision")
|
||||
lines.append("- Faithfulness / non-hallucination")
|
||||
lines.append("- Reasoning / synthesis")
|
||||
lines.append("")
|
||||
lines.append(
|
||||
"_References used for evaluation themes: RAGAS metrics and NVIDIA RAG pipeline evaluation docs._"
|
||||
)
|
||||
lines.append("")
|
||||
|
||||
for row in rows:
|
||||
lines.append(f"## {row['index']:03d}. `{row['path']}`")
|
||||
lines.append("")
|
||||
for section, qs in row["questions"].items():
|
||||
lines.append(f"### {section}")
|
||||
for q in qs:
|
||||
lines.append(f"- {q}")
|
||||
lines.append("")
|
||||
|
||||
OUTPUT_MD.write_text("\n".join(lines), encoding="utf-8")
|
||||
print(f"Written: {OUTPUT_MD}")
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
raise SystemExit(main())
|
||||
782
rag_evaluation.py
Normal file
782
rag_evaluation.py
Normal file
@@ -0,0 +1,782 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
RAG Evaluation Script
|
||||
|
||||
Evaluates two RAG systems (LangChain and LlamaIndex) using OpenAI-compatible LLM
|
||||
for scoring, with Yandex Disk integration for document verification.
|
||||
|
||||
Usage:
|
||||
python rag_evaluation.py 1:10 # Evaluate questions 1 to 10
|
||||
python rag_evaluation.py 5:20 # Evaluate questions 5 to 20
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
import tempfile
|
||||
from dataclasses import dataclass, field
|
||||
from pathlib import Path
|
||||
from typing import Any, Optional
|
||||
from dotenv import load_dotenv
|
||||
|
||||
load_dotenv()
|
||||
|
||||
import requests
|
||||
|
||||
# =============================================================================
|
||||
# Configuration
|
||||
# =============================================================================
|
||||
|
||||
# OpenAI-compatible LLM settings
|
||||
OPENAI_CHAT_MODEL = os.getenv("OPENAI_CHAT_MODEL", "MiniMaxAI/MiniMax-M2")
|
||||
OPENAI_CHAT_URL = os.getenv("OPENAI_CHAT_URL", "https://foundation-models.api.cloud.ru/v1")
|
||||
OPENAI_CHAT_KEY = os.getenv("OPENAI_CHAT_KEY", "")
|
||||
|
||||
# RAG system URLs
|
||||
LANGCHAIN_URL = "http://localhost:8331/api/test-query"
|
||||
LLAMAINDEX_URL = "http://localhost:8334/api/test-query"
|
||||
|
||||
# Yandex Disk
|
||||
YADISK_TOKEN = os.getenv("YADISK_TOKEN", "")
|
||||
|
||||
# File paths
|
||||
INPUT_MD = Path(__file__).parent / "DOCUMENTS_TO_TEST.md"
|
||||
OUTPUT_MD = Path(__file__).parent / "EVALUATION_RESULT.md"
|
||||
|
||||
# Timeouts
|
||||
RAG_TIMEOUT = 120 # seconds
|
||||
LLM_TIMEOUT = 60 # seconds
|
||||
YADISK_TIMEOUT = 60 # seconds
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Data Classes
|
||||
# =============================================================================
|
||||
|
||||
@dataclass
|
||||
class QuestionItem:
|
||||
"""Represents a single question with its evaluation results."""
|
||||
section: str
|
||||
question: str
|
||||
question_number: int = 0
|
||||
langchain_answer: str = ""
|
||||
llamaindex_answer: str = ""
|
||||
langchain_score: float = 0.0
|
||||
llamaindex_score: float = 0.0
|
||||
winner: str = "Tie"
|
||||
rationale: str = ""
|
||||
error: str = ""
|
||||
|
||||
|
||||
@dataclass
|
||||
class DocumentItem:
|
||||
"""Represents a document with its associated questions."""
|
||||
header: str
|
||||
path: str
|
||||
sections: list[tuple[str, list[QuestionItem]]] = field(default_factory=list)
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Document Parser
|
||||
# =============================================================================
|
||||
|
||||
def split_documents(md_text: str) -> tuple[list[str], list[str]]:
|
||||
"""Split the markdown file into header lines and document blocks."""
|
||||
lines = md_text.splitlines()
|
||||
header: list[str] = []
|
||||
docs: list[list[str]] = []
|
||||
current: list[str] | None = None
|
||||
|
||||
for line in lines:
|
||||
if line.startswith("## "):
|
||||
if current is not None:
|
||||
docs.append(current)
|
||||
current = [line]
|
||||
else:
|
||||
if current is None:
|
||||
header.append(line)
|
||||
else:
|
||||
current.append(line)
|
||||
|
||||
if current is not None:
|
||||
docs.append(current)
|
||||
|
||||
return header, ["\n".join(d) for d in docs]
|
||||
|
||||
|
||||
def parse_document_block(block: str) -> DocumentItem:
|
||||
"""Parse a single document block from the markdown file."""
|
||||
lines = block.splitlines()
|
||||
header = lines[0].strip()
|
||||
|
||||
# Extract file path from backticks
|
||||
m = re.search(r"`([^`]+)`", header)
|
||||
doc_path = m.group(1) if m else ""
|
||||
|
||||
sections: list[tuple[str, list[QuestionItem]]] = []
|
||||
current_section = ""
|
||||
current_questions: list[QuestionItem] = []
|
||||
|
||||
for line in lines[1:]:
|
||||
if line.startswith("### "):
|
||||
if current_section:
|
||||
sections.append((current_section, current_questions))
|
||||
current_section = line[4:].strip()
|
||||
current_questions = []
|
||||
elif line.startswith("- "):
|
||||
q = line[2:].strip()
|
||||
if q:
|
||||
current_questions.append(
|
||||
QuestionItem(section=current_section, question=q)
|
||||
)
|
||||
|
||||
if current_section:
|
||||
sections.append((current_section, current_questions))
|
||||
|
||||
return DocumentItem(header=header, path=doc_path, sections=sections)
|
||||
|
||||
|
||||
def parse_all_documents(md_path: Path) -> list[DocumentItem]:
|
||||
"""Parse all documents from the markdown file."""
|
||||
raw = md_path.read_text(encoding="utf-8")
|
||||
_, doc_blocks = split_documents(raw)
|
||||
return [parse_document_block(b) for b in doc_blocks]
|
||||
|
||||
|
||||
def flatten_questions(docs: list[DocumentItem]) -> list[tuple[DocumentItem, QuestionItem]]:
|
||||
"""Flatten all questions from all documents into a single list with indices."""
|
||||
result = []
|
||||
for doc in docs:
|
||||
for _, questions in doc.sections:
|
||||
for q in questions:
|
||||
result.append((doc, q))
|
||||
return result
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# RAG API Clients
|
||||
# =============================================================================
|
||||
|
||||
def call_langchain(query: str, timeout: int = RAG_TIMEOUT) -> str:
|
||||
"""Call the LangChain RAG system API."""
|
||||
payload = {"query": query}
|
||||
try:
|
||||
r = requests.post(LANGCHAIN_URL, json=payload, timeout=timeout)
|
||||
r.raise_for_status()
|
||||
data = r.json()
|
||||
return str(data.get("response", "")).strip()
|
||||
except Exception as e:
|
||||
return f"ERROR: {e}"
|
||||
|
||||
|
||||
def call_llamaindex(query: str, timeout: int = RAG_TIMEOUT) -> str:
|
||||
"""Call the LlamaIndex RAG system API."""
|
||||
payload = {"query": query}
|
||||
try:
|
||||
r = requests.post(LLAMAINDEX_URL, json=payload, timeout=timeout)
|
||||
r.raise_for_status()
|
||||
data = r.json()
|
||||
return str(data.get("response", "")).strip()
|
||||
except Exception as e:
|
||||
return f"ERROR: {e}"
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Yandex Disk Integration
|
||||
# =============================================================================
|
||||
|
||||
def download_yadisk_file(remote_path: str, token: str, local_path: str, timeout: int = YADISK_TIMEOUT) -> None:
|
||||
"""Download a file from Yandex Disk to a local path."""
|
||||
headers = {"Authorization": f"OAuth {token}"}
|
||||
|
||||
# Get download URL
|
||||
response = requests.get(
|
||||
"https://cloud-api.yandex.net/v1/disk/resources/download",
|
||||
headers=headers,
|
||||
params={"path": remote_path},
|
||||
timeout=timeout,
|
||||
)
|
||||
response.raise_for_status()
|
||||
href = response.json()["href"]
|
||||
|
||||
# Download the file
|
||||
file_response = requests.get(href, timeout=timeout * 2)
|
||||
file_response.raise_for_status()
|
||||
|
||||
with open(local_path, "wb") as f:
|
||||
f.write(file_response.content)
|
||||
|
||||
|
||||
def extract_text_from_file(file_path: str) -> str:
|
||||
"""Extract text from a downloaded file based on its extension."""
|
||||
ext = Path(file_path).suffix.lower()
|
||||
|
||||
# Text-based formats
|
||||
if ext in [".txt", ".md", ".csv", ".json", ".xml", ".html", ".htm"]:
|
||||
with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
|
||||
return f.read()
|
||||
|
||||
# For binary formats (docx, pdf, xlsx), we'll return a placeholder
|
||||
# In production, you'd use libraries like python-docx, PyPDF2, openpyxl
|
||||
elif ext in [".docx", ".doc"]:
|
||||
try:
|
||||
from docx import Document
|
||||
doc = Document(file_path)
|
||||
return "\n".join([p.text for p in doc.paragraphs])
|
||||
except ImportError:
|
||||
return f"[DOCX file: {file_path}] - python-docx not installed"
|
||||
except Exception as e:
|
||||
return f"[DOCX read error: {e}]"
|
||||
|
||||
elif ext == ".pdf":
|
||||
try:
|
||||
import PyPDF2
|
||||
text_parts = []
|
||||
with open(file_path, "rb") as f:
|
||||
reader = PyPDF2.PdfReader(f)
|
||||
for page in reader.pages:
|
||||
text_parts.append(page.extract_text() or "")
|
||||
return "\n".join(text_parts)
|
||||
except ImportError:
|
||||
return f"[PDF file: {file_path}] - PyPDF2 not installed"
|
||||
except Exception as e:
|
||||
return f"[PDF read error: {e}]"
|
||||
|
||||
elif ext in [".xlsx", ".xls"]:
|
||||
try:
|
||||
from openpyxl import load_workbook
|
||||
wb = load_workbook(file_path, read_only=True)
|
||||
texts = []
|
||||
for sheet in wb.worksheets:
|
||||
for row in sheet.iter_rows(values_only=True):
|
||||
texts.append("\t".join(str(c) if c is not None else "" for c in row))
|
||||
return "\n".join(texts)
|
||||
except ImportError:
|
||||
return f"[XLSX file: {file_path}] - openpyxl not installed"
|
||||
except Exception as e:
|
||||
return f"[XLSX read error: {e}]"
|
||||
|
||||
else:
|
||||
# Try to read as text
|
||||
try:
|
||||
with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
|
||||
return f.read()
|
||||
except Exception:
|
||||
return f"[Binary file: {file_path}]"
|
||||
|
||||
|
||||
def fetch_document_content(remote_path: str, token: str) -> str:
|
||||
"""Fetch content from a Yandex Disk file."""
|
||||
if not token:
|
||||
return "[Yandex Disk token not provided]"
|
||||
|
||||
# Clean up the path - remove "disk:/" prefix if present
|
||||
clean_path = remote_path
|
||||
if clean_path.startswith("disk:/"):
|
||||
clean_path = clean_path[6:]
|
||||
|
||||
with tempfile.NamedTemporaryFile(delete=False, suffix=Path(clean_path).suffix) as tmp:
|
||||
local_path = tmp.name
|
||||
|
||||
try:
|
||||
download_yadisk_file(clean_path, token, local_path)
|
||||
return extract_text_from_file(local_path)
|
||||
except Exception as e:
|
||||
return f"[Yandex Disk download error: {e}]"
|
||||
finally:
|
||||
if os.path.exists(local_path):
|
||||
os.unlink(local_path)
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# OpenAI-compatible LLM Evaluator
|
||||
# =============================================================================
|
||||
|
||||
def create_evaluation_prompt(
|
||||
question: str,
|
||||
rag_response: str,
|
||||
document_content: str,
|
||||
section_type: str
|
||||
) -> str:
|
||||
"""Create a prompt for the LLM to evaluate a RAG response."""
|
||||
|
||||
# Section-specific evaluation criteria
|
||||
section_criteria = {
|
||||
"Entity/Fact Recall (Response Relevance)": """
|
||||
Критерии оценки:
|
||||
- Насколько точно ответ извлекает факты и сущности из документа
|
||||
- Соответствует ли ответ на вопрос о ключевых участниках и их ролях
|
||||
- Полнота извлечения фактов из контекста
|
||||
""",
|
||||
"Numerical & Temporal Precision": """
|
||||
Критерии оценки:
|
||||
- Точность извлечения дат, лет, числовых значений
|
||||
- Соответствие чисел в ответе числам в документе
|
||||
- Правильность временных привязок событий
|
||||
""",
|
||||
"Context Precision (Evidence-anchored)": """
|
||||
Критерии оценки:
|
||||
- Насколько хорошо ответ идентифицирует релевантные фрагменты
|
||||
- Умение отличать релевантные фрагменты от нерелевантных
|
||||
- Обоснованность выбора контекста
|
||||
""",
|
||||
"Faithfulness / Non-hallucination": """
|
||||
Критерии оценки:
|
||||
- Отсутствие выдуманной информации
|
||||
- Ответ основан только на предоставленном контексте
|
||||
- Корректное указание на отсутствие информации, если её нет
|
||||
""",
|
||||
"Reasoning & Synthesis": """
|
||||
Критерии оценки:
|
||||
- Качество синтеза информации из нескольких фрагментов
|
||||
- Логичность выводов
|
||||
- Указание на ограничения, риски или условия
|
||||
"""
|
||||
}
|
||||
|
||||
criteria = section_criteria.get(section_type, """
|
||||
Критерии оценки:
|
||||
- Релевантность ответа вопросу
|
||||
- Точность фактов
|
||||
- Отсутствие галлюцинаций
|
||||
- Полнота ответа
|
||||
""")
|
||||
|
||||
prompt = f"""Ты — эксперт по оценке качества RAG-систем (Retrieval-Augmented Generation).
|
||||
|
||||
Твоя задача: оценить качество ответа RAG-системы на вопрос пользователя, сравнив его с содержимым исходного документа.
|
||||
|
||||
## Вопрос пользователя:
|
||||
{question}
|
||||
|
||||
## Ответ RAG-системы:
|
||||
{rag_response}
|
||||
|
||||
## Содержимое исходного документа:
|
||||
{document_content[:8000]} # Ограничиваем длину для контекста
|
||||
|
||||
{criteria}
|
||||
|
||||
## Формат ответа:
|
||||
Верни ответ ТОЛЬКО в формате JSON:
|
||||
{{
|
||||
"score": <число от 0.0 до 1.0>,
|
||||
"rationale": "<краткое обоснование оценки на русском языке>",
|
||||
"strengths": ["<сильные стороны>"],
|
||||
"weaknesses": ["<слабые стороны>"],
|
||||
"hallucination_detected": <true/false>,
|
||||
"missing_info": ["<отсутствующая важная информация>"]
|
||||
}}
|
||||
|
||||
Оценка:
|
||||
- 1.0: Идеальный ответ, полностью точный и полный
|
||||
- 0.8-0.9: Очень хороший ответ с незначительными неточностями
|
||||
- 0.6-0.7: Хороший ответ, но есть некоторые проблемы
|
||||
- 0.4-0.5: Удовлетворительный ответ с существенными проблемами
|
||||
- 0.2-0.3: Плохой ответ, много ошибок или неполный
|
||||
- 0.0-0.1: Ответ неверный или содержит галлюцинации
|
||||
"""
|
||||
return prompt
|
||||
|
||||
|
||||
def evaluate_with_llm(
|
||||
question: str,
|
||||
rag_response: str,
|
||||
document_content: str,
|
||||
section_type: str,
|
||||
model: str = OPENAI_CHAT_MODEL,
|
||||
api_url: str = OPENAI_CHAT_URL,
|
||||
api_key: str = OPENAI_CHAT_KEY
|
||||
) -> dict[str, Any]:
|
||||
"""Evaluate a RAG response using the OpenAI-compatible LLM."""
|
||||
|
||||
if not api_key:
|
||||
return {
|
||||
"score": 0.0,
|
||||
"rationale": "API key not provided",
|
||||
"error": "Missing API key"
|
||||
}
|
||||
|
||||
prompt = create_evaluation_prompt(question, rag_response, document_content, section_type)
|
||||
|
||||
headers = {
|
||||
"Authorization": f"Bearer {api_key}",
|
||||
"Content-Type": "application/json"
|
||||
}
|
||||
|
||||
payload = {
|
||||
"model": model,
|
||||
"messages": [
|
||||
{
|
||||
"role": "system",
|
||||
"content": "Ты — эксперт по оценке качества RAG-систем. Отвечай ТОЛЬКО в формате JSON."
|
||||
},
|
||||
{
|
||||
"role": "user",
|
||||
"content": prompt
|
||||
}
|
||||
],
|
||||
"temperature": 0.1,
|
||||
"max_tokens": 500
|
||||
}
|
||||
|
||||
try:
|
||||
response = requests.post(
|
||||
api_url + "/chat/completions",
|
||||
headers=headers,
|
||||
json=payload,
|
||||
timeout=LLM_TIMEOUT
|
||||
)
|
||||
response.raise_for_status()
|
||||
|
||||
result = response.json()
|
||||
|
||||
# Safely extract content
|
||||
try:
|
||||
content = result.get("choices", [{}])[0].get("message", {}).get("content")
|
||||
except (IndexError, KeyError):
|
||||
content = None
|
||||
|
||||
if not content:
|
||||
return {
|
||||
"score": 0.5,
|
||||
"rationale": "LLM returned empty or malformed response",
|
||||
"error": "Empty content in LLM response"
|
||||
}
|
||||
|
||||
# Parse JSON response
|
||||
try:
|
||||
# Try to extract JSON from the response
|
||||
json_match = re.search(r'\{[^{}]*\}', content, re.DOTALL)
|
||||
if json_match:
|
||||
evaluation = json.loads(json_match.group())
|
||||
else:
|
||||
evaluation = json.loads(content)
|
||||
|
||||
return {
|
||||
"score": float(evaluation.get("score", 0.0)),
|
||||
"rationale": evaluation.get("rationale", "") or "",
|
||||
"strengths": evaluation.get("strengths", []),
|
||||
"weaknesses": evaluation.get("weaknesses", []),
|
||||
"hallucination_detected": evaluation.get("hallucination_detected", False),
|
||||
"missing_info": evaluation.get("missing_info", [])
|
||||
}
|
||||
except json.JSONDecodeError as e:
|
||||
return {
|
||||
"score": 0.5,
|
||||
"rationale": f"Failed to parse LLM response: {content[:200]}",
|
||||
"error": str(e)
|
||||
}
|
||||
|
||||
except requests.RequestException as e:
|
||||
return {
|
||||
"score": 0.0,
|
||||
"rationale": f"LLM API error: {e}",
|
||||
"error": str(e)
|
||||
}
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Results Output
|
||||
# =============================================================================
|
||||
|
||||
def truncate_text(text: str, max_len: int = 1500) -> str:
|
||||
"""Truncate text for display."""
|
||||
text = (text or "").strip()
|
||||
if len(text) <= max_len:
|
||||
return text
|
||||
return text[:max_len] + "... [truncated]"
|
||||
|
||||
|
||||
def format_question_result(q: QuestionItem, doc_path: str) -> str:
|
||||
"""Format a single question result for the output markdown."""
|
||||
lines = [
|
||||
f"#### Вопрос #{q.question_number}",
|
||||
f"**Вопрос:** {q.question}",
|
||||
"",
|
||||
f"**Секция:** {q.section}",
|
||||
"",
|
||||
"**Ответ LangChain:**",
|
||||
f"```",
|
||||
truncate_text(q.langchain_answer),
|
||||
"```",
|
||||
"",
|
||||
"**Ответ LlamaIndex:**",
|
||||
f"```",
|
||||
truncate_text(q.llamaindex_answer),
|
||||
"```",
|
||||
"",
|
||||
"**Результаты оценки:**",
|
||||
f"- LangChain Score: {q.langchain_score:.2f}",
|
||||
f"- LlamaIndex Score: {q.llamaindex_score:.2f}",
|
||||
f"- Победитель: **{q.winner}**",
|
||||
"",
|
||||
f"**Обоснование:** {q.rationale}",
|
||||
"",
|
||||
"---",
|
||||
""
|
||||
]
|
||||
return "\n".join(lines)
|
||||
|
||||
|
||||
def format_document_results(doc: DocumentItem, with_results: bool = True) -> str:
|
||||
"""Format document results for the output markdown."""
|
||||
lines = [
|
||||
doc.header,
|
||||
"",
|
||||
f"**Путь к файлу:** `{doc.path}`",
|
||||
""
|
||||
]
|
||||
|
||||
if with_results:
|
||||
for section_name, questions in doc.sections:
|
||||
lines.append(f"### {section_name}")
|
||||
lines.append("")
|
||||
for q in questions:
|
||||
lines.append(format_question_result(q, doc.path))
|
||||
else:
|
||||
lines.append("_Результаты ещё не обработаны._")
|
||||
lines.append("")
|
||||
|
||||
return "\n".join(lines)
|
||||
|
||||
|
||||
def format_summary(all_questions: list[QuestionItem], batch_info: dict) -> str:
|
||||
"""Format summary statistics."""
|
||||
wins = {"LangChain": 0, "LlamaIndex": 0, "Tie": 0}
|
||||
scores_lc: list[float] = []
|
||||
scores_li: list[float] = []
|
||||
|
||||
for q in all_questions:
|
||||
wins[q.winner] += 1
|
||||
scores_lc.append(q.langchain_score)
|
||||
scores_li.append(q.llamaindex_score)
|
||||
|
||||
avg_lc = sum(scores_lc) / max(1, len(scores_lc))
|
||||
avg_li = sum(scores_li) / max(1, len(scores_li))
|
||||
|
||||
if avg_lc > avg_li + 0.05:
|
||||
ranking = "LangChain"
|
||||
elif avg_li > avg_lc + 0.05:
|
||||
ranking = "LlamaIndex"
|
||||
else:
|
||||
ranking = "Ничья"
|
||||
|
||||
lines = [
|
||||
"## Итоговая сводка",
|
||||
"",
|
||||
f"- Всего вопросов оценено: {len(all_questions)}",
|
||||
f"- Диапазон вопросов: {batch_info.get('from', 1)} - {batch_info.get('to', len(all_questions))}",
|
||||
"",
|
||||
"### Победители по вопросам:",
|
||||
f"- LangChain: {wins['LangChain']}",
|
||||
f"- LlamaIndex: {wins['LlamaIndex']}",
|
||||
f"- Ничья: {wins['Tie']}",
|
||||
"",
|
||||
"### Средние оценки:",
|
||||
f"- LangChain: {avg_lc:.3f}",
|
||||
f"- LlamaIndex: {avg_li:.3f}",
|
||||
"",
|
||||
f"### Итоговый рейтинг: **{ranking}**",
|
||||
"",
|
||||
"_Методика оценки: LLM-оценка на основе сравнения с содержимым документов из Yandex Disk._",
|
||||
"",
|
||||
"---",
|
||||
""
|
||||
]
|
||||
return "\n".join(lines)
|
||||
|
||||
|
||||
def write_results(
|
||||
header_lines: list[str],
|
||||
docs: list[DocumentItem],
|
||||
all_questions: list[QuestionItem],
|
||||
batch_info: dict,
|
||||
output_path: Path
|
||||
) -> None:
|
||||
"""Write evaluation results to markdown file."""
|
||||
output_parts: list[str] = []
|
||||
|
||||
# Header
|
||||
output_parts.extend(header_lines)
|
||||
output_parts.append("")
|
||||
output_parts.append("# Результаты оценки RAG-систем")
|
||||
output_parts.append("")
|
||||
output_parts.append(f"Дата генерации: {Path(output_path).stat().st_mtime if output_path.exists() else 'N/A'}")
|
||||
output_parts.append("")
|
||||
|
||||
# Summary
|
||||
output_parts.append(format_summary(all_questions, batch_info))
|
||||
|
||||
# Detailed results per document
|
||||
for doc in docs:
|
||||
output_parts.append(format_document_results(doc, with_results=True))
|
||||
|
||||
output_path.write_text("\n".join(output_parts).rstrip() + "\n", encoding="utf-8")
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Main Evaluation Loop
|
||||
# =============================================================================
|
||||
|
||||
def run_evaluation(
|
||||
from_q: int,
|
||||
to_q: int,
|
||||
timeout_rag: int = RAG_TIMEOUT,
|
||||
timeout_llm: int = LLM_TIMEOUT
|
||||
) -> None:
|
||||
"""Run the evaluation for the specified question range."""
|
||||
|
||||
print(f"Загрузка документов из {INPUT_MD}...")
|
||||
docs = parse_all_documents(INPUT_MD)
|
||||
all_flat = flatten_questions(docs)
|
||||
|
||||
total_questions = len(all_flat)
|
||||
print(f"Всего вопросов найдено: {total_questions}")
|
||||
|
||||
# Adjust range
|
||||
from_q = max(1, from_q)
|
||||
to_q = min(total_questions, to_q)
|
||||
|
||||
if from_q > to_q:
|
||||
print(f"Ошибка: диапазон {from_q}:{to_q} некорректен")
|
||||
return
|
||||
|
||||
print(f"Оценка вопросов с {from_q} по {to_q}...")
|
||||
|
||||
# Store original header for output
|
||||
raw = INPUT_MD.read_text(encoding="utf-8")
|
||||
header_lines, _ = split_documents(raw)
|
||||
|
||||
# Track processed questions
|
||||
processed_indices = set(range(from_q - 1, to_q))
|
||||
|
||||
# Process each question in range
|
||||
q_index = 0
|
||||
for doc_idx, (doc, q) in enumerate(all_flat):
|
||||
q_index += 1
|
||||
|
||||
if q_index < from_q or q_index > to_q:
|
||||
continue
|
||||
|
||||
q.question_number = q_index
|
||||
print(f"\n[{q_index}/{total_questions}] {q.question[:80]}...")
|
||||
|
||||
# Call both RAG systems
|
||||
print(f" -> LangChain...", end=" ", flush=True)
|
||||
t0 = __import__("time").time()
|
||||
q.langchain_answer = call_langchain(q.question, timeout=timeout_rag)
|
||||
print(f"OK ({__import__('time').time() - t0:.1f}s)")
|
||||
|
||||
print(f" -> LlamaIndex...", end=" ", flush=True)
|
||||
t0 = __import__("time").time()
|
||||
q.llamaindex_answer = call_llamaindex(q.question, timeout=timeout_rag)
|
||||
print(f"OK ({__import__('time').time() - t0:.1f}s)")
|
||||
|
||||
# Download document content from Yandex Disk
|
||||
print(f" -> Загрузка документа из Yandex Disk...", end=" ", flush=True)
|
||||
if doc.path:
|
||||
doc_content = fetch_document_content(doc.path, YADISK_TOKEN)
|
||||
print(f"OK ({len(doc_content)} символов)")
|
||||
else:
|
||||
doc_content = "[Путь к документу не найден]"
|
||||
print("SKIP (нет пути)")
|
||||
|
||||
# Evaluate LangChain response
|
||||
print(f" -> Оценка LangChain...", end=" ", flush=True)
|
||||
lc_eval = evaluate_with_llm(
|
||||
q.question, q.langchain_answer, doc_content, q.section
|
||||
)
|
||||
q.langchain_score = lc_eval.get("score", 0.0)
|
||||
lc_rationale = lc_eval.get("rationale", "")
|
||||
print(f"Score: {q.langchain_score:.2f}")
|
||||
|
||||
# Evaluate LlamaIndex response
|
||||
print(f" -> Оценка LlamaIndex...", end=" ", flush=True)
|
||||
li_eval = evaluate_with_llm(
|
||||
q.question, q.llamaindex_answer, doc_content, q.section
|
||||
)
|
||||
q.llamaindex_score = li_eval.get("score", 0.0)
|
||||
li_rationale = li_eval.get("rationale", "")
|
||||
print(f"Score: {q.llamaindex_score:.2f}")
|
||||
|
||||
# Determine winner
|
||||
score_diff = abs(q.langchain_score - q.llamaindex_score)
|
||||
if score_diff < 0.05:
|
||||
q.winner = "Tie"
|
||||
elif q.langchain_score > q.llamaindex_score:
|
||||
q.winner = "LangChain"
|
||||
else:
|
||||
q.winner = "LlamaIndex"
|
||||
|
||||
# Combine rationales
|
||||
q.rationale = f"LC: {lc_rationale} | LI: {li_rationale}"
|
||||
|
||||
# Write results
|
||||
print(f"\nЗапись результатов в {OUTPUT_MD}...")
|
||||
batch_info = {"from": from_q, "to": to_q}
|
||||
|
||||
# Collect all evaluated questions
|
||||
evaluated_questions = [
|
||||
q for _, q in all_flat
|
||||
if q.question_number in range(from_q, to_q + 1)
|
||||
]
|
||||
|
||||
write_results(header_lines, docs, evaluated_questions, batch_info, OUTPUT_MD)
|
||||
print(f"Готово! Результаты сохранены в {OUTPUT_MD}")
|
||||
|
||||
|
||||
def main() -> int:
|
||||
"""Main entry point."""
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Оценка RAG-систем с использованием LLM и Yandex Disk"
|
||||
)
|
||||
parser.add_argument(
|
||||
"range",
|
||||
type=str,
|
||||
help="Диапазон вопросов для оценки в формате 'from:to' (например, 1:10)"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--timeout-rag",
|
||||
type=int,
|
||||
default=RAG_TIMEOUT,
|
||||
help=f"Таймаут для RAG API (по умолчанию {RAG_TIMEOUT}s)"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--timeout-llm",
|
||||
type=int,
|
||||
default=LLM_TIMEOUT,
|
||||
help=f"Таймаут для LLM API (по умолчанию {LLM_TIMEOUT}s)"
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# Parse range argument
|
||||
range_match = re.match(r"(\d+):(\d+)", args.range)
|
||||
if not range_match:
|
||||
print("Ошибка: диапазон должен быть в формате 'from:to' (например, 1:10)")
|
||||
return 1
|
||||
|
||||
from_q = int(range_match.group(1))
|
||||
to_q = int(range_match.group(2))
|
||||
|
||||
if from_q > to_q:
|
||||
print("Ошибка: 'from' должно быть меньше или равно 'to'")
|
||||
return 1
|
||||
|
||||
# Validate configuration
|
||||
if not OPENAI_CHAT_KEY:
|
||||
print("Предупреждение: OPENAI_CHAT_KEY не установлен. Оценка LLM будет пропущена.")
|
||||
if not YADISK_TOKEN:
|
||||
print("Предупреждение: YADISK_TOKEN не установлен. Загрузка документов будет пропущена.")
|
||||
|
||||
# Run evaluation
|
||||
run_evaluation(from_q, to_q, args.timeout_rag, args.timeout_llm)
|
||||
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
raise SystemExit(main())
|
||||
353
run_rag_batch_eval.py
Normal file
353
run_rag_batch_eval.py
Normal file
@@ -0,0 +1,353 @@
|
||||
#!/usr/bin/env python3
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import math
|
||||
import re
|
||||
import time
|
||||
from dataclasses import dataclass, field
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
import requests
|
||||
|
||||
LANGCHAIN_URL = "http://localhost:8331/api/test-query"
|
||||
LLAMAINDEX_URL = "http://localhost:8334/api/test-query"
|
||||
|
||||
INPUT_MD = Path("/Users/idchlife/www/work/rag-solution/DOCUMENTS_TO_TEST.md")
|
||||
OUTPUT_MD = Path("/Users/idchlife/www/work/rag-solution/RAG_EVALUATION.md")
|
||||
|
||||
|
||||
STOPWORDS_RU = {
|
||||
"что",
|
||||
"кто",
|
||||
"как",
|
||||
"какой",
|
||||
"какая",
|
||||
"какие",
|
||||
"ли",
|
||||
"в",
|
||||
"на",
|
||||
"по",
|
||||
"и",
|
||||
"или",
|
||||
"для",
|
||||
"из",
|
||||
"с",
|
||||
"о",
|
||||
"об",
|
||||
"а",
|
||||
"не",
|
||||
"к",
|
||||
"до",
|
||||
"от",
|
||||
"это",
|
||||
"есть",
|
||||
"если",
|
||||
"какому",
|
||||
"каком",
|
||||
"году",
|
||||
"материалах",
|
||||
"базы",
|
||||
"найди",
|
||||
}
|
||||
|
||||
|
||||
@dataclass
|
||||
class QuestionItem:
|
||||
section: str
|
||||
question: str
|
||||
langchain_answer: str = ""
|
||||
llamaindex_answer: str = ""
|
||||
langchain_score: float = 0.0
|
||||
llamaindex_score: float = 0.0
|
||||
winner: str = "Tie"
|
||||
rationale: str = ""
|
||||
|
||||
|
||||
@dataclass
|
||||
class DocumentItem:
|
||||
header: str
|
||||
path: str
|
||||
sections: list[tuple[str, list[QuestionItem]]] = field(default_factory=list)
|
||||
|
||||
|
||||
def split_documents(md_text: str) -> tuple[list[str], list[str]]:
|
||||
lines = md_text.splitlines()
|
||||
header: list[str] = []
|
||||
docs: list[list[str]] = []
|
||||
current: list[str] | None = None
|
||||
for line in lines:
|
||||
if line.startswith("## "):
|
||||
if current is not None:
|
||||
docs.append(current)
|
||||
current = [line]
|
||||
else:
|
||||
if current is None:
|
||||
header.append(line)
|
||||
else:
|
||||
current.append(line)
|
||||
if current is not None:
|
||||
docs.append(current)
|
||||
return header, ["\n".join(d) for d in docs]
|
||||
|
||||
|
||||
def parse_document_block(block: str) -> DocumentItem:
|
||||
lines = block.splitlines()
|
||||
header = lines[0].strip()
|
||||
m = re.search(r"`([^`]+)`", header)
|
||||
doc_path = m.group(1) if m else ""
|
||||
sections: list[tuple[str, list[QuestionItem]]] = []
|
||||
current_section = ""
|
||||
current_questions: list[QuestionItem] = []
|
||||
for line in lines[1:]:
|
||||
if line.startswith("### "):
|
||||
if current_section:
|
||||
sections.append((current_section, current_questions))
|
||||
current_section = line[4:].strip()
|
||||
current_questions = []
|
||||
elif line.startswith("- "):
|
||||
q = line[2:].strip()
|
||||
if q:
|
||||
current_questions.append(
|
||||
QuestionItem(section=current_section, question=q)
|
||||
)
|
||||
if current_section:
|
||||
sections.append((current_section, current_questions))
|
||||
return DocumentItem(header=header, path=doc_path, sections=sections)
|
||||
|
||||
|
||||
def tokenize(text: str) -> list[str]:
|
||||
tokens = re.findall(r"[A-Za-zА-Яа-я0-9_]+", text.lower())
|
||||
return [t for t in tokens if len(t) > 2 and t not in STOPWORDS_RU]
|
||||
|
||||
|
||||
def score_answer(question: str, answer: str) -> tuple[float, dict[str, float]]:
|
||||
answer = (answer or "").strip()
|
||||
if not answer:
|
||||
return 0.0, {"len": 0.0, "overlap": 0.0, "specificity": 0.0, "structure": 0.0}
|
||||
|
||||
q_tokens = set(tokenize(question))
|
||||
a_tokens = tokenize(answer)
|
||||
a_token_set = set(a_tokens)
|
||||
overlap = (len(q_tokens & a_token_set) / max(1, len(q_tokens))) if q_tokens else 0.0
|
||||
|
||||
length_score = min(1.0, len(answer) / 500.0)
|
||||
if len(answer) > 2800:
|
||||
length_score *= 0.85
|
||||
|
||||
numbers = len(re.findall(r"\b\d+(?:[.,]\d+)?\b", answer))
|
||||
cyr_names = len(re.findall(r"[А-ЯЁ][а-яё]{2,}(?:\s+[А-ЯЁ][а-яё]{2,}){0,2}", answer))
|
||||
specificity = min(1.0, (numbers * 0.08) + (cyr_names * 0.05))
|
||||
|
||||
bullet_like = 1.0 if re.search(r"(^|\n)\s*(?:\d+\.|-)\s+", answer) else 0.0
|
||||
sentence_count = len(re.findall(r"[.!?]", answer))
|
||||
structure = min(1.0, bullet_like * 0.5 + min(0.5, sentence_count / 6.0))
|
||||
|
||||
refusal_penalty = 0.0
|
||||
if re.search(
|
||||
r"\b(ошибк|error|не удалось|failed|исключени|exception)\b", answer.lower()
|
||||
):
|
||||
refusal_penalty = 0.6
|
||||
|
||||
total = (
|
||||
(0.38 * overlap)
|
||||
+ (0.26 * length_score)
|
||||
+ (0.20 * specificity)
|
||||
+ (0.16 * structure)
|
||||
- refusal_penalty
|
||||
)
|
||||
total = max(0.0, min(1.0, total))
|
||||
return total, {
|
||||
"len": length_score,
|
||||
"overlap": overlap,
|
||||
"specificity": specificity,
|
||||
"structure": structure,
|
||||
}
|
||||
|
||||
|
||||
def compare_answers(
|
||||
question: str, lc_answer: str, li_answer: str
|
||||
) -> tuple[str, float, float, str]:
|
||||
lc_score, lc_parts = score_answer(question, lc_answer)
|
||||
li_score, li_parts = score_answer(question, li_answer)
|
||||
diff = lc_score - li_score
|
||||
if abs(diff) < 0.04:
|
||||
winner = "Tie"
|
||||
elif diff > 0:
|
||||
winner = "LangChain"
|
||||
else:
|
||||
winner = "LlamaIndex"
|
||||
rationale = (
|
||||
f"LC(overlap={lc_parts['overlap']:.2f}, len={lc_parts['len']:.2f}, spec={lc_parts['specificity']:.2f}, "
|
||||
f"struct={lc_parts['structure']:.2f}) vs "
|
||||
f"LI(overlap={li_parts['overlap']:.2f}, len={li_parts['len']:.2f}, spec={li_parts['specificity']:.2f}, "
|
||||
f"struct={li_parts['structure']:.2f})"
|
||||
)
|
||||
return winner, lc_score, li_score, rationale
|
||||
|
||||
|
||||
def call_langchain(query: str, timeout: int) -> str:
|
||||
payload = {"query": query}
|
||||
r = requests.post(LANGCHAIN_URL, json=payload, timeout=timeout)
|
||||
r.raise_for_status()
|
||||
data = r.json()
|
||||
return str(data.get("response", "")).strip()
|
||||
|
||||
|
||||
def call_llamaindex(query: str, timeout: int) -> str:
|
||||
payload = {"query": query, "mode": "agent"}
|
||||
r = requests.post(LLAMAINDEX_URL, json=payload, timeout=timeout)
|
||||
r.raise_for_status()
|
||||
data = r.json()
|
||||
return str(data.get("response", "")).strip()
|
||||
|
||||
|
||||
def truncate(text: str, max_len: int = 1400) -> str:
|
||||
text = (text or "").strip()
|
||||
if len(text) <= max_len:
|
||||
return text
|
||||
return text[:max_len] + "... [truncated]"
|
||||
|
||||
|
||||
def format_batch_summary(
|
||||
batch_docs: list[DocumentItem],
|
||||
batch_idx: int,
|
||||
docs_in_batch: int,
|
||||
) -> str:
|
||||
wins = {"LangChain": 0, "LlamaIndex": 0, "Tie": 0}
|
||||
scores_lc: list[float] = []
|
||||
scores_li: list[float] = []
|
||||
questions = 0
|
||||
for doc in batch_docs:
|
||||
for _, qs in doc.sections:
|
||||
for q in qs:
|
||||
questions += 1
|
||||
wins[q.winner] += 1
|
||||
scores_lc.append(q.langchain_score)
|
||||
scores_li.append(q.llamaindex_score)
|
||||
avg_lc = sum(scores_lc) / max(1, len(scores_lc))
|
||||
avg_li = sum(scores_li) / max(1, len(scores_li))
|
||||
lines = [
|
||||
f"## Batch {batch_idx} Summary",
|
||||
"",
|
||||
f"- Documents processed in this batch: {docs_in_batch}",
|
||||
f"- Questions processed in this batch: {questions}",
|
||||
f"- LangChain wins: {wins['LangChain']}",
|
||||
f"- LlamaIndex wins: {wins['LlamaIndex']}",
|
||||
f"- Ties: {wins['Tie']}",
|
||||
f"- Average score LangChain: {avg_lc:.3f}",
|
||||
f"- Average score LlamaIndex: {avg_li:.3f}",
|
||||
(
|
||||
f"- Final ranking for this batch: "
|
||||
f"{'LangChain' if avg_lc > avg_li + 0.01 else 'LlamaIndex' if avg_li > avg_lc + 0.01 else 'Tie'}"
|
||||
),
|
||||
"",
|
||||
"_Scoring note: relative heuristic rubric (query overlap, informativeness, specificity, structure), "
|
||||
"used only for side-by-side ranking in this batch._",
|
||||
"",
|
||||
]
|
||||
return "\n".join(lines)
|
||||
|
||||
|
||||
def render_document_with_results(doc: DocumentItem, with_results: bool) -> str:
|
||||
lines = [doc.header, ""]
|
||||
for section_name, questions in doc.sections:
|
||||
lines.append(f"### {section_name}")
|
||||
for q in questions:
|
||||
lines.append(f"- {q.question}")
|
||||
if with_results:
|
||||
lines.append("")
|
||||
lines.append(" - `LangChain Answer`:")
|
||||
lines.append(f" {truncate(q.langchain_answer)}")
|
||||
lines.append(" - `LlamaIndex Answer`:")
|
||||
lines.append(f" {truncate(q.llamaindex_answer)}")
|
||||
lines.append(
|
||||
f" - `Result`: winner={q.winner}, "
|
||||
f"score_langchain={q.langchain_score:.3f}, score_llamaindex={q.llamaindex_score:.3f}"
|
||||
)
|
||||
lines.append(f" - `Rationale`: {q.rationale}")
|
||||
lines.append("")
|
||||
if not with_results:
|
||||
lines.append("_Batch 1 status: not processed yet._")
|
||||
lines.append("")
|
||||
return "\n".join(lines)
|
||||
|
||||
|
||||
def main() -> int:
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--batch-docs", type=int, default=10)
|
||||
parser.add_argument("--batch-index", type=int, default=1)
|
||||
parser.add_argument("--timeout", type=int, default=120)
|
||||
args = parser.parse_args()
|
||||
|
||||
raw = INPUT_MD.read_text(encoding="utf-8")
|
||||
header_lines, doc_blocks = split_documents(raw)
|
||||
docs = [parse_document_block(b) for b in doc_blocks]
|
||||
|
||||
start = (args.batch_index - 1) * args.batch_docs
|
||||
end = start + args.batch_docs
|
||||
if start >= len(docs):
|
||||
raise RuntimeError("Batch start is beyond available documents")
|
||||
|
||||
batch_docs = docs[start:end]
|
||||
total_questions = sum(len(qs) for d in batch_docs for _, qs in d.sections)
|
||||
q_index = 0
|
||||
|
||||
for doc in batch_docs:
|
||||
for _, questions in doc.sections:
|
||||
for q in questions:
|
||||
q_index += 1
|
||||
print(f"[{q_index:03d}/{total_questions}] {q.question}")
|
||||
try:
|
||||
t0 = time.time()
|
||||
q.langchain_answer = call_langchain(
|
||||
q.question, timeout=args.timeout
|
||||
)
|
||||
print(
|
||||
f" -> LangChain OK in {time.time() - t0:.1f}s "
|
||||
f"(chars={len(q.langchain_answer)})"
|
||||
)
|
||||
except Exception as e:
|
||||
q.langchain_answer = f"ERROR: {e}"
|
||||
print(f" -> LangChain ERROR: {e}")
|
||||
try:
|
||||
t0 = time.time()
|
||||
q.llamaindex_answer = call_llamaindex(
|
||||
q.question, timeout=args.timeout
|
||||
)
|
||||
print(
|
||||
f" -> LlamaIndex OK in {time.time() - t0:.1f}s "
|
||||
f"(chars={len(q.llamaindex_answer)})"
|
||||
)
|
||||
except Exception as e:
|
||||
q.llamaindex_answer = f"ERROR: {e}"
|
||||
print(f" -> LlamaIndex ERROR: {e}")
|
||||
|
||||
winner, lc_score, li_score, rationale = compare_answers(
|
||||
q.question, q.langchain_answer, q.llamaindex_answer
|
||||
)
|
||||
q.winner = winner
|
||||
q.langchain_score = lc_score
|
||||
q.llamaindex_score = li_score
|
||||
q.rationale = rationale
|
||||
|
||||
output_parts: list[str] = []
|
||||
output_parts.extend(header_lines)
|
||||
output_parts.append("")
|
||||
output_parts.append(
|
||||
format_batch_summary(batch_docs, args.batch_index, len(batch_docs))
|
||||
)
|
||||
|
||||
for i, doc in enumerate(docs):
|
||||
in_batch = start <= i < end
|
||||
output_parts.append(render_document_with_results(doc, with_results=in_batch))
|
||||
|
||||
OUTPUT_MD.write_text("\n".join(output_parts).rstrip() + "\n", encoding="utf-8")
|
||||
print(f"Written: {OUTPUT_MD}")
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
raise SystemExit(main())
|
||||
BIN
services/rag/.DS_Store
vendored
BIN
services/rag/.DS_Store
vendored
Binary file not shown.
3
services/rag/langchain/.gitignore
vendored
3
services/rag/langchain/.gitignore
vendored
@@ -216,3 +216,6 @@ __marimo__/
|
||||
.streamlit/secrets.toml
|
||||
document_tracking.db
|
||||
.env.test
|
||||
|
||||
yadisk_imported_paths.csv
|
||||
yadisk_imported_paths.json
|
||||
|
||||
@@ -125,3 +125,7 @@ During this Phase we create asynchronous process of enrichment, utilizing async/
|
||||
- [x] Make tabbed UI with top level tabs. First tab exists and is selected. Each tab should have copy of demo ui, meaning the chat window with ability to specify the api url
|
||||
- [x] At the end of the tabs there should be button with plus sign, which will add new tab. Tabs to be called by numbers.
|
||||
- [x] There should predefined 3 tabs opened. First one should have predefined api url "https://rag.langchain.overwatch.su/api/test-query", second "https://rag.llamaindex.overwatch.su/api/test-query", third "https://rag.haystack.overwatch.su/api/test-query"
|
||||
|
||||
# Phase 17 (creating json with list of documents that are supported for import)
|
||||
|
||||
- [x] Make cli command that takes json file with list of paths, filters them to only those that are being imported into the vector storage (can be checked in enrichment), then this file should be saved in the current folder as "yadisk_imported_paths.json" and in "yadisk_imported_paths.csv" file. In case of CSV - it should be formatted as csv of course.
|
||||
|
||||
@@ -1,4 +1,6 @@
|
||||
import os
|
||||
import csv
|
||||
import json
|
||||
from pathlib import Path
|
||||
|
||||
import click
|
||||
@@ -126,5 +128,60 @@ def chat(collection_name, model):
|
||||
click.echo(f"Error: {str(e)}")
|
||||
|
||||
|
||||
@cli.command(
|
||||
name="export-supported-paths",
|
||||
help="Filter JSON paths by enrichment-supported extensions and export JSON/CSV",
|
||||
)
|
||||
@click.argument("input_json", type=click.Path(exists=True, dir_okay=False, path_type=Path))
|
||||
def export_supported_paths(input_json: Path):
|
||||
"""Export supported document paths into yadisk_imported_paths.json and yadisk_imported_paths.csv."""
|
||||
logger.info(f"Filtering supported paths from input file: {input_json}")
|
||||
|
||||
try:
|
||||
from enrichment import SUPPORTED_EXTENSIONS
|
||||
|
||||
with input_json.open("r", encoding="utf-8") as source_file:
|
||||
raw_data = json.load(source_file)
|
||||
|
||||
if not isinstance(raw_data, list):
|
||||
raise ValueError("Input JSON must contain an array of file paths")
|
||||
|
||||
filtered_paths = []
|
||||
seen_paths = set()
|
||||
for item in raw_data:
|
||||
path_str = str(item).strip()
|
||||
if not path_str:
|
||||
continue
|
||||
if path_str in seen_paths:
|
||||
continue
|
||||
|
||||
extension = Path(path_str).suffix.lower()
|
||||
if extension in SUPPORTED_EXTENSIONS:
|
||||
filtered_paths.append(path_str)
|
||||
seen_paths.add(path_str)
|
||||
|
||||
output_json = Path.cwd() / "yadisk_imported_paths.json"
|
||||
output_csv = Path.cwd() / "yadisk_imported_paths.csv"
|
||||
|
||||
with output_json.open("w", encoding="utf-8") as output_json_file:
|
||||
json.dump(filtered_paths, output_json_file, ensure_ascii=False, indent=2)
|
||||
|
||||
with output_csv.open("w", encoding="utf-8", newline="") as output_csv_file:
|
||||
writer = csv.writer(output_csv_file)
|
||||
writer.writerow(["path"])
|
||||
for path_item in filtered_paths:
|
||||
writer.writerow([path_item])
|
||||
|
||||
click.echo(
|
||||
f"Export complete: {len(filtered_paths)} supported paths saved to {output_json.name} and {output_csv.name}"
|
||||
)
|
||||
logger.info(
|
||||
f"Exported {len(filtered_paths)} supported paths to {output_json} and {output_csv}"
|
||||
)
|
||||
except Exception as error:
|
||||
logger.error(f"Failed to export supported paths: {error}")
|
||||
click.echo(f"Error: {error}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
cli()
|
||||
|
||||
@@ -43,13 +43,18 @@ setup_logging()
|
||||
|
||||
app = FastAPI(title="LlamaIndex RAG API", version="1.0.0")
|
||||
|
||||
origins = [
|
||||
"*",
|
||||
]
|
||||
|
||||
# Add CORS middleware
|
||||
app.add_middleware(
|
||||
CORSMiddleware,
|
||||
allow_origins=["*"], # In production, configure this properly
|
||||
allow_origins=origins, # In production, configure this properly
|
||||
allow_credentials=True,
|
||||
allow_methods=["*"],
|
||||
allow_headers=["*"],
|
||||
allow_private_network=True,
|
||||
)
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user