783 lines
26 KiB
Python
783 lines
26 KiB
Python
|
|
#!/usr/bin/env python3
|
|||
|
|
"""
|
|||
|
|
RAG Evaluation Script
|
|||
|
|
|
|||
|
|
Evaluates two RAG systems (LangChain and LlamaIndex) using OpenAI-compatible LLM
|
|||
|
|
for scoring, with Yandex Disk integration for document verification.
|
|||
|
|
|
|||
|
|
Usage:
|
|||
|
|
python rag_evaluation.py 1:10 # Evaluate questions 1 to 10
|
|||
|
|
python rag_evaluation.py 5:20 # Evaluate questions 5 to 20
|
|||
|
|
"""
|
|||
|
|
from __future__ import annotations
|
|||
|
|
|
|||
|
|
import argparse
|
|||
|
|
import json
|
|||
|
|
import os
|
|||
|
|
import re
|
|||
|
|
import tempfile
|
|||
|
|
from dataclasses import dataclass, field
|
|||
|
|
from pathlib import Path
|
|||
|
|
from typing import Any, Optional
|
|||
|
|
from dotenv import load_dotenv
|
|||
|
|
|
|||
|
|
load_dotenv()
|
|||
|
|
|
|||
|
|
import requests
|
|||
|
|
|
|||
|
|
# =============================================================================
|
|||
|
|
# Configuration
|
|||
|
|
# =============================================================================
|
|||
|
|
|
|||
|
|
# OpenAI-compatible LLM settings
|
|||
|
|
OPENAI_CHAT_MODEL = os.getenv("OPENAI_CHAT_MODEL", "MiniMaxAI/MiniMax-M2")
|
|||
|
|
OPENAI_CHAT_URL = os.getenv("OPENAI_CHAT_URL", "https://foundation-models.api.cloud.ru/v1")
|
|||
|
|
OPENAI_CHAT_KEY = os.getenv("OPENAI_CHAT_KEY", "")
|
|||
|
|
|
|||
|
|
# RAG system URLs
|
|||
|
|
LANGCHAIN_URL = "http://localhost:8331/api/test-query"
|
|||
|
|
LLAMAINDEX_URL = "http://localhost:8334/api/test-query"
|
|||
|
|
|
|||
|
|
# Yandex Disk
|
|||
|
|
YADISK_TOKEN = os.getenv("YADISK_TOKEN", "")
|
|||
|
|
|
|||
|
|
# File paths
|
|||
|
|
INPUT_MD = Path(__file__).parent / "DOCUMENTS_TO_TEST.md"
|
|||
|
|
OUTPUT_MD = Path(__file__).parent / "EVALUATION_RESULT.md"
|
|||
|
|
|
|||
|
|
# Timeouts
|
|||
|
|
RAG_TIMEOUT = 120 # seconds
|
|||
|
|
LLM_TIMEOUT = 60 # seconds
|
|||
|
|
YADISK_TIMEOUT = 60 # seconds
|
|||
|
|
|
|||
|
|
|
|||
|
|
# =============================================================================
|
|||
|
|
# Data Classes
|
|||
|
|
# =============================================================================
|
|||
|
|
|
|||
|
|
@dataclass
|
|||
|
|
class QuestionItem:
|
|||
|
|
"""Represents a single question with its evaluation results."""
|
|||
|
|
section: str
|
|||
|
|
question: str
|
|||
|
|
question_number: int = 0
|
|||
|
|
langchain_answer: str = ""
|
|||
|
|
llamaindex_answer: str = ""
|
|||
|
|
langchain_score: float = 0.0
|
|||
|
|
llamaindex_score: float = 0.0
|
|||
|
|
winner: str = "Tie"
|
|||
|
|
rationale: str = ""
|
|||
|
|
error: str = ""
|
|||
|
|
|
|||
|
|
|
|||
|
|
@dataclass
|
|||
|
|
class DocumentItem:
|
|||
|
|
"""Represents a document with its associated questions."""
|
|||
|
|
header: str
|
|||
|
|
path: str
|
|||
|
|
sections: list[tuple[str, list[QuestionItem]]] = field(default_factory=list)
|
|||
|
|
|
|||
|
|
|
|||
|
|
# =============================================================================
|
|||
|
|
# Document Parser
|
|||
|
|
# =============================================================================
|
|||
|
|
|
|||
|
|
def split_documents(md_text: str) -> tuple[list[str], list[str]]:
|
|||
|
|
"""Split the markdown file into header lines and document blocks."""
|
|||
|
|
lines = md_text.splitlines()
|
|||
|
|
header: list[str] = []
|
|||
|
|
docs: list[list[str]] = []
|
|||
|
|
current: list[str] | None = None
|
|||
|
|
|
|||
|
|
for line in lines:
|
|||
|
|
if line.startswith("## "):
|
|||
|
|
if current is not None:
|
|||
|
|
docs.append(current)
|
|||
|
|
current = [line]
|
|||
|
|
else:
|
|||
|
|
if current is None:
|
|||
|
|
header.append(line)
|
|||
|
|
else:
|
|||
|
|
current.append(line)
|
|||
|
|
|
|||
|
|
if current is not None:
|
|||
|
|
docs.append(current)
|
|||
|
|
|
|||
|
|
return header, ["\n".join(d) for d in docs]
|
|||
|
|
|
|||
|
|
|
|||
|
|
def parse_document_block(block: str) -> DocumentItem:
|
|||
|
|
"""Parse a single document block from the markdown file."""
|
|||
|
|
lines = block.splitlines()
|
|||
|
|
header = lines[0].strip()
|
|||
|
|
|
|||
|
|
# Extract file path from backticks
|
|||
|
|
m = re.search(r"`([^`]+)`", header)
|
|||
|
|
doc_path = m.group(1) if m else ""
|
|||
|
|
|
|||
|
|
sections: list[tuple[str, list[QuestionItem]]] = []
|
|||
|
|
current_section = ""
|
|||
|
|
current_questions: list[QuestionItem] = []
|
|||
|
|
|
|||
|
|
for line in lines[1:]:
|
|||
|
|
if line.startswith("### "):
|
|||
|
|
if current_section:
|
|||
|
|
sections.append((current_section, current_questions))
|
|||
|
|
current_section = line[4:].strip()
|
|||
|
|
current_questions = []
|
|||
|
|
elif line.startswith("- "):
|
|||
|
|
q = line[2:].strip()
|
|||
|
|
if q:
|
|||
|
|
current_questions.append(
|
|||
|
|
QuestionItem(section=current_section, question=q)
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
if current_section:
|
|||
|
|
sections.append((current_section, current_questions))
|
|||
|
|
|
|||
|
|
return DocumentItem(header=header, path=doc_path, sections=sections)
|
|||
|
|
|
|||
|
|
|
|||
|
|
def parse_all_documents(md_path: Path) -> list[DocumentItem]:
|
|||
|
|
"""Parse all documents from the markdown file."""
|
|||
|
|
raw = md_path.read_text(encoding="utf-8")
|
|||
|
|
_, doc_blocks = split_documents(raw)
|
|||
|
|
return [parse_document_block(b) for b in doc_blocks]
|
|||
|
|
|
|||
|
|
|
|||
|
|
def flatten_questions(docs: list[DocumentItem]) -> list[tuple[DocumentItem, QuestionItem]]:
|
|||
|
|
"""Flatten all questions from all documents into a single list with indices."""
|
|||
|
|
result = []
|
|||
|
|
for doc in docs:
|
|||
|
|
for _, questions in doc.sections:
|
|||
|
|
for q in questions:
|
|||
|
|
result.append((doc, q))
|
|||
|
|
return result
|
|||
|
|
|
|||
|
|
|
|||
|
|
# =============================================================================
|
|||
|
|
# RAG API Clients
|
|||
|
|
# =============================================================================
|
|||
|
|
|
|||
|
|
def call_langchain(query: str, timeout: int = RAG_TIMEOUT) -> str:
|
|||
|
|
"""Call the LangChain RAG system API."""
|
|||
|
|
payload = {"query": query}
|
|||
|
|
try:
|
|||
|
|
r = requests.post(LANGCHAIN_URL, json=payload, timeout=timeout)
|
|||
|
|
r.raise_for_status()
|
|||
|
|
data = r.json()
|
|||
|
|
return str(data.get("response", "")).strip()
|
|||
|
|
except Exception as e:
|
|||
|
|
return f"ERROR: {e}"
|
|||
|
|
|
|||
|
|
|
|||
|
|
def call_llamaindex(query: str, timeout: int = RAG_TIMEOUT) -> str:
|
|||
|
|
"""Call the LlamaIndex RAG system API."""
|
|||
|
|
payload = {"query": query}
|
|||
|
|
try:
|
|||
|
|
r = requests.post(LLAMAINDEX_URL, json=payload, timeout=timeout)
|
|||
|
|
r.raise_for_status()
|
|||
|
|
data = r.json()
|
|||
|
|
return str(data.get("response", "")).strip()
|
|||
|
|
except Exception as e:
|
|||
|
|
return f"ERROR: {e}"
|
|||
|
|
|
|||
|
|
|
|||
|
|
# =============================================================================
|
|||
|
|
# Yandex Disk Integration
|
|||
|
|
# =============================================================================
|
|||
|
|
|
|||
|
|
def download_yadisk_file(remote_path: str, token: str, local_path: str, timeout: int = YADISK_TIMEOUT) -> None:
|
|||
|
|
"""Download a file from Yandex Disk to a local path."""
|
|||
|
|
headers = {"Authorization": f"OAuth {token}"}
|
|||
|
|
|
|||
|
|
# Get download URL
|
|||
|
|
response = requests.get(
|
|||
|
|
"https://cloud-api.yandex.net/v1/disk/resources/download",
|
|||
|
|
headers=headers,
|
|||
|
|
params={"path": remote_path},
|
|||
|
|
timeout=timeout,
|
|||
|
|
)
|
|||
|
|
response.raise_for_status()
|
|||
|
|
href = response.json()["href"]
|
|||
|
|
|
|||
|
|
# Download the file
|
|||
|
|
file_response = requests.get(href, timeout=timeout * 2)
|
|||
|
|
file_response.raise_for_status()
|
|||
|
|
|
|||
|
|
with open(local_path, "wb") as f:
|
|||
|
|
f.write(file_response.content)
|
|||
|
|
|
|||
|
|
|
|||
|
|
def extract_text_from_file(file_path: str) -> str:
|
|||
|
|
"""Extract text from a downloaded file based on its extension."""
|
|||
|
|
ext = Path(file_path).suffix.lower()
|
|||
|
|
|
|||
|
|
# Text-based formats
|
|||
|
|
if ext in [".txt", ".md", ".csv", ".json", ".xml", ".html", ".htm"]:
|
|||
|
|
with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
|
|||
|
|
return f.read()
|
|||
|
|
|
|||
|
|
# For binary formats (docx, pdf, xlsx), we'll return a placeholder
|
|||
|
|
# In production, you'd use libraries like python-docx, PyPDF2, openpyxl
|
|||
|
|
elif ext in [".docx", ".doc"]:
|
|||
|
|
try:
|
|||
|
|
from docx import Document
|
|||
|
|
doc = Document(file_path)
|
|||
|
|
return "\n".join([p.text for p in doc.paragraphs])
|
|||
|
|
except ImportError:
|
|||
|
|
return f"[DOCX file: {file_path}] - python-docx not installed"
|
|||
|
|
except Exception as e:
|
|||
|
|
return f"[DOCX read error: {e}]"
|
|||
|
|
|
|||
|
|
elif ext == ".pdf":
|
|||
|
|
try:
|
|||
|
|
import PyPDF2
|
|||
|
|
text_parts = []
|
|||
|
|
with open(file_path, "rb") as f:
|
|||
|
|
reader = PyPDF2.PdfReader(f)
|
|||
|
|
for page in reader.pages:
|
|||
|
|
text_parts.append(page.extract_text() or "")
|
|||
|
|
return "\n".join(text_parts)
|
|||
|
|
except ImportError:
|
|||
|
|
return f"[PDF file: {file_path}] - PyPDF2 not installed"
|
|||
|
|
except Exception as e:
|
|||
|
|
return f"[PDF read error: {e}]"
|
|||
|
|
|
|||
|
|
elif ext in [".xlsx", ".xls"]:
|
|||
|
|
try:
|
|||
|
|
from openpyxl import load_workbook
|
|||
|
|
wb = load_workbook(file_path, read_only=True)
|
|||
|
|
texts = []
|
|||
|
|
for sheet in wb.worksheets:
|
|||
|
|
for row in sheet.iter_rows(values_only=True):
|
|||
|
|
texts.append("\t".join(str(c) if c is not None else "" for c in row))
|
|||
|
|
return "\n".join(texts)
|
|||
|
|
except ImportError:
|
|||
|
|
return f"[XLSX file: {file_path}] - openpyxl not installed"
|
|||
|
|
except Exception as e:
|
|||
|
|
return f"[XLSX read error: {e}]"
|
|||
|
|
|
|||
|
|
else:
|
|||
|
|
# Try to read as text
|
|||
|
|
try:
|
|||
|
|
with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
|
|||
|
|
return f.read()
|
|||
|
|
except Exception:
|
|||
|
|
return f"[Binary file: {file_path}]"
|
|||
|
|
|
|||
|
|
|
|||
|
|
def fetch_document_content(remote_path: str, token: str) -> str:
|
|||
|
|
"""Fetch content from a Yandex Disk file."""
|
|||
|
|
if not token:
|
|||
|
|
return "[Yandex Disk token not provided]"
|
|||
|
|
|
|||
|
|
# Clean up the path - remove "disk:/" prefix if present
|
|||
|
|
clean_path = remote_path
|
|||
|
|
if clean_path.startswith("disk:/"):
|
|||
|
|
clean_path = clean_path[6:]
|
|||
|
|
|
|||
|
|
with tempfile.NamedTemporaryFile(delete=False, suffix=Path(clean_path).suffix) as tmp:
|
|||
|
|
local_path = tmp.name
|
|||
|
|
|
|||
|
|
try:
|
|||
|
|
download_yadisk_file(clean_path, token, local_path)
|
|||
|
|
return extract_text_from_file(local_path)
|
|||
|
|
except Exception as e:
|
|||
|
|
return f"[Yandex Disk download error: {e}]"
|
|||
|
|
finally:
|
|||
|
|
if os.path.exists(local_path):
|
|||
|
|
os.unlink(local_path)
|
|||
|
|
|
|||
|
|
|
|||
|
|
# =============================================================================
|
|||
|
|
# OpenAI-compatible LLM Evaluator
|
|||
|
|
# =============================================================================
|
|||
|
|
|
|||
|
|
def create_evaluation_prompt(
|
|||
|
|
question: str,
|
|||
|
|
rag_response: str,
|
|||
|
|
document_content: str,
|
|||
|
|
section_type: str
|
|||
|
|
) -> str:
|
|||
|
|
"""Create a prompt for the LLM to evaluate a RAG response."""
|
|||
|
|
|
|||
|
|
# Section-specific evaluation criteria
|
|||
|
|
section_criteria = {
|
|||
|
|
"Entity/Fact Recall (Response Relevance)": """
|
|||
|
|
Критерии оценки:
|
|||
|
|
- Насколько точно ответ извлекает факты и сущности из документа
|
|||
|
|
- Соответствует ли ответ на вопрос о ключевых участниках и их ролях
|
|||
|
|
- Полнота извлечения фактов из контекста
|
|||
|
|
""",
|
|||
|
|
"Numerical & Temporal Precision": """
|
|||
|
|
Критерии оценки:
|
|||
|
|
- Точность извлечения дат, лет, числовых значений
|
|||
|
|
- Соответствие чисел в ответе числам в документе
|
|||
|
|
- Правильность временных привязок событий
|
|||
|
|
""",
|
|||
|
|
"Context Precision (Evidence-anchored)": """
|
|||
|
|
Критерии оценки:
|
|||
|
|
- Насколько хорошо ответ идентифицирует релевантные фрагменты
|
|||
|
|
- Умение отличать релевантные фрагменты от нерелевантных
|
|||
|
|
- Обоснованность выбора контекста
|
|||
|
|
""",
|
|||
|
|
"Faithfulness / Non-hallucination": """
|
|||
|
|
Критерии оценки:
|
|||
|
|
- Отсутствие выдуманной информации
|
|||
|
|
- Ответ основан только на предоставленном контексте
|
|||
|
|
- Корректное указание на отсутствие информации, если её нет
|
|||
|
|
""",
|
|||
|
|
"Reasoning & Synthesis": """
|
|||
|
|
Критерии оценки:
|
|||
|
|
- Качество синтеза информации из нескольких фрагментов
|
|||
|
|
- Логичность выводов
|
|||
|
|
- Указание на ограничения, риски или условия
|
|||
|
|
"""
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
criteria = section_criteria.get(section_type, """
|
|||
|
|
Критерии оценки:
|
|||
|
|
- Релевантность ответа вопросу
|
|||
|
|
- Точность фактов
|
|||
|
|
- Отсутствие галлюцинаций
|
|||
|
|
- Полнота ответа
|
|||
|
|
""")
|
|||
|
|
|
|||
|
|
prompt = f"""Ты — эксперт по оценке качества RAG-систем (Retrieval-Augmented Generation).
|
|||
|
|
|
|||
|
|
Твоя задача: оценить качество ответа RAG-системы на вопрос пользователя, сравнив его с содержимым исходного документа.
|
|||
|
|
|
|||
|
|
## Вопрос пользователя:
|
|||
|
|
{question}
|
|||
|
|
|
|||
|
|
## Ответ RAG-системы:
|
|||
|
|
{rag_response}
|
|||
|
|
|
|||
|
|
## Содержимое исходного документа:
|
|||
|
|
{document_content[:8000]} # Ограничиваем длину для контекста
|
|||
|
|
|
|||
|
|
{criteria}
|
|||
|
|
|
|||
|
|
## Формат ответа:
|
|||
|
|
Верни ответ ТОЛЬКО в формате JSON:
|
|||
|
|
{{
|
|||
|
|
"score": <число от 0.0 до 1.0>,
|
|||
|
|
"rationale": "<краткое обоснование оценки на русском языке>",
|
|||
|
|
"strengths": ["<сильные стороны>"],
|
|||
|
|
"weaknesses": ["<слабые стороны>"],
|
|||
|
|
"hallucination_detected": <true/false>,
|
|||
|
|
"missing_info": ["<отсутствующая важная информация>"]
|
|||
|
|
}}
|
|||
|
|
|
|||
|
|
Оценка:
|
|||
|
|
- 1.0: Идеальный ответ, полностью точный и полный
|
|||
|
|
- 0.8-0.9: Очень хороший ответ с незначительными неточностями
|
|||
|
|
- 0.6-0.7: Хороший ответ, но есть некоторые проблемы
|
|||
|
|
- 0.4-0.5: Удовлетворительный ответ с существенными проблемами
|
|||
|
|
- 0.2-0.3: Плохой ответ, много ошибок или неполный
|
|||
|
|
- 0.0-0.1: Ответ неверный или содержит галлюцинации
|
|||
|
|
"""
|
|||
|
|
return prompt
|
|||
|
|
|
|||
|
|
|
|||
|
|
def evaluate_with_llm(
|
|||
|
|
question: str,
|
|||
|
|
rag_response: str,
|
|||
|
|
document_content: str,
|
|||
|
|
section_type: str,
|
|||
|
|
model: str = OPENAI_CHAT_MODEL,
|
|||
|
|
api_url: str = OPENAI_CHAT_URL,
|
|||
|
|
api_key: str = OPENAI_CHAT_KEY
|
|||
|
|
) -> dict[str, Any]:
|
|||
|
|
"""Evaluate a RAG response using the OpenAI-compatible LLM."""
|
|||
|
|
|
|||
|
|
if not api_key:
|
|||
|
|
return {
|
|||
|
|
"score": 0.0,
|
|||
|
|
"rationale": "API key not provided",
|
|||
|
|
"error": "Missing API key"
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
prompt = create_evaluation_prompt(question, rag_response, document_content, section_type)
|
|||
|
|
|
|||
|
|
headers = {
|
|||
|
|
"Authorization": f"Bearer {api_key}",
|
|||
|
|
"Content-Type": "application/json"
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
payload = {
|
|||
|
|
"model": model,
|
|||
|
|
"messages": [
|
|||
|
|
{
|
|||
|
|
"role": "system",
|
|||
|
|
"content": "Ты — эксперт по оценке качества RAG-систем. Отвечай ТОЛЬКО в формате JSON."
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"role": "user",
|
|||
|
|
"content": prompt
|
|||
|
|
}
|
|||
|
|
],
|
|||
|
|
"temperature": 0.1,
|
|||
|
|
"max_tokens": 500
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
try:
|
|||
|
|
response = requests.post(
|
|||
|
|
api_url + "/chat/completions",
|
|||
|
|
headers=headers,
|
|||
|
|
json=payload,
|
|||
|
|
timeout=LLM_TIMEOUT
|
|||
|
|
)
|
|||
|
|
response.raise_for_status()
|
|||
|
|
|
|||
|
|
result = response.json()
|
|||
|
|
|
|||
|
|
# Safely extract content
|
|||
|
|
try:
|
|||
|
|
content = result.get("choices", [{}])[0].get("message", {}).get("content")
|
|||
|
|
except (IndexError, KeyError):
|
|||
|
|
content = None
|
|||
|
|
|
|||
|
|
if not content:
|
|||
|
|
return {
|
|||
|
|
"score": 0.5,
|
|||
|
|
"rationale": "LLM returned empty or malformed response",
|
|||
|
|
"error": "Empty content in LLM response"
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
# Parse JSON response
|
|||
|
|
try:
|
|||
|
|
# Try to extract JSON from the response
|
|||
|
|
json_match = re.search(r'\{[^{}]*\}', content, re.DOTALL)
|
|||
|
|
if json_match:
|
|||
|
|
evaluation = json.loads(json_match.group())
|
|||
|
|
else:
|
|||
|
|
evaluation = json.loads(content)
|
|||
|
|
|
|||
|
|
return {
|
|||
|
|
"score": float(evaluation.get("score", 0.0)),
|
|||
|
|
"rationale": evaluation.get("rationale", "") or "",
|
|||
|
|
"strengths": evaluation.get("strengths", []),
|
|||
|
|
"weaknesses": evaluation.get("weaknesses", []),
|
|||
|
|
"hallucination_detected": evaluation.get("hallucination_detected", False),
|
|||
|
|
"missing_info": evaluation.get("missing_info", [])
|
|||
|
|
}
|
|||
|
|
except json.JSONDecodeError as e:
|
|||
|
|
return {
|
|||
|
|
"score": 0.5,
|
|||
|
|
"rationale": f"Failed to parse LLM response: {content[:200]}",
|
|||
|
|
"error": str(e)
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
except requests.RequestException as e:
|
|||
|
|
return {
|
|||
|
|
"score": 0.0,
|
|||
|
|
"rationale": f"LLM API error: {e}",
|
|||
|
|
"error": str(e)
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
|
|||
|
|
# =============================================================================
|
|||
|
|
# Results Output
|
|||
|
|
# =============================================================================
|
|||
|
|
|
|||
|
|
def truncate_text(text: str, max_len: int = 1500) -> str:
|
|||
|
|
"""Truncate text for display."""
|
|||
|
|
text = (text or "").strip()
|
|||
|
|
if len(text) <= max_len:
|
|||
|
|
return text
|
|||
|
|
return text[:max_len] + "... [truncated]"
|
|||
|
|
|
|||
|
|
|
|||
|
|
def format_question_result(q: QuestionItem, doc_path: str) -> str:
|
|||
|
|
"""Format a single question result for the output markdown."""
|
|||
|
|
lines = [
|
|||
|
|
f"#### Вопрос #{q.question_number}",
|
|||
|
|
f"**Вопрос:** {q.question}",
|
|||
|
|
"",
|
|||
|
|
f"**Секция:** {q.section}",
|
|||
|
|
"",
|
|||
|
|
"**Ответ LangChain:**",
|
|||
|
|
f"```",
|
|||
|
|
truncate_text(q.langchain_answer),
|
|||
|
|
"```",
|
|||
|
|
"",
|
|||
|
|
"**Ответ LlamaIndex:**",
|
|||
|
|
f"```",
|
|||
|
|
truncate_text(q.llamaindex_answer),
|
|||
|
|
"```",
|
|||
|
|
"",
|
|||
|
|
"**Результаты оценки:**",
|
|||
|
|
f"- LangChain Score: {q.langchain_score:.2f}",
|
|||
|
|
f"- LlamaIndex Score: {q.llamaindex_score:.2f}",
|
|||
|
|
f"- Победитель: **{q.winner}**",
|
|||
|
|
"",
|
|||
|
|
f"**Обоснование:** {q.rationale}",
|
|||
|
|
"",
|
|||
|
|
"---",
|
|||
|
|
""
|
|||
|
|
]
|
|||
|
|
return "\n".join(lines)
|
|||
|
|
|
|||
|
|
|
|||
|
|
def format_document_results(doc: DocumentItem, with_results: bool = True) -> str:
|
|||
|
|
"""Format document results for the output markdown."""
|
|||
|
|
lines = [
|
|||
|
|
doc.header,
|
|||
|
|
"",
|
|||
|
|
f"**Путь к файлу:** `{doc.path}`",
|
|||
|
|
""
|
|||
|
|
]
|
|||
|
|
|
|||
|
|
if with_results:
|
|||
|
|
for section_name, questions in doc.sections:
|
|||
|
|
lines.append(f"### {section_name}")
|
|||
|
|
lines.append("")
|
|||
|
|
for q in questions:
|
|||
|
|
lines.append(format_question_result(q, doc.path))
|
|||
|
|
else:
|
|||
|
|
lines.append("_Результаты ещё не обработаны._")
|
|||
|
|
lines.append("")
|
|||
|
|
|
|||
|
|
return "\n".join(lines)
|
|||
|
|
|
|||
|
|
|
|||
|
|
def format_summary(all_questions: list[QuestionItem], batch_info: dict) -> str:
|
|||
|
|
"""Format summary statistics."""
|
|||
|
|
wins = {"LangChain": 0, "LlamaIndex": 0, "Tie": 0}
|
|||
|
|
scores_lc: list[float] = []
|
|||
|
|
scores_li: list[float] = []
|
|||
|
|
|
|||
|
|
for q in all_questions:
|
|||
|
|
wins[q.winner] += 1
|
|||
|
|
scores_lc.append(q.langchain_score)
|
|||
|
|
scores_li.append(q.llamaindex_score)
|
|||
|
|
|
|||
|
|
avg_lc = sum(scores_lc) / max(1, len(scores_lc))
|
|||
|
|
avg_li = sum(scores_li) / max(1, len(scores_li))
|
|||
|
|
|
|||
|
|
if avg_lc > avg_li + 0.05:
|
|||
|
|
ranking = "LangChain"
|
|||
|
|
elif avg_li > avg_lc + 0.05:
|
|||
|
|
ranking = "LlamaIndex"
|
|||
|
|
else:
|
|||
|
|
ranking = "Ничья"
|
|||
|
|
|
|||
|
|
lines = [
|
|||
|
|
"## Итоговая сводка",
|
|||
|
|
"",
|
|||
|
|
f"- Всего вопросов оценено: {len(all_questions)}",
|
|||
|
|
f"- Диапазон вопросов: {batch_info.get('from', 1)} - {batch_info.get('to', len(all_questions))}",
|
|||
|
|
"",
|
|||
|
|
"### Победители по вопросам:",
|
|||
|
|
f"- LangChain: {wins['LangChain']}",
|
|||
|
|
f"- LlamaIndex: {wins['LlamaIndex']}",
|
|||
|
|
f"- Ничья: {wins['Tie']}",
|
|||
|
|
"",
|
|||
|
|
"### Средние оценки:",
|
|||
|
|
f"- LangChain: {avg_lc:.3f}",
|
|||
|
|
f"- LlamaIndex: {avg_li:.3f}",
|
|||
|
|
"",
|
|||
|
|
f"### Итоговый рейтинг: **{ranking}**",
|
|||
|
|
"",
|
|||
|
|
"_Методика оценки: LLM-оценка на основе сравнения с содержимым документов из Yandex Disk._",
|
|||
|
|
"",
|
|||
|
|
"---",
|
|||
|
|
""
|
|||
|
|
]
|
|||
|
|
return "\n".join(lines)
|
|||
|
|
|
|||
|
|
|
|||
|
|
def write_results(
|
|||
|
|
header_lines: list[str],
|
|||
|
|
docs: list[DocumentItem],
|
|||
|
|
all_questions: list[QuestionItem],
|
|||
|
|
batch_info: dict,
|
|||
|
|
output_path: Path
|
|||
|
|
) -> None:
|
|||
|
|
"""Write evaluation results to markdown file."""
|
|||
|
|
output_parts: list[str] = []
|
|||
|
|
|
|||
|
|
# Header
|
|||
|
|
output_parts.extend(header_lines)
|
|||
|
|
output_parts.append("")
|
|||
|
|
output_parts.append("# Результаты оценки RAG-систем")
|
|||
|
|
output_parts.append("")
|
|||
|
|
output_parts.append(f"Дата генерации: {Path(output_path).stat().st_mtime if output_path.exists() else 'N/A'}")
|
|||
|
|
output_parts.append("")
|
|||
|
|
|
|||
|
|
# Summary
|
|||
|
|
output_parts.append(format_summary(all_questions, batch_info))
|
|||
|
|
|
|||
|
|
# Detailed results per document
|
|||
|
|
for doc in docs:
|
|||
|
|
output_parts.append(format_document_results(doc, with_results=True))
|
|||
|
|
|
|||
|
|
output_path.write_text("\n".join(output_parts).rstrip() + "\n", encoding="utf-8")
|
|||
|
|
|
|||
|
|
|
|||
|
|
# =============================================================================
|
|||
|
|
# Main Evaluation Loop
|
|||
|
|
# =============================================================================
|
|||
|
|
|
|||
|
|
def run_evaluation(
|
|||
|
|
from_q: int,
|
|||
|
|
to_q: int,
|
|||
|
|
timeout_rag: int = RAG_TIMEOUT,
|
|||
|
|
timeout_llm: int = LLM_TIMEOUT
|
|||
|
|
) -> None:
|
|||
|
|
"""Run the evaluation for the specified question range."""
|
|||
|
|
|
|||
|
|
print(f"Загрузка документов из {INPUT_MD}...")
|
|||
|
|
docs = parse_all_documents(INPUT_MD)
|
|||
|
|
all_flat = flatten_questions(docs)
|
|||
|
|
|
|||
|
|
total_questions = len(all_flat)
|
|||
|
|
print(f"Всего вопросов найдено: {total_questions}")
|
|||
|
|
|
|||
|
|
# Adjust range
|
|||
|
|
from_q = max(1, from_q)
|
|||
|
|
to_q = min(total_questions, to_q)
|
|||
|
|
|
|||
|
|
if from_q > to_q:
|
|||
|
|
print(f"Ошибка: диапазон {from_q}:{to_q} некорректен")
|
|||
|
|
return
|
|||
|
|
|
|||
|
|
print(f"Оценка вопросов с {from_q} по {to_q}...")
|
|||
|
|
|
|||
|
|
# Store original header for output
|
|||
|
|
raw = INPUT_MD.read_text(encoding="utf-8")
|
|||
|
|
header_lines, _ = split_documents(raw)
|
|||
|
|
|
|||
|
|
# Track processed questions
|
|||
|
|
processed_indices = set(range(from_q - 1, to_q))
|
|||
|
|
|
|||
|
|
# Process each question in range
|
|||
|
|
q_index = 0
|
|||
|
|
for doc_idx, (doc, q) in enumerate(all_flat):
|
|||
|
|
q_index += 1
|
|||
|
|
|
|||
|
|
if q_index < from_q or q_index > to_q:
|
|||
|
|
continue
|
|||
|
|
|
|||
|
|
q.question_number = q_index
|
|||
|
|
print(f"\n[{q_index}/{total_questions}] {q.question[:80]}...")
|
|||
|
|
|
|||
|
|
# Call both RAG systems
|
|||
|
|
print(f" -> LangChain...", end=" ", flush=True)
|
|||
|
|
t0 = __import__("time").time()
|
|||
|
|
q.langchain_answer = call_langchain(q.question, timeout=timeout_rag)
|
|||
|
|
print(f"OK ({__import__('time').time() - t0:.1f}s)")
|
|||
|
|
|
|||
|
|
print(f" -> LlamaIndex...", end=" ", flush=True)
|
|||
|
|
t0 = __import__("time").time()
|
|||
|
|
q.llamaindex_answer = call_llamaindex(q.question, timeout=timeout_rag)
|
|||
|
|
print(f"OK ({__import__('time').time() - t0:.1f}s)")
|
|||
|
|
|
|||
|
|
# Download document content from Yandex Disk
|
|||
|
|
print(f" -> Загрузка документа из Yandex Disk...", end=" ", flush=True)
|
|||
|
|
if doc.path:
|
|||
|
|
doc_content = fetch_document_content(doc.path, YADISK_TOKEN)
|
|||
|
|
print(f"OK ({len(doc_content)} символов)")
|
|||
|
|
else:
|
|||
|
|
doc_content = "[Путь к документу не найден]"
|
|||
|
|
print("SKIP (нет пути)")
|
|||
|
|
|
|||
|
|
# Evaluate LangChain response
|
|||
|
|
print(f" -> Оценка LangChain...", end=" ", flush=True)
|
|||
|
|
lc_eval = evaluate_with_llm(
|
|||
|
|
q.question, q.langchain_answer, doc_content, q.section
|
|||
|
|
)
|
|||
|
|
q.langchain_score = lc_eval.get("score", 0.0)
|
|||
|
|
lc_rationale = lc_eval.get("rationale", "")
|
|||
|
|
print(f"Score: {q.langchain_score:.2f}")
|
|||
|
|
|
|||
|
|
# Evaluate LlamaIndex response
|
|||
|
|
print(f" -> Оценка LlamaIndex...", end=" ", flush=True)
|
|||
|
|
li_eval = evaluate_with_llm(
|
|||
|
|
q.question, q.llamaindex_answer, doc_content, q.section
|
|||
|
|
)
|
|||
|
|
q.llamaindex_score = li_eval.get("score", 0.0)
|
|||
|
|
li_rationale = li_eval.get("rationale", "")
|
|||
|
|
print(f"Score: {q.llamaindex_score:.2f}")
|
|||
|
|
|
|||
|
|
# Determine winner
|
|||
|
|
score_diff = abs(q.langchain_score - q.llamaindex_score)
|
|||
|
|
if score_diff < 0.05:
|
|||
|
|
q.winner = "Tie"
|
|||
|
|
elif q.langchain_score > q.llamaindex_score:
|
|||
|
|
q.winner = "LangChain"
|
|||
|
|
else:
|
|||
|
|
q.winner = "LlamaIndex"
|
|||
|
|
|
|||
|
|
# Combine rationales
|
|||
|
|
q.rationale = f"LC: {lc_rationale} | LI: {li_rationale}"
|
|||
|
|
|
|||
|
|
# Write results
|
|||
|
|
print(f"\nЗапись результатов в {OUTPUT_MD}...")
|
|||
|
|
batch_info = {"from": from_q, "to": to_q}
|
|||
|
|
|
|||
|
|
# Collect all evaluated questions
|
|||
|
|
evaluated_questions = [
|
|||
|
|
q for _, q in all_flat
|
|||
|
|
if q.question_number in range(from_q, to_q + 1)
|
|||
|
|
]
|
|||
|
|
|
|||
|
|
write_results(header_lines, docs, evaluated_questions, batch_info, OUTPUT_MD)
|
|||
|
|
print(f"Готово! Результаты сохранены в {OUTPUT_MD}")
|
|||
|
|
|
|||
|
|
|
|||
|
|
def main() -> int:
|
|||
|
|
"""Main entry point."""
|
|||
|
|
parser = argparse.ArgumentParser(
|
|||
|
|
description="Оценка RAG-систем с использованием LLM и Yandex Disk"
|
|||
|
|
)
|
|||
|
|
parser.add_argument(
|
|||
|
|
"range",
|
|||
|
|
type=str,
|
|||
|
|
help="Диапазон вопросов для оценки в формате 'from:to' (например, 1:10)"
|
|||
|
|
)
|
|||
|
|
parser.add_argument(
|
|||
|
|
"--timeout-rag",
|
|||
|
|
type=int,
|
|||
|
|
default=RAG_TIMEOUT,
|
|||
|
|
help=f"Таймаут для RAG API (по умолчанию {RAG_TIMEOUT}s)"
|
|||
|
|
)
|
|||
|
|
parser.add_argument(
|
|||
|
|
"--timeout-llm",
|
|||
|
|
type=int,
|
|||
|
|
default=LLM_TIMEOUT,
|
|||
|
|
help=f"Таймаут для LLM API (по умолчанию {LLM_TIMEOUT}s)"
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
args = parser.parse_args()
|
|||
|
|
|
|||
|
|
# Parse range argument
|
|||
|
|
range_match = re.match(r"(\d+):(\d+)", args.range)
|
|||
|
|
if not range_match:
|
|||
|
|
print("Ошибка: диапазон должен быть в формате 'from:to' (например, 1:10)")
|
|||
|
|
return 1
|
|||
|
|
|
|||
|
|
from_q = int(range_match.group(1))
|
|||
|
|
to_q = int(range_match.group(2))
|
|||
|
|
|
|||
|
|
if from_q > to_q:
|
|||
|
|
print("Ошибка: 'from' должно быть меньше или равно 'to'")
|
|||
|
|
return 1
|
|||
|
|
|
|||
|
|
# Validate configuration
|
|||
|
|
if not OPENAI_CHAT_KEY:
|
|||
|
|
print("Предупреждение: OPENAI_CHAT_KEY не установлен. Оценка LLM будет пропущена.")
|
|||
|
|
if not YADISK_TOKEN:
|
|||
|
|
print("Предупреждение: YADISK_TOKEN не установлен. Загрузка документов будет пропущена.")
|
|||
|
|
|
|||
|
|
# Run evaluation
|
|||
|
|
run_evaluation(from_q, to_q, args.timeout_rag, args.timeout_llm)
|
|||
|
|
|
|||
|
|
return 0
|
|||
|
|
|
|||
|
|
|
|||
|
|
if __name__ == "__main__":
|
|||
|
|
raise SystemExit(main())
|