evaluation for rag systems
This commit is contained in:
782
rag_evaluation.py
Normal file
782
rag_evaluation.py
Normal file
@@ -0,0 +1,782 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
RAG Evaluation Script
|
||||
|
||||
Evaluates two RAG systems (LangChain and LlamaIndex) using OpenAI-compatible LLM
|
||||
for scoring, with Yandex Disk integration for document verification.
|
||||
|
||||
Usage:
|
||||
python rag_evaluation.py 1:10 # Evaluate questions 1 to 10
|
||||
python rag_evaluation.py 5:20 # Evaluate questions 5 to 20
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
import tempfile
|
||||
from dataclasses import dataclass, field
|
||||
from pathlib import Path
|
||||
from typing import Any, Optional
|
||||
from dotenv import load_dotenv
|
||||
|
||||
load_dotenv()
|
||||
|
||||
import requests
|
||||
|
||||
# =============================================================================
|
||||
# Configuration
|
||||
# =============================================================================
|
||||
|
||||
# OpenAI-compatible LLM settings
|
||||
OPENAI_CHAT_MODEL = os.getenv("OPENAI_CHAT_MODEL", "MiniMaxAI/MiniMax-M2")
|
||||
OPENAI_CHAT_URL = os.getenv("OPENAI_CHAT_URL", "https://foundation-models.api.cloud.ru/v1")
|
||||
OPENAI_CHAT_KEY = os.getenv("OPENAI_CHAT_KEY", "")
|
||||
|
||||
# RAG system URLs
|
||||
LANGCHAIN_URL = "http://localhost:8331/api/test-query"
|
||||
LLAMAINDEX_URL = "http://localhost:8334/api/test-query"
|
||||
|
||||
# Yandex Disk
|
||||
YADISK_TOKEN = os.getenv("YADISK_TOKEN", "")
|
||||
|
||||
# File paths
|
||||
INPUT_MD = Path(__file__).parent / "DOCUMENTS_TO_TEST.md"
|
||||
OUTPUT_MD = Path(__file__).parent / "EVALUATION_RESULT.md"
|
||||
|
||||
# Timeouts
|
||||
RAG_TIMEOUT = 120 # seconds
|
||||
LLM_TIMEOUT = 60 # seconds
|
||||
YADISK_TIMEOUT = 60 # seconds
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Data Classes
|
||||
# =============================================================================
|
||||
|
||||
@dataclass
|
||||
class QuestionItem:
|
||||
"""Represents a single question with its evaluation results."""
|
||||
section: str
|
||||
question: str
|
||||
question_number: int = 0
|
||||
langchain_answer: str = ""
|
||||
llamaindex_answer: str = ""
|
||||
langchain_score: float = 0.0
|
||||
llamaindex_score: float = 0.0
|
||||
winner: str = "Tie"
|
||||
rationale: str = ""
|
||||
error: str = ""
|
||||
|
||||
|
||||
@dataclass
|
||||
class DocumentItem:
|
||||
"""Represents a document with its associated questions."""
|
||||
header: str
|
||||
path: str
|
||||
sections: list[tuple[str, list[QuestionItem]]] = field(default_factory=list)
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Document Parser
|
||||
# =============================================================================
|
||||
|
||||
def split_documents(md_text: str) -> tuple[list[str], list[str]]:
|
||||
"""Split the markdown file into header lines and document blocks."""
|
||||
lines = md_text.splitlines()
|
||||
header: list[str] = []
|
||||
docs: list[list[str]] = []
|
||||
current: list[str] | None = None
|
||||
|
||||
for line in lines:
|
||||
if line.startswith("## "):
|
||||
if current is not None:
|
||||
docs.append(current)
|
||||
current = [line]
|
||||
else:
|
||||
if current is None:
|
||||
header.append(line)
|
||||
else:
|
||||
current.append(line)
|
||||
|
||||
if current is not None:
|
||||
docs.append(current)
|
||||
|
||||
return header, ["\n".join(d) for d in docs]
|
||||
|
||||
|
||||
def parse_document_block(block: str) -> DocumentItem:
|
||||
"""Parse a single document block from the markdown file."""
|
||||
lines = block.splitlines()
|
||||
header = lines[0].strip()
|
||||
|
||||
# Extract file path from backticks
|
||||
m = re.search(r"`([^`]+)`", header)
|
||||
doc_path = m.group(1) if m else ""
|
||||
|
||||
sections: list[tuple[str, list[QuestionItem]]] = []
|
||||
current_section = ""
|
||||
current_questions: list[QuestionItem] = []
|
||||
|
||||
for line in lines[1:]:
|
||||
if line.startswith("### "):
|
||||
if current_section:
|
||||
sections.append((current_section, current_questions))
|
||||
current_section = line[4:].strip()
|
||||
current_questions = []
|
||||
elif line.startswith("- "):
|
||||
q = line[2:].strip()
|
||||
if q:
|
||||
current_questions.append(
|
||||
QuestionItem(section=current_section, question=q)
|
||||
)
|
||||
|
||||
if current_section:
|
||||
sections.append((current_section, current_questions))
|
||||
|
||||
return DocumentItem(header=header, path=doc_path, sections=sections)
|
||||
|
||||
|
||||
def parse_all_documents(md_path: Path) -> list[DocumentItem]:
|
||||
"""Parse all documents from the markdown file."""
|
||||
raw = md_path.read_text(encoding="utf-8")
|
||||
_, doc_blocks = split_documents(raw)
|
||||
return [parse_document_block(b) for b in doc_blocks]
|
||||
|
||||
|
||||
def flatten_questions(docs: list[DocumentItem]) -> list[tuple[DocumentItem, QuestionItem]]:
|
||||
"""Flatten all questions from all documents into a single list with indices."""
|
||||
result = []
|
||||
for doc in docs:
|
||||
for _, questions in doc.sections:
|
||||
for q in questions:
|
||||
result.append((doc, q))
|
||||
return result
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# RAG API Clients
|
||||
# =============================================================================
|
||||
|
||||
def call_langchain(query: str, timeout: int = RAG_TIMEOUT) -> str:
|
||||
"""Call the LangChain RAG system API."""
|
||||
payload = {"query": query}
|
||||
try:
|
||||
r = requests.post(LANGCHAIN_URL, json=payload, timeout=timeout)
|
||||
r.raise_for_status()
|
||||
data = r.json()
|
||||
return str(data.get("response", "")).strip()
|
||||
except Exception as e:
|
||||
return f"ERROR: {e}"
|
||||
|
||||
|
||||
def call_llamaindex(query: str, timeout: int = RAG_TIMEOUT) -> str:
|
||||
"""Call the LlamaIndex RAG system API."""
|
||||
payload = {"query": query}
|
||||
try:
|
||||
r = requests.post(LLAMAINDEX_URL, json=payload, timeout=timeout)
|
||||
r.raise_for_status()
|
||||
data = r.json()
|
||||
return str(data.get("response", "")).strip()
|
||||
except Exception as e:
|
||||
return f"ERROR: {e}"
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Yandex Disk Integration
|
||||
# =============================================================================
|
||||
|
||||
def download_yadisk_file(remote_path: str, token: str, local_path: str, timeout: int = YADISK_TIMEOUT) -> None:
|
||||
"""Download a file from Yandex Disk to a local path."""
|
||||
headers = {"Authorization": f"OAuth {token}"}
|
||||
|
||||
# Get download URL
|
||||
response = requests.get(
|
||||
"https://cloud-api.yandex.net/v1/disk/resources/download",
|
||||
headers=headers,
|
||||
params={"path": remote_path},
|
||||
timeout=timeout,
|
||||
)
|
||||
response.raise_for_status()
|
||||
href = response.json()["href"]
|
||||
|
||||
# Download the file
|
||||
file_response = requests.get(href, timeout=timeout * 2)
|
||||
file_response.raise_for_status()
|
||||
|
||||
with open(local_path, "wb") as f:
|
||||
f.write(file_response.content)
|
||||
|
||||
|
||||
def extract_text_from_file(file_path: str) -> str:
|
||||
"""Extract text from a downloaded file based on its extension."""
|
||||
ext = Path(file_path).suffix.lower()
|
||||
|
||||
# Text-based formats
|
||||
if ext in [".txt", ".md", ".csv", ".json", ".xml", ".html", ".htm"]:
|
||||
with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
|
||||
return f.read()
|
||||
|
||||
# For binary formats (docx, pdf, xlsx), we'll return a placeholder
|
||||
# In production, you'd use libraries like python-docx, PyPDF2, openpyxl
|
||||
elif ext in [".docx", ".doc"]:
|
||||
try:
|
||||
from docx import Document
|
||||
doc = Document(file_path)
|
||||
return "\n".join([p.text for p in doc.paragraphs])
|
||||
except ImportError:
|
||||
return f"[DOCX file: {file_path}] - python-docx not installed"
|
||||
except Exception as e:
|
||||
return f"[DOCX read error: {e}]"
|
||||
|
||||
elif ext == ".pdf":
|
||||
try:
|
||||
import PyPDF2
|
||||
text_parts = []
|
||||
with open(file_path, "rb") as f:
|
||||
reader = PyPDF2.PdfReader(f)
|
||||
for page in reader.pages:
|
||||
text_parts.append(page.extract_text() or "")
|
||||
return "\n".join(text_parts)
|
||||
except ImportError:
|
||||
return f"[PDF file: {file_path}] - PyPDF2 not installed"
|
||||
except Exception as e:
|
||||
return f"[PDF read error: {e}]"
|
||||
|
||||
elif ext in [".xlsx", ".xls"]:
|
||||
try:
|
||||
from openpyxl import load_workbook
|
||||
wb = load_workbook(file_path, read_only=True)
|
||||
texts = []
|
||||
for sheet in wb.worksheets:
|
||||
for row in sheet.iter_rows(values_only=True):
|
||||
texts.append("\t".join(str(c) if c is not None else "" for c in row))
|
||||
return "\n".join(texts)
|
||||
except ImportError:
|
||||
return f"[XLSX file: {file_path}] - openpyxl not installed"
|
||||
except Exception as e:
|
||||
return f"[XLSX read error: {e}]"
|
||||
|
||||
else:
|
||||
# Try to read as text
|
||||
try:
|
||||
with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
|
||||
return f.read()
|
||||
except Exception:
|
||||
return f"[Binary file: {file_path}]"
|
||||
|
||||
|
||||
def fetch_document_content(remote_path: str, token: str) -> str:
|
||||
"""Fetch content from a Yandex Disk file."""
|
||||
if not token:
|
||||
return "[Yandex Disk token not provided]"
|
||||
|
||||
# Clean up the path - remove "disk:/" prefix if present
|
||||
clean_path = remote_path
|
||||
if clean_path.startswith("disk:/"):
|
||||
clean_path = clean_path[6:]
|
||||
|
||||
with tempfile.NamedTemporaryFile(delete=False, suffix=Path(clean_path).suffix) as tmp:
|
||||
local_path = tmp.name
|
||||
|
||||
try:
|
||||
download_yadisk_file(clean_path, token, local_path)
|
||||
return extract_text_from_file(local_path)
|
||||
except Exception as e:
|
||||
return f"[Yandex Disk download error: {e}]"
|
||||
finally:
|
||||
if os.path.exists(local_path):
|
||||
os.unlink(local_path)
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# OpenAI-compatible LLM Evaluator
|
||||
# =============================================================================
|
||||
|
||||
def create_evaluation_prompt(
|
||||
question: str,
|
||||
rag_response: str,
|
||||
document_content: str,
|
||||
section_type: str
|
||||
) -> str:
|
||||
"""Create a prompt for the LLM to evaluate a RAG response."""
|
||||
|
||||
# Section-specific evaluation criteria
|
||||
section_criteria = {
|
||||
"Entity/Fact Recall (Response Relevance)": """
|
||||
Критерии оценки:
|
||||
- Насколько точно ответ извлекает факты и сущности из документа
|
||||
- Соответствует ли ответ на вопрос о ключевых участниках и их ролях
|
||||
- Полнота извлечения фактов из контекста
|
||||
""",
|
||||
"Numerical & Temporal Precision": """
|
||||
Критерии оценки:
|
||||
- Точность извлечения дат, лет, числовых значений
|
||||
- Соответствие чисел в ответе числам в документе
|
||||
- Правильность временных привязок событий
|
||||
""",
|
||||
"Context Precision (Evidence-anchored)": """
|
||||
Критерии оценки:
|
||||
- Насколько хорошо ответ идентифицирует релевантные фрагменты
|
||||
- Умение отличать релевантные фрагменты от нерелевантных
|
||||
- Обоснованность выбора контекста
|
||||
""",
|
||||
"Faithfulness / Non-hallucination": """
|
||||
Критерии оценки:
|
||||
- Отсутствие выдуманной информации
|
||||
- Ответ основан только на предоставленном контексте
|
||||
- Корректное указание на отсутствие информации, если её нет
|
||||
""",
|
||||
"Reasoning & Synthesis": """
|
||||
Критерии оценки:
|
||||
- Качество синтеза информации из нескольких фрагментов
|
||||
- Логичность выводов
|
||||
- Указание на ограничения, риски или условия
|
||||
"""
|
||||
}
|
||||
|
||||
criteria = section_criteria.get(section_type, """
|
||||
Критерии оценки:
|
||||
- Релевантность ответа вопросу
|
||||
- Точность фактов
|
||||
- Отсутствие галлюцинаций
|
||||
- Полнота ответа
|
||||
""")
|
||||
|
||||
prompt = f"""Ты — эксперт по оценке качества RAG-систем (Retrieval-Augmented Generation).
|
||||
|
||||
Твоя задача: оценить качество ответа RAG-системы на вопрос пользователя, сравнив его с содержимым исходного документа.
|
||||
|
||||
## Вопрос пользователя:
|
||||
{question}
|
||||
|
||||
## Ответ RAG-системы:
|
||||
{rag_response}
|
||||
|
||||
## Содержимое исходного документа:
|
||||
{document_content[:8000]} # Ограничиваем длину для контекста
|
||||
|
||||
{criteria}
|
||||
|
||||
## Формат ответа:
|
||||
Верни ответ ТОЛЬКО в формате JSON:
|
||||
{{
|
||||
"score": <число от 0.0 до 1.0>,
|
||||
"rationale": "<краткое обоснование оценки на русском языке>",
|
||||
"strengths": ["<сильные стороны>"],
|
||||
"weaknesses": ["<слабые стороны>"],
|
||||
"hallucination_detected": <true/false>,
|
||||
"missing_info": ["<отсутствующая важная информация>"]
|
||||
}}
|
||||
|
||||
Оценка:
|
||||
- 1.0: Идеальный ответ, полностью точный и полный
|
||||
- 0.8-0.9: Очень хороший ответ с незначительными неточностями
|
||||
- 0.6-0.7: Хороший ответ, но есть некоторые проблемы
|
||||
- 0.4-0.5: Удовлетворительный ответ с существенными проблемами
|
||||
- 0.2-0.3: Плохой ответ, много ошибок или неполный
|
||||
- 0.0-0.1: Ответ неверный или содержит галлюцинации
|
||||
"""
|
||||
return prompt
|
||||
|
||||
|
||||
def evaluate_with_llm(
|
||||
question: str,
|
||||
rag_response: str,
|
||||
document_content: str,
|
||||
section_type: str,
|
||||
model: str = OPENAI_CHAT_MODEL,
|
||||
api_url: str = OPENAI_CHAT_URL,
|
||||
api_key: str = OPENAI_CHAT_KEY
|
||||
) -> dict[str, Any]:
|
||||
"""Evaluate a RAG response using the OpenAI-compatible LLM."""
|
||||
|
||||
if not api_key:
|
||||
return {
|
||||
"score": 0.0,
|
||||
"rationale": "API key not provided",
|
||||
"error": "Missing API key"
|
||||
}
|
||||
|
||||
prompt = create_evaluation_prompt(question, rag_response, document_content, section_type)
|
||||
|
||||
headers = {
|
||||
"Authorization": f"Bearer {api_key}",
|
||||
"Content-Type": "application/json"
|
||||
}
|
||||
|
||||
payload = {
|
||||
"model": model,
|
||||
"messages": [
|
||||
{
|
||||
"role": "system",
|
||||
"content": "Ты — эксперт по оценке качества RAG-систем. Отвечай ТОЛЬКО в формате JSON."
|
||||
},
|
||||
{
|
||||
"role": "user",
|
||||
"content": prompt
|
||||
}
|
||||
],
|
||||
"temperature": 0.1,
|
||||
"max_tokens": 500
|
||||
}
|
||||
|
||||
try:
|
||||
response = requests.post(
|
||||
api_url + "/chat/completions",
|
||||
headers=headers,
|
||||
json=payload,
|
||||
timeout=LLM_TIMEOUT
|
||||
)
|
||||
response.raise_for_status()
|
||||
|
||||
result = response.json()
|
||||
|
||||
# Safely extract content
|
||||
try:
|
||||
content = result.get("choices", [{}])[0].get("message", {}).get("content")
|
||||
except (IndexError, KeyError):
|
||||
content = None
|
||||
|
||||
if not content:
|
||||
return {
|
||||
"score": 0.5,
|
||||
"rationale": "LLM returned empty or malformed response",
|
||||
"error": "Empty content in LLM response"
|
||||
}
|
||||
|
||||
# Parse JSON response
|
||||
try:
|
||||
# Try to extract JSON from the response
|
||||
json_match = re.search(r'\{[^{}]*\}', content, re.DOTALL)
|
||||
if json_match:
|
||||
evaluation = json.loads(json_match.group())
|
||||
else:
|
||||
evaluation = json.loads(content)
|
||||
|
||||
return {
|
||||
"score": float(evaluation.get("score", 0.0)),
|
||||
"rationale": evaluation.get("rationale", "") or "",
|
||||
"strengths": evaluation.get("strengths", []),
|
||||
"weaknesses": evaluation.get("weaknesses", []),
|
||||
"hallucination_detected": evaluation.get("hallucination_detected", False),
|
||||
"missing_info": evaluation.get("missing_info", [])
|
||||
}
|
||||
except json.JSONDecodeError as e:
|
||||
return {
|
||||
"score": 0.5,
|
||||
"rationale": f"Failed to parse LLM response: {content[:200]}",
|
||||
"error": str(e)
|
||||
}
|
||||
|
||||
except requests.RequestException as e:
|
||||
return {
|
||||
"score": 0.0,
|
||||
"rationale": f"LLM API error: {e}",
|
||||
"error": str(e)
|
||||
}
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Results Output
|
||||
# =============================================================================
|
||||
|
||||
def truncate_text(text: str, max_len: int = 1500) -> str:
|
||||
"""Truncate text for display."""
|
||||
text = (text or "").strip()
|
||||
if len(text) <= max_len:
|
||||
return text
|
||||
return text[:max_len] + "... [truncated]"
|
||||
|
||||
|
||||
def format_question_result(q: QuestionItem, doc_path: str) -> str:
|
||||
"""Format a single question result for the output markdown."""
|
||||
lines = [
|
||||
f"#### Вопрос #{q.question_number}",
|
||||
f"**Вопрос:** {q.question}",
|
||||
"",
|
||||
f"**Секция:** {q.section}",
|
||||
"",
|
||||
"**Ответ LangChain:**",
|
||||
f"```",
|
||||
truncate_text(q.langchain_answer),
|
||||
"```",
|
||||
"",
|
||||
"**Ответ LlamaIndex:**",
|
||||
f"```",
|
||||
truncate_text(q.llamaindex_answer),
|
||||
"```",
|
||||
"",
|
||||
"**Результаты оценки:**",
|
||||
f"- LangChain Score: {q.langchain_score:.2f}",
|
||||
f"- LlamaIndex Score: {q.llamaindex_score:.2f}",
|
||||
f"- Победитель: **{q.winner}**",
|
||||
"",
|
||||
f"**Обоснование:** {q.rationale}",
|
||||
"",
|
||||
"---",
|
||||
""
|
||||
]
|
||||
return "\n".join(lines)
|
||||
|
||||
|
||||
def format_document_results(doc: DocumentItem, with_results: bool = True) -> str:
|
||||
"""Format document results for the output markdown."""
|
||||
lines = [
|
||||
doc.header,
|
||||
"",
|
||||
f"**Путь к файлу:** `{doc.path}`",
|
||||
""
|
||||
]
|
||||
|
||||
if with_results:
|
||||
for section_name, questions in doc.sections:
|
||||
lines.append(f"### {section_name}")
|
||||
lines.append("")
|
||||
for q in questions:
|
||||
lines.append(format_question_result(q, doc.path))
|
||||
else:
|
||||
lines.append("_Результаты ещё не обработаны._")
|
||||
lines.append("")
|
||||
|
||||
return "\n".join(lines)
|
||||
|
||||
|
||||
def format_summary(all_questions: list[QuestionItem], batch_info: dict) -> str:
|
||||
"""Format summary statistics."""
|
||||
wins = {"LangChain": 0, "LlamaIndex": 0, "Tie": 0}
|
||||
scores_lc: list[float] = []
|
||||
scores_li: list[float] = []
|
||||
|
||||
for q in all_questions:
|
||||
wins[q.winner] += 1
|
||||
scores_lc.append(q.langchain_score)
|
||||
scores_li.append(q.llamaindex_score)
|
||||
|
||||
avg_lc = sum(scores_lc) / max(1, len(scores_lc))
|
||||
avg_li = sum(scores_li) / max(1, len(scores_li))
|
||||
|
||||
if avg_lc > avg_li + 0.05:
|
||||
ranking = "LangChain"
|
||||
elif avg_li > avg_lc + 0.05:
|
||||
ranking = "LlamaIndex"
|
||||
else:
|
||||
ranking = "Ничья"
|
||||
|
||||
lines = [
|
||||
"## Итоговая сводка",
|
||||
"",
|
||||
f"- Всего вопросов оценено: {len(all_questions)}",
|
||||
f"- Диапазон вопросов: {batch_info.get('from', 1)} - {batch_info.get('to', len(all_questions))}",
|
||||
"",
|
||||
"### Победители по вопросам:",
|
||||
f"- LangChain: {wins['LangChain']}",
|
||||
f"- LlamaIndex: {wins['LlamaIndex']}",
|
||||
f"- Ничья: {wins['Tie']}",
|
||||
"",
|
||||
"### Средние оценки:",
|
||||
f"- LangChain: {avg_lc:.3f}",
|
||||
f"- LlamaIndex: {avg_li:.3f}",
|
||||
"",
|
||||
f"### Итоговый рейтинг: **{ranking}**",
|
||||
"",
|
||||
"_Методика оценки: LLM-оценка на основе сравнения с содержимым документов из Yandex Disk._",
|
||||
"",
|
||||
"---",
|
||||
""
|
||||
]
|
||||
return "\n".join(lines)
|
||||
|
||||
|
||||
def write_results(
|
||||
header_lines: list[str],
|
||||
docs: list[DocumentItem],
|
||||
all_questions: list[QuestionItem],
|
||||
batch_info: dict,
|
||||
output_path: Path
|
||||
) -> None:
|
||||
"""Write evaluation results to markdown file."""
|
||||
output_parts: list[str] = []
|
||||
|
||||
# Header
|
||||
output_parts.extend(header_lines)
|
||||
output_parts.append("")
|
||||
output_parts.append("# Результаты оценки RAG-систем")
|
||||
output_parts.append("")
|
||||
output_parts.append(f"Дата генерации: {Path(output_path).stat().st_mtime if output_path.exists() else 'N/A'}")
|
||||
output_parts.append("")
|
||||
|
||||
# Summary
|
||||
output_parts.append(format_summary(all_questions, batch_info))
|
||||
|
||||
# Detailed results per document
|
||||
for doc in docs:
|
||||
output_parts.append(format_document_results(doc, with_results=True))
|
||||
|
||||
output_path.write_text("\n".join(output_parts).rstrip() + "\n", encoding="utf-8")
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Main Evaluation Loop
|
||||
# =============================================================================
|
||||
|
||||
def run_evaluation(
|
||||
from_q: int,
|
||||
to_q: int,
|
||||
timeout_rag: int = RAG_TIMEOUT,
|
||||
timeout_llm: int = LLM_TIMEOUT
|
||||
) -> None:
|
||||
"""Run the evaluation for the specified question range."""
|
||||
|
||||
print(f"Загрузка документов из {INPUT_MD}...")
|
||||
docs = parse_all_documents(INPUT_MD)
|
||||
all_flat = flatten_questions(docs)
|
||||
|
||||
total_questions = len(all_flat)
|
||||
print(f"Всего вопросов найдено: {total_questions}")
|
||||
|
||||
# Adjust range
|
||||
from_q = max(1, from_q)
|
||||
to_q = min(total_questions, to_q)
|
||||
|
||||
if from_q > to_q:
|
||||
print(f"Ошибка: диапазон {from_q}:{to_q} некорректен")
|
||||
return
|
||||
|
||||
print(f"Оценка вопросов с {from_q} по {to_q}...")
|
||||
|
||||
# Store original header for output
|
||||
raw = INPUT_MD.read_text(encoding="utf-8")
|
||||
header_lines, _ = split_documents(raw)
|
||||
|
||||
# Track processed questions
|
||||
processed_indices = set(range(from_q - 1, to_q))
|
||||
|
||||
# Process each question in range
|
||||
q_index = 0
|
||||
for doc_idx, (doc, q) in enumerate(all_flat):
|
||||
q_index += 1
|
||||
|
||||
if q_index < from_q or q_index > to_q:
|
||||
continue
|
||||
|
||||
q.question_number = q_index
|
||||
print(f"\n[{q_index}/{total_questions}] {q.question[:80]}...")
|
||||
|
||||
# Call both RAG systems
|
||||
print(f" -> LangChain...", end=" ", flush=True)
|
||||
t0 = __import__("time").time()
|
||||
q.langchain_answer = call_langchain(q.question, timeout=timeout_rag)
|
||||
print(f"OK ({__import__('time').time() - t0:.1f}s)")
|
||||
|
||||
print(f" -> LlamaIndex...", end=" ", flush=True)
|
||||
t0 = __import__("time").time()
|
||||
q.llamaindex_answer = call_llamaindex(q.question, timeout=timeout_rag)
|
||||
print(f"OK ({__import__('time').time() - t0:.1f}s)")
|
||||
|
||||
# Download document content from Yandex Disk
|
||||
print(f" -> Загрузка документа из Yandex Disk...", end=" ", flush=True)
|
||||
if doc.path:
|
||||
doc_content = fetch_document_content(doc.path, YADISK_TOKEN)
|
||||
print(f"OK ({len(doc_content)} символов)")
|
||||
else:
|
||||
doc_content = "[Путь к документу не найден]"
|
||||
print("SKIP (нет пути)")
|
||||
|
||||
# Evaluate LangChain response
|
||||
print(f" -> Оценка LangChain...", end=" ", flush=True)
|
||||
lc_eval = evaluate_with_llm(
|
||||
q.question, q.langchain_answer, doc_content, q.section
|
||||
)
|
||||
q.langchain_score = lc_eval.get("score", 0.0)
|
||||
lc_rationale = lc_eval.get("rationale", "")
|
||||
print(f"Score: {q.langchain_score:.2f}")
|
||||
|
||||
# Evaluate LlamaIndex response
|
||||
print(f" -> Оценка LlamaIndex...", end=" ", flush=True)
|
||||
li_eval = evaluate_with_llm(
|
||||
q.question, q.llamaindex_answer, doc_content, q.section
|
||||
)
|
||||
q.llamaindex_score = li_eval.get("score", 0.0)
|
||||
li_rationale = li_eval.get("rationale", "")
|
||||
print(f"Score: {q.llamaindex_score:.2f}")
|
||||
|
||||
# Determine winner
|
||||
score_diff = abs(q.langchain_score - q.llamaindex_score)
|
||||
if score_diff < 0.05:
|
||||
q.winner = "Tie"
|
||||
elif q.langchain_score > q.llamaindex_score:
|
||||
q.winner = "LangChain"
|
||||
else:
|
||||
q.winner = "LlamaIndex"
|
||||
|
||||
# Combine rationales
|
||||
q.rationale = f"LC: {lc_rationale} | LI: {li_rationale}"
|
||||
|
||||
# Write results
|
||||
print(f"\nЗапись результатов в {OUTPUT_MD}...")
|
||||
batch_info = {"from": from_q, "to": to_q}
|
||||
|
||||
# Collect all evaluated questions
|
||||
evaluated_questions = [
|
||||
q for _, q in all_flat
|
||||
if q.question_number in range(from_q, to_q + 1)
|
||||
]
|
||||
|
||||
write_results(header_lines, docs, evaluated_questions, batch_info, OUTPUT_MD)
|
||||
print(f"Готово! Результаты сохранены в {OUTPUT_MD}")
|
||||
|
||||
|
||||
def main() -> int:
|
||||
"""Main entry point."""
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Оценка RAG-систем с использованием LLM и Yandex Disk"
|
||||
)
|
||||
parser.add_argument(
|
||||
"range",
|
||||
type=str,
|
||||
help="Диапазон вопросов для оценки в формате 'from:to' (например, 1:10)"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--timeout-rag",
|
||||
type=int,
|
||||
default=RAG_TIMEOUT,
|
||||
help=f"Таймаут для RAG API (по умолчанию {RAG_TIMEOUT}s)"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--timeout-llm",
|
||||
type=int,
|
||||
default=LLM_TIMEOUT,
|
||||
help=f"Таймаут для LLM API (по умолчанию {LLM_TIMEOUT}s)"
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# Parse range argument
|
||||
range_match = re.match(r"(\d+):(\d+)", args.range)
|
||||
if not range_match:
|
||||
print("Ошибка: диапазон должен быть в формате 'from:to' (например, 1:10)")
|
||||
return 1
|
||||
|
||||
from_q = int(range_match.group(1))
|
||||
to_q = int(range_match.group(2))
|
||||
|
||||
if from_q > to_q:
|
||||
print("Ошибка: 'from' должно быть меньше или равно 'to'")
|
||||
return 1
|
||||
|
||||
# Validate configuration
|
||||
if not OPENAI_CHAT_KEY:
|
||||
print("Предупреждение: OPENAI_CHAT_KEY не установлен. Оценка LLM будет пропущена.")
|
||||
if not YADISK_TOKEN:
|
||||
print("Предупреждение: YADISK_TOKEN не установлен. Загрузка документов будет пропущена.")
|
||||
|
||||
# Run evaluation
|
||||
run_evaluation(from_q, to_q, args.timeout_rag, args.timeout_llm)
|
||||
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
raise SystemExit(main())
|
||||
Reference in New Issue
Block a user