2026-03-11 22:30:02 +03:00
|
|
|
|
#!/usr/bin/env python3
|
|
|
|
|
|
"""
|
2026-03-13 08:20:18 +03:00
|
|
|
|
RAG evaluation script (file-batch mode).
|
|
|
|
|
|
|
|
|
|
|
|
Key behavior:
|
|
|
|
|
|
- Step = one document file (all its questions), not one question.
|
|
|
|
|
|
- Pre-download/caching in ./tmp/rag-evaluation (skip if already downloaded).
|
|
|
|
|
|
- Sequential API calls only (LangChain then LlamaIndex).
|
|
|
|
|
|
- Pairwise answer evaluation (both systems in one judge prompt).
|
|
|
|
|
|
- JSON output with append/overwrite support for batch runs and re-runs.
|
2026-03-11 22:30:02 +03:00
|
|
|
|
"""
|
2026-03-13 08:20:18 +03:00
|
|
|
|
|
2026-03-11 22:30:02 +03:00
|
|
|
|
from __future__ import annotations
|
|
|
|
|
|
|
|
|
|
|
|
import argparse
|
2026-03-13 08:20:18 +03:00
|
|
|
|
import datetime as dt
|
2026-03-11 22:30:02 +03:00
|
|
|
|
import json
|
|
|
|
|
|
import os
|
|
|
|
|
|
import re
|
2026-03-13 08:20:18 +03:00
|
|
|
|
import time
|
|
|
|
|
|
from dataclasses import asdict, dataclass, field
|
2026-03-11 22:30:02 +03:00
|
|
|
|
from pathlib import Path
|
2026-03-13 08:20:18 +03:00
|
|
|
|
from typing import Any
|
|
|
|
|
|
|
|
|
|
|
|
try:
|
|
|
|
|
|
import requests
|
|
|
|
|
|
except ImportError as e: # pragma: no cover
|
|
|
|
|
|
raise SystemExit(
|
|
|
|
|
|
"Missing dependency: requests. Run with your project venv "
|
|
|
|
|
|
"(for example services/rag/langchain/venv/bin/python rag_evaluation.py ...)"
|
|
|
|
|
|
) from e
|
2026-03-11 22:30:02 +03:00
|
|
|
|
from dotenv import load_dotenv
|
|
|
|
|
|
|
|
|
|
|
|
load_dotenv()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# =============================================================================
|
|
|
|
|
|
# Configuration
|
|
|
|
|
|
# =============================================================================
|
|
|
|
|
|
|
2026-03-13 08:20:18 +03:00
|
|
|
|
LANGCHAIN_URL = os.getenv("LANGCHAIN_URL", "http://localhost:8331/api/test-query")
|
|
|
|
|
|
LLAMAINDEX_URL = os.getenv("LLAMAINDEX_URL", "http://localhost:8334/api/test-query")
|
2026-03-11 22:30:02 +03:00
|
|
|
|
|
2026-03-13 08:20:18 +03:00
|
|
|
|
# OpenAI-compatible evaluator endpoint. You can point this at OpenAI-compatible providers.
|
|
|
|
|
|
OPENAI_CHAT_URL = os.getenv(
|
|
|
|
|
|
"OPENAI_CHAT_URL", "https://foundation-models.api.cloud.ru/v1"
|
|
|
|
|
|
)
|
|
|
|
|
|
OPENAI_CHAT_KEY = os.getenv("OPENAI_CHAT_KEY", "")
|
|
|
|
|
|
OPENAI_CHAT_MODEL = os.getenv("OPENAI_CHAT_MODEL", "MiniMaxAI/MiniMax-M2")
|
2026-03-11 22:30:02 +03:00
|
|
|
|
|
|
|
|
|
|
YADISK_TOKEN = os.getenv("YADISK_TOKEN", "")
|
|
|
|
|
|
|
2026-03-13 08:20:18 +03:00
|
|
|
|
BASE_DIR = Path(__file__).resolve().parent
|
|
|
|
|
|
INPUT_MD = BASE_DIR / "DOCUMENTS_TO_TEST.md"
|
|
|
|
|
|
OUTPUT_JSON = BASE_DIR / "EVALUATION_RESULT.json"
|
|
|
|
|
|
TMP_DIR = BASE_DIR / "tmp" / "rag-evaluation"
|
2026-03-11 22:30:02 +03:00
|
|
|
|
|
2026-03-13 08:20:18 +03:00
|
|
|
|
RAG_TIMEOUT = int(os.getenv("RAG_TIMEOUT", "120"))
|
|
|
|
|
|
EVAL_TIMEOUT = int(os.getenv("EVAL_TIMEOUT", "90"))
|
|
|
|
|
|
YADISK_META_TIMEOUT = int(os.getenv("YADISK_META_TIMEOUT", "30"))
|
|
|
|
|
|
YADISK_DOWNLOAD_TIMEOUT = int(os.getenv("YADISK_DOWNLOAD_TIMEOUT", "180"))
|
2026-03-11 22:30:02 +03:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# =============================================================================
|
2026-03-13 08:20:18 +03:00
|
|
|
|
# Data structures
|
2026-03-11 22:30:02 +03:00
|
|
|
|
# =============================================================================
|
|
|
|
|
|
|
2026-03-13 08:20:18 +03:00
|
|
|
|
|
2026-03-11 22:30:02 +03:00
|
|
|
|
@dataclass
|
2026-03-13 08:20:18 +03:00
|
|
|
|
class QuestionResult:
|
2026-03-11 22:30:02 +03:00
|
|
|
|
section: str
|
|
|
|
|
|
question: str
|
|
|
|
|
|
langchain_answer: str = ""
|
|
|
|
|
|
llamaindex_answer: str = ""
|
|
|
|
|
|
langchain_score: float = 0.0
|
|
|
|
|
|
llamaindex_score: float = 0.0
|
|
|
|
|
|
winner: str = "Tie"
|
|
|
|
|
|
rationale: str = ""
|
2026-03-13 08:20:18 +03:00
|
|
|
|
evaluator_model: str = ""
|
|
|
|
|
|
evaluated_at: str = ""
|
2026-03-11 22:30:02 +03:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@dataclass
|
2026-03-13 08:20:18 +03:00
|
|
|
|
class DocumentEvaluation:
|
|
|
|
|
|
index: int
|
2026-03-11 22:30:02 +03:00
|
|
|
|
path: str
|
2026-03-13 08:20:18 +03:00
|
|
|
|
cache_file: str = ""
|
|
|
|
|
|
cache_status: str = ""
|
|
|
|
|
|
questions: list[QuestionResult] = field(default_factory=list)
|
|
|
|
|
|
started_at: str = ""
|
|
|
|
|
|
finished_at: str = ""
|
2026-03-11 22:30:02 +03:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# =============================================================================
|
2026-03-13 08:20:18 +03:00
|
|
|
|
# Markdown parsing
|
2026-03-11 22:30:02 +03:00
|
|
|
|
# =============================================================================
|
|
|
|
|
|
|
2026-03-13 08:20:18 +03:00
|
|
|
|
|
2026-03-11 22:30:02 +03:00
|
|
|
|
def split_documents(md_text: str) -> tuple[list[str], list[str]]:
|
|
|
|
|
|
lines = md_text.splitlines()
|
|
|
|
|
|
header: list[str] = []
|
|
|
|
|
|
docs: list[list[str]] = []
|
|
|
|
|
|
current: list[str] | None = None
|
|
|
|
|
|
for line in lines:
|
|
|
|
|
|
if line.startswith("## "):
|
|
|
|
|
|
if current is not None:
|
|
|
|
|
|
docs.append(current)
|
|
|
|
|
|
current = [line]
|
|
|
|
|
|
else:
|
|
|
|
|
|
if current is None:
|
|
|
|
|
|
header.append(line)
|
|
|
|
|
|
else:
|
|
|
|
|
|
current.append(line)
|
|
|
|
|
|
if current is not None:
|
|
|
|
|
|
docs.append(current)
|
|
|
|
|
|
return header, ["\n".join(d) for d in docs]
|
|
|
|
|
|
|
|
|
|
|
|
|
2026-03-13 08:20:18 +03:00
|
|
|
|
def parse_document_block(idx: int, block: str) -> tuple[str, list[QuestionResult]]:
|
2026-03-11 22:30:02 +03:00
|
|
|
|
lines = block.splitlines()
|
|
|
|
|
|
header = lines[0].strip()
|
|
|
|
|
|
m = re.search(r"`([^`]+)`", header)
|
|
|
|
|
|
doc_path = m.group(1) if m else ""
|
|
|
|
|
|
|
2026-03-13 08:20:18 +03:00
|
|
|
|
section = ""
|
|
|
|
|
|
questions: list[QuestionResult] = []
|
2026-03-11 22:30:02 +03:00
|
|
|
|
for line in lines[1:]:
|
|
|
|
|
|
if line.startswith("### "):
|
2026-03-13 08:20:18 +03:00
|
|
|
|
section = line[4:].strip()
|
2026-03-11 22:30:02 +03:00
|
|
|
|
elif line.startswith("- "):
|
|
|
|
|
|
q = line[2:].strip()
|
|
|
|
|
|
if q:
|
2026-03-13 08:20:18 +03:00
|
|
|
|
questions.append(QuestionResult(section=section, question=q))
|
|
|
|
|
|
return doc_path, questions
|
2026-03-11 22:30:02 +03:00
|
|
|
|
|
|
|
|
|
|
|
2026-03-13 08:20:18 +03:00
|
|
|
|
def parse_all_docs(md_path: Path) -> list[tuple[int, str, list[QuestionResult]]]:
|
2026-03-11 22:30:02 +03:00
|
|
|
|
raw = md_path.read_text(encoding="utf-8")
|
2026-03-13 08:20:18 +03:00
|
|
|
|
_, blocks = split_documents(raw)
|
|
|
|
|
|
parsed: list[tuple[int, str, list[QuestionResult]]] = []
|
|
|
|
|
|
for i, block in enumerate(blocks, start=1):
|
|
|
|
|
|
path, questions = parse_document_block(i, block)
|
|
|
|
|
|
parsed.append((i, path, questions))
|
|
|
|
|
|
return parsed
|
2026-03-11 22:30:02 +03:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# =============================================================================
|
2026-03-13 08:20:18 +03:00
|
|
|
|
# Caching / Yandex Disk
|
2026-03-11 22:30:02 +03:00
|
|
|
|
# =============================================================================
|
|
|
|
|
|
|
|
|
|
|
|
|
2026-03-13 08:20:18 +03:00
|
|
|
|
def cache_file_name(remote_path: str) -> str:
|
|
|
|
|
|
# Deterministic local cache filename
|
|
|
|
|
|
digest = re.sub(r"[^a-z0-9]", "", str(abs(hash(remote_path))))[:12]
|
|
|
|
|
|
suffix = Path(remote_path).suffix or ".bin"
|
|
|
|
|
|
return f"{digest}{suffix}"
|
2026-03-11 22:30:02 +03:00
|
|
|
|
|
2026-03-13 08:20:18 +03:00
|
|
|
|
|
|
|
|
|
|
def download_yadisk_to_cache(remote_path: str, token: str, cache_path: Path) -> str:
|
|
|
|
|
|
"""
|
|
|
|
|
|
Download file into cache path if missing.
|
|
|
|
|
|
Returns status: "cached_existing" | "downloaded" | "error:..."
|
|
|
|
|
|
"""
|
|
|
|
|
|
cache_path.parent.mkdir(parents=True, exist_ok=True)
|
|
|
|
|
|
if cache_path.exists() and cache_path.stat().st_size > 0:
|
|
|
|
|
|
return "cached_existing"
|
|
|
|
|
|
if not token:
|
|
|
|
|
|
return "error:missing_yadisk_token"
|
|
|
|
|
|
|
|
|
|
|
|
headers = {"Authorization": f"OAuth {token}"}
|
2026-03-11 22:30:02 +03:00
|
|
|
|
try:
|
2026-03-13 08:20:18 +03:00
|
|
|
|
r = requests.get(
|
|
|
|
|
|
"https://cloud-api.yandex.net/v1/disk/resources/download",
|
|
|
|
|
|
headers=headers,
|
|
|
|
|
|
params={"path": remote_path},
|
|
|
|
|
|
timeout=YADISK_META_TIMEOUT,
|
|
|
|
|
|
)
|
2026-03-11 22:30:02 +03:00
|
|
|
|
r.raise_for_status()
|
2026-03-13 08:20:18 +03:00
|
|
|
|
href = r.json()["href"]
|
|
|
|
|
|
f = requests.get(href, timeout=YADISK_DOWNLOAD_TIMEOUT)
|
|
|
|
|
|
f.raise_for_status()
|
|
|
|
|
|
cache_path.write_bytes(f.content)
|
|
|
|
|
|
if cache_path.stat().st_size == 0:
|
|
|
|
|
|
return "error:empty_download"
|
|
|
|
|
|
return "downloaded"
|
|
|
|
|
|
except Exception as e: # noqa: BLE001
|
|
|
|
|
|
return f"error:{e}"
|
2026-03-11 22:30:02 +03:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# =============================================================================
|
2026-03-13 08:20:18 +03:00
|
|
|
|
# File text extraction (for evaluator context)
|
2026-03-11 22:30:02 +03:00
|
|
|
|
# =============================================================================
|
|
|
|
|
|
|
|
|
|
|
|
|
2026-03-13 08:20:18 +03:00
|
|
|
|
def extract_text_from_file(path: Path) -> str:
|
|
|
|
|
|
ext = path.suffix.lower()
|
|
|
|
|
|
if ext in {".txt", ".md", ".csv", ".json", ".xml", ".html", ".htm"}:
|
|
|
|
|
|
return path.read_text(encoding="utf-8", errors="ignore")
|
2026-03-11 22:30:02 +03:00
|
|
|
|
|
2026-03-13 08:20:18 +03:00
|
|
|
|
if ext in {".docx", ".doc"}:
|
|
|
|
|
|
try:
|
|
|
|
|
|
from docx import Document # type: ignore
|
2026-03-11 22:30:02 +03:00
|
|
|
|
|
2026-03-13 08:20:18 +03:00
|
|
|
|
doc = Document(str(path))
|
|
|
|
|
|
return "\n".join(p.text for p in doc.paragraphs)
|
|
|
|
|
|
except Exception as e: # noqa: BLE001
|
|
|
|
|
|
return f"[DOC parse error: {e}]"
|
2026-03-11 22:30:02 +03:00
|
|
|
|
|
2026-03-13 08:20:18 +03:00
|
|
|
|
if ext == ".pdf":
|
2026-03-11 22:30:02 +03:00
|
|
|
|
try:
|
2026-03-13 08:20:18 +03:00
|
|
|
|
import PyPDF2 # type: ignore
|
|
|
|
|
|
|
|
|
|
|
|
out: list[str] = []
|
|
|
|
|
|
with path.open("rb") as f:
|
2026-03-11 22:30:02 +03:00
|
|
|
|
reader = PyPDF2.PdfReader(f)
|
|
|
|
|
|
for page in reader.pages:
|
2026-03-13 08:20:18 +03:00
|
|
|
|
out.append(page.extract_text() or "")
|
|
|
|
|
|
return "\n".join(out)
|
|
|
|
|
|
except Exception as e: # noqa: BLE001
|
|
|
|
|
|
return f"[PDF parse error: {e}]"
|
2026-03-11 22:30:02 +03:00
|
|
|
|
|
2026-03-13 08:20:18 +03:00
|
|
|
|
if ext in {".xlsx", ".xls"}:
|
2026-03-11 22:30:02 +03:00
|
|
|
|
try:
|
2026-03-13 08:20:18 +03:00
|
|
|
|
from openpyxl import load_workbook # type: ignore
|
|
|
|
|
|
|
|
|
|
|
|
wb = load_workbook(str(path), read_only=True)
|
|
|
|
|
|
out: list[str] = []
|
|
|
|
|
|
for ws in wb.worksheets:
|
|
|
|
|
|
for row in ws.iter_rows(values_only=True):
|
|
|
|
|
|
out.append("\t".join("" if c is None else str(c) for c in row))
|
|
|
|
|
|
if len(out) > 5000:
|
|
|
|
|
|
break
|
|
|
|
|
|
if len(out) > 5000:
|
|
|
|
|
|
break
|
|
|
|
|
|
return "\n".join(out)
|
|
|
|
|
|
except Exception as e: # noqa: BLE001
|
|
|
|
|
|
return f"[XLS parse error: {e}]"
|
|
|
|
|
|
|
|
|
|
|
|
# fallback
|
|
|
|
|
|
try:
|
|
|
|
|
|
return path.read_text(encoding="utf-8", errors="ignore")
|
|
|
|
|
|
except Exception:
|
|
|
|
|
|
return f"[Binary file: {path.name}]"
|
2026-03-11 22:30:02 +03:00
|
|
|
|
|
|
|
|
|
|
|
2026-03-13 08:20:18 +03:00
|
|
|
|
# =============================================================================
|
|
|
|
|
|
# RAG API calls (sequential)
|
|
|
|
|
|
# =============================================================================
|
2026-03-11 22:30:02 +03:00
|
|
|
|
|
|
|
|
|
|
|
2026-03-13 08:20:18 +03:00
|
|
|
|
def call_rag(url: str, query: str, timeout: int) -> str:
|
|
|
|
|
|
payload = {"query": query}
|
|
|
|
|
|
try:
|
|
|
|
|
|
r = requests.post(url, json=payload, timeout=timeout)
|
|
|
|
|
|
r.raise_for_status()
|
|
|
|
|
|
data = r.json()
|
|
|
|
|
|
text = data.get("response", "")
|
|
|
|
|
|
if text is None:
|
|
|
|
|
|
return ""
|
|
|
|
|
|
return str(text).strip()
|
|
|
|
|
|
except Exception as e: # noqa: BLE001
|
|
|
|
|
|
return f"ERROR: {e}"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def call_langchain(query: str, timeout: int) -> str:
|
|
|
|
|
|
return call_rag(LANGCHAIN_URL, query, timeout)
|
2026-03-11 22:30:02 +03:00
|
|
|
|
|
2026-03-13 08:20:18 +03:00
|
|
|
|
|
|
|
|
|
|
def call_llamaindex(query: str, timeout: int) -> str:
|
|
|
|
|
|
payload = {"query": query, "mode": "agent"}
|
2026-03-11 22:30:02 +03:00
|
|
|
|
try:
|
2026-03-13 08:20:18 +03:00
|
|
|
|
r = requests.post(LLAMAINDEX_URL, json=payload, timeout=timeout)
|
|
|
|
|
|
r.raise_for_status()
|
|
|
|
|
|
data = r.json()
|
|
|
|
|
|
text = data.get("response", "")
|
|
|
|
|
|
if text is None:
|
|
|
|
|
|
return ""
|
|
|
|
|
|
return str(text).strip()
|
|
|
|
|
|
except Exception as e: # noqa: BLE001
|
|
|
|
|
|
return f"ERROR: {e}"
|
2026-03-11 22:30:02 +03:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# =============================================================================
|
2026-03-13 08:20:18 +03:00
|
|
|
|
# Evaluator
|
2026-03-11 22:30:02 +03:00
|
|
|
|
# =============================================================================
|
|
|
|
|
|
|
2026-03-13 08:20:18 +03:00
|
|
|
|
|
|
|
|
|
|
def _rule_score(answer: str) -> float:
|
|
|
|
|
|
if not answer or not answer.strip():
|
|
|
|
|
|
return 0.0
|
|
|
|
|
|
if answer.startswith("ERROR:"):
|
|
|
|
|
|
return -1.0
|
|
|
|
|
|
score = 0.3
|
|
|
|
|
|
if len(answer) > 120:
|
|
|
|
|
|
score += 0.2
|
|
|
|
|
|
if re.search(r"\d", answer):
|
|
|
|
|
|
score += 0.1
|
|
|
|
|
|
if re.search(r"[.!?]", answer):
|
|
|
|
|
|
score += 0.1
|
|
|
|
|
|
if re.search(r"(не найден|недостаточно|нет информации)", answer.lower()):
|
|
|
|
|
|
score += 0.05
|
|
|
|
|
|
return min(1.0, score)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
SECTION_CRITERIA: dict[str, str] = {
|
|
|
|
|
|
"Entity/Fact Recall (Response Relevance)": "Оцени точность извлечения сущностей/фактов и релевантность вопросу.",
|
|
|
|
|
|
"Numerical & Temporal Precision": "Оцени точность чисел, дат, периодов и временных связей.",
|
|
|
|
|
|
"Context Precision (Evidence-anchored)": "Оцени, насколько ответ опирается на релевантный контекст без лишнего.",
|
|
|
|
|
|
"Faithfulness / Non-hallucination": "Оцени отсутствие галлюцинаций и корректное поведение при отсутствии фактов.",
|
|
|
|
|
|
"Reasoning & Synthesis": "Оцени качество синтеза фактов и логичность итогового вывода.",
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def build_pair_eval_prompt(
|
2026-03-11 22:30:02 +03:00
|
|
|
|
question: str,
|
2026-03-13 08:20:18 +03:00
|
|
|
|
section: str,
|
|
|
|
|
|
langchain_answer: str,
|
|
|
|
|
|
llamaindex_answer: str,
|
|
|
|
|
|
document_text: str,
|
2026-03-11 22:30:02 +03:00
|
|
|
|
) -> str:
|
2026-03-13 08:20:18 +03:00
|
|
|
|
criteria = SECTION_CRITERIA.get(
|
|
|
|
|
|
section, "Оцени релевантность, точность и полезность."
|
|
|
|
|
|
)
|
|
|
|
|
|
context = document_text[:9000]
|
|
|
|
|
|
return f"""Ты судья качества RAG-ответов. Сравни два ответа на один вопрос.
|
2026-03-11 22:30:02 +03:00
|
|
|
|
|
2026-03-13 08:20:18 +03:00
|
|
|
|
Вопрос:
|
2026-03-11 22:30:02 +03:00
|
|
|
|
{question}
|
|
|
|
|
|
|
2026-03-13 08:20:18 +03:00
|
|
|
|
Секция оценки:
|
|
|
|
|
|
{section}
|
|
|
|
|
|
Критерий:
|
|
|
|
|
|
{criteria}
|
2026-03-11 22:30:02 +03:00
|
|
|
|
|
2026-03-13 08:20:18 +03:00
|
|
|
|
Ответ A (LangChain):
|
|
|
|
|
|
{langchain_answer}
|
2026-03-11 22:30:02 +03:00
|
|
|
|
|
2026-03-13 08:20:18 +03:00
|
|
|
|
Ответ B (LlamaIndex):
|
|
|
|
|
|
{llamaindex_answer}
|
|
|
|
|
|
|
|
|
|
|
|
Опорный контекст документа:
|
|
|
|
|
|
{context}
|
2026-03-11 22:30:02 +03:00
|
|
|
|
|
2026-03-13 08:20:18 +03:00
|
|
|
|
Верни ТОЛЬКО JSON:
|
2026-03-11 22:30:02 +03:00
|
|
|
|
{{
|
2026-03-13 08:20:18 +03:00
|
|
|
|
"langchain_score": <float от -1.0 до 1.0>,
|
|
|
|
|
|
"llamaindex_score": <float от -1.0 до 1.0>,
|
|
|
|
|
|
"winner": "LangChain|LlamaIndex|Tie",
|
|
|
|
|
|
"rationale": "<кратко по сути>"
|
2026-03-11 22:30:02 +03:00
|
|
|
|
}}
|
|
|
|
|
|
|
2026-03-13 08:20:18 +03:00
|
|
|
|
Правила:
|
|
|
|
|
|
- Технические ошибки/таймауты должны получать -1.0.
|
|
|
|
|
|
- Пустой ответ без ошибки = 0.0.
|
|
|
|
|
|
- Галлюцинации сильно штрафуются.
|
|
|
|
|
|
- Если разница незначительная, выбирай Tie.
|
2026-03-11 22:30:02 +03:00
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
2026-03-13 08:20:18 +03:00
|
|
|
|
def evaluate_pair_with_llm(
|
2026-03-11 22:30:02 +03:00
|
|
|
|
question: str,
|
2026-03-13 08:20:18 +03:00
|
|
|
|
section: str,
|
|
|
|
|
|
langchain_answer: str,
|
|
|
|
|
|
llamaindex_answer: str,
|
|
|
|
|
|
document_text: str,
|
|
|
|
|
|
) -> tuple[float, float, str, str]:
|
|
|
|
|
|
# Deterministic short-circuit for technical failures
|
|
|
|
|
|
if langchain_answer.startswith("ERROR:") and llamaindex_answer.startswith("ERROR:"):
|
|
|
|
|
|
return -1.0, -1.0, "Tie", "Обе системы вернули техническую ошибку."
|
|
|
|
|
|
if langchain_answer.startswith("ERROR:"):
|
|
|
|
|
|
return (
|
|
|
|
|
|
-1.0,
|
|
|
|
|
|
_rule_score(llamaindex_answer),
|
|
|
|
|
|
"LlamaIndex",
|
|
|
|
|
|
"LangChain технически не ответил.",
|
|
|
|
|
|
)
|
|
|
|
|
|
if llamaindex_answer.startswith("ERROR:"):
|
|
|
|
|
|
return (
|
|
|
|
|
|
_rule_score(langchain_answer),
|
|
|
|
|
|
-1.0,
|
|
|
|
|
|
"LangChain",
|
|
|
|
|
|
"LlamaIndex технически не ответил.",
|
|
|
|
|
|
)
|
2026-03-11 22:30:02 +03:00
|
|
|
|
|
2026-03-13 08:20:18 +03:00
|
|
|
|
if not OPENAI_CHAT_KEY:
|
|
|
|
|
|
# fallback heuristic
|
|
|
|
|
|
lc = _rule_score(langchain_answer)
|
|
|
|
|
|
li = _rule_score(llamaindex_answer)
|
|
|
|
|
|
if abs(lc - li) < 0.05:
|
|
|
|
|
|
return lc, li, "Tie", "Эвристическая оценка без LLM (ключ не задан)."
|
|
|
|
|
|
return (
|
|
|
|
|
|
(lc, li, "LangChain", "Эвристическая оценка без LLM.")
|
|
|
|
|
|
if lc > li
|
|
|
|
|
|
else (
|
|
|
|
|
|
lc,
|
|
|
|
|
|
li,
|
|
|
|
|
|
"LlamaIndex",
|
|
|
|
|
|
"Эвристическая оценка без LLM.",
|
|
|
|
|
|
)
|
|
|
|
|
|
)
|
2026-03-11 22:30:02 +03:00
|
|
|
|
|
2026-03-13 08:20:18 +03:00
|
|
|
|
prompt = build_pair_eval_prompt(
|
|
|
|
|
|
question=question,
|
|
|
|
|
|
section=section,
|
|
|
|
|
|
langchain_answer=langchain_answer,
|
|
|
|
|
|
llamaindex_answer=llamaindex_answer,
|
|
|
|
|
|
document_text=document_text,
|
|
|
|
|
|
)
|
2026-03-11 22:30:02 +03:00
|
|
|
|
headers = {
|
2026-03-13 08:20:18 +03:00
|
|
|
|
"Authorization": f"Bearer {OPENAI_CHAT_KEY}",
|
|
|
|
|
|
"Content-Type": "application/json",
|
2026-03-11 22:30:02 +03:00
|
|
|
|
}
|
|
|
|
|
|
payload = {
|
2026-03-13 08:20:18 +03:00
|
|
|
|
"model": OPENAI_CHAT_MODEL,
|
2026-03-11 22:30:02 +03:00
|
|
|
|
"messages": [
|
|
|
|
|
|
{
|
|
|
|
|
|
"role": "system",
|
2026-03-13 08:20:18 +03:00
|
|
|
|
"content": "Ты строгий судья качества RAG. Отвечай только JSON.",
|
2026-03-11 22:30:02 +03:00
|
|
|
|
},
|
2026-03-13 08:20:18 +03:00
|
|
|
|
{"role": "user", "content": prompt},
|
2026-03-11 22:30:02 +03:00
|
|
|
|
],
|
2026-03-13 08:20:18 +03:00
|
|
|
|
"temperature": 0.0,
|
|
|
|
|
|
"max_tokens": 400,
|
2026-03-11 22:30:02 +03:00
|
|
|
|
}
|
|
|
|
|
|
try:
|
2026-03-13 08:20:18 +03:00
|
|
|
|
r = requests.post(
|
|
|
|
|
|
f"{OPENAI_CHAT_URL.rstrip('/')}/chat/completions",
|
2026-03-11 22:30:02 +03:00
|
|
|
|
headers=headers,
|
|
|
|
|
|
json=payload,
|
2026-03-13 08:20:18 +03:00
|
|
|
|
timeout=EVAL_TIMEOUT,
|
|
|
|
|
|
)
|
|
|
|
|
|
r.raise_for_status()
|
|
|
|
|
|
data = r.json()
|
|
|
|
|
|
content = data.get("choices", [{}])[0].get("message", {}).get("content", "")
|
|
|
|
|
|
m = re.search(r"\{.*\}", content, re.DOTALL)
|
|
|
|
|
|
raw = m.group(0) if m else content
|
|
|
|
|
|
parsed = json.loads(raw)
|
|
|
|
|
|
lc = float(parsed.get("langchain_score", 0.0))
|
|
|
|
|
|
li = float(parsed.get("llamaindex_score", 0.0))
|
|
|
|
|
|
winner = str(parsed.get("winner", "Tie"))
|
|
|
|
|
|
rationale = str(parsed.get("rationale", ""))
|
|
|
|
|
|
if winner not in {"LangChain", "LlamaIndex", "Tie"}:
|
|
|
|
|
|
winner = "Tie"
|
|
|
|
|
|
return lc, li, winner, rationale
|
|
|
|
|
|
except Exception as e: # noqa: BLE001
|
|
|
|
|
|
lc = _rule_score(langchain_answer)
|
|
|
|
|
|
li = _rule_score(llamaindex_answer)
|
|
|
|
|
|
if abs(lc - li) < 0.05:
|
|
|
|
|
|
return lc, li, "Tie", f"Fallback heuristic; LLM eval error: {e}"
|
|
|
|
|
|
return (
|
|
|
|
|
|
(lc, li, "LangChain", f"Fallback heuristic; LLM eval error: {e}")
|
|
|
|
|
|
if lc > li
|
|
|
|
|
|
else (
|
|
|
|
|
|
lc,
|
|
|
|
|
|
li,
|
|
|
|
|
|
"LlamaIndex",
|
|
|
|
|
|
f"Fallback heuristic; LLM eval error: {e}",
|
|
|
|
|
|
)
|
2026-03-11 22:30:02 +03:00
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# =============================================================================
|
2026-03-13 08:20:18 +03:00
|
|
|
|
# JSON storage
|
2026-03-11 22:30:02 +03:00
|
|
|
|
# =============================================================================
|
|
|
|
|
|
|
|
|
|
|
|
|
2026-03-13 08:20:18 +03:00
|
|
|
|
def now_iso() -> str:
|
|
|
|
|
|
return dt.datetime.now(dt.timezone.utc).isoformat()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def default_json_payload(
|
|
|
|
|
|
all_docs: list[tuple[int, str, list[QuestionResult]]],
|
|
|
|
|
|
) -> dict[str, Any]:
|
|
|
|
|
|
return {
|
|
|
|
|
|
"meta": {
|
|
|
|
|
|
"created_at": now_iso(),
|
|
|
|
|
|
"updated_at": now_iso(),
|
|
|
|
|
|
"input_file": str(INPUT_MD),
|
|
|
|
|
|
"langchain_url": LANGCHAIN_URL,
|
|
|
|
|
|
"llamaindex_url": LLAMAINDEX_URL,
|
|
|
|
|
|
"evaluator_model": OPENAI_CHAT_MODEL,
|
|
|
|
|
|
"notes": [
|
|
|
|
|
|
"step = one file (all file questions)",
|
|
|
|
|
|
"sequential API calls only",
|
|
|
|
|
|
"cache dir: ./tmp/rag-evaluation",
|
|
|
|
|
|
],
|
|
|
|
|
|
},
|
|
|
|
|
|
"documents": [
|
|
|
|
|
|
{
|
|
|
|
|
|
"index": idx,
|
|
|
|
|
|
"path": path,
|
|
|
|
|
|
"cache_file": "",
|
|
|
|
|
|
"cache_status": "not_processed",
|
|
|
|
|
|
"started_at": "",
|
|
|
|
|
|
"finished_at": "",
|
|
|
|
|
|
"questions": [asdict(q) for q in questions],
|
|
|
|
|
|
}
|
|
|
|
|
|
for idx, path, questions in all_docs
|
|
|
|
|
|
],
|
|
|
|
|
|
"batches": [],
|
|
|
|
|
|
}
|
2026-03-11 22:30:02 +03:00
|
|
|
|
|
|
|
|
|
|
|
2026-03-13 08:20:18 +03:00
|
|
|
|
def load_or_init_json(
|
|
|
|
|
|
all_docs: list[tuple[int, str, list[QuestionResult]]],
|
|
|
|
|
|
output_json: Path,
|
|
|
|
|
|
mode: str,
|
|
|
|
|
|
) -> dict[str, Any]:
|
|
|
|
|
|
if mode == "overwrite" or not output_json.exists():
|
|
|
|
|
|
return default_json_payload(all_docs)
|
|
|
|
|
|
try:
|
|
|
|
|
|
data = json.loads(output_json.read_text(encoding="utf-8"))
|
|
|
|
|
|
if "documents" not in data:
|
|
|
|
|
|
return default_json_payload(all_docs)
|
|
|
|
|
|
return data
|
|
|
|
|
|
except Exception:
|
|
|
|
|
|
return default_json_payload(all_docs)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def upsert_document_result(store: dict[str, Any], result: DocumentEvaluation) -> None:
|
|
|
|
|
|
docs = store.setdefault("documents", [])
|
|
|
|
|
|
for i, doc in enumerate(docs):
|
|
|
|
|
|
if doc.get("path") == result.path:
|
|
|
|
|
|
docs[i] = {
|
|
|
|
|
|
"index": result.index,
|
|
|
|
|
|
"path": result.path,
|
|
|
|
|
|
"cache_file": result.cache_file,
|
|
|
|
|
|
"cache_status": result.cache_status,
|
|
|
|
|
|
"started_at": result.started_at,
|
|
|
|
|
|
"finished_at": result.finished_at,
|
|
|
|
|
|
"questions": [asdict(q) for q in result.questions],
|
|
|
|
|
|
}
|
|
|
|
|
|
return
|
|
|
|
|
|
docs.append(
|
|
|
|
|
|
{
|
|
|
|
|
|
"index": result.index,
|
|
|
|
|
|
"path": result.path,
|
|
|
|
|
|
"cache_file": result.cache_file,
|
|
|
|
|
|
"cache_status": result.cache_status,
|
|
|
|
|
|
"started_at": result.started_at,
|
|
|
|
|
|
"finished_at": result.finished_at,
|
|
|
|
|
|
"questions": [asdict(q) for q in result.questions],
|
|
|
|
|
|
}
|
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def update_batch_stats(store: dict[str, Any], batch_meta: dict[str, Any]) -> None:
|
|
|
|
|
|
store.setdefault("batches", []).append(batch_meta)
|
|
|
|
|
|
store.setdefault("meta", {})["updated_at"] = now_iso()
|
|
|
|
|
|
|
|
|
|
|
|
|
2026-03-13 08:28:21 +03:00
|
|
|
|
def atomic_write_json(path: Path, payload: dict[str, Any]) -> None:
|
|
|
|
|
|
"""Atomically write JSON to avoid partial/corrupted files on interruption."""
|
|
|
|
|
|
path.parent.mkdir(parents=True, exist_ok=True)
|
|
|
|
|
|
tmp_path = path.with_suffix(path.suffix + ".tmp")
|
|
|
|
|
|
tmp_path.write_text(json.dumps(payload, ensure_ascii=False, indent=2), encoding="utf-8")
|
|
|
|
|
|
tmp_path.replace(path)
|
|
|
|
|
|
|
|
|
|
|
|
|
2026-03-13 08:20:18 +03:00
|
|
|
|
def compute_batch_summary(results: list[DocumentEvaluation]) -> dict[str, Any]:
|
2026-03-11 22:30:02 +03:00
|
|
|
|
wins = {"LangChain": 0, "LlamaIndex": 0, "Tie": 0}
|
|
|
|
|
|
scores_lc: list[float] = []
|
|
|
|
|
|
scores_li: list[float] = []
|
2026-03-13 08:20:18 +03:00
|
|
|
|
q_total = 0
|
|
|
|
|
|
for d in results:
|
|
|
|
|
|
for q in d.questions:
|
|
|
|
|
|
q_total += 1
|
|
|
|
|
|
wins[q.winner] = wins.get(q.winner, 0) + 1
|
|
|
|
|
|
scores_lc.append(q.langchain_score)
|
|
|
|
|
|
scores_li.append(q.llamaindex_score)
|
2026-03-11 22:30:02 +03:00
|
|
|
|
avg_lc = sum(scores_lc) / max(1, len(scores_lc))
|
|
|
|
|
|
avg_li = sum(scores_li) / max(1, len(scores_li))
|
2026-03-13 08:20:18 +03:00
|
|
|
|
if avg_lc > avg_li + 0.01:
|
2026-03-11 22:30:02 +03:00
|
|
|
|
ranking = "LangChain"
|
2026-03-13 08:20:18 +03:00
|
|
|
|
elif avg_li > avg_lc + 0.01:
|
2026-03-11 22:30:02 +03:00
|
|
|
|
ranking = "LlamaIndex"
|
|
|
|
|
|
else:
|
2026-03-13 08:20:18 +03:00
|
|
|
|
ranking = "Tie"
|
|
|
|
|
|
return {
|
|
|
|
|
|
"documents_processed": len(results),
|
|
|
|
|
|
"questions_processed": q_total,
|
|
|
|
|
|
"wins": wins,
|
|
|
|
|
|
"avg_langchain": round(avg_lc, 4),
|
|
|
|
|
|
"avg_llamaindex": round(avg_li, 4),
|
|
|
|
|
|
"ranking": ranking,
|
|
|
|
|
|
}
|
2026-03-11 22:30:02 +03:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# =============================================================================
|
2026-03-13 08:20:18 +03:00
|
|
|
|
# Main flow
|
2026-03-11 22:30:02 +03:00
|
|
|
|
# =============================================================================
|
|
|
|
|
|
|
|
|
|
|
|
|
2026-03-13 08:20:18 +03:00
|
|
|
|
def run_evaluation(doc_from: int, doc_to: int, mode: str) -> None:
|
|
|
|
|
|
all_docs = parse_all_docs(INPUT_MD)
|
|
|
|
|
|
total_docs = len(all_docs)
|
|
|
|
|
|
doc_from = max(1, doc_from)
|
|
|
|
|
|
doc_to = min(total_docs, doc_to)
|
|
|
|
|
|
if doc_from > doc_to:
|
|
|
|
|
|
raise ValueError(f"Invalid doc range: {doc_from}:{doc_to}")
|
|
|
|
|
|
|
|
|
|
|
|
store = load_or_init_json(all_docs, OUTPUT_JSON, mode)
|
|
|
|
|
|
|
|
|
|
|
|
TMP_DIR.mkdir(parents=True, exist_ok=True)
|
|
|
|
|
|
selected = [d for d in all_docs if doc_from <= d[0] <= doc_to]
|
|
|
|
|
|
print(
|
|
|
|
|
|
f"Total docs: {total_docs}. Processing docs {doc_from}:{doc_to} ({len(selected)} steps)."
|
|
|
|
|
|
)
|
|
|
|
|
|
print(f"Cache dir: {TMP_DIR}")
|
|
|
|
|
|
print(f"Output JSON: {OUTPUT_JSON}")
|
|
|
|
|
|
|
|
|
|
|
|
batch_results: list[DocumentEvaluation] = []
|
|
|
|
|
|
batch_started = now_iso()
|
|
|
|
|
|
|
|
|
|
|
|
for step, (idx, doc_path, questions) in enumerate(selected, start=1):
|
|
|
|
|
|
print(f"\n[STEP {step}/{len(selected)}] File #{idx}: {doc_path}")
|
|
|
|
|
|
started = now_iso()
|
|
|
|
|
|
cache_name = cache_file_name(doc_path)
|
|
|
|
|
|
cache_path = TMP_DIR / cache_name
|
|
|
|
|
|
cache_status = download_yadisk_to_cache(doc_path, YADISK_TOKEN, cache_path)
|
|
|
|
|
|
print(f" -> cache: {cache_status} ({cache_path})")
|
|
|
|
|
|
|
|
|
|
|
|
doc_text = ""
|
|
|
|
|
|
if cache_status.startswith("error:"):
|
|
|
|
|
|
doc_text = f"[CACHE_ERROR] {cache_status}"
|
2026-03-11 22:30:02 +03:00
|
|
|
|
else:
|
2026-03-13 08:20:18 +03:00
|
|
|
|
doc_text = extract_text_from_file(cache_path)
|
|
|
|
|
|
print(f" -> extracted text length: {len(doc_text)}")
|
|
|
|
|
|
|
|
|
|
|
|
evaluated_questions: list[QuestionResult] = []
|
|
|
|
|
|
for qn, q in enumerate(questions, start=1):
|
|
|
|
|
|
qr = QuestionResult(section=q.section, question=q.question)
|
|
|
|
|
|
print(f" [{qn}/{len(questions)}] {q.question[:90]}")
|
|
|
|
|
|
|
|
|
|
|
|
t0 = time.time()
|
|
|
|
|
|
qr.langchain_answer = call_langchain(q.question, timeout=RAG_TIMEOUT)
|
|
|
|
|
|
print(f" LangChain: {time.time() - t0:.1f}s")
|
|
|
|
|
|
|
|
|
|
|
|
t0 = time.time()
|
|
|
|
|
|
qr.llamaindex_answer = call_llamaindex(q.question, timeout=RAG_TIMEOUT)
|
|
|
|
|
|
print(f" LlamaIndex: {time.time() - t0:.1f}s")
|
|
|
|
|
|
|
|
|
|
|
|
lc, li, winner, rationale = evaluate_pair_with_llm(
|
|
|
|
|
|
question=q.question,
|
|
|
|
|
|
section=q.section,
|
|
|
|
|
|
langchain_answer=qr.langchain_answer,
|
|
|
|
|
|
llamaindex_answer=qr.llamaindex_answer,
|
|
|
|
|
|
document_text=doc_text,
|
|
|
|
|
|
)
|
|
|
|
|
|
qr.langchain_score = lc
|
|
|
|
|
|
qr.llamaindex_score = li
|
|
|
|
|
|
qr.winner = winner
|
|
|
|
|
|
qr.rationale = rationale
|
|
|
|
|
|
qr.evaluator_model = OPENAI_CHAT_MODEL
|
|
|
|
|
|
qr.evaluated_at = now_iso()
|
|
|
|
|
|
evaluated_questions.append(qr)
|
|
|
|
|
|
|
|
|
|
|
|
doc_result = DocumentEvaluation(
|
|
|
|
|
|
index=idx,
|
|
|
|
|
|
path=doc_path,
|
|
|
|
|
|
cache_file=str(cache_path),
|
|
|
|
|
|
cache_status=cache_status,
|
|
|
|
|
|
questions=evaluated_questions,
|
|
|
|
|
|
started_at=started,
|
|
|
|
|
|
finished_at=now_iso(),
|
|
|
|
|
|
)
|
|
|
|
|
|
upsert_document_result(store, doc_result)
|
|
|
|
|
|
batch_results.append(doc_result)
|
2026-03-11 22:30:02 +03:00
|
|
|
|
|
2026-03-13 08:20:18 +03:00
|
|
|
|
# Save incremental progress after each file/step
|
2026-03-13 08:28:21 +03:00
|
|
|
|
atomic_write_json(OUTPUT_JSON, store)
|
2026-03-13 08:20:18 +03:00
|
|
|
|
print(" -> step saved")
|
|
|
|
|
|
|
|
|
|
|
|
summary = compute_batch_summary(batch_results)
|
|
|
|
|
|
batch_meta = {
|
|
|
|
|
|
"started_at": batch_started,
|
|
|
|
|
|
"finished_at": now_iso(),
|
|
|
|
|
|
"range": f"{doc_from}:{doc_to}",
|
|
|
|
|
|
"summary": summary,
|
|
|
|
|
|
"mode": mode,
|
|
|
|
|
|
}
|
|
|
|
|
|
update_batch_stats(store, batch_meta)
|
2026-03-13 08:28:21 +03:00
|
|
|
|
atomic_write_json(OUTPUT_JSON, store)
|
2026-03-11 22:30:02 +03:00
|
|
|
|
|
2026-03-13 08:20:18 +03:00
|
|
|
|
print("\nBatch complete.")
|
|
|
|
|
|
print(json.dumps(summary, ensure_ascii=False, indent=2))
|
|
|
|
|
|
print(f"Saved to: {OUTPUT_JSON}")
|
2026-03-11 22:30:02 +03:00
|
|
|
|
|
|
|
|
|
|
|
2026-03-13 08:20:18 +03:00
|
|
|
|
def parse_range(value: str) -> tuple[int, int]:
|
|
|
|
|
|
m = re.fullmatch(r"(\d+):(\d+)", value.strip())
|
|
|
|
|
|
if not m:
|
|
|
|
|
|
raise argparse.ArgumentTypeError(
|
|
|
|
|
|
"Range must be in format from:to (example: 1:10)"
|
|
|
|
|
|
)
|
|
|
|
|
|
a, b = int(m.group(1)), int(m.group(2))
|
|
|
|
|
|
if a <= 0 or b <= 0:
|
|
|
|
|
|
raise argparse.ArgumentTypeError("Range values must be positive")
|
|
|
|
|
|
return a, b
|
2026-03-11 22:30:02 +03:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def main() -> int:
|
|
|
|
|
|
parser = argparse.ArgumentParser(
|
2026-03-13 08:20:18 +03:00
|
|
|
|
description="RAG evaluation in file-batch mode (JSON output)"
|
2026-03-11 22:30:02 +03:00
|
|
|
|
)
|
|
|
|
|
|
parser.add_argument(
|
2026-03-13 08:20:18 +03:00
|
|
|
|
"doc_range",
|
|
|
|
|
|
type=parse_range,
|
|
|
|
|
|
help="Document range in format from:to (step = one file). Example: 1:10",
|
2026-03-11 22:30:02 +03:00
|
|
|
|
)
|
|
|
|
|
|
parser.add_argument(
|
2026-03-13 08:20:18 +03:00
|
|
|
|
"--mode",
|
|
|
|
|
|
choices=["append", "overwrite"],
|
|
|
|
|
|
default="append",
|
|
|
|
|
|
help="append: upsert evaluated docs into existing JSON; overwrite: rebuild JSON from input docs",
|
2026-03-11 22:30:02 +03:00
|
|
|
|
)
|
|
|
|
|
|
args = parser.parse_args()
|
|
|
|
|
|
|
2026-03-13 08:20:18 +03:00
|
|
|
|
doc_from, doc_to = args.doc_range
|
2026-03-11 22:30:02 +03:00
|
|
|
|
|
2026-03-13 08:20:18 +03:00
|
|
|
|
if "MiniMax" in OPENAI_CHAT_MODEL or "MiniMax" in OPENAI_CHAT_URL:
|
|
|
|
|
|
print(
|
|
|
|
|
|
"NOTE: evaluator model is MiniMax. It works, but for stricter judging quality, "
|
|
|
|
|
|
"gpt-4.1-mini/gpt-4.1 (if available on your endpoint) is usually stronger."
|
|
|
|
|
|
)
|
2026-03-11 22:30:02 +03:00
|
|
|
|
|
2026-03-13 08:20:18 +03:00
|
|
|
|
run_evaluation(doc_from=doc_from, doc_to=doc_to, mode=args.mode)
|
2026-03-11 22:30:02 +03:00
|
|
|
|
return 0
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
|
|
|
raise SystemExit(main())
|