Files
rag-solution/rag_evaluation.py

733 lines
24 KiB
Python
Raw Normal View History

2026-03-11 22:30:02 +03:00
#!/usr/bin/env python3
"""
RAG evaluation script (file-batch mode).
Key behavior:
- Step = one document file (all its questions), not one question.
- Pre-download/caching in ./tmp/rag-evaluation (skip if already downloaded).
- Sequential API calls only (LangChain then LlamaIndex).
- Pairwise answer evaluation (both systems in one judge prompt).
- JSON output with append/overwrite support for batch runs and re-runs.
2026-03-11 22:30:02 +03:00
"""
2026-03-11 22:30:02 +03:00
from __future__ import annotations
import argparse
import datetime as dt
2026-03-11 22:30:02 +03:00
import json
import os
import re
import time
from dataclasses import asdict, dataclass, field
2026-03-11 22:30:02 +03:00
from pathlib import Path
from typing import Any
try:
import requests
except ImportError as e: # pragma: no cover
raise SystemExit(
"Missing dependency: requests. Run with your project venv "
"(for example services/rag/langchain/venv/bin/python rag_evaluation.py ...)"
) from e
2026-03-11 22:30:02 +03:00
from dotenv import load_dotenv
load_dotenv()
# =============================================================================
# Configuration
# =============================================================================
LANGCHAIN_URL = os.getenv("LANGCHAIN_URL", "http://localhost:8331/api/test-query")
LLAMAINDEX_URL = os.getenv("LLAMAINDEX_URL", "http://localhost:8334/api/test-query")
2026-03-11 22:30:02 +03:00
# OpenAI-compatible evaluator endpoint. You can point this at OpenAI-compatible providers.
OPENAI_CHAT_URL = os.getenv(
"OPENAI_CHAT_URL", "https://foundation-models.api.cloud.ru/v1"
)
OPENAI_CHAT_KEY = os.getenv("OPENAI_CHAT_KEY", "")
OPENAI_CHAT_MODEL = os.getenv("OPENAI_CHAT_MODEL", "MiniMaxAI/MiniMax-M2")
2026-03-11 22:30:02 +03:00
YADISK_TOKEN = os.getenv("YADISK_TOKEN", "")
BASE_DIR = Path(__file__).resolve().parent
INPUT_MD = BASE_DIR / "DOCUMENTS_TO_TEST.md"
OUTPUT_JSON = BASE_DIR / "EVALUATION_RESULT.json"
TMP_DIR = BASE_DIR / "tmp" / "rag-evaluation"
2026-03-11 22:30:02 +03:00
RAG_TIMEOUT = int(os.getenv("RAG_TIMEOUT", "120"))
EVAL_TIMEOUT = int(os.getenv("EVAL_TIMEOUT", "90"))
YADISK_META_TIMEOUT = int(os.getenv("YADISK_META_TIMEOUT", "30"))
YADISK_DOWNLOAD_TIMEOUT = int(os.getenv("YADISK_DOWNLOAD_TIMEOUT", "180"))
2026-03-11 22:30:02 +03:00
# =============================================================================
# Data structures
2026-03-11 22:30:02 +03:00
# =============================================================================
2026-03-11 22:30:02 +03:00
@dataclass
class QuestionResult:
2026-03-11 22:30:02 +03:00
section: str
question: str
langchain_answer: str = ""
llamaindex_answer: str = ""
langchain_score: float = 0.0
llamaindex_score: float = 0.0
winner: str = "Tie"
rationale: str = ""
evaluator_model: str = ""
evaluated_at: str = ""
2026-03-11 22:30:02 +03:00
@dataclass
class DocumentEvaluation:
index: int
2026-03-11 22:30:02 +03:00
path: str
cache_file: str = ""
cache_status: str = ""
questions: list[QuestionResult] = field(default_factory=list)
started_at: str = ""
finished_at: str = ""
2026-03-11 22:30:02 +03:00
# =============================================================================
# Markdown parsing
2026-03-11 22:30:02 +03:00
# =============================================================================
2026-03-11 22:30:02 +03:00
def split_documents(md_text: str) -> tuple[list[str], list[str]]:
lines = md_text.splitlines()
header: list[str] = []
docs: list[list[str]] = []
current: list[str] | None = None
for line in lines:
if line.startswith("## "):
if current is not None:
docs.append(current)
current = [line]
else:
if current is None:
header.append(line)
else:
current.append(line)
if current is not None:
docs.append(current)
return header, ["\n".join(d) for d in docs]
def parse_document_block(idx: int, block: str) -> tuple[str, list[QuestionResult]]:
2026-03-11 22:30:02 +03:00
lines = block.splitlines()
header = lines[0].strip()
m = re.search(r"`([^`]+)`", header)
doc_path = m.group(1) if m else ""
section = ""
questions: list[QuestionResult] = []
2026-03-11 22:30:02 +03:00
for line in lines[1:]:
if line.startswith("### "):
section = line[4:].strip()
2026-03-11 22:30:02 +03:00
elif line.startswith("- "):
q = line[2:].strip()
if q:
questions.append(QuestionResult(section=section, question=q))
return doc_path, questions
2026-03-11 22:30:02 +03:00
def parse_all_docs(md_path: Path) -> list[tuple[int, str, list[QuestionResult]]]:
2026-03-11 22:30:02 +03:00
raw = md_path.read_text(encoding="utf-8")
_, blocks = split_documents(raw)
parsed: list[tuple[int, str, list[QuestionResult]]] = []
for i, block in enumerate(blocks, start=1):
path, questions = parse_document_block(i, block)
parsed.append((i, path, questions))
return parsed
2026-03-11 22:30:02 +03:00
# =============================================================================
# Caching / Yandex Disk
2026-03-11 22:30:02 +03:00
# =============================================================================
def cache_file_name(remote_path: str) -> str:
# Deterministic local cache filename
digest = re.sub(r"[^a-z0-9]", "", str(abs(hash(remote_path))))[:12]
suffix = Path(remote_path).suffix or ".bin"
return f"{digest}{suffix}"
2026-03-11 22:30:02 +03:00
def download_yadisk_to_cache(remote_path: str, token: str, cache_path: Path) -> str:
"""
Download file into cache path if missing.
Returns status: "cached_existing" | "downloaded" | "error:..."
"""
cache_path.parent.mkdir(parents=True, exist_ok=True)
if cache_path.exists() and cache_path.stat().st_size > 0:
return "cached_existing"
if not token:
return "error:missing_yadisk_token"
headers = {"Authorization": f"OAuth {token}"}
2026-03-11 22:30:02 +03:00
try:
r = requests.get(
"https://cloud-api.yandex.net/v1/disk/resources/download",
headers=headers,
params={"path": remote_path},
timeout=YADISK_META_TIMEOUT,
)
2026-03-11 22:30:02 +03:00
r.raise_for_status()
href = r.json()["href"]
f = requests.get(href, timeout=YADISK_DOWNLOAD_TIMEOUT)
f.raise_for_status()
cache_path.write_bytes(f.content)
if cache_path.stat().st_size == 0:
return "error:empty_download"
return "downloaded"
except Exception as e: # noqa: BLE001
return f"error:{e}"
2026-03-11 22:30:02 +03:00
# =============================================================================
# File text extraction (for evaluator context)
2026-03-11 22:30:02 +03:00
# =============================================================================
def extract_text_from_file(path: Path) -> str:
ext = path.suffix.lower()
if ext in {".txt", ".md", ".csv", ".json", ".xml", ".html", ".htm"}:
return path.read_text(encoding="utf-8", errors="ignore")
2026-03-11 22:30:02 +03:00
if ext in {".docx", ".doc"}:
try:
from docx import Document # type: ignore
2026-03-11 22:30:02 +03:00
doc = Document(str(path))
return "\n".join(p.text for p in doc.paragraphs)
except Exception as e: # noqa: BLE001
return f"[DOC parse error: {e}]"
2026-03-11 22:30:02 +03:00
if ext == ".pdf":
2026-03-11 22:30:02 +03:00
try:
import PyPDF2 # type: ignore
out: list[str] = []
with path.open("rb") as f:
2026-03-11 22:30:02 +03:00
reader = PyPDF2.PdfReader(f)
for page in reader.pages:
out.append(page.extract_text() or "")
return "\n".join(out)
except Exception as e: # noqa: BLE001
return f"[PDF parse error: {e}]"
2026-03-11 22:30:02 +03:00
if ext in {".xlsx", ".xls"}:
2026-03-11 22:30:02 +03:00
try:
from openpyxl import load_workbook # type: ignore
wb = load_workbook(str(path), read_only=True)
out: list[str] = []
for ws in wb.worksheets:
for row in ws.iter_rows(values_only=True):
out.append("\t".join("" if c is None else str(c) for c in row))
if len(out) > 5000:
break
if len(out) > 5000:
break
return "\n".join(out)
except Exception as e: # noqa: BLE001
return f"[XLS parse error: {e}]"
# fallback
try:
return path.read_text(encoding="utf-8", errors="ignore")
except Exception:
return f"[Binary file: {path.name}]"
2026-03-11 22:30:02 +03:00
# =============================================================================
# RAG API calls (sequential)
# =============================================================================
2026-03-11 22:30:02 +03:00
def call_rag(url: str, query: str, timeout: int) -> str:
payload = {"query": query}
try:
r = requests.post(url, json=payload, timeout=timeout)
r.raise_for_status()
data = r.json()
text = data.get("response", "")
if text is None:
return ""
return str(text).strip()
except Exception as e: # noqa: BLE001
return f"ERROR: {e}"
def call_langchain(query: str, timeout: int) -> str:
return call_rag(LANGCHAIN_URL, query, timeout)
2026-03-11 22:30:02 +03:00
def call_llamaindex(query: str, timeout: int) -> str:
payload = {"query": query, "mode": "agent"}
2026-03-11 22:30:02 +03:00
try:
r = requests.post(LLAMAINDEX_URL, json=payload, timeout=timeout)
r.raise_for_status()
data = r.json()
text = data.get("response", "")
if text is None:
return ""
return str(text).strip()
except Exception as e: # noqa: BLE001
return f"ERROR: {e}"
2026-03-11 22:30:02 +03:00
# =============================================================================
# Evaluator
2026-03-11 22:30:02 +03:00
# =============================================================================
def _rule_score(answer: str) -> float:
if not answer or not answer.strip():
return 0.0
if answer.startswith("ERROR:"):
return -1.0
score = 0.3
if len(answer) > 120:
score += 0.2
if re.search(r"\d", answer):
score += 0.1
if re.search(r"[.!?]", answer):
score += 0.1
if re.search(r"(не найден|недостаточно|нет информации)", answer.lower()):
score += 0.05
return min(1.0, score)
SECTION_CRITERIA: dict[str, str] = {
"Entity/Fact Recall (Response Relevance)": "Оцени точность извлечения сущностей/фактов и релевантность вопросу.",
"Numerical & Temporal Precision": "Оцени точность чисел, дат, периодов и временных связей.",
"Context Precision (Evidence-anchored)": "Оцени, насколько ответ опирается на релевантный контекст без лишнего.",
"Faithfulness / Non-hallucination": "Оцени отсутствие галлюцинаций и корректное поведение при отсутствии фактов.",
"Reasoning & Synthesis": "Оцени качество синтеза фактов и логичность итогового вывода.",
}
def build_pair_eval_prompt(
2026-03-11 22:30:02 +03:00
question: str,
section: str,
langchain_answer: str,
llamaindex_answer: str,
document_text: str,
2026-03-11 22:30:02 +03:00
) -> str:
criteria = SECTION_CRITERIA.get(
section, "Оцени релевантность, точность и полезность."
)
context = document_text[:9000]
return f"""Ты судья качества RAG-ответов. Сравни два ответа на один вопрос.
2026-03-11 22:30:02 +03:00
Вопрос:
2026-03-11 22:30:02 +03:00
{question}
Секция оценки:
{section}
Критерий:
{criteria}
2026-03-11 22:30:02 +03:00
Ответ A (LangChain):
{langchain_answer}
2026-03-11 22:30:02 +03:00
Ответ B (LlamaIndex):
{llamaindex_answer}
Опорный контекст документа:
{context}
2026-03-11 22:30:02 +03:00
Верни ТОЛЬКО JSON:
2026-03-11 22:30:02 +03:00
{{
"langchain_score": <float от -1.0 до 1.0>,
"llamaindex_score": <float от -1.0 до 1.0>,
"winner": "LangChain|LlamaIndex|Tie",
"rationale": "<кратко по сути>"
2026-03-11 22:30:02 +03:00
}}
Правила:
- Технические ошибки/таймауты должны получать -1.0.
- Пустой ответ без ошибки = 0.0.
- Галлюцинации сильно штрафуются.
- Если разница незначительная, выбирай Tie.
2026-03-11 22:30:02 +03:00
"""
def evaluate_pair_with_llm(
2026-03-11 22:30:02 +03:00
question: str,
section: str,
langchain_answer: str,
llamaindex_answer: str,
document_text: str,
) -> tuple[float, float, str, str]:
# Deterministic short-circuit for technical failures
if langchain_answer.startswith("ERROR:") and llamaindex_answer.startswith("ERROR:"):
return -1.0, -1.0, "Tie", "Обе системы вернули техническую ошибку."
if langchain_answer.startswith("ERROR:"):
return (
-1.0,
_rule_score(llamaindex_answer),
"LlamaIndex",
"LangChain технически не ответил.",
)
if llamaindex_answer.startswith("ERROR:"):
return (
_rule_score(langchain_answer),
-1.0,
"LangChain",
"LlamaIndex технически не ответил.",
)
2026-03-11 22:30:02 +03:00
if not OPENAI_CHAT_KEY:
# fallback heuristic
lc = _rule_score(langchain_answer)
li = _rule_score(llamaindex_answer)
if abs(lc - li) < 0.05:
return lc, li, "Tie", "Эвристическая оценка без LLM (ключ не задан)."
return (
(lc, li, "LangChain", "Эвристическая оценка без LLM.")
if lc > li
else (
lc,
li,
"LlamaIndex",
"Эвристическая оценка без LLM.",
)
)
2026-03-11 22:30:02 +03:00
prompt = build_pair_eval_prompt(
question=question,
section=section,
langchain_answer=langchain_answer,
llamaindex_answer=llamaindex_answer,
document_text=document_text,
)
2026-03-11 22:30:02 +03:00
headers = {
"Authorization": f"Bearer {OPENAI_CHAT_KEY}",
"Content-Type": "application/json",
2026-03-11 22:30:02 +03:00
}
payload = {
"model": OPENAI_CHAT_MODEL,
2026-03-11 22:30:02 +03:00
"messages": [
{
"role": "system",
"content": "Ты строгий судья качества RAG. Отвечай только JSON.",
2026-03-11 22:30:02 +03:00
},
{"role": "user", "content": prompt},
2026-03-11 22:30:02 +03:00
],
"temperature": 0.0,
"max_tokens": 400,
2026-03-11 22:30:02 +03:00
}
try:
r = requests.post(
f"{OPENAI_CHAT_URL.rstrip('/')}/chat/completions",
2026-03-11 22:30:02 +03:00
headers=headers,
json=payload,
timeout=EVAL_TIMEOUT,
)
r.raise_for_status()
data = r.json()
content = data.get("choices", [{}])[0].get("message", {}).get("content", "")
m = re.search(r"\{.*\}", content, re.DOTALL)
raw = m.group(0) if m else content
parsed = json.loads(raw)
lc = float(parsed.get("langchain_score", 0.0))
li = float(parsed.get("llamaindex_score", 0.0))
winner = str(parsed.get("winner", "Tie"))
rationale = str(parsed.get("rationale", ""))
if winner not in {"LangChain", "LlamaIndex", "Tie"}:
winner = "Tie"
return lc, li, winner, rationale
except Exception as e: # noqa: BLE001
lc = _rule_score(langchain_answer)
li = _rule_score(llamaindex_answer)
if abs(lc - li) < 0.05:
return lc, li, "Tie", f"Fallback heuristic; LLM eval error: {e}"
return (
(lc, li, "LangChain", f"Fallback heuristic; LLM eval error: {e}")
if lc > li
else (
lc,
li,
"LlamaIndex",
f"Fallback heuristic; LLM eval error: {e}",
)
2026-03-11 22:30:02 +03:00
)
# =============================================================================
# JSON storage
2026-03-11 22:30:02 +03:00
# =============================================================================
def now_iso() -> str:
return dt.datetime.now(dt.timezone.utc).isoformat()
def default_json_payload(
all_docs: list[tuple[int, str, list[QuestionResult]]],
) -> dict[str, Any]:
return {
"meta": {
"created_at": now_iso(),
"updated_at": now_iso(),
"input_file": str(INPUT_MD),
"langchain_url": LANGCHAIN_URL,
"llamaindex_url": LLAMAINDEX_URL,
"evaluator_model": OPENAI_CHAT_MODEL,
"notes": [
"step = one file (all file questions)",
"sequential API calls only",
"cache dir: ./tmp/rag-evaluation",
],
},
"documents": [
{
"index": idx,
"path": path,
"cache_file": "",
"cache_status": "not_processed",
"started_at": "",
"finished_at": "",
"questions": [asdict(q) for q in questions],
}
for idx, path, questions in all_docs
],
"batches": [],
}
2026-03-11 22:30:02 +03:00
def load_or_init_json(
all_docs: list[tuple[int, str, list[QuestionResult]]],
output_json: Path,
mode: str,
) -> dict[str, Any]:
if mode == "overwrite" or not output_json.exists():
return default_json_payload(all_docs)
try:
data = json.loads(output_json.read_text(encoding="utf-8"))
if "documents" not in data:
return default_json_payload(all_docs)
return data
except Exception:
return default_json_payload(all_docs)
def upsert_document_result(store: dict[str, Any], result: DocumentEvaluation) -> None:
docs = store.setdefault("documents", [])
for i, doc in enumerate(docs):
if doc.get("path") == result.path:
docs[i] = {
"index": result.index,
"path": result.path,
"cache_file": result.cache_file,
"cache_status": result.cache_status,
"started_at": result.started_at,
"finished_at": result.finished_at,
"questions": [asdict(q) for q in result.questions],
}
return
docs.append(
{
"index": result.index,
"path": result.path,
"cache_file": result.cache_file,
"cache_status": result.cache_status,
"started_at": result.started_at,
"finished_at": result.finished_at,
"questions": [asdict(q) for q in result.questions],
}
)
def update_batch_stats(store: dict[str, Any], batch_meta: dict[str, Any]) -> None:
store.setdefault("batches", []).append(batch_meta)
store.setdefault("meta", {})["updated_at"] = now_iso()
def atomic_write_json(path: Path, payload: dict[str, Any]) -> None:
"""Atomically write JSON to avoid partial/corrupted files on interruption."""
path.parent.mkdir(parents=True, exist_ok=True)
tmp_path = path.with_suffix(path.suffix + ".tmp")
tmp_path.write_text(json.dumps(payload, ensure_ascii=False, indent=2), encoding="utf-8")
tmp_path.replace(path)
def compute_batch_summary(results: list[DocumentEvaluation]) -> dict[str, Any]:
2026-03-11 22:30:02 +03:00
wins = {"LangChain": 0, "LlamaIndex": 0, "Tie": 0}
scores_lc: list[float] = []
scores_li: list[float] = []
q_total = 0
for d in results:
for q in d.questions:
q_total += 1
wins[q.winner] = wins.get(q.winner, 0) + 1
scores_lc.append(q.langchain_score)
scores_li.append(q.llamaindex_score)
2026-03-11 22:30:02 +03:00
avg_lc = sum(scores_lc) / max(1, len(scores_lc))
avg_li = sum(scores_li) / max(1, len(scores_li))
if avg_lc > avg_li + 0.01:
2026-03-11 22:30:02 +03:00
ranking = "LangChain"
elif avg_li > avg_lc + 0.01:
2026-03-11 22:30:02 +03:00
ranking = "LlamaIndex"
else:
ranking = "Tie"
return {
"documents_processed": len(results),
"questions_processed": q_total,
"wins": wins,
"avg_langchain": round(avg_lc, 4),
"avg_llamaindex": round(avg_li, 4),
"ranking": ranking,
}
2026-03-11 22:30:02 +03:00
# =============================================================================
# Main flow
2026-03-11 22:30:02 +03:00
# =============================================================================
def run_evaluation(doc_from: int, doc_to: int, mode: str) -> None:
all_docs = parse_all_docs(INPUT_MD)
total_docs = len(all_docs)
doc_from = max(1, doc_from)
doc_to = min(total_docs, doc_to)
if doc_from > doc_to:
raise ValueError(f"Invalid doc range: {doc_from}:{doc_to}")
store = load_or_init_json(all_docs, OUTPUT_JSON, mode)
TMP_DIR.mkdir(parents=True, exist_ok=True)
selected = [d for d in all_docs if doc_from <= d[0] <= doc_to]
print(
f"Total docs: {total_docs}. Processing docs {doc_from}:{doc_to} ({len(selected)} steps)."
)
print(f"Cache dir: {TMP_DIR}")
print(f"Output JSON: {OUTPUT_JSON}")
batch_results: list[DocumentEvaluation] = []
batch_started = now_iso()
for step, (idx, doc_path, questions) in enumerate(selected, start=1):
print(f"\n[STEP {step}/{len(selected)}] File #{idx}: {doc_path}")
started = now_iso()
cache_name = cache_file_name(doc_path)
cache_path = TMP_DIR / cache_name
cache_status = download_yadisk_to_cache(doc_path, YADISK_TOKEN, cache_path)
print(f" -> cache: {cache_status} ({cache_path})")
doc_text = ""
if cache_status.startswith("error:"):
doc_text = f"[CACHE_ERROR] {cache_status}"
2026-03-11 22:30:02 +03:00
else:
doc_text = extract_text_from_file(cache_path)
print(f" -> extracted text length: {len(doc_text)}")
evaluated_questions: list[QuestionResult] = []
for qn, q in enumerate(questions, start=1):
qr = QuestionResult(section=q.section, question=q.question)
print(f" [{qn}/{len(questions)}] {q.question[:90]}")
t0 = time.time()
qr.langchain_answer = call_langchain(q.question, timeout=RAG_TIMEOUT)
print(f" LangChain: {time.time() - t0:.1f}s")
t0 = time.time()
qr.llamaindex_answer = call_llamaindex(q.question, timeout=RAG_TIMEOUT)
print(f" LlamaIndex: {time.time() - t0:.1f}s")
lc, li, winner, rationale = evaluate_pair_with_llm(
question=q.question,
section=q.section,
langchain_answer=qr.langchain_answer,
llamaindex_answer=qr.llamaindex_answer,
document_text=doc_text,
)
qr.langchain_score = lc
qr.llamaindex_score = li
qr.winner = winner
qr.rationale = rationale
qr.evaluator_model = OPENAI_CHAT_MODEL
qr.evaluated_at = now_iso()
evaluated_questions.append(qr)
doc_result = DocumentEvaluation(
index=idx,
path=doc_path,
cache_file=str(cache_path),
cache_status=cache_status,
questions=evaluated_questions,
started_at=started,
finished_at=now_iso(),
)
upsert_document_result(store, doc_result)
batch_results.append(doc_result)
2026-03-11 22:30:02 +03:00
# Save incremental progress after each file/step
atomic_write_json(OUTPUT_JSON, store)
print(" -> step saved")
summary = compute_batch_summary(batch_results)
batch_meta = {
"started_at": batch_started,
"finished_at": now_iso(),
"range": f"{doc_from}:{doc_to}",
"summary": summary,
"mode": mode,
}
update_batch_stats(store, batch_meta)
atomic_write_json(OUTPUT_JSON, store)
2026-03-11 22:30:02 +03:00
print("\nBatch complete.")
print(json.dumps(summary, ensure_ascii=False, indent=2))
print(f"Saved to: {OUTPUT_JSON}")
2026-03-11 22:30:02 +03:00
def parse_range(value: str) -> tuple[int, int]:
m = re.fullmatch(r"(\d+):(\d+)", value.strip())
if not m:
raise argparse.ArgumentTypeError(
"Range must be in format from:to (example: 1:10)"
)
a, b = int(m.group(1)), int(m.group(2))
if a <= 0 or b <= 0:
raise argparse.ArgumentTypeError("Range values must be positive")
return a, b
2026-03-11 22:30:02 +03:00
def main() -> int:
parser = argparse.ArgumentParser(
description="RAG evaluation in file-batch mode (JSON output)"
2026-03-11 22:30:02 +03:00
)
parser.add_argument(
"doc_range",
type=parse_range,
help="Document range in format from:to (step = one file). Example: 1:10",
2026-03-11 22:30:02 +03:00
)
parser.add_argument(
"--mode",
choices=["append", "overwrite"],
default="append",
help="append: upsert evaluated docs into existing JSON; overwrite: rebuild JSON from input docs",
2026-03-11 22:30:02 +03:00
)
args = parser.parse_args()
doc_from, doc_to = args.doc_range
2026-03-11 22:30:02 +03:00
if "MiniMax" in OPENAI_CHAT_MODEL or "MiniMax" in OPENAI_CHAT_URL:
print(
"NOTE: evaluator model is MiniMax. It works, but for stricter judging quality, "
"gpt-4.1-mini/gpt-4.1 (if available on your endpoint) is usually stronger."
)
2026-03-11 22:30:02 +03:00
run_evaluation(doc_from=doc_from, doc_to=doc_to, mode=args.mode)
2026-03-11 22:30:02 +03:00
return 0
if __name__ == "__main__":
raise SystemExit(main())