evaluation for rag systems

2026-03-11 22:30:02 +03:00
parent 5721bad117
commit 6c953a327f
11 changed files with 31897 additions and 1 deletions
--- a/rag_evaluation.py
+++ b/rag_evaluation.py
@@ -0,0 +1,782 @@
+#!/usr/bin/env python3
+"""
+RAG Evaluation Script
+
+Evaluates two RAG systems (LangChain and LlamaIndex) using OpenAI-compatible LLM
+for scoring, with Yandex Disk integration for document verification.
+
+Usage:
+    python rag_evaluation.py 1:10    # Evaluate questions 1 to 10
+    python rag_evaluation.py 5:20    # Evaluate questions 5 to 20
+"""
+from __future__ import annotations
+
+import argparse
+import json
+import os
+import re
+import tempfile
+from dataclasses import dataclass, field
+from pathlib import Path
+from typing import Any, Optional
+from dotenv import load_dotenv
+
+load_dotenv()
+
+import requests
+
+# =============================================================================
+# Configuration
+# =============================================================================
+
+# OpenAI-compatible LLM settings
+OPENAI_CHAT_MODEL = os.getenv("OPENAI_CHAT_MODEL", "MiniMaxAI/MiniMax-M2")
+OPENAI_CHAT_URL = os.getenv("OPENAI_CHAT_URL", "https://foundation-models.api.cloud.ru/v1")
+OPENAI_CHAT_KEY = os.getenv("OPENAI_CHAT_KEY", "")
+
+# RAG system URLs
+LANGCHAIN_URL = "http://localhost:8331/api/test-query"
+LLAMAINDEX_URL = "http://localhost:8334/api/test-query"
+
+# Yandex Disk
+YADISK_TOKEN = os.getenv("YADISK_TOKEN", "")
+
+# File paths
+INPUT_MD = Path(__file__).parent / "DOCUMENTS_TO_TEST.md"
+OUTPUT_MD = Path(__file__).parent / "EVALUATION_RESULT.md"
+
+# Timeouts
+RAG_TIMEOUT = 120  # seconds
+LLM_TIMEOUT = 60   # seconds
+YADISK_TIMEOUT = 60  # seconds
+
+
+# =============================================================================
+# Data Classes
+# =============================================================================
+
+@dataclass
+class QuestionItem:
+    """Represents a single question with its evaluation results."""
+    section: str
+    question: str
+    question_number: int = 0
+    langchain_answer: str = ""
+    llamaindex_answer: str = ""
+    langchain_score: float = 0.0
+    llamaindex_score: float = 0.0
+    winner: str = "Tie"
+    rationale: str = ""
+    error: str = ""
+
+
+@dataclass
+class DocumentItem:
+    """Represents a document with its associated questions."""
+    header: str
+    path: str
+    sections: list[tuple[str, list[QuestionItem]]] = field(default_factory=list)
+
+
+# =============================================================================
+# Document Parser
+# =============================================================================
+
+def split_documents(md_text: str) -> tuple[list[str], list[str]]:
+    """Split the markdown file into header lines and document blocks."""
+    lines = md_text.splitlines()
+    header: list[str] = []
+    docs: list[list[str]] = []
+    current: list[str] | None = None
+
+    for line in lines:
+        if line.startswith("## "):
+            if current is not None:
+                docs.append(current)
+            current = [line]
+        else:
+            if current is None:
+                header.append(line)
+            else:
+                current.append(line)
+
+    if current is not None:
+        docs.append(current)
+
+    return header, ["\n".join(d) for d in docs]
+
+
+def parse_document_block(block: str) -> DocumentItem:
+    """Parse a single document block from the markdown file."""
+    lines = block.splitlines()
+    header = lines[0].strip()
+
+    # Extract file path from backticks
+    m = re.search(r"`([^`]+)`", header)
+    doc_path = m.group(1) if m else ""
+
+    sections: list[tuple[str, list[QuestionItem]]] = []
+    current_section = ""
+    current_questions: list[QuestionItem] = []
+
+    for line in lines[1:]:
+        if line.startswith("### "):
+            if current_section:
+                sections.append((current_section, current_questions))
+            current_section = line[4:].strip()
+            current_questions = []
+        elif line.startswith("- "):
+            q = line[2:].strip()
+            if q:
+                current_questions.append(
+                    QuestionItem(section=current_section, question=q)
+                )
+
+    if current_section:
+        sections.append((current_section, current_questions))
+
+    return DocumentItem(header=header, path=doc_path, sections=sections)
+
+
+def parse_all_documents(md_path: Path) -> list[DocumentItem]:
+    """Parse all documents from the markdown file."""
+    raw = md_path.read_text(encoding="utf-8")
+    _, doc_blocks = split_documents(raw)
+    return [parse_document_block(b) for b in doc_blocks]
+
+
+def flatten_questions(docs: list[DocumentItem]) -> list[tuple[DocumentItem, QuestionItem]]:
+    """Flatten all questions from all documents into a single list with indices."""
+    result = []
+    for doc in docs:
+        for _, questions in doc.sections:
+            for q in questions:
+                result.append((doc, q))
+    return result
+
+
+# =============================================================================
+# RAG API Clients
+# =============================================================================
+
+def call_langchain(query: str, timeout: int = RAG_TIMEOUT) -> str:
+    """Call the LangChain RAG system API."""
+    payload = {"query": query}
+    try:
+        r = requests.post(LANGCHAIN_URL, json=payload, timeout=timeout)
+        r.raise_for_status()
+        data = r.json()
+        return str(data.get("response", "")).strip()
+    except Exception as e:
+        return f"ERROR: {e}"
+
+
+def call_llamaindex(query: str, timeout: int = RAG_TIMEOUT) -> str:
+    """Call the LlamaIndex RAG system API."""
+    payload = {"query": query}
+    try:
+        r = requests.post(LLAMAINDEX_URL, json=payload, timeout=timeout)
+        r.raise_for_status()
+        data = r.json()
+        return str(data.get("response", "")).strip()
+    except Exception as e:
+        return f"ERROR: {e}"
+
+
+# =============================================================================
+# Yandex Disk Integration
+# =============================================================================
+
+def download_yadisk_file(remote_path: str, token: str, local_path: str, timeout: int = YADISK_TIMEOUT) -> None:
+    """Download a file from Yandex Disk to a local path."""
+    headers = {"Authorization": f"OAuth {token}"}
+
+    # Get download URL
+    response = requests.get(
+        "https://cloud-api.yandex.net/v1/disk/resources/download",
+        headers=headers,
+        params={"path": remote_path},
+        timeout=timeout,
+    )
+    response.raise_for_status()
+    href = response.json()["href"]
+
+    # Download the file
+    file_response = requests.get(href, timeout=timeout * 2)
+    file_response.raise_for_status()
+
+    with open(local_path, "wb") as f:
+        f.write(file_response.content)
+
+
+def extract_text_from_file(file_path: str) -> str:
+    """Extract text from a downloaded file based on its extension."""
+    ext = Path(file_path).suffix.lower()
+
+    # Text-based formats
+    if ext in [".txt", ".md", ".csv", ".json", ".xml", ".html", ".htm"]:
+        with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
+            return f.read()
+
+    # For binary formats (docx, pdf, xlsx), we'll return a placeholder
+    # In production, you'd use libraries like python-docx, PyPDF2, openpyxl
+    elif ext in [".docx", ".doc"]:
+        try:
+            from docx import Document
+            doc = Document(file_path)
+            return "\n".join([p.text for p in doc.paragraphs])
+        except ImportError:
+            return f"[DOCX file: {file_path}] - python-docx not installed"
+        except Exception as e:
+            return f"[DOCX read error: {e}]"
+
+    elif ext == ".pdf":
+        try:
+            import PyPDF2
+            text_parts = []
+            with open(file_path, "rb") as f:
+                reader = PyPDF2.PdfReader(f)
+                for page in reader.pages:
+                    text_parts.append(page.extract_text() or "")
+            return "\n".join(text_parts)
+        except ImportError:
+            return f"[PDF file: {file_path}] - PyPDF2 not installed"
+        except Exception as e:
+            return f"[PDF read error: {e}]"
+
+    elif ext in [".xlsx", ".xls"]:
+        try:
+            from openpyxl import load_workbook
+            wb = load_workbook(file_path, read_only=True)
+            texts = []
+            for sheet in wb.worksheets:
+                for row in sheet.iter_rows(values_only=True):
+                    texts.append("\t".join(str(c) if c is not None else "" for c in row))
+            return "\n".join(texts)
+        except ImportError:
+            return f"[XLSX file: {file_path}] - openpyxl not installed"
+        except Exception as e:
+            return f"[XLSX read error: {e}]"
+
+    else:
+        # Try to read as text
+        try:
+            with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
+                return f.read()
+        except Exception:
+            return f"[Binary file: {file_path}]"
+
+
+def fetch_document_content(remote_path: str, token: str) -> str:
+    """Fetch content from a Yandex Disk file."""
+    if not token:
+        return "[Yandex Disk token not provided]"
+
+    # Clean up the path - remove "disk:/" prefix if present
+    clean_path = remote_path
+    if clean_path.startswith("disk:/"):
+        clean_path = clean_path[6:]
+
+    with tempfile.NamedTemporaryFile(delete=False, suffix=Path(clean_path).suffix) as tmp:
+        local_path = tmp.name
+
+    try:
+        download_yadisk_file(clean_path, token, local_path)
+        return extract_text_from_file(local_path)
+    except Exception as e:
+        return f"[Yandex Disk download error: {e}]"
+    finally:
+        if os.path.exists(local_path):
+            os.unlink(local_path)
+
+
+# =============================================================================
+# OpenAI-compatible LLM Evaluator
+# =============================================================================
+
+def create_evaluation_prompt(
+    question: str,
+    rag_response: str,
+    document_content: str,
+    section_type: str
+) -> str:
+    """Create a prompt for the LLM to evaluate a RAG response."""
+
+    # Section-specific evaluation criteria
+    section_criteria = {
+        "Entity/Fact Recall (Response Relevance)": """
+Критерии оценки:
+- Насколько точно ответ извлекает факты и сущности из документа
+- Соответствует ли ответ на вопрос о ключевых участниках и их ролях
+- Полнота извлечения фактов из контекста
+""",
+        "Numerical & Temporal Precision": """
+Критерии оценки:
+- Точность извлечения дат, лет, числовых значений
+- Соответствие чисел в ответе числам в документе
+- Правильность временных привязок событий
+""",
+        "Context Precision (Evidence-anchored)": """
+Критерии оценки:
+- Насколько хорошо ответ идентифицирует релевантные фрагменты
+- Умение отличать релевантные фрагменты от нерелевантных
+- Обоснованность выбора контекста
+""",
+        "Faithfulness / Non-hallucination": """
+Критерии оценки:
+- Отсутствие выдуманной информации
+- Ответ основан только на предоставленном контексте
+- Корректное указание на отсутствие информации, если её нет
+""",
+        "Reasoning & Synthesis": """
+Критерии оценки:
+- Качество синтеза информации из нескольких фрагментов
+- Логичность выводов
+- Указание на ограничения, риски или условия
+"""
+    }
+
+    criteria = section_criteria.get(section_type, """
+Критерии оценки:
+- Релевантность ответа вопросу
+- Точность фактов
+- Отсутствие галлюцинаций
+- Полнота ответа
+""")
+
+    prompt = f"""Ты — эксперт по оценке качества RAG-систем (Retrieval-Augmented Generation).
+
+Твоя задача: оценить качество ответа RAG-системы на вопрос пользователя, сравнив его с содержимым исходного документа.
+
+## Вопрос пользователя:
+{question}
+
+## Ответ RAG-системы:
+{rag_response}
+
+## Содержимое исходного документа:
+{document_content[:8000]}  # Ограничиваем длину для контекста
+
+{criteria}
+
+## Формат ответа:
+Верни ответ ТОЛЬКО в формате JSON:
+{{
+    "score": <число от 0.0 до 1.0>,
+    "rationale": "<краткое обоснование оценки на русском языке>",
+    "strengths": ["<сильные стороны>"],
+    "weaknesses": ["<слабые стороны>"],
+    "hallucination_detected": <true/false>,
+    "missing_info": ["<отсутствующая важная информация>"]
+}}
+
+Оценка:
+- 1.0: Идеальный ответ, полностью точный и полный
+- 0.8-0.9: Очень хороший ответ с незначительными неточностями
+- 0.6-0.7: Хороший ответ, но есть некоторые проблемы
+- 0.4-0.5: Удовлетворительный ответ с существенными проблемами
+- 0.2-0.3: Плохой ответ, много ошибок или неполный
+- 0.0-0.1: Ответ неверный или содержит галлюцинации
+"""
+    return prompt
+
+
+def evaluate_with_llm(
+    question: str,
+    rag_response: str,
+    document_content: str,
+    section_type: str,
+    model: str = OPENAI_CHAT_MODEL,
+    api_url: str = OPENAI_CHAT_URL,
+    api_key: str = OPENAI_CHAT_KEY
+) -> dict[str, Any]:
+    """Evaluate a RAG response using the OpenAI-compatible LLM."""
+
+    if not api_key:
+        return {
+            "score": 0.0,
+            "rationale": "API key not provided",
+            "error": "Missing API key"
+        }
+
+    prompt = create_evaluation_prompt(question, rag_response, document_content, section_type)
+
+    headers = {
+        "Authorization": f"Bearer {api_key}",
+        "Content-Type": "application/json"
+    }
+
+    payload = {
+        "model": model,
+        "messages": [
+            {
+                "role": "system",
+                "content": "Ты — эксперт по оценке качества RAG-систем. Отвечай ТОЛЬКО в формате JSON."
+            },
+            {
+                "role": "user",
+                "content": prompt
+            }
+        ],
+        "temperature": 0.1,
+        "max_tokens": 500
+    }
+
+    try:
+        response = requests.post(
+            api_url + "/chat/completions",
+            headers=headers,
+            json=payload,
+            timeout=LLM_TIMEOUT
+        )
+        response.raise_for_status()
+        
+        result = response.json()
+        
+        # Safely extract content
+        try:
+            content = result.get("choices", [{}])[0].get("message", {}).get("content")
+        except (IndexError, KeyError):
+            content = None
+        
+        if not content:
+            return {
+                "score": 0.5,
+                "rationale": "LLM returned empty or malformed response",
+                "error": "Empty content in LLM response"
+            }
+        
+        # Parse JSON response
+        try:
+            # Try to extract JSON from the response
+            json_match = re.search(r'\{[^{}]*\}', content, re.DOTALL)
+            if json_match:
+                evaluation = json.loads(json_match.group())
+            else:
+                evaluation = json.loads(content)
+            
+            return {
+                "score": float(evaluation.get("score", 0.0)),
+                "rationale": evaluation.get("rationale", "") or "",
+                "strengths": evaluation.get("strengths", []),
+                "weaknesses": evaluation.get("weaknesses", []),
+                "hallucination_detected": evaluation.get("hallucination_detected", False),
+                "missing_info": evaluation.get("missing_info", [])
+            }
+        except json.JSONDecodeError as e:
+            return {
+                "score": 0.5,
+                "rationale": f"Failed to parse LLM response: {content[:200]}",
+                "error": str(e)
+            }
+            
+    except requests.RequestException as e:
+        return {
+            "score": 0.0,
+            "rationale": f"LLM API error: {e}",
+            "error": str(e)
+        }
+
+
+# =============================================================================
+# Results Output
+# =============================================================================
+
+def truncate_text(text: str, max_len: int = 1500) -> str:
+    """Truncate text for display."""
+    text = (text or "").strip()
+    if len(text) <= max_len:
+        return text
+    return text[:max_len] + "... [truncated]"
+
+
+def format_question_result(q: QuestionItem, doc_path: str) -> str:
+    """Format a single question result for the output markdown."""
+    lines = [
+        f"#### Вопрос #{q.question_number}",
+        f"**Вопрос:** {q.question}",
+        "",
+        f"**Секция:** {q.section}",
+        "",
+        "**Ответ LangChain:**",
+        f"```",
+        truncate_text(q.langchain_answer),
+        "```",
+        "",
+        "**Ответ LlamaIndex:**",
+        f"```",
+        truncate_text(q.llamaindex_answer),
+        "```",
+        "",
+        "**Результаты оценки:**",
+        f"- LangChain Score: {q.langchain_score:.2f}",
+        f"- LlamaIndex Score: {q.llamaindex_score:.2f}",
+        f"- Победитель: **{q.winner}**",
+        "",
+        f"**Обоснование:** {q.rationale}",
+        "",
+        "---",
+        ""
+    ]
+    return "\n".join(lines)
+
+
+def format_document_results(doc: DocumentItem, with_results: bool = True) -> str:
+    """Format document results for the output markdown."""
+    lines = [
+        doc.header,
+        "",
+        f"**Путь к файлу:** `{doc.path}`",
+        ""
+    ]
+
+    if with_results:
+        for section_name, questions in doc.sections:
+            lines.append(f"### {section_name}")
+            lines.append("")
+            for q in questions:
+                lines.append(format_question_result(q, doc.path))
+    else:
+        lines.append("_Результаты ещё не обработаны._")
+        lines.append("")
+
+    return "\n".join(lines)
+
+
+def format_summary(all_questions: list[QuestionItem], batch_info: dict) -> str:
+    """Format summary statistics."""
+    wins = {"LangChain": 0, "LlamaIndex": 0, "Tie": 0}
+    scores_lc: list[float] = []
+    scores_li: list[float] = []
+
+    for q in all_questions:
+        wins[q.winner] += 1
+        scores_lc.append(q.langchain_score)
+        scores_li.append(q.llamaindex_score)
+
+    avg_lc = sum(scores_lc) / max(1, len(scores_lc))
+    avg_li = sum(scores_li) / max(1, len(scores_li))
+
+    if avg_lc > avg_li + 0.05:
+        ranking = "LangChain"
+    elif avg_li > avg_lc + 0.05:
+        ranking = "LlamaIndex"
+    else:
+        ranking = "Ничья"
+
+    lines = [
+        "## Итоговая сводка",
+        "",
+        f"- Всего вопросов оценено: {len(all_questions)}",
+        f"- Диапазон вопросов: {batch_info.get('from', 1)} - {batch_info.get('to', len(all_questions))}",
+        "",
+        "### Победители по вопросам:",
+        f"- LangChain: {wins['LangChain']}",
+        f"- LlamaIndex: {wins['LlamaIndex']}",
+        f"- Ничья: {wins['Tie']}",
+        "",
+        "### Средние оценки:",
+        f"- LangChain: {avg_lc:.3f}",
+        f"- LlamaIndex: {avg_li:.3f}",
+        "",
+        f"### Итоговый рейтинг: **{ranking}**",
+        "",
+        "_Методика оценки: LLM-оценка на основе сравнения с содержимым документов из Yandex Disk._",
+        "",
+        "---",
+        ""
+    ]
+    return "\n".join(lines)
+
+
+def write_results(
+    header_lines: list[str],
+    docs: list[DocumentItem],
+    all_questions: list[QuestionItem],
+    batch_info: dict,
+    output_path: Path
+) -> None:
+    """Write evaluation results to markdown file."""
+    output_parts: list[str] = []
+
+    # Header
+    output_parts.extend(header_lines)
+    output_parts.append("")
+    output_parts.append("# Результаты оценки RAG-систем")
+    output_parts.append("")
+    output_parts.append(f"Дата генерации: {Path(output_path).stat().st_mtime if output_path.exists() else 'N/A'}")
+    output_parts.append("")
+
+    # Summary
+    output_parts.append(format_summary(all_questions, batch_info))
+
+    # Detailed results per document
+    for doc in docs:
+        output_parts.append(format_document_results(doc, with_results=True))
+
+    output_path.write_text("\n".join(output_parts).rstrip() + "\n", encoding="utf-8")
+
+
+# =============================================================================
+# Main Evaluation Loop
+# =============================================================================
+
+def run_evaluation(
+    from_q: int,
+    to_q: int,
+    timeout_rag: int = RAG_TIMEOUT,
+    timeout_llm: int = LLM_TIMEOUT
+) -> None:
+    """Run the evaluation for the specified question range."""
+
+    print(f"Загрузка документов из {INPUT_MD}...")
+    docs = parse_all_documents(INPUT_MD)
+    all_flat = flatten_questions(docs)
+
+    total_questions = len(all_flat)
+    print(f"Всего вопросов найдено: {total_questions}")
+
+    # Adjust range
+    from_q = max(1, from_q)
+    to_q = min(total_questions, to_q)
+
+    if from_q > to_q:
+        print(f"Ошибка: диапазон {from_q}:{to_q} некорректен")
+        return
+
+    print(f"Оценка вопросов с {from_q} по {to_q}...")
+
+    # Store original header for output
+    raw = INPUT_MD.read_text(encoding="utf-8")
+    header_lines, _ = split_documents(raw)
+
+    # Track processed questions
+    processed_indices = set(range(from_q - 1, to_q))
+
+    # Process each question in range
+    q_index = 0
+    for doc_idx, (doc, q) in enumerate(all_flat):
+        q_index += 1
+
+        if q_index < from_q or q_index > to_q:
+            continue
+
+        q.question_number = q_index
+        print(f"\n[{q_index}/{total_questions}] {q.question[:80]}...")
+
+        # Call both RAG systems
+        print(f"  -> LangChain...", end=" ", flush=True)
+        t0 = __import__("time").time()
+        q.langchain_answer = call_langchain(q.question, timeout=timeout_rag)
+        print(f"OK ({__import__('time').time() - t0:.1f}s)")
+
+        print(f"  -> LlamaIndex...", end=" ", flush=True)
+        t0 = __import__("time").time()
+        q.llamaindex_answer = call_llamaindex(q.question, timeout=timeout_rag)
+        print(f"OK ({__import__('time').time() - t0:.1f}s)")
+
+        # Download document content from Yandex Disk
+        print(f"  -> Загрузка документа из Yandex Disk...", end=" ", flush=True)
+        if doc.path:
+            doc_content = fetch_document_content(doc.path, YADISK_TOKEN)
+            print(f"OK ({len(doc_content)} символов)")
+        else:
+            doc_content = "[Путь к документу не найден]"
+            print("SKIP (нет пути)")
+
+        # Evaluate LangChain response
+        print(f"  -> Оценка LangChain...", end=" ", flush=True)
+        lc_eval = evaluate_with_llm(
+            q.question, q.langchain_answer, doc_content, q.section
+        )
+        q.langchain_score = lc_eval.get("score", 0.0)
+        lc_rationale = lc_eval.get("rationale", "")
+        print(f"Score: {q.langchain_score:.2f}")
+
+        # Evaluate LlamaIndex response
+        print(f"  -> Оценка LlamaIndex...", end=" ", flush=True)
+        li_eval = evaluate_with_llm(
+            q.question, q.llamaindex_answer, doc_content, q.section
+        )
+        q.llamaindex_score = li_eval.get("score", 0.0)
+        li_rationale = li_eval.get("rationale", "")
+        print(f"Score: {q.llamaindex_score:.2f}")
+
+        # Determine winner
+        score_diff = abs(q.langchain_score - q.llamaindex_score)
+        if score_diff < 0.05:
+            q.winner = "Tie"
+        elif q.langchain_score > q.llamaindex_score:
+            q.winner = "LangChain"
+        else:
+            q.winner = "LlamaIndex"
+
+        # Combine rationales
+        q.rationale = f"LC: {lc_rationale} | LI: {li_rationale}"
+
+    # Write results
+    print(f"\nЗапись результатов в {OUTPUT_MD}...")
+    batch_info = {"from": from_q, "to": to_q}
+
+    # Collect all evaluated questions
+    evaluated_questions = [
+        q for _, q in all_flat
+        if q.question_number in range(from_q, to_q + 1)
+    ]
+
+    write_results(header_lines, docs, evaluated_questions, batch_info, OUTPUT_MD)
+    print(f"Готово! Результаты сохранены в {OUTPUT_MD}")
+
+
+def main() -> int:
+    """Main entry point."""
+    parser = argparse.ArgumentParser(
+        description="Оценка RAG-систем с использованием LLM и Yandex Disk"
+    )
+    parser.add_argument(
+        "range",
+        type=str,
+        help="Диапазон вопросов для оценки в формате 'from:to' (например, 1:10)"
+    )
+    parser.add_argument(
+        "--timeout-rag",
+        type=int,
+        default=RAG_TIMEOUT,
+        help=f"Таймаут для RAG API (по умолчанию {RAG_TIMEOUT}s)"
+    )
+    parser.add_argument(
+        "--timeout-llm",
+        type=int,
+        default=LLM_TIMEOUT,
+        help=f"Таймаут для LLM API (по умолчанию {LLM_TIMEOUT}s)"
+    )
+
+    args = parser.parse_args()
+
+    # Parse range argument
+    range_match = re.match(r"(\d+):(\d+)", args.range)
+    if not range_match:
+        print("Ошибка: диапазон должен быть в формате 'from:to' (например, 1:10)")
+        return 1
+
+    from_q = int(range_match.group(1))
+    to_q = int(range_match.group(2))
+
+    if from_q > to_q:
+        print("Ошибка: 'from' должно быть меньше или равно 'to'")
+        return 1
+
+    # Validate configuration
+    if not OPENAI_CHAT_KEY:
+        print("Предупреждение: OPENAI_CHAT_KEY не установлен. Оценка LLM будет пропущена.")
+    if not YADISK_TOKEN:
+        print("Предупреждение: YADISK_TOKEN не установлен. Загрузка документов будет пропущена.")
+
+    # Run evaluation
+    run_evaluation(from_q, to_q, args.timeout_rag, args.timeout_llm)
+
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())