#!/usr/bin/env python3 """ RAG Evaluation Script Evaluates two RAG systems (LangChain and LlamaIndex) using OpenAI-compatible LLM for scoring, with Yandex Disk integration for document verification. Usage: python rag_evaluation.py 1:10 # Evaluate questions 1 to 10 python rag_evaluation.py 5:20 # Evaluate questions 5 to 20 """ from __future__ import annotations import argparse import json import os import re import tempfile from dataclasses import dataclass, field from pathlib import Path from typing import Any, Optional from dotenv import load_dotenv load_dotenv() import requests # ============================================================================= # Configuration # ============================================================================= # OpenAI-compatible LLM settings OPENAI_CHAT_MODEL = os.getenv("OPENAI_CHAT_MODEL", "MiniMaxAI/MiniMax-M2") OPENAI_CHAT_URL = os.getenv("OPENAI_CHAT_URL", "https://foundation-models.api.cloud.ru/v1") OPENAI_CHAT_KEY = os.getenv("OPENAI_CHAT_KEY", "") # RAG system URLs LANGCHAIN_URL = "http://localhost:8331/api/test-query" LLAMAINDEX_URL = "http://localhost:8334/api/test-query" # Yandex Disk YADISK_TOKEN = os.getenv("YADISK_TOKEN", "") # File paths INPUT_MD = Path(__file__).parent / "DOCUMENTS_TO_TEST.md" OUTPUT_MD = Path(__file__).parent / "EVALUATION_RESULT.md" # Timeouts RAG_TIMEOUT = 120 # seconds LLM_TIMEOUT = 60 # seconds YADISK_TIMEOUT = 60 # seconds # ============================================================================= # Data Classes # ============================================================================= @dataclass class QuestionItem: """Represents a single question with its evaluation results.""" section: str question: str question_number: int = 0 langchain_answer: str = "" llamaindex_answer: str = "" langchain_score: float = 0.0 llamaindex_score: float = 0.0 winner: str = "Tie" rationale: str = "" error: str = "" @dataclass class DocumentItem: """Represents a document with its associated questions.""" header: str path: str sections: list[tuple[str, list[QuestionItem]]] = field(default_factory=list) # ============================================================================= # Document Parser # ============================================================================= def split_documents(md_text: str) -> tuple[list[str], list[str]]: """Split the markdown file into header lines and document blocks.""" lines = md_text.splitlines() header: list[str] = [] docs: list[list[str]] = [] current: list[str] | None = None for line in lines: if line.startswith("## "): if current is not None: docs.append(current) current = [line] else: if current is None: header.append(line) else: current.append(line) if current is not None: docs.append(current) return header, ["\n".join(d) for d in docs] def parse_document_block(block: str) -> DocumentItem: """Parse a single document block from the markdown file.""" lines = block.splitlines() header = lines[0].strip() # Extract file path from backticks m = re.search(r"`([^`]+)`", header) doc_path = m.group(1) if m else "" sections: list[tuple[str, list[QuestionItem]]] = [] current_section = "" current_questions: list[QuestionItem] = [] for line in lines[1:]: if line.startswith("### "): if current_section: sections.append((current_section, current_questions)) current_section = line[4:].strip() current_questions = [] elif line.startswith("- "): q = line[2:].strip() if q: current_questions.append( QuestionItem(section=current_section, question=q) ) if current_section: sections.append((current_section, current_questions)) return DocumentItem(header=header, path=doc_path, sections=sections) def parse_all_documents(md_path: Path) -> list[DocumentItem]: """Parse all documents from the markdown file.""" raw = md_path.read_text(encoding="utf-8") _, doc_blocks = split_documents(raw) return [parse_document_block(b) for b in doc_blocks] def flatten_questions(docs: list[DocumentItem]) -> list[tuple[DocumentItem, QuestionItem]]: """Flatten all questions from all documents into a single list with indices.""" result = [] for doc in docs: for _, questions in doc.sections: for q in questions: result.append((doc, q)) return result # ============================================================================= # RAG API Clients # ============================================================================= def call_langchain(query: str, timeout: int = RAG_TIMEOUT) -> str: """Call the LangChain RAG system API.""" payload = {"query": query} try: r = requests.post(LANGCHAIN_URL, json=payload, timeout=timeout) r.raise_for_status() data = r.json() return str(data.get("response", "")).strip() except Exception as e: return f"ERROR: {e}" def call_llamaindex(query: str, timeout: int = RAG_TIMEOUT) -> str: """Call the LlamaIndex RAG system API.""" payload = {"query": query} try: r = requests.post(LLAMAINDEX_URL, json=payload, timeout=timeout) r.raise_for_status() data = r.json() return str(data.get("response", "")).strip() except Exception as e: return f"ERROR: {e}" # ============================================================================= # Yandex Disk Integration # ============================================================================= def download_yadisk_file(remote_path: str, token: str, local_path: str, timeout: int = YADISK_TIMEOUT) -> None: """Download a file from Yandex Disk to a local path.""" headers = {"Authorization": f"OAuth {token}"} # Get download URL response = requests.get( "https://cloud-api.yandex.net/v1/disk/resources/download", headers=headers, params={"path": remote_path}, timeout=timeout, ) response.raise_for_status() href = response.json()["href"] # Download the file file_response = requests.get(href, timeout=timeout * 2) file_response.raise_for_status() with open(local_path, "wb") as f: f.write(file_response.content) def extract_text_from_file(file_path: str) -> str: """Extract text from a downloaded file based on its extension.""" ext = Path(file_path).suffix.lower() # Text-based formats if ext in [".txt", ".md", ".csv", ".json", ".xml", ".html", ".htm"]: with open(file_path, "r", encoding="utf-8", errors="ignore") as f: return f.read() # For binary formats (docx, pdf, xlsx), we'll return a placeholder # In production, you'd use libraries like python-docx, PyPDF2, openpyxl elif ext in [".docx", ".doc"]: try: from docx import Document doc = Document(file_path) return "\n".join([p.text for p in doc.paragraphs]) except ImportError: return f"[DOCX file: {file_path}] - python-docx not installed" except Exception as e: return f"[DOCX read error: {e}]" elif ext == ".pdf": try: import PyPDF2 text_parts = [] with open(file_path, "rb") as f: reader = PyPDF2.PdfReader(f) for page in reader.pages: text_parts.append(page.extract_text() or "") return "\n".join(text_parts) except ImportError: return f"[PDF file: {file_path}] - PyPDF2 not installed" except Exception as e: return f"[PDF read error: {e}]" elif ext in [".xlsx", ".xls"]: try: from openpyxl import load_workbook wb = load_workbook(file_path, read_only=True) texts = [] for sheet in wb.worksheets: for row in sheet.iter_rows(values_only=True): texts.append("\t".join(str(c) if c is not None else "" for c in row)) return "\n".join(texts) except ImportError: return f"[XLSX file: {file_path}] - openpyxl not installed" except Exception as e: return f"[XLSX read error: {e}]" else: # Try to read as text try: with open(file_path, "r", encoding="utf-8", errors="ignore") as f: return f.read() except Exception: return f"[Binary file: {file_path}]" def fetch_document_content(remote_path: str, token: str) -> str: """Fetch content from a Yandex Disk file.""" if not token: return "[Yandex Disk token not provided]" # Clean up the path - remove "disk:/" prefix if present clean_path = remote_path if clean_path.startswith("disk:/"): clean_path = clean_path[6:] with tempfile.NamedTemporaryFile(delete=False, suffix=Path(clean_path).suffix) as tmp: local_path = tmp.name try: download_yadisk_file(clean_path, token, local_path) return extract_text_from_file(local_path) except Exception as e: return f"[Yandex Disk download error: {e}]" finally: if os.path.exists(local_path): os.unlink(local_path) # ============================================================================= # OpenAI-compatible LLM Evaluator # ============================================================================= def create_evaluation_prompt( question: str, rag_response: str, document_content: str, section_type: str ) -> str: """Create a prompt for the LLM to evaluate a RAG response.""" # Section-specific evaluation criteria section_criteria = { "Entity/Fact Recall (Response Relevance)": """ Критерии оценки: - Насколько точно ответ извлекает факты и сущности из документа - Соответствует ли ответ на вопрос о ключевых участниках и их ролях - Полнота извлечения фактов из контекста """, "Numerical & Temporal Precision": """ Критерии оценки: - Точность извлечения дат, лет, числовых значений - Соответствие чисел в ответе числам в документе - Правильность временных привязок событий """, "Context Precision (Evidence-anchored)": """ Критерии оценки: - Насколько хорошо ответ идентифицирует релевантные фрагменты - Умение отличать релевантные фрагменты от нерелевантных - Обоснованность выбора контекста """, "Faithfulness / Non-hallucination": """ Критерии оценки: - Отсутствие выдуманной информации - Ответ основан только на предоставленном контексте - Корректное указание на отсутствие информации, если её нет """, "Reasoning & Synthesis": """ Критерии оценки: - Качество синтеза информации из нескольких фрагментов - Логичность выводов - Указание на ограничения, риски или условия """ } criteria = section_criteria.get(section_type, """ Критерии оценки: - Релевантность ответа вопросу - Точность фактов - Отсутствие галлюцинаций - Полнота ответа """) prompt = f"""Ты — эксперт по оценке качества RAG-систем (Retrieval-Augmented Generation). Твоя задача: оценить качество ответа RAG-системы на вопрос пользователя, сравнив его с содержимым исходного документа. ## Вопрос пользователя: {question} ## Ответ RAG-системы: {rag_response} ## Содержимое исходного документа: {document_content[:8000]} # Ограничиваем длину для контекста {criteria} ## Формат ответа: Верни ответ ТОЛЬКО в формате JSON: {{ "score": <число от 0.0 до 1.0>, "rationale": "<краткое обоснование оценки на русском языке>", "strengths": ["<сильные стороны>"], "weaknesses": ["<слабые стороны>"], "hallucination_detected": , "missing_info": ["<отсутствующая важная информация>"] }} Оценка: - 1.0: Идеальный ответ, полностью точный и полный - 0.8-0.9: Очень хороший ответ с незначительными неточностями - 0.6-0.7: Хороший ответ, но есть некоторые проблемы - 0.4-0.5: Удовлетворительный ответ с существенными проблемами - 0.2-0.3: Плохой ответ, много ошибок или неполный - 0.0-0.1: Ответ неверный или содержит галлюцинации """ return prompt def evaluate_with_llm( question: str, rag_response: str, document_content: str, section_type: str, model: str = OPENAI_CHAT_MODEL, api_url: str = OPENAI_CHAT_URL, api_key: str = OPENAI_CHAT_KEY ) -> dict[str, Any]: """Evaluate a RAG response using the OpenAI-compatible LLM.""" if not api_key: return { "score": 0.0, "rationale": "API key not provided", "error": "Missing API key" } prompt = create_evaluation_prompt(question, rag_response, document_content, section_type) headers = { "Authorization": f"Bearer {api_key}", "Content-Type": "application/json" } payload = { "model": model, "messages": [ { "role": "system", "content": "Ты — эксперт по оценке качества RAG-систем. Отвечай ТОЛЬКО в формате JSON." }, { "role": "user", "content": prompt } ], "temperature": 0.1, "max_tokens": 500 } try: response = requests.post( api_url + "/chat/completions", headers=headers, json=payload, timeout=LLM_TIMEOUT ) response.raise_for_status() result = response.json() # Safely extract content try: content = result.get("choices", [{}])[0].get("message", {}).get("content") except (IndexError, KeyError): content = None if not content: return { "score": 0.5, "rationale": "LLM returned empty or malformed response", "error": "Empty content in LLM response" } # Parse JSON response try: # Try to extract JSON from the response json_match = re.search(r'\{[^{}]*\}', content, re.DOTALL) if json_match: evaluation = json.loads(json_match.group()) else: evaluation = json.loads(content) return { "score": float(evaluation.get("score", 0.0)), "rationale": evaluation.get("rationale", "") or "", "strengths": evaluation.get("strengths", []), "weaknesses": evaluation.get("weaknesses", []), "hallucination_detected": evaluation.get("hallucination_detected", False), "missing_info": evaluation.get("missing_info", []) } except json.JSONDecodeError as e: return { "score": 0.5, "rationale": f"Failed to parse LLM response: {content[:200]}", "error": str(e) } except requests.RequestException as e: return { "score": 0.0, "rationale": f"LLM API error: {e}", "error": str(e) } # ============================================================================= # Results Output # ============================================================================= def truncate_text(text: str, max_len: int = 1500) -> str: """Truncate text for display.""" text = (text or "").strip() if len(text) <= max_len: return text return text[:max_len] + "... [truncated]" def format_question_result(q: QuestionItem, doc_path: str) -> str: """Format a single question result for the output markdown.""" lines = [ f"#### Вопрос #{q.question_number}", f"**Вопрос:** {q.question}", "", f"**Секция:** {q.section}", "", "**Ответ LangChain:**", f"```", truncate_text(q.langchain_answer), "```", "", "**Ответ LlamaIndex:**", f"```", truncate_text(q.llamaindex_answer), "```", "", "**Результаты оценки:**", f"- LangChain Score: {q.langchain_score:.2f}", f"- LlamaIndex Score: {q.llamaindex_score:.2f}", f"- Победитель: **{q.winner}**", "", f"**Обоснование:** {q.rationale}", "", "---", "" ] return "\n".join(lines) def format_document_results(doc: DocumentItem, with_results: bool = True) -> str: """Format document results for the output markdown.""" lines = [ doc.header, "", f"**Путь к файлу:** `{doc.path}`", "" ] if with_results: for section_name, questions in doc.sections: lines.append(f"### {section_name}") lines.append("") for q in questions: lines.append(format_question_result(q, doc.path)) else: lines.append("_Результаты ещё не обработаны._") lines.append("") return "\n".join(lines) def format_summary(all_questions: list[QuestionItem], batch_info: dict) -> str: """Format summary statistics.""" wins = {"LangChain": 0, "LlamaIndex": 0, "Tie": 0} scores_lc: list[float] = [] scores_li: list[float] = [] for q in all_questions: wins[q.winner] += 1 scores_lc.append(q.langchain_score) scores_li.append(q.llamaindex_score) avg_lc = sum(scores_lc) / max(1, len(scores_lc)) avg_li = sum(scores_li) / max(1, len(scores_li)) if avg_lc > avg_li + 0.05: ranking = "LangChain" elif avg_li > avg_lc + 0.05: ranking = "LlamaIndex" else: ranking = "Ничья" lines = [ "## Итоговая сводка", "", f"- Всего вопросов оценено: {len(all_questions)}", f"- Диапазон вопросов: {batch_info.get('from', 1)} - {batch_info.get('to', len(all_questions))}", "", "### Победители по вопросам:", f"- LangChain: {wins['LangChain']}", f"- LlamaIndex: {wins['LlamaIndex']}", f"- Ничья: {wins['Tie']}", "", "### Средние оценки:", f"- LangChain: {avg_lc:.3f}", f"- LlamaIndex: {avg_li:.3f}", "", f"### Итоговый рейтинг: **{ranking}**", "", "_Методика оценки: LLM-оценка на основе сравнения с содержимым документов из Yandex Disk._", "", "---", "" ] return "\n".join(lines) def write_results( header_lines: list[str], docs: list[DocumentItem], all_questions: list[QuestionItem], batch_info: dict, output_path: Path ) -> None: """Write evaluation results to markdown file.""" output_parts: list[str] = [] # Header output_parts.extend(header_lines) output_parts.append("") output_parts.append("# Результаты оценки RAG-систем") output_parts.append("") output_parts.append(f"Дата генерации: {Path(output_path).stat().st_mtime if output_path.exists() else 'N/A'}") output_parts.append("") # Summary output_parts.append(format_summary(all_questions, batch_info)) # Detailed results per document for doc in docs: output_parts.append(format_document_results(doc, with_results=True)) output_path.write_text("\n".join(output_parts).rstrip() + "\n", encoding="utf-8") # ============================================================================= # Main Evaluation Loop # ============================================================================= def run_evaluation( from_q: int, to_q: int, timeout_rag: int = RAG_TIMEOUT, timeout_llm: int = LLM_TIMEOUT ) -> None: """Run the evaluation for the specified question range.""" print(f"Загрузка документов из {INPUT_MD}...") docs = parse_all_documents(INPUT_MD) all_flat = flatten_questions(docs) total_questions = len(all_flat) print(f"Всего вопросов найдено: {total_questions}") # Adjust range from_q = max(1, from_q) to_q = min(total_questions, to_q) if from_q > to_q: print(f"Ошибка: диапазон {from_q}:{to_q} некорректен") return print(f"Оценка вопросов с {from_q} по {to_q}...") # Store original header for output raw = INPUT_MD.read_text(encoding="utf-8") header_lines, _ = split_documents(raw) # Track processed questions processed_indices = set(range(from_q - 1, to_q)) # Process each question in range q_index = 0 for doc_idx, (doc, q) in enumerate(all_flat): q_index += 1 if q_index < from_q or q_index > to_q: continue q.question_number = q_index print(f"\n[{q_index}/{total_questions}] {q.question[:80]}...") # Call both RAG systems print(f" -> LangChain...", end=" ", flush=True) t0 = __import__("time").time() q.langchain_answer = call_langchain(q.question, timeout=timeout_rag) print(f"OK ({__import__('time').time() - t0:.1f}s)") print(f" -> LlamaIndex...", end=" ", flush=True) t0 = __import__("time").time() q.llamaindex_answer = call_llamaindex(q.question, timeout=timeout_rag) print(f"OK ({__import__('time').time() - t0:.1f}s)") # Download document content from Yandex Disk print(f" -> Загрузка документа из Yandex Disk...", end=" ", flush=True) if doc.path: doc_content = fetch_document_content(doc.path, YADISK_TOKEN) print(f"OK ({len(doc_content)} символов)") else: doc_content = "[Путь к документу не найден]" print("SKIP (нет пути)") # Evaluate LangChain response print(f" -> Оценка LangChain...", end=" ", flush=True) lc_eval = evaluate_with_llm( q.question, q.langchain_answer, doc_content, q.section ) q.langchain_score = lc_eval.get("score", 0.0) lc_rationale = lc_eval.get("rationale", "") print(f"Score: {q.langchain_score:.2f}") # Evaluate LlamaIndex response print(f" -> Оценка LlamaIndex...", end=" ", flush=True) li_eval = evaluate_with_llm( q.question, q.llamaindex_answer, doc_content, q.section ) q.llamaindex_score = li_eval.get("score", 0.0) li_rationale = li_eval.get("rationale", "") print(f"Score: {q.llamaindex_score:.2f}") # Determine winner score_diff = abs(q.langchain_score - q.llamaindex_score) if score_diff < 0.05: q.winner = "Tie" elif q.langchain_score > q.llamaindex_score: q.winner = "LangChain" else: q.winner = "LlamaIndex" # Combine rationales q.rationale = f"LC: {lc_rationale} | LI: {li_rationale}" # Write results print(f"\nЗапись результатов в {OUTPUT_MD}...") batch_info = {"from": from_q, "to": to_q} # Collect all evaluated questions evaluated_questions = [ q for _, q in all_flat if q.question_number in range(from_q, to_q + 1) ] write_results(header_lines, docs, evaluated_questions, batch_info, OUTPUT_MD) print(f"Готово! Результаты сохранены в {OUTPUT_MD}") def main() -> int: """Main entry point.""" parser = argparse.ArgumentParser( description="Оценка RAG-систем с использованием LLM и Yandex Disk" ) parser.add_argument( "range", type=str, help="Диапазон вопросов для оценки в формате 'from:to' (например, 1:10)" ) parser.add_argument( "--timeout-rag", type=int, default=RAG_TIMEOUT, help=f"Таймаут для RAG API (по умолчанию {RAG_TIMEOUT}s)" ) parser.add_argument( "--timeout-llm", type=int, default=LLM_TIMEOUT, help=f"Таймаут для LLM API (по умолчанию {LLM_TIMEOUT}s)" ) args = parser.parse_args() # Parse range argument range_match = re.match(r"(\d+):(\d+)", args.range) if not range_match: print("Ошибка: диапазон должен быть в формате 'from:to' (например, 1:10)") return 1 from_q = int(range_match.group(1)) to_q = int(range_match.group(2)) if from_q > to_q: print("Ошибка: 'from' должно быть меньше или равно 'to'") return 1 # Validate configuration if not OPENAI_CHAT_KEY: print("Предупреждение: OPENAI_CHAT_KEY не установлен. Оценка LLM будет пропущена.") if not YADISK_TOKEN: print("Предупреждение: YADISK_TOKEN не установлен. Загрузка документов будет пропущена.") # Run evaluation run_evaluation(from_q, to_q, args.timeout_rag, args.timeout_llm) return 0 if __name__ == "__main__": raise SystemExit(main())