evaluation for rag systems update, script for evaluating against questions
This commit is contained in:
5
.gitignore
vendored
5
.gitignore
vendored
@@ -1,3 +1,8 @@
|
|||||||
data-unpacked-archives
|
data-unpacked-archives
|
||||||
data-broken-archives
|
data-broken-archives
|
||||||
.env
|
.env
|
||||||
|
tmp/
|
||||||
|
__pycache__
|
||||||
|
venv
|
||||||
|
services/rag/.DS_Store
|
||||||
|
EVALUATION_RESULT.json
|
||||||
|
|||||||
28000
EVALUATION_RESULT.md
28000
EVALUATION_RESULT.md
File diff suppressed because it is too large
Load Diff
1120
rag_evaluation.py
1120
rag_evaluation.py
File diff suppressed because it is too large
Load Diff
@@ -1,353 +0,0 @@
|
|||||||
#!/usr/bin/env python3
|
|
||||||
from __future__ import annotations
|
|
||||||
|
|
||||||
import argparse
|
|
||||||
import json
|
|
||||||
import math
|
|
||||||
import re
|
|
||||||
import time
|
|
||||||
from dataclasses import dataclass, field
|
|
||||||
from pathlib import Path
|
|
||||||
from typing import Any
|
|
||||||
|
|
||||||
import requests
|
|
||||||
|
|
||||||
LANGCHAIN_URL = "http://localhost:8331/api/test-query"
|
|
||||||
LLAMAINDEX_URL = "http://localhost:8334/api/test-query"
|
|
||||||
|
|
||||||
INPUT_MD = Path("/Users/idchlife/www/work/rag-solution/DOCUMENTS_TO_TEST.md")
|
|
||||||
OUTPUT_MD = Path("/Users/idchlife/www/work/rag-solution/RAG_EVALUATION.md")
|
|
||||||
|
|
||||||
|
|
||||||
STOPWORDS_RU = {
|
|
||||||
"что",
|
|
||||||
"кто",
|
|
||||||
"как",
|
|
||||||
"какой",
|
|
||||||
"какая",
|
|
||||||
"какие",
|
|
||||||
"ли",
|
|
||||||
"в",
|
|
||||||
"на",
|
|
||||||
"по",
|
|
||||||
"и",
|
|
||||||
"или",
|
|
||||||
"для",
|
|
||||||
"из",
|
|
||||||
"с",
|
|
||||||
"о",
|
|
||||||
"об",
|
|
||||||
"а",
|
|
||||||
"не",
|
|
||||||
"к",
|
|
||||||
"до",
|
|
||||||
"от",
|
|
||||||
"это",
|
|
||||||
"есть",
|
|
||||||
"если",
|
|
||||||
"какому",
|
|
||||||
"каком",
|
|
||||||
"году",
|
|
||||||
"материалах",
|
|
||||||
"базы",
|
|
||||||
"найди",
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
|
||||||
class QuestionItem:
|
|
||||||
section: str
|
|
||||||
question: str
|
|
||||||
langchain_answer: str = ""
|
|
||||||
llamaindex_answer: str = ""
|
|
||||||
langchain_score: float = 0.0
|
|
||||||
llamaindex_score: float = 0.0
|
|
||||||
winner: str = "Tie"
|
|
||||||
rationale: str = ""
|
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
|
||||||
class DocumentItem:
|
|
||||||
header: str
|
|
||||||
path: str
|
|
||||||
sections: list[tuple[str, list[QuestionItem]]] = field(default_factory=list)
|
|
||||||
|
|
||||||
|
|
||||||
def split_documents(md_text: str) -> tuple[list[str], list[str]]:
|
|
||||||
lines = md_text.splitlines()
|
|
||||||
header: list[str] = []
|
|
||||||
docs: list[list[str]] = []
|
|
||||||
current: list[str] | None = None
|
|
||||||
for line in lines:
|
|
||||||
if line.startswith("## "):
|
|
||||||
if current is not None:
|
|
||||||
docs.append(current)
|
|
||||||
current = [line]
|
|
||||||
else:
|
|
||||||
if current is None:
|
|
||||||
header.append(line)
|
|
||||||
else:
|
|
||||||
current.append(line)
|
|
||||||
if current is not None:
|
|
||||||
docs.append(current)
|
|
||||||
return header, ["\n".join(d) for d in docs]
|
|
||||||
|
|
||||||
|
|
||||||
def parse_document_block(block: str) -> DocumentItem:
|
|
||||||
lines = block.splitlines()
|
|
||||||
header = lines[0].strip()
|
|
||||||
m = re.search(r"`([^`]+)`", header)
|
|
||||||
doc_path = m.group(1) if m else ""
|
|
||||||
sections: list[tuple[str, list[QuestionItem]]] = []
|
|
||||||
current_section = ""
|
|
||||||
current_questions: list[QuestionItem] = []
|
|
||||||
for line in lines[1:]:
|
|
||||||
if line.startswith("### "):
|
|
||||||
if current_section:
|
|
||||||
sections.append((current_section, current_questions))
|
|
||||||
current_section = line[4:].strip()
|
|
||||||
current_questions = []
|
|
||||||
elif line.startswith("- "):
|
|
||||||
q = line[2:].strip()
|
|
||||||
if q:
|
|
||||||
current_questions.append(
|
|
||||||
QuestionItem(section=current_section, question=q)
|
|
||||||
)
|
|
||||||
if current_section:
|
|
||||||
sections.append((current_section, current_questions))
|
|
||||||
return DocumentItem(header=header, path=doc_path, sections=sections)
|
|
||||||
|
|
||||||
|
|
||||||
def tokenize(text: str) -> list[str]:
|
|
||||||
tokens = re.findall(r"[A-Za-zА-Яа-я0-9_]+", text.lower())
|
|
||||||
return [t for t in tokens if len(t) > 2 and t not in STOPWORDS_RU]
|
|
||||||
|
|
||||||
|
|
||||||
def score_answer(question: str, answer: str) -> tuple[float, dict[str, float]]:
|
|
||||||
answer = (answer or "").strip()
|
|
||||||
if not answer:
|
|
||||||
return 0.0, {"len": 0.0, "overlap": 0.0, "specificity": 0.0, "structure": 0.0}
|
|
||||||
|
|
||||||
q_tokens = set(tokenize(question))
|
|
||||||
a_tokens = tokenize(answer)
|
|
||||||
a_token_set = set(a_tokens)
|
|
||||||
overlap = (len(q_tokens & a_token_set) / max(1, len(q_tokens))) if q_tokens else 0.0
|
|
||||||
|
|
||||||
length_score = min(1.0, len(answer) / 500.0)
|
|
||||||
if len(answer) > 2800:
|
|
||||||
length_score *= 0.85
|
|
||||||
|
|
||||||
numbers = len(re.findall(r"\b\d+(?:[.,]\d+)?\b", answer))
|
|
||||||
cyr_names = len(re.findall(r"[А-ЯЁ][а-яё]{2,}(?:\s+[А-ЯЁ][а-яё]{2,}){0,2}", answer))
|
|
||||||
specificity = min(1.0, (numbers * 0.08) + (cyr_names * 0.05))
|
|
||||||
|
|
||||||
bullet_like = 1.0 if re.search(r"(^|\n)\s*(?:\d+\.|-)\s+", answer) else 0.0
|
|
||||||
sentence_count = len(re.findall(r"[.!?]", answer))
|
|
||||||
structure = min(1.0, bullet_like * 0.5 + min(0.5, sentence_count / 6.0))
|
|
||||||
|
|
||||||
refusal_penalty = 0.0
|
|
||||||
if re.search(
|
|
||||||
r"\b(ошибк|error|не удалось|failed|исключени|exception)\b", answer.lower()
|
|
||||||
):
|
|
||||||
refusal_penalty = 0.6
|
|
||||||
|
|
||||||
total = (
|
|
||||||
(0.38 * overlap)
|
|
||||||
+ (0.26 * length_score)
|
|
||||||
+ (0.20 * specificity)
|
|
||||||
+ (0.16 * structure)
|
|
||||||
- refusal_penalty
|
|
||||||
)
|
|
||||||
total = max(0.0, min(1.0, total))
|
|
||||||
return total, {
|
|
||||||
"len": length_score,
|
|
||||||
"overlap": overlap,
|
|
||||||
"specificity": specificity,
|
|
||||||
"structure": structure,
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
def compare_answers(
|
|
||||||
question: str, lc_answer: str, li_answer: str
|
|
||||||
) -> tuple[str, float, float, str]:
|
|
||||||
lc_score, lc_parts = score_answer(question, lc_answer)
|
|
||||||
li_score, li_parts = score_answer(question, li_answer)
|
|
||||||
diff = lc_score - li_score
|
|
||||||
if abs(diff) < 0.04:
|
|
||||||
winner = "Tie"
|
|
||||||
elif diff > 0:
|
|
||||||
winner = "LangChain"
|
|
||||||
else:
|
|
||||||
winner = "LlamaIndex"
|
|
||||||
rationale = (
|
|
||||||
f"LC(overlap={lc_parts['overlap']:.2f}, len={lc_parts['len']:.2f}, spec={lc_parts['specificity']:.2f}, "
|
|
||||||
f"struct={lc_parts['structure']:.2f}) vs "
|
|
||||||
f"LI(overlap={li_parts['overlap']:.2f}, len={li_parts['len']:.2f}, spec={li_parts['specificity']:.2f}, "
|
|
||||||
f"struct={li_parts['structure']:.2f})"
|
|
||||||
)
|
|
||||||
return winner, lc_score, li_score, rationale
|
|
||||||
|
|
||||||
|
|
||||||
def call_langchain(query: str, timeout: int) -> str:
|
|
||||||
payload = {"query": query}
|
|
||||||
r = requests.post(LANGCHAIN_URL, json=payload, timeout=timeout)
|
|
||||||
r.raise_for_status()
|
|
||||||
data = r.json()
|
|
||||||
return str(data.get("response", "")).strip()
|
|
||||||
|
|
||||||
|
|
||||||
def call_llamaindex(query: str, timeout: int) -> str:
|
|
||||||
payload = {"query": query, "mode": "agent"}
|
|
||||||
r = requests.post(LLAMAINDEX_URL, json=payload, timeout=timeout)
|
|
||||||
r.raise_for_status()
|
|
||||||
data = r.json()
|
|
||||||
return str(data.get("response", "")).strip()
|
|
||||||
|
|
||||||
|
|
||||||
def truncate(text: str, max_len: int = 1400) -> str:
|
|
||||||
text = (text or "").strip()
|
|
||||||
if len(text) <= max_len:
|
|
||||||
return text
|
|
||||||
return text[:max_len] + "... [truncated]"
|
|
||||||
|
|
||||||
|
|
||||||
def format_batch_summary(
|
|
||||||
batch_docs: list[DocumentItem],
|
|
||||||
batch_idx: int,
|
|
||||||
docs_in_batch: int,
|
|
||||||
) -> str:
|
|
||||||
wins = {"LangChain": 0, "LlamaIndex": 0, "Tie": 0}
|
|
||||||
scores_lc: list[float] = []
|
|
||||||
scores_li: list[float] = []
|
|
||||||
questions = 0
|
|
||||||
for doc in batch_docs:
|
|
||||||
for _, qs in doc.sections:
|
|
||||||
for q in qs:
|
|
||||||
questions += 1
|
|
||||||
wins[q.winner] += 1
|
|
||||||
scores_lc.append(q.langchain_score)
|
|
||||||
scores_li.append(q.llamaindex_score)
|
|
||||||
avg_lc = sum(scores_lc) / max(1, len(scores_lc))
|
|
||||||
avg_li = sum(scores_li) / max(1, len(scores_li))
|
|
||||||
lines = [
|
|
||||||
f"## Batch {batch_idx} Summary",
|
|
||||||
"",
|
|
||||||
f"- Documents processed in this batch: {docs_in_batch}",
|
|
||||||
f"- Questions processed in this batch: {questions}",
|
|
||||||
f"- LangChain wins: {wins['LangChain']}",
|
|
||||||
f"- LlamaIndex wins: {wins['LlamaIndex']}",
|
|
||||||
f"- Ties: {wins['Tie']}",
|
|
||||||
f"- Average score LangChain: {avg_lc:.3f}",
|
|
||||||
f"- Average score LlamaIndex: {avg_li:.3f}",
|
|
||||||
(
|
|
||||||
f"- Final ranking for this batch: "
|
|
||||||
f"{'LangChain' if avg_lc > avg_li + 0.01 else 'LlamaIndex' if avg_li > avg_lc + 0.01 else 'Tie'}"
|
|
||||||
),
|
|
||||||
"",
|
|
||||||
"_Scoring note: relative heuristic rubric (query overlap, informativeness, specificity, structure), "
|
|
||||||
"used only for side-by-side ranking in this batch._",
|
|
||||||
"",
|
|
||||||
]
|
|
||||||
return "\n".join(lines)
|
|
||||||
|
|
||||||
|
|
||||||
def render_document_with_results(doc: DocumentItem, with_results: bool) -> str:
|
|
||||||
lines = [doc.header, ""]
|
|
||||||
for section_name, questions in doc.sections:
|
|
||||||
lines.append(f"### {section_name}")
|
|
||||||
for q in questions:
|
|
||||||
lines.append(f"- {q.question}")
|
|
||||||
if with_results:
|
|
||||||
lines.append("")
|
|
||||||
lines.append(" - `LangChain Answer`:")
|
|
||||||
lines.append(f" {truncate(q.langchain_answer)}")
|
|
||||||
lines.append(" - `LlamaIndex Answer`:")
|
|
||||||
lines.append(f" {truncate(q.llamaindex_answer)}")
|
|
||||||
lines.append(
|
|
||||||
f" - `Result`: winner={q.winner}, "
|
|
||||||
f"score_langchain={q.langchain_score:.3f}, score_llamaindex={q.llamaindex_score:.3f}"
|
|
||||||
)
|
|
||||||
lines.append(f" - `Rationale`: {q.rationale}")
|
|
||||||
lines.append("")
|
|
||||||
if not with_results:
|
|
||||||
lines.append("_Batch 1 status: not processed yet._")
|
|
||||||
lines.append("")
|
|
||||||
return "\n".join(lines)
|
|
||||||
|
|
||||||
|
|
||||||
def main() -> int:
|
|
||||||
parser = argparse.ArgumentParser()
|
|
||||||
parser.add_argument("--batch-docs", type=int, default=10)
|
|
||||||
parser.add_argument("--batch-index", type=int, default=1)
|
|
||||||
parser.add_argument("--timeout", type=int, default=120)
|
|
||||||
args = parser.parse_args()
|
|
||||||
|
|
||||||
raw = INPUT_MD.read_text(encoding="utf-8")
|
|
||||||
header_lines, doc_blocks = split_documents(raw)
|
|
||||||
docs = [parse_document_block(b) for b in doc_blocks]
|
|
||||||
|
|
||||||
start = (args.batch_index - 1) * args.batch_docs
|
|
||||||
end = start + args.batch_docs
|
|
||||||
if start >= len(docs):
|
|
||||||
raise RuntimeError("Batch start is beyond available documents")
|
|
||||||
|
|
||||||
batch_docs = docs[start:end]
|
|
||||||
total_questions = sum(len(qs) for d in batch_docs for _, qs in d.sections)
|
|
||||||
q_index = 0
|
|
||||||
|
|
||||||
for doc in batch_docs:
|
|
||||||
for _, questions in doc.sections:
|
|
||||||
for q in questions:
|
|
||||||
q_index += 1
|
|
||||||
print(f"[{q_index:03d}/{total_questions}] {q.question}")
|
|
||||||
try:
|
|
||||||
t0 = time.time()
|
|
||||||
q.langchain_answer = call_langchain(
|
|
||||||
q.question, timeout=args.timeout
|
|
||||||
)
|
|
||||||
print(
|
|
||||||
f" -> LangChain OK in {time.time() - t0:.1f}s "
|
|
||||||
f"(chars={len(q.langchain_answer)})"
|
|
||||||
)
|
|
||||||
except Exception as e:
|
|
||||||
q.langchain_answer = f"ERROR: {e}"
|
|
||||||
print(f" -> LangChain ERROR: {e}")
|
|
||||||
try:
|
|
||||||
t0 = time.time()
|
|
||||||
q.llamaindex_answer = call_llamaindex(
|
|
||||||
q.question, timeout=args.timeout
|
|
||||||
)
|
|
||||||
print(
|
|
||||||
f" -> LlamaIndex OK in {time.time() - t0:.1f}s "
|
|
||||||
f"(chars={len(q.llamaindex_answer)})"
|
|
||||||
)
|
|
||||||
except Exception as e:
|
|
||||||
q.llamaindex_answer = f"ERROR: {e}"
|
|
||||||
print(f" -> LlamaIndex ERROR: {e}")
|
|
||||||
|
|
||||||
winner, lc_score, li_score, rationale = compare_answers(
|
|
||||||
q.question, q.langchain_answer, q.llamaindex_answer
|
|
||||||
)
|
|
||||||
q.winner = winner
|
|
||||||
q.langchain_score = lc_score
|
|
||||||
q.llamaindex_score = li_score
|
|
||||||
q.rationale = rationale
|
|
||||||
|
|
||||||
output_parts: list[str] = []
|
|
||||||
output_parts.extend(header_lines)
|
|
||||||
output_parts.append("")
|
|
||||||
output_parts.append(
|
|
||||||
format_batch_summary(batch_docs, args.batch_index, len(batch_docs))
|
|
||||||
)
|
|
||||||
|
|
||||||
for i, doc in enumerate(docs):
|
|
||||||
in_batch = start <= i < end
|
|
||||||
output_parts.append(render_document_with_results(doc, with_results=in_batch))
|
|
||||||
|
|
||||||
OUTPUT_MD.write_text("\n".join(output_parts).rstrip() + "\n", encoding="utf-8")
|
|
||||||
print(f"Written: {OUTPUT_MD}")
|
|
||||||
return 0
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
raise SystemExit(main())
|
|
||||||
BIN
services/rag/.DS_Store
vendored
BIN
services/rag/.DS_Store
vendored
Binary file not shown.
@@ -20,6 +20,9 @@ from vector_storage import initialize_vector_store
|
|||||||
# Load environment variables
|
# Load environment variables
|
||||||
load_dotenv()
|
load_dotenv()
|
||||||
|
|
||||||
|
CHAT_REQUEST_TIMEOUT_SECONDS = float(os.getenv("CHAT_REQUEST_TIMEOUT_SECONDS", "45"))
|
||||||
|
CHAT_MAX_RETRIES = int(os.getenv("CHAT_MAX_RETRIES", "0"))
|
||||||
|
|
||||||
|
|
||||||
def get_llm_model_info(
|
def get_llm_model_info(
|
||||||
llm_model: Optional[str] = None,
|
llm_model: Optional[str] = None,
|
||||||
@@ -149,10 +152,12 @@ def create_chat_agent(
|
|||||||
openai_api_base=base_url_or_api_base,
|
openai_api_base=base_url_or_api_base,
|
||||||
openai_api_key=api_key,
|
openai_api_key=api_key,
|
||||||
temperature=0.1,
|
temperature=0.1,
|
||||||
|
request_timeout=CHAT_REQUEST_TIMEOUT_SECONDS,
|
||||||
|
max_retries=CHAT_MAX_RETRIES,
|
||||||
)
|
)
|
||||||
|
|
||||||
logger.info(
|
logger.info(
|
||||||
f"Using OpenAI-compatible model: {model_name} via {base_url_or_api_base}"
|
f"Using OpenAI-compatible model: {model_name} via {base_url_or_api_base}, timeout={CHAT_REQUEST_TIMEOUT_SECONDS}s, retries={CHAT_MAX_RETRIES}"
|
||||||
)
|
)
|
||||||
else: # Default to ollama
|
else: # Default to ollama
|
||||||
# Initialize the Ollama chat model
|
# Initialize the Ollama chat model
|
||||||
@@ -160,9 +165,13 @@ def create_chat_agent(
|
|||||||
model=model_name,
|
model=model_name,
|
||||||
base_url=base_url_or_api_base, # Default Ollama URL
|
base_url=base_url_or_api_base, # Default Ollama URL
|
||||||
temperature=0.1,
|
temperature=0.1,
|
||||||
|
sync_client_kwargs={"timeout": CHAT_REQUEST_TIMEOUT_SECONDS},
|
||||||
|
async_client_kwargs={"timeout": CHAT_REQUEST_TIMEOUT_SECONDS},
|
||||||
)
|
)
|
||||||
|
|
||||||
logger.info(f"Using Ollama model: {model_name}")
|
logger.info(
|
||||||
|
f"Using Ollama model: {model_name}, timeout={CHAT_REQUEST_TIMEOUT_SECONDS}s"
|
||||||
|
)
|
||||||
|
|
||||||
# Create the document retrieval tool
|
# Create the document retrieval tool
|
||||||
retrieval_tool = DocumentRetrievalTool()
|
retrieval_tool = DocumentRetrievalTool()
|
||||||
|
|||||||
@@ -25,6 +25,10 @@ OLLAMA_EMBEDDING_MODEL = os.getenv("OLLAMA_EMBEDDING_MODEL", "nomic-embed-text")
|
|||||||
OPENAI_EMBEDDING_MODEL = os.getenv("OPENAI_EMBEDDING_MODEL", "text-embedding-ada-002")
|
OPENAI_EMBEDDING_MODEL = os.getenv("OPENAI_EMBEDDING_MODEL", "text-embedding-ada-002")
|
||||||
OPENAI_EMBEDDING_BASE_URL = os.getenv("OPENAI_EMBEDDING_BASE_URL")
|
OPENAI_EMBEDDING_BASE_URL = os.getenv("OPENAI_EMBEDDING_BASE_URL")
|
||||||
OPENAI_EMBEDDING_API_KEY = os.getenv("OPENAI_EMBEDDING_API_KEY")
|
OPENAI_EMBEDDING_API_KEY = os.getenv("OPENAI_EMBEDDING_API_KEY")
|
||||||
|
EMBEDDING_REQUEST_TIMEOUT_SECONDS = float(
|
||||||
|
os.getenv("EMBEDDING_REQUEST_TIMEOUT_SECONDS", "30")
|
||||||
|
)
|
||||||
|
EMBEDDING_MAX_RETRIES = int(os.getenv("EMBEDDING_MAX_RETRIES", "0"))
|
||||||
|
|
||||||
|
|
||||||
def initialize_vector_store(
|
def initialize_vector_store(
|
||||||
@@ -53,6 +57,8 @@ def initialize_vector_store(
|
|||||||
model=OPENAI_EMBEDDING_MODEL,
|
model=OPENAI_EMBEDDING_MODEL,
|
||||||
openai_api_base=OPENAI_EMBEDDING_BASE_URL,
|
openai_api_base=OPENAI_EMBEDDING_BASE_URL,
|
||||||
openai_api_key=OPENAI_EMBEDDING_API_KEY,
|
openai_api_key=OPENAI_EMBEDDING_API_KEY,
|
||||||
|
request_timeout=EMBEDDING_REQUEST_TIMEOUT_SECONDS,
|
||||||
|
max_retries=EMBEDDING_MAX_RETRIES,
|
||||||
)
|
)
|
||||||
elif EMBEDDING_STRATEGY == "none":
|
elif EMBEDDING_STRATEGY == "none":
|
||||||
embeddings = None
|
embeddings = None
|
||||||
@@ -63,6 +69,8 @@ def initialize_vector_store(
|
|||||||
embeddings = OllamaEmbeddings(
|
embeddings = OllamaEmbeddings(
|
||||||
model=OLLAMA_EMBEDDING_MODEL,
|
model=OLLAMA_EMBEDDING_MODEL,
|
||||||
base_url="http://localhost:11434", # Default Ollama URL
|
base_url="http://localhost:11434", # Default Ollama URL
|
||||||
|
sync_client_kwargs={"timeout": EMBEDDING_REQUEST_TIMEOUT_SECONDS},
|
||||||
|
async_client_kwargs={"timeout": EMBEDDING_REQUEST_TIMEOUT_SECONDS},
|
||||||
)
|
)
|
||||||
|
|
||||||
# Check if collection exists and create if needed
|
# Check if collection exists and create if needed
|
||||||
|
|||||||
Reference in New Issue
Block a user