378 lines
14 KiB
Python
378 lines
14 KiB
Python
|
|
#!/usr/bin/env python3
|
|||
|
|
from __future__ import annotations
|
|||
|
|
|
|||
|
|
import hashlib
|
|||
|
|
import json
|
|||
|
|
import os
|
|||
|
|
import random
|
|||
|
|
import re
|
|||
|
|
import tempfile
|
|||
|
|
from collections import defaultdict
|
|||
|
|
from pathlib import Path
|
|||
|
|
from typing import Any
|
|||
|
|
|
|||
|
|
from dotenv import load_dotenv
|
|||
|
|
from qdrant_client import QdrantClient
|
|||
|
|
from qdrant_client.http.models import Filter, FieldCondition, MatchValue
|
|||
|
|
import requests
|
|||
|
|
|
|||
|
|
try:
|
|||
|
|
from langchain_community.document_loaders import PyPDFLoader, TextLoader
|
|||
|
|
except Exception: # pragma: no cover
|
|||
|
|
PyPDFLoader = None
|
|||
|
|
TextLoader = None
|
|||
|
|
|
|||
|
|
try:
|
|||
|
|
from langchain_community.document_loaders import UnstructuredWordDocumentLoader
|
|||
|
|
except Exception: # pragma: no cover
|
|||
|
|
UnstructuredWordDocumentLoader = None
|
|||
|
|
|
|||
|
|
try:
|
|||
|
|
from langchain_community.document_loaders import UnstructuredPowerPointLoader
|
|||
|
|
except Exception: # pragma: no cover
|
|||
|
|
UnstructuredPowerPointLoader = None
|
|||
|
|
|
|||
|
|
try:
|
|||
|
|
from langchain_community.document_loaders import UnstructuredExcelLoader
|
|||
|
|
except Exception: # pragma: no cover
|
|||
|
|
UnstructuredExcelLoader = None
|
|||
|
|
|
|||
|
|
try:
|
|||
|
|
from langchain_community.document_loaders import UnstructuredODTLoader
|
|||
|
|
except Exception: # pragma: no cover
|
|||
|
|
UnstructuredODTLoader = None
|
|||
|
|
|
|||
|
|
ROOT = Path(__file__).resolve().parent
|
|||
|
|
LANGCHAIN_DIR = ROOT / "services" / "rag" / "langchain"
|
|||
|
|
LLAMAINDEX_DIR = ROOT / "services" / "rag" / "llamaindex"
|
|||
|
|
YADISK_JSON = ROOT / "yadisk_files.json"
|
|||
|
|
OUTPUT_MD = ROOT / "DOCUMENTS_TO_TEST.md"
|
|||
|
|
|
|||
|
|
|
|||
|
|
def safe_stem_from_remote(remote_path: str) -> str:
|
|||
|
|
stem = Path(Path(remote_path).name).stem or "file"
|
|||
|
|
return "".join(ch if ch.isalnum() or ch in ("-", "_") else "_" for ch in stem)
|
|||
|
|
|
|||
|
|
|
|||
|
|
def llama_prefect_filename(remote_path: str) -> str:
|
|||
|
|
remote_name = Path(remote_path).name or "downloaded_file"
|
|||
|
|
suffix = Path(remote_name).suffix
|
|||
|
|
digest = hashlib.md5(remote_path.encode("utf-8")).hexdigest()[:10]
|
|||
|
|
return f"{safe_stem_from_remote(remote_path)}_{digest}{suffix}"
|
|||
|
|
|
|||
|
|
|
|||
|
|
def get_loader(local_path: str):
|
|||
|
|
ext = Path(local_path).suffix.lower()
|
|||
|
|
if ext == ".pdf" and PyPDFLoader is not None:
|
|||
|
|
return PyPDFLoader(local_path)
|
|||
|
|
if ext in {".doc", ".docx"} and UnstructuredWordDocumentLoader is not None:
|
|||
|
|
return UnstructuredWordDocumentLoader(
|
|||
|
|
local_path, **{"strategy": "hi_res", "languages": ["rus"]}
|
|||
|
|
)
|
|||
|
|
if ext == ".pptx" and UnstructuredPowerPointLoader is not None:
|
|||
|
|
return UnstructuredPowerPointLoader(
|
|||
|
|
local_path, **{"strategy": "hi_res", "languages": ["rus"]}
|
|||
|
|
)
|
|||
|
|
if ext in {".xls", ".xlsx"} and UnstructuredExcelLoader is not None:
|
|||
|
|
return UnstructuredExcelLoader(
|
|||
|
|
local_path, **{"strategy": "hi_res", "languages": ["rus"]}
|
|||
|
|
)
|
|||
|
|
if ext == ".odt" and UnstructuredODTLoader is not None:
|
|||
|
|
return UnstructuredODTLoader(
|
|||
|
|
local_path, **{"strategy": "hi_res", "languages": ["rus"]}
|
|||
|
|
)
|
|||
|
|
if ext in {".txt", ".md"} and TextLoader is not None:
|
|||
|
|
return TextLoader(local_path, encoding="utf-8")
|
|||
|
|
return None
|
|||
|
|
|
|||
|
|
|
|||
|
|
def supported_loader_extensions() -> set[str]:
|
|||
|
|
exts = set()
|
|||
|
|
if PyPDFLoader is not None:
|
|||
|
|
exts.add(".pdf")
|
|||
|
|
if UnstructuredWordDocumentLoader is not None:
|
|||
|
|
exts.update({".doc", ".docx"})
|
|||
|
|
if UnstructuredPowerPointLoader is not None:
|
|||
|
|
exts.add(".pptx")
|
|||
|
|
if UnstructuredExcelLoader is not None:
|
|||
|
|
exts.update({".xls", ".xlsx"})
|
|||
|
|
if UnstructuredODTLoader is not None:
|
|||
|
|
exts.add(".odt")
|
|||
|
|
if TextLoader is not None:
|
|||
|
|
exts.update({".txt", ".md"})
|
|||
|
|
return exts
|
|||
|
|
|
|||
|
|
|
|||
|
|
def collect_langchain_paths(client: QdrantClient) -> set[str]:
|
|||
|
|
paths: set[str] = set()
|
|||
|
|
offset = None
|
|||
|
|
while True:
|
|||
|
|
points, offset = client.scroll(
|
|||
|
|
collection_name="documents_langchain",
|
|||
|
|
offset=offset,
|
|||
|
|
limit=1000,
|
|||
|
|
with_payload=True,
|
|||
|
|
with_vectors=False,
|
|||
|
|
)
|
|||
|
|
if not points:
|
|||
|
|
break
|
|||
|
|
for p in points:
|
|||
|
|
payload = p.payload or {}
|
|||
|
|
md = payload.get("metadata") or {}
|
|||
|
|
fp = md.get("file_path") or md.get("source")
|
|||
|
|
if isinstance(fp, str) and fp:
|
|||
|
|
paths.add(fp)
|
|||
|
|
if offset is None:
|
|||
|
|
break
|
|||
|
|
return paths
|
|||
|
|
|
|||
|
|
|
|||
|
|
def collect_llama_filenames(client: QdrantClient) -> set[str]:
|
|||
|
|
names: set[str] = set()
|
|||
|
|
offset = None
|
|||
|
|
while True:
|
|||
|
|
points, offset = client.scroll(
|
|||
|
|
collection_name="documents_llamaindex",
|
|||
|
|
offset=offset,
|
|||
|
|
limit=1000,
|
|||
|
|
with_payload=True,
|
|||
|
|
with_vectors=False,
|
|||
|
|
)
|
|||
|
|
if not points:
|
|||
|
|
break
|
|||
|
|
for p in points:
|
|||
|
|
payload = p.payload or {}
|
|||
|
|
name = payload.get("filename")
|
|||
|
|
if isinstance(name, str) and name:
|
|||
|
|
names.add(name)
|
|||
|
|
if offset is None:
|
|||
|
|
break
|
|||
|
|
return names
|
|||
|
|
|
|||
|
|
|
|||
|
|
def first_unique(matches: list[str], fallback: str) -> str:
|
|||
|
|
for m in matches:
|
|||
|
|
m = m.strip()
|
|||
|
|
if m:
|
|||
|
|
return m
|
|||
|
|
return fallback
|
|||
|
|
|
|||
|
|
|
|||
|
|
def build_questions(remote_path: str, text: str) -> dict[str, list[str]]:
|
|||
|
|
text = " ".join((text or "").split())
|
|||
|
|
text_preview = text[:15000]
|
|||
|
|
years = sorted(
|
|||
|
|
{
|
|||
|
|
int(m)
|
|||
|
|
for m in re.findall(r"\b(19\d{2}|20\d{2}|21\d{2})\b", text_preview)
|
|||
|
|
if 1900 <= int(m) <= 2199
|
|||
|
|
}
|
|||
|
|
)
|
|||
|
|
dates = re.findall(
|
|||
|
|
r"\b(?:\d{1,2}[./-]\d{1,2}[./-]\d{2,4}|\d{4}[./-]\d{1,2}[./-]\d{1,2})\b",
|
|||
|
|
text_preview,
|
|||
|
|
)
|
|||
|
|
numbers = re.findall(r"\b\d{2,}\b", text_preview)
|
|||
|
|
quoted = re.findall(r"[\"«]([^\"»\n]{4,120})[\"»]", text_preview)
|
|||
|
|
org_like = re.findall(
|
|||
|
|
r"\b(?:ООО|АО|ПАО|ФГУП|Минтранс|Министерств[ао]|Правительств[ао]|Форум)\b[^\n,.]{0,80}",
|
|||
|
|
text_preview,
|
|||
|
|
flags=re.IGNORECASE,
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
year_q = (
|
|||
|
|
f"В каком году в документе описывается ключевое событие ({years[0]}) и как это подтверждается контекстом?"
|
|||
|
|
if years
|
|||
|
|
else "Есть ли в документе указание на год события? Если да, какой именно год упомянут?"
|
|||
|
|
)
|
|||
|
|
date_q = (
|
|||
|
|
f"Какая дата ({dates[0]}) встречается в документе и к какому событию/разделу она относится?"
|
|||
|
|
if dates
|
|||
|
|
else "Какие календарные даты или периоды (если есть) упомянуты в документе?"
|
|||
|
|
)
|
|||
|
|
num_q = (
|
|||
|
|
f"Какое числовое значение ({numbers[0]}) встречается в документе и в каком контексте оно используется?"
|
|||
|
|
if numbers
|
|||
|
|
else "Есть ли в документе количественные показатели (суммы, проценты, номера, объемы) и что они обозначают?"
|
|||
|
|
)
|
|||
|
|
entity = first_unique(quoted, first_unique(org_like, Path(remote_path).name))
|
|||
|
|
topic_hint = Path(remote_path).stem.replace("_", " ").replace("-", " ")
|
|||
|
|
topic_hint = " ".join(topic_hint.split())[:120]
|
|||
|
|
entity_q = f"Что в документе говорится про «{entity}»?"
|
|||
|
|
|
|||
|
|
return {
|
|||
|
|
"Entity/Fact Recall (Response Relevance)": [
|
|||
|
|
f"Что известно про «{entity}» в материалах базы?",
|
|||
|
|
f"В контексте темы «{topic_hint}» кто выступает ключевым участником и какова его роль?",
|
|||
|
|
],
|
|||
|
|
"Numerical & Temporal Precision": [
|
|||
|
|
year_q.replace("в документе", "в материалах").replace("документе", "материалах"),
|
|||
|
|
date_q.replace("в документе", "в материалах").replace("документе", "материалах"),
|
|||
|
|
num_q.replace("в документе", "в материалах").replace("документе", "материалах"),
|
|||
|
|
],
|
|||
|
|
"Context Precision (Evidence-anchored)": [
|
|||
|
|
f"Найди в базе фрагмент, который лучше всего подтверждает тезис по теме «{topic_hint}», и объясни его релевантность.",
|
|||
|
|
f"Есть ли в базе схожие по теме «{topic_hint}», но нерелевантные фрагменты, которые можно ошибочно выбрать?",
|
|||
|
|
],
|
|||
|
|
"Faithfulness / Non-hallucination": [
|
|||
|
|
f"Какая информация по теме «{topic_hint}» отсутствует в найденном контексте и не должна быть додумана?",
|
|||
|
|
f"Если прямого ответа по теме «{topic_hint}» в материалах нет, как корректно ответить без галлюцинаций?",
|
|||
|
|
],
|
|||
|
|
"Reasoning & Synthesis": [
|
|||
|
|
f"Сформулируй краткий вывод по теме «{topic_hint}» в 2-3 пунктах, опираясь на несколько найденных фрагментов.",
|
|||
|
|
f"Какие ограничения, риски или условия по теме «{topic_hint}» упоминаются в материалах, и как они влияют на вывод?",
|
|||
|
|
],
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
|
|||
|
|
def extract_document_text(docs: list[Any]) -> str:
|
|||
|
|
chunks: list[str] = []
|
|||
|
|
for doc in docs:
|
|||
|
|
content = getattr(doc, "page_content", None)
|
|||
|
|
if content is None:
|
|||
|
|
content = getattr(doc, "text", None)
|
|||
|
|
if isinstance(content, str) and content.strip():
|
|||
|
|
chunks.append(content.strip())
|
|||
|
|
if len(" ".join(chunks)) > 25000:
|
|||
|
|
break
|
|||
|
|
return "\n".join(chunks)[:25000]
|
|||
|
|
|
|||
|
|
|
|||
|
|
def download_yadisk_file(remote_path: str, token: str, local_path: str) -> None:
|
|||
|
|
headers = {"Authorization": f"OAuth {token}"}
|
|||
|
|
response = requests.get(
|
|||
|
|
"https://cloud-api.yandex.net/v1/disk/resources/download",
|
|||
|
|
headers=headers,
|
|||
|
|
params={"path": remote_path},
|
|||
|
|
timeout=30,
|
|||
|
|
)
|
|||
|
|
response.raise_for_status()
|
|||
|
|
href = response.json()["href"]
|
|||
|
|
file_response = requests.get(href, timeout=180)
|
|||
|
|
file_response.raise_for_status()
|
|||
|
|
with open(local_path, "wb") as f:
|
|||
|
|
f.write(file_response.content)
|
|||
|
|
|
|||
|
|
|
|||
|
|
def fetch_text_from_yadisk(remote_path: str, token: str) -> str:
|
|||
|
|
suffix = Path(remote_path).suffix
|
|||
|
|
with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as tmp:
|
|||
|
|
local_path = tmp.name
|
|||
|
|
try:
|
|||
|
|
download_yadisk_file(remote_path, token, local_path)
|
|||
|
|
loader = get_loader(local_path)
|
|||
|
|
if loader is None:
|
|||
|
|
return ""
|
|||
|
|
docs = loader.load()
|
|||
|
|
return extract_document_text(docs)
|
|||
|
|
finally:
|
|||
|
|
if os.path.exists(local_path):
|
|||
|
|
os.unlink(local_path)
|
|||
|
|
|
|||
|
|
|
|||
|
|
def main() -> int:
|
|||
|
|
load_dotenv(LANGCHAIN_DIR / ".env")
|
|||
|
|
qdrant_host = os.getenv("QDRANT_HOST")
|
|||
|
|
qdrant_rest_port = int(os.getenv("QDRANT_REST_PORT", "6333"))
|
|||
|
|
yadisk_token = os.getenv("YADISK_TOKEN", "").strip()
|
|||
|
|
if not qdrant_host:
|
|||
|
|
raise RuntimeError("QDRANT_HOST is missing in langchain .env")
|
|||
|
|
if not yadisk_token:
|
|||
|
|
raise RuntimeError("YADISK_TOKEN is missing in langchain .env")
|
|||
|
|
|
|||
|
|
with YADISK_JSON.open("r", encoding="utf-8") as f:
|
|||
|
|
raw_paths = json.load(f)
|
|||
|
|
if not isinstance(raw_paths, list):
|
|||
|
|
raise RuntimeError("yadisk_files.json must be a JSON list of paths")
|
|||
|
|
all_paths = [str(x) for x in raw_paths if isinstance(x, str)]
|
|||
|
|
|
|||
|
|
allowed_ext = supported_loader_extensions()
|
|||
|
|
filtered_by_ext = [
|
|||
|
|
p for p in all_paths if Path(p).suffix.lower() in allowed_ext and p.startswith("disk:/")
|
|||
|
|
]
|
|||
|
|
|
|||
|
|
client = QdrantClient(host=qdrant_host, port=qdrant_rest_port, timeout=60)
|
|||
|
|
langchain_paths = collect_langchain_paths(client)
|
|||
|
|
llama_filenames = collect_llama_filenames(client)
|
|||
|
|
|
|||
|
|
candidates = []
|
|||
|
|
for path in filtered_by_ext:
|
|||
|
|
if path not in langchain_paths:
|
|||
|
|
continue
|
|||
|
|
if llama_prefect_filename(path) not in llama_filenames:
|
|||
|
|
continue
|
|||
|
|
candidates.append(path)
|
|||
|
|
|
|||
|
|
random.seed(42)
|
|||
|
|
random.shuffle(candidates)
|
|||
|
|
if len(candidates) < 100:
|
|||
|
|
raise RuntimeError(
|
|||
|
|
f"Only {len(candidates)} candidate documents found in both collections; need 100"
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
rows: list[dict[str, Any]] = []
|
|||
|
|
attempts = 0
|
|||
|
|
for remote_path in candidates:
|
|||
|
|
if len(rows) >= 100:
|
|||
|
|
break
|
|||
|
|
attempts += 1
|
|||
|
|
idx = len(rows) + 1
|
|||
|
|
print(f"[TRY {attempts:03d}] loading {remote_path}")
|
|||
|
|
try:
|
|||
|
|
text = fetch_text_from_yadisk(remote_path, yadisk_token)
|
|||
|
|
except Exception as e:
|
|||
|
|
print(f" -> skip (download/read error): {e}")
|
|||
|
|
continue
|
|||
|
|
if not text.strip():
|
|||
|
|
print(" -> skip (empty extracted text)")
|
|||
|
|
continue
|
|||
|
|
rows.append(
|
|||
|
|
{
|
|||
|
|
"index": idx,
|
|||
|
|
"path": remote_path,
|
|||
|
|
"questions": build_questions(remote_path, text),
|
|||
|
|
}
|
|||
|
|
)
|
|||
|
|
print(f"[OK {idx:03d}/100] prepared questions for {remote_path}")
|
|||
|
|
|
|||
|
|
if len(rows) < 100:
|
|||
|
|
raise RuntimeError(
|
|||
|
|
f"Only {len(rows)} documents were successfully downloaded/read and turned into questions"
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
lines: list[str] = []
|
|||
|
|
lines.append("# DOCUMENTS_TO_TEST")
|
|||
|
|
lines.append("")
|
|||
|
|
lines.append("This dataset contains 100 YaDisk documents that were verified as present in both:")
|
|||
|
|
lines.append("- `documents_langchain` (Qdrant)")
|
|||
|
|
lines.append("- `documents_llamaindex` (Qdrant)")
|
|||
|
|
lines.append("")
|
|||
|
|
lines.append("Question sections are aligned with common RAG evaluation themes (retrieval + generation):")
|
|||
|
|
lines.append("- Response relevance / entity-fact recall")
|
|||
|
|
lines.append("- Numerical and temporal precision")
|
|||
|
|
lines.append("- Context precision")
|
|||
|
|
lines.append("- Faithfulness / non-hallucination")
|
|||
|
|
lines.append("- Reasoning / synthesis")
|
|||
|
|
lines.append("")
|
|||
|
|
lines.append(
|
|||
|
|
"_References used for evaluation themes: RAGAS metrics and NVIDIA RAG pipeline evaluation docs._"
|
|||
|
|
)
|
|||
|
|
lines.append("")
|
|||
|
|
|
|||
|
|
for row in rows:
|
|||
|
|
lines.append(f"## {row['index']:03d}. `{row['path']}`")
|
|||
|
|
lines.append("")
|
|||
|
|
for section, qs in row["questions"].items():
|
|||
|
|
lines.append(f"### {section}")
|
|||
|
|
for q in qs:
|
|||
|
|
lines.append(f"- {q}")
|
|||
|
|
lines.append("")
|
|||
|
|
|
|||
|
|
OUTPUT_MD.write_text("\n".join(lines), encoding="utf-8")
|
|||
|
|
print(f"Written: {OUTPUT_MD}")
|
|||
|
|
return 0
|
|||
|
|
|
|||
|
|
|
|||
|
|
if __name__ == "__main__":
|
|||
|
|
raise SystemExit(main())
|