#!/usr/bin/env python3 from __future__ import annotations import hashlib import json import os import random import re import tempfile from collections import defaultdict from pathlib import Path from typing import Any from dotenv import load_dotenv from qdrant_client import QdrantClient from qdrant_client.http.models import Filter, FieldCondition, MatchValue import requests try: from langchain_community.document_loaders import PyPDFLoader, TextLoader except Exception: # pragma: no cover PyPDFLoader = None TextLoader = None try: from langchain_community.document_loaders import UnstructuredWordDocumentLoader except Exception: # pragma: no cover UnstructuredWordDocumentLoader = None try: from langchain_community.document_loaders import UnstructuredPowerPointLoader except Exception: # pragma: no cover UnstructuredPowerPointLoader = None try: from langchain_community.document_loaders import UnstructuredExcelLoader except Exception: # pragma: no cover UnstructuredExcelLoader = None try: from langchain_community.document_loaders import UnstructuredODTLoader except Exception: # pragma: no cover UnstructuredODTLoader = None ROOT = Path(__file__).resolve().parent LANGCHAIN_DIR = ROOT / "services" / "rag" / "langchain" LLAMAINDEX_DIR = ROOT / "services" / "rag" / "llamaindex" YADISK_JSON = ROOT / "yadisk_files.json" OUTPUT_MD = ROOT / "DOCUMENTS_TO_TEST.md" def safe_stem_from_remote(remote_path: str) -> str: stem = Path(Path(remote_path).name).stem or "file" return "".join(ch if ch.isalnum() or ch in ("-", "_") else "_" for ch in stem) def llama_prefect_filename(remote_path: str) -> str: remote_name = Path(remote_path).name or "downloaded_file" suffix = Path(remote_name).suffix digest = hashlib.md5(remote_path.encode("utf-8")).hexdigest()[:10] return f"{safe_stem_from_remote(remote_path)}_{digest}{suffix}" def get_loader(local_path: str): ext = Path(local_path).suffix.lower() if ext == ".pdf" and PyPDFLoader is not None: return PyPDFLoader(local_path) if ext in {".doc", ".docx"} and UnstructuredWordDocumentLoader is not None: return UnstructuredWordDocumentLoader( local_path, **{"strategy": "hi_res", "languages": ["rus"]} ) if ext == ".pptx" and UnstructuredPowerPointLoader is not None: return UnstructuredPowerPointLoader( local_path, **{"strategy": "hi_res", "languages": ["rus"]} ) if ext in {".xls", ".xlsx"} and UnstructuredExcelLoader is not None: return UnstructuredExcelLoader( local_path, **{"strategy": "hi_res", "languages": ["rus"]} ) if ext == ".odt" and UnstructuredODTLoader is not None: return UnstructuredODTLoader( local_path, **{"strategy": "hi_res", "languages": ["rus"]} ) if ext in {".txt", ".md"} and TextLoader is not None: return TextLoader(local_path, encoding="utf-8") return None def supported_loader_extensions() -> set[str]: exts = set() if PyPDFLoader is not None: exts.add(".pdf") if UnstructuredWordDocumentLoader is not None: exts.update({".doc", ".docx"}) if UnstructuredPowerPointLoader is not None: exts.add(".pptx") if UnstructuredExcelLoader is not None: exts.update({".xls", ".xlsx"}) if UnstructuredODTLoader is not None: exts.add(".odt") if TextLoader is not None: exts.update({".txt", ".md"}) return exts def collect_langchain_paths(client: QdrantClient) -> set[str]: paths: set[str] = set() offset = None while True: points, offset = client.scroll( collection_name="documents_langchain", offset=offset, limit=1000, with_payload=True, with_vectors=False, ) if not points: break for p in points: payload = p.payload or {} md = payload.get("metadata") or {} fp = md.get("file_path") or md.get("source") if isinstance(fp, str) and fp: paths.add(fp) if offset is None: break return paths def collect_llama_filenames(client: QdrantClient) -> set[str]: names: set[str] = set() offset = None while True: points, offset = client.scroll( collection_name="documents_llamaindex", offset=offset, limit=1000, with_payload=True, with_vectors=False, ) if not points: break for p in points: payload = p.payload or {} name = payload.get("filename") if isinstance(name, str) and name: names.add(name) if offset is None: break return names def first_unique(matches: list[str], fallback: str) -> str: for m in matches: m = m.strip() if m: return m return fallback def build_questions(remote_path: str, text: str) -> dict[str, list[str]]: text = " ".join((text or "").split()) text_preview = text[:15000] years = sorted( { int(m) for m in re.findall(r"\b(19\d{2}|20\d{2}|21\d{2})\b", text_preview) if 1900 <= int(m) <= 2199 } ) dates = re.findall( r"\b(?:\d{1,2}[./-]\d{1,2}[./-]\d{2,4}|\d{4}[./-]\d{1,2}[./-]\d{1,2})\b", text_preview, ) numbers = re.findall(r"\b\d{2,}\b", text_preview) quoted = re.findall(r"[\"«]([^\"»\n]{4,120})[\"»]", text_preview) org_like = re.findall( r"\b(?:ООО|АО|ПАО|ФГУП|Минтранс|Министерств[ао]|Правительств[ао]|Форум)\b[^\n,.]{0,80}", text_preview, flags=re.IGNORECASE, ) year_q = ( f"В каком году в документе описывается ключевое событие ({years[0]}) и как это подтверждается контекстом?" if years else "Есть ли в документе указание на год события? Если да, какой именно год упомянут?" ) date_q = ( f"Какая дата ({dates[0]}) встречается в документе и к какому событию/разделу она относится?" if dates else "Какие календарные даты или периоды (если есть) упомянуты в документе?" ) num_q = ( f"Какое числовое значение ({numbers[0]}) встречается в документе и в каком контексте оно используется?" if numbers else "Есть ли в документе количественные показатели (суммы, проценты, номера, объемы) и что они обозначают?" ) entity = first_unique(quoted, first_unique(org_like, Path(remote_path).name)) topic_hint = Path(remote_path).stem.replace("_", " ").replace("-", " ") topic_hint = " ".join(topic_hint.split())[:120] entity_q = f"Что в документе говорится про «{entity}»?" return { "Entity/Fact Recall (Response Relevance)": [ f"Что известно про «{entity}» в материалах базы?", f"В контексте темы «{topic_hint}» кто выступает ключевым участником и какова его роль?", ], "Numerical & Temporal Precision": [ year_q.replace("в документе", "в материалах").replace("документе", "материалах"), date_q.replace("в документе", "в материалах").replace("документе", "материалах"), num_q.replace("в документе", "в материалах").replace("документе", "материалах"), ], "Context Precision (Evidence-anchored)": [ f"Найди в базе фрагмент, который лучше всего подтверждает тезис по теме «{topic_hint}», и объясни его релевантность.", f"Есть ли в базе схожие по теме «{topic_hint}», но нерелевантные фрагменты, которые можно ошибочно выбрать?", ], "Faithfulness / Non-hallucination": [ f"Какая информация по теме «{topic_hint}» отсутствует в найденном контексте и не должна быть додумана?", f"Если прямого ответа по теме «{topic_hint}» в материалах нет, как корректно ответить без галлюцинаций?", ], "Reasoning & Synthesis": [ f"Сформулируй краткий вывод по теме «{topic_hint}» в 2-3 пунктах, опираясь на несколько найденных фрагментов.", f"Какие ограничения, риски или условия по теме «{topic_hint}» упоминаются в материалах, и как они влияют на вывод?", ], } def extract_document_text(docs: list[Any]) -> str: chunks: list[str] = [] for doc in docs: content = getattr(doc, "page_content", None) if content is None: content = getattr(doc, "text", None) if isinstance(content, str) and content.strip(): chunks.append(content.strip()) if len(" ".join(chunks)) > 25000: break return "\n".join(chunks)[:25000] def download_yadisk_file(remote_path: str, token: str, local_path: str) -> None: headers = {"Authorization": f"OAuth {token}"} response = requests.get( "https://cloud-api.yandex.net/v1/disk/resources/download", headers=headers, params={"path": remote_path}, timeout=30, ) response.raise_for_status() href = response.json()["href"] file_response = requests.get(href, timeout=180) file_response.raise_for_status() with open(local_path, "wb") as f: f.write(file_response.content) def fetch_text_from_yadisk(remote_path: str, token: str) -> str: suffix = Path(remote_path).suffix with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as tmp: local_path = tmp.name try: download_yadisk_file(remote_path, token, local_path) loader = get_loader(local_path) if loader is None: return "" docs = loader.load() return extract_document_text(docs) finally: if os.path.exists(local_path): os.unlink(local_path) def main() -> int: load_dotenv(LANGCHAIN_DIR / ".env") qdrant_host = os.getenv("QDRANT_HOST") qdrant_rest_port = int(os.getenv("QDRANT_REST_PORT", "6333")) yadisk_token = os.getenv("YADISK_TOKEN", "").strip() if not qdrant_host: raise RuntimeError("QDRANT_HOST is missing in langchain .env") if not yadisk_token: raise RuntimeError("YADISK_TOKEN is missing in langchain .env") with YADISK_JSON.open("r", encoding="utf-8") as f: raw_paths = json.load(f) if not isinstance(raw_paths, list): raise RuntimeError("yadisk_files.json must be a JSON list of paths") all_paths = [str(x) for x in raw_paths if isinstance(x, str)] allowed_ext = supported_loader_extensions() filtered_by_ext = [ p for p in all_paths if Path(p).suffix.lower() in allowed_ext and p.startswith("disk:/") ] client = QdrantClient(host=qdrant_host, port=qdrant_rest_port, timeout=60) langchain_paths = collect_langchain_paths(client) llama_filenames = collect_llama_filenames(client) candidates = [] for path in filtered_by_ext: if path not in langchain_paths: continue if llama_prefect_filename(path) not in llama_filenames: continue candidates.append(path) random.seed(42) random.shuffle(candidates) if len(candidates) < 100: raise RuntimeError( f"Only {len(candidates)} candidate documents found in both collections; need 100" ) rows: list[dict[str, Any]] = [] attempts = 0 for remote_path in candidates: if len(rows) >= 100: break attempts += 1 idx = len(rows) + 1 print(f"[TRY {attempts:03d}] loading {remote_path}") try: text = fetch_text_from_yadisk(remote_path, yadisk_token) except Exception as e: print(f" -> skip (download/read error): {e}") continue if not text.strip(): print(" -> skip (empty extracted text)") continue rows.append( { "index": idx, "path": remote_path, "questions": build_questions(remote_path, text), } ) print(f"[OK {idx:03d}/100] prepared questions for {remote_path}") if len(rows) < 100: raise RuntimeError( f"Only {len(rows)} documents were successfully downloaded/read and turned into questions" ) lines: list[str] = [] lines.append("# DOCUMENTS_TO_TEST") lines.append("") lines.append("This dataset contains 100 YaDisk documents that were verified as present in both:") lines.append("- `documents_langchain` (Qdrant)") lines.append("- `documents_llamaindex` (Qdrant)") lines.append("") lines.append("Question sections are aligned with common RAG evaluation themes (retrieval + generation):") lines.append("- Response relevance / entity-fact recall") lines.append("- Numerical and temporal precision") lines.append("- Context precision") lines.append("- Faithfulness / non-hallucination") lines.append("- Reasoning / synthesis") lines.append("") lines.append( "_References used for evaluation themes: RAGAS metrics and NVIDIA RAG pipeline evaluation docs._" ) lines.append("") for row in rows: lines.append(f"## {row['index']:03d}. `{row['path']}`") lines.append("") for section, qs in row["questions"].items(): lines.append(f"### {section}") for q in qs: lines.append(f"- {q}") lines.append("") OUTPUT_MD.write_text("\n".join(lines), encoding="utf-8") print(f"Written: {OUTPUT_MD}") return 0 if __name__ == "__main__": raise SystemExit(main())