rag-solution/generate_documents_to_test.py

#!/usr/bin/env python3
from __future__ import annotations

import hashlib
import json
import os
import random
import re
import tempfile
from collections import defaultdict
from pathlib import Path
from typing import Any

from dotenv import load_dotenv
from qdrant_client import QdrantClient
from qdrant_client.http.models import Filter, FieldCondition, MatchValue
import requests

try:
    from langchain_community.document_loaders import PyPDFLoader, TextLoader
except Exception:  # pragma: no cover
    PyPDFLoader = None
    TextLoader = None

try:
    from langchain_community.document_loaders import UnstructuredWordDocumentLoader
except Exception:  # pragma: no cover
    UnstructuredWordDocumentLoader = None

try:
    from langchain_community.document_loaders import UnstructuredPowerPointLoader
except Exception:  # pragma: no cover
    UnstructuredPowerPointLoader = None

try:
    from langchain_community.document_loaders import UnstructuredExcelLoader
except Exception:  # pragma: no cover
    UnstructuredExcelLoader = None

try:
    from langchain_community.document_loaders import UnstructuredODTLoader
except Exception:  # pragma: no cover
    UnstructuredODTLoader = None

ROOT = Path(__file__).resolve().parent
LANGCHAIN_DIR = ROOT / "services" / "rag" / "langchain"
LLAMAINDEX_DIR = ROOT / "services" / "rag" / "llamaindex"
YADISK_JSON = ROOT / "yadisk_files.json"
OUTPUT_MD = ROOT / "DOCUMENTS_TO_TEST.md"


def safe_stem_from_remote(remote_path: str) -> str:
    stem = Path(Path(remote_path).name).stem or "file"
    return "".join(ch if ch.isalnum() or ch in ("-", "_") else "_" for ch in stem)


def llama_prefect_filename(remote_path: str) -> str:
    remote_name = Path(remote_path).name or "downloaded_file"
    suffix = Path(remote_name).suffix
    digest = hashlib.md5(remote_path.encode("utf-8")).hexdigest()[:10]
    return f"{safe_stem_from_remote(remote_path)}_{digest}{suffix}"


def get_loader(local_path: str):
    ext = Path(local_path).suffix.lower()
    if ext == ".pdf" and PyPDFLoader is not None:
        return PyPDFLoader(local_path)
    if ext in {".doc", ".docx"} and UnstructuredWordDocumentLoader is not None:
        return UnstructuredWordDocumentLoader(
            local_path, **{"strategy": "hi_res", "languages": ["rus"]}
        )
    if ext == ".pptx" and UnstructuredPowerPointLoader is not None:
        return UnstructuredPowerPointLoader(
            local_path, **{"strategy": "hi_res", "languages": ["rus"]}
        )
    if ext in {".xls", ".xlsx"} and UnstructuredExcelLoader is not None:
        return UnstructuredExcelLoader(
            local_path, **{"strategy": "hi_res", "languages": ["rus"]}
        )
    if ext == ".odt" and UnstructuredODTLoader is not None:
        return UnstructuredODTLoader(
            local_path, **{"strategy": "hi_res", "languages": ["rus"]}
        )
    if ext in {".txt", ".md"} and TextLoader is not None:
        return TextLoader(local_path, encoding="utf-8")
    return None


def supported_loader_extensions() -> set[str]:
    exts = set()
    if PyPDFLoader is not None:
        exts.add(".pdf")
    if UnstructuredWordDocumentLoader is not None:
        exts.update({".doc", ".docx"})
    if UnstructuredPowerPointLoader is not None:
        exts.add(".pptx")
    if UnstructuredExcelLoader is not None:
        exts.update({".xls", ".xlsx"})
    if UnstructuredODTLoader is not None:
        exts.add(".odt")
    if TextLoader is not None:
        exts.update({".txt", ".md"})
    return exts


def collect_langchain_paths(client: QdrantClient) -> set[str]:
    paths: set[str] = set()
    offset = None
    while True:
        points, offset = client.scroll(
            collection_name="documents_langchain",
            offset=offset,
            limit=1000,
            with_payload=True,
            with_vectors=False,
        )
        if not points:
            break
        for p in points:
            payload = p.payload or {}
            md = payload.get("metadata") or {}
            fp = md.get("file_path") or md.get("source")
            if isinstance(fp, str) and fp:
                paths.add(fp)
        if offset is None:
            break
    return paths


def collect_llama_filenames(client: QdrantClient) -> set[str]:
    names: set[str] = set()
    offset = None
    while True:
        points, offset = client.scroll(
            collection_name="documents_llamaindex",
            offset=offset,
            limit=1000,
            with_payload=True,
            with_vectors=False,
        )
        if not points:
            break
        for p in points:
            payload = p.payload or {}
            name = payload.get("filename")
            if isinstance(name, str) and name:
                names.add(name)
        if offset is None:
            break
    return names


def first_unique(matches: list[str], fallback: str) -> str:
    for m in matches:
        m = m.strip()
        if m:
            return m
    return fallback


def build_questions(remote_path: str, text: str) -> dict[str, list[str]]:
    text = " ".join((text or "").split())
    text_preview = text[:15000]
    years = sorted(
        {
            int(m)
            for m in re.findall(r"\b(19\d{2}|20\d{2}|21\d{2})\b", text_preview)
            if 1900 <= int(m) <= 2199
        }
    )
    dates = re.findall(
        r"\b(?:\d{1,2}[./-]\d{1,2}[./-]\d{2,4}|\d{4}[./-]\d{1,2}[./-]\d{1,2})\b",
        text_preview,
    )
    numbers = re.findall(r"\b\d{2,}\b", text_preview)
    quoted = re.findall(r"[\"«]([^\"»\n]{4,120})[\"»]", text_preview)
    org_like = re.findall(
        r"\b(?:ООО|АО|ПАО|ФГУП|Минтранс|Министерств[ао]|Правительств[ао]|Форум)\b[^\n,.]{0,80}",
        text_preview,
        flags=re.IGNORECASE,
    )

    year_q = (
        f"В каком году в документе описывается ключевое событие ({years[0]}) и как это подтверждается контекстом?"
        if years
        else "Есть ли в документе указание на год события? Если да, какой именно год упомянут?"
    )
    date_q = (
        f"Какая дата ({dates[0]}) встречается в документе и к какому событию/разделу она относится?"
        if dates
        else "Какие календарные даты или периоды (если есть) упомянуты в документе?"
    )
    num_q = (
        f"Какое числовое значение ({numbers[0]}) встречается в документе и в каком контексте оно используется?"
        if numbers
        else "Есть ли в документе количественные показатели (суммы, проценты, номера, объемы) и что они обозначают?"
    )
    entity = first_unique(quoted, first_unique(org_like, Path(remote_path).name))
    topic_hint = Path(remote_path).stem.replace("_", " ").replace("-", " ")
    topic_hint = " ".join(topic_hint.split())[:120]
    entity_q = f"Что в документе говорится про «{entity}»?"

    return {
        "Entity/Fact Recall (Response Relevance)": [
            f"Что известно про «{entity}» в материалах базы?",
            f"В контексте темы «{topic_hint}» кто выступает ключевым участником и какова его роль?",
        ],
        "Numerical & Temporal Precision": [
            year_q.replace("в документе", "в материалах").replace("документе", "материалах"),
            date_q.replace("в документе", "в материалах").replace("документе", "материалах"),
            num_q.replace("в документе", "в материалах").replace("документе", "материалах"),
        ],
        "Context Precision (Evidence-anchored)": [
            f"Найди в базе фрагмент, который лучше всего подтверждает тезис по теме «{topic_hint}», и объясни его релевантность.",
            f"Есть ли в базе схожие по теме «{topic_hint}», но нерелевантные фрагменты, которые можно ошибочно выбрать?",
        ],
        "Faithfulness / Non-hallucination": [
            f"Какая информация по теме «{topic_hint}» отсутствует в найденном контексте и не должна быть додумана?",
            f"Если прямого ответа по теме «{topic_hint}» в материалах нет, как корректно ответить без галлюцинаций?",
        ],
        "Reasoning & Synthesis": [
            f"Сформулируй краткий вывод по теме «{topic_hint}» в 2-3 пунктах, опираясь на несколько найденных фрагментов.",
            f"Какие ограничения, риски или условия по теме «{topic_hint}» упоминаются в материалах, и как они влияют на вывод?",
        ],
    }


def extract_document_text(docs: list[Any]) -> str:
    chunks: list[str] = []
    for doc in docs:
        content = getattr(doc, "page_content", None)
        if content is None:
            content = getattr(doc, "text", None)
        if isinstance(content, str) and content.strip():
            chunks.append(content.strip())
        if len(" ".join(chunks)) > 25000:
            break
    return "\n".join(chunks)[:25000]


def download_yadisk_file(remote_path: str, token: str, local_path: str) -> None:
    headers = {"Authorization": f"OAuth {token}"}
    response = requests.get(
        "https://cloud-api.yandex.net/v1/disk/resources/download",
        headers=headers,
        params={"path": remote_path},
        timeout=30,
    )
    response.raise_for_status()
    href = response.json()["href"]
    file_response = requests.get(href, timeout=180)
    file_response.raise_for_status()
    with open(local_path, "wb") as f:
        f.write(file_response.content)


def fetch_text_from_yadisk(remote_path: str, token: str) -> str:
    suffix = Path(remote_path).suffix
    with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as tmp:
        local_path = tmp.name
    try:
        download_yadisk_file(remote_path, token, local_path)
        loader = get_loader(local_path)
        if loader is None:
            return ""
        docs = loader.load()
        return extract_document_text(docs)
    finally:
        if os.path.exists(local_path):
            os.unlink(local_path)


def main() -> int:
    load_dotenv(LANGCHAIN_DIR / ".env")
    qdrant_host = os.getenv("QDRANT_HOST")
    qdrant_rest_port = int(os.getenv("QDRANT_REST_PORT", "6333"))
    yadisk_token = os.getenv("YADISK_TOKEN", "").strip()
    if not qdrant_host:
        raise RuntimeError("QDRANT_HOST is missing in langchain .env")
    if not yadisk_token:
        raise RuntimeError("YADISK_TOKEN is missing in langchain .env")

    with YADISK_JSON.open("r", encoding="utf-8") as f:
        raw_paths = json.load(f)
    if not isinstance(raw_paths, list):
        raise RuntimeError("yadisk_files.json must be a JSON list of paths")
    all_paths = [str(x) for x in raw_paths if isinstance(x, str)]

    allowed_ext = supported_loader_extensions()
    filtered_by_ext = [
        p for p in all_paths if Path(p).suffix.lower() in allowed_ext and p.startswith("disk:/")
    ]

    client = QdrantClient(host=qdrant_host, port=qdrant_rest_port, timeout=60)
    langchain_paths = collect_langchain_paths(client)
    llama_filenames = collect_llama_filenames(client)

    candidates = []
    for path in filtered_by_ext:
        if path not in langchain_paths:
            continue
        if llama_prefect_filename(path) not in llama_filenames:
            continue
        candidates.append(path)

    random.seed(42)
    random.shuffle(candidates)
    if len(candidates) < 100:
        raise RuntimeError(
            f"Only {len(candidates)} candidate documents found in both collections; need 100"
        )

    rows: list[dict[str, Any]] = []
    attempts = 0
    for remote_path in candidates:
        if len(rows) >= 100:
            break
        attempts += 1
        idx = len(rows) + 1
        print(f"[TRY {attempts:03d}] loading {remote_path}")
        try:
            text = fetch_text_from_yadisk(remote_path, yadisk_token)
        except Exception as e:
            print(f"  -> skip (download/read error): {e}")
            continue
        if not text.strip():
            print("  -> skip (empty extracted text)")
            continue
        rows.append(
            {
                "index": idx,
                "path": remote_path,
                "questions": build_questions(remote_path, text),
            }
        )
        print(f"[OK  {idx:03d}/100] prepared questions for {remote_path}")

    if len(rows) < 100:
        raise RuntimeError(
            f"Only {len(rows)} documents were successfully downloaded/read and turned into questions"
        )

    lines: list[str] = []
    lines.append("# DOCUMENTS_TO_TEST")
    lines.append("")
    lines.append("This dataset contains 100 YaDisk documents that were verified as present in both:")
    lines.append("- `documents_langchain` (Qdrant)")
    lines.append("- `documents_llamaindex` (Qdrant)")
    lines.append("")
    lines.append("Question sections are aligned with common RAG evaluation themes (retrieval + generation):")
    lines.append("- Response relevance / entity-fact recall")
    lines.append("- Numerical and temporal precision")
    lines.append("- Context precision")
    lines.append("- Faithfulness / non-hallucination")
    lines.append("- Reasoning / synthesis")
    lines.append("")
    lines.append(
        "_References used for evaluation themes: RAGAS metrics and NVIDIA RAG pipeline evaluation docs._"
    )
    lines.append("")

    for row in rows:
        lines.append(f"## {row['index']:03d}. `{row['path']}`")
        lines.append("")
        for section, qs in row["questions"].items():
            lines.append(f"### {section}")
            for q in qs:
                lines.append(f"- {q}")
            lines.append("")

    OUTPUT_MD.write_text("\n".join(lines), encoding="utf-8")
    print(f"Written: {OUTPUT_MD}")
    return 0


if __name__ == "__main__":
    raise SystemExit(main())