langchain uploading new way of predefined paths from yandex disk

2026-02-26 00:01:47 +03:00
parent 2c7ab06b3f
commit 3e29ea70ed
7 changed files with 365 additions and 17 deletions
--- a/services/rag/langchain/.env.dist
+++ b/services/rag/langchain/.env.dist
@@ -15,3 +15,4 @@ ENRICHMENT_PROCESSING_MODE=async/sync
 ENRICHMENT_ADAPTIVE_FILES_QUEUE_LIMIT=5
 ENRICHMENT_ADAPTIVE_FILE_PROCESS_THREADS=4
 ENRICHMENT_ADAPTIVE_DOCUMENT_UPLOADS_THREADS=4
 PREFECT_YADISK_ENRICH_CONCURRENCY=8
--- a/services/rag/langchain/PLANNING.md
+++ b/services/rag/langchain/PLANNING.md
@@ -85,7 +85,6 @@ During enrichment, we should use adaptive collection from the helpers, for loadi
 - [x] We still will need filetypes that we will need to skip, so while iterating over files we need to check their extension and skip them.
 - [x] Adaptive files has filename in them, so it should be used when extracting metadata
 # Phase 13 (async processing of files)
 During this Phase we create asynchronous process of enrichment, utilizing async/await
@@ -111,3 +110,18 @@ During this Phase we create asynchronous process of enrichment, utilizing async/
 - [x] Create prefect task "iterate_yadisk_folder_and_store_file_paths" that will connect to yandex disk with yadisk library, analyze everything inside folder `Общая` recursively and store file paths in the ./../../../yadisk_files.json, in array of strings.
 - [x] In our pefect file add function for flow to serve, as per prefect documentation on serving flows
 - [x] Tests will be done manually by hand, by executing this script and checking prefect dashboard. No automatical tests needed for this phase.
 # Phase 15 (prefect enrichment process for langchain, with predefined values, also removal of non-documet formats)
 - [x] Remove for now formats, extensions for images of any kind, archives of any kind, and add possible text documents, documents formats, like .txt, .xlsx, etc. in enrichment processes/functions.
 - [x] Create prefect client file in `prefect/02_yadisk_predefined_enrich.py`. This file will firt load file from ./../../../yadisk_files.json into array of paths. After that, array of paths will be filtered, and only supported in enrichment extensions will be left. After that, code will iterate through each path in this filtered array, use yadisk library to download file, process it for enrichment, and the remove it after processing. There should be statistics for this, at runtime, with progressbar that shows how many files processed out of how many left. Also, near the progressbar there should be counter of errors. Yes, if there is an error, it should be swallowed, even if it is inside thred or async function.
 - [x] For yandex disk integration use library yadisk. In .env file there should be variable YADISK_TOKEN for accessing the needed connection
 - [x] Code for loading should be reflected upon, and then made it so it would be done in async way, with as much as possible simulatenous tasks. yadisk async integration should be used (async features can be checked here: https://pypi.org/project/yadisk/)
 - [x] No tests for code should be done at this phase, all tests will be done manually, because loading of documents can take a long time for automated test.
 # Phase 16 (making demo ui scalable)
 - [ ] Make demo-ui window containable and reusable part of html + js. This part will be used for creating multi-windowed demo ui.
 - [ ] Make tabbed UI with top level tabs. First tab exists and is selected. Each tab should have copy of demo ui, meaning the chat window with ability to specify the api url
 - [ ] At the end of the tabs there should be button with plus sign, which will add new tab. Tabs to be called by numbers.
 - [ ] There should predefined 3 tabs opened. First one should have predefined api url "https://rag.langchain.overwatch.su/api/test-query", second "https://rag.llamaindex.overwatch.su/api/test-query", third "https://rag.haystack.overwatch.su/api/test-query"
--- a/services/rag/langchain/enrichment.py
+++ b/services/rag/langchain/enrichment.py
@@ -8,7 +8,7 @@ from pathlib import Path
 from typing import List, Optional, Tuple
 from dotenv import load_dotenv
-from langchain_community.document_loaders import PyPDFLoader
+from langchain_community.document_loaders import PyPDFLoader, TextLoader
 from langchain_core.documents import Document
 from langchain_text_splitters import RecursiveCharacterTextSplitter
 from loguru import logger
@@ -75,21 +75,26 @@ ENRICHMENT_ADAPTIVE_DOCUMENT_UPLOADS_THREADS = int(
 )
 SUPPORTED_EXTENSIONS = {
-    ".pdf",
+    ".csv",
    ".docx",
    ".doc",
-    ".pptx",
+    ".docx",
-    ".xlsx",
+    ".epub",
-    ".xls",
+    ".htm",
-    # ".jpg",
+    ".html",
-    # ".jpeg",
+    ".json",
-    # ".png",
+    ".jsonl",
-    # ".gif",
+    ".md",
    # ".bmp",
    # ".tiff",
    # ".webp",
    ".odt",
-    ".txt",  # this one is obvious but was unexpected to see in data lol
+    ".pdf",
    ".ppt",
    ".pptx",
    ".rtf",
    ".rst",
    ".tsv",
    ".txt",
    ".xls",
    ".xlsx",
    ".xml",
 }
 Base = declarative_base()
@@ -261,6 +266,8 @@ class DocumentEnricher:
            return UnstructuredODTLoader(
                file_path, **{"strategy": "hi_res", "languages": ["rus"]}
            )
        if ext in [".txt", ".md"]:
            return TextLoader(file_path, encoding="utf-8")
        return None
    def _load_one_adaptive_file(
--- a/services/rag/langchain/prefect/02_yadisk_predefined_enrich.py
+++ b/services/rag/langchain/prefect/02_yadisk_predefined_enrich.py
@@ -0,0 +1,216 @@
 """Prefect flow to enrich Yandex Disk files from a predefined JSON file list."""
 from __future__ import annotations
 import asyncio
 import json
 import os
 import sys
 import tempfile
 from pathlib import Path
 from typing import List
 from dotenv import load_dotenv
 from prefect import flow, task
 load_dotenv()
 PROJECT_ROOT = Path(__file__).resolve().parent.parent
 if str(PROJECT_ROOT) not in sys.path:
    sys.path.insert(0, str(PROJECT_ROOT))
 PREFECT_API_URL = os.getenv("PREFECT_API_URL")
 YADISK_TOKEN = os.getenv("YADISK_TOKEN")
 ENRICH_CONCURRENCY = int(os.getenv("PREFECT_YADISK_ENRICH_CONCURRENCY", "8"))
 OUTPUT_FILE_LIST = (PROJECT_ROOT / "../../../yadisk_files.json").resolve()
 if PREFECT_API_URL:
    os.environ["PREFECT_API_URL"] = PREFECT_API_URL
 class _ProgressTracker:
    def __init__(self, total: int):
        self.total = total
        self.processed = 0
        self.errors = 0
        self._lock = asyncio.Lock()
    async def mark_done(self, error: bool = False):
        async with self._lock:
            self.processed += 1
            if error:
                self.errors += 1
            self._render()
    def _render(self):
        total = max(self.total, 1)
        width = 30
        filled = int(width * self.processed / total)
        bar = "#" * filled + "-" * (width - filled)
        left = max(self.total - self.processed, 0)
        print(
            f"\r[{bar}] {self.processed}/{self.total} processed | left: {left} | errors: {self.errors}",
            end="",
            flush=True,
        )
        if self.processed >= self.total:
            print()
 async def _download_yadisk_file(async_disk, remote_path: str, local_path: str) -> None:
    await async_disk.download(remote_path, local_path)
 def _process_local_file_for_enrichment(enricher, vector_store, local_path: str, remote_path: str) -> bool:
    """Process one downloaded file and upload chunks into vector store.
    Returns True when file was processed/uploaded, False when skipped.
    """
    extension = Path(remote_path).suffix.lower()
    file_hash = enricher._get_file_hash(local_path)
    if enricher._is_document_hash_processed(file_hash):
        return False
    loader = enricher._get_loader_for_extension(local_path)
    if loader is None:
        return False
    docs = loader.load()
    filename = Path(remote_path).name
    for doc in docs:
        doc.metadata["source"] = remote_path
        doc.metadata["filename"] = filename
        doc.metadata["file_path"] = remote_path
        doc.metadata["file_size"] = os.path.getsize(local_path)
        doc.metadata["file_extension"] = extension
        if "page" in doc.metadata:
            doc.metadata["page_number"] = doc.metadata["page"]
    split_docs = enricher.text_splitter.split_documents(docs)
    from helpers import extract_russian_event_names, extract_years_from_text
    for chunk in split_docs:
        chunk.metadata["years"] = extract_years_from_text(chunk.page_content)
        chunk.metadata["events"] = extract_russian_event_names(chunk.page_content)
    if not split_docs:
        return False
    vector_store.add_documents(split_docs)
    enricher._mark_document_processed(remote_path, file_hash)
    return True
 async def _process_remote_file(async_disk, remote_path: str, semaphore: asyncio.Semaphore, tracker: _ProgressTracker, enricher, vector_store):
    async with semaphore:
        temp_path = None
        had_error = False
        try:
            suffix = Path(remote_path).suffix
            with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as tmp_file:
                temp_path = tmp_file.name
            await _download_yadisk_file(async_disk, remote_path, temp_path)
            await asyncio.to_thread(
                _process_local_file_for_enrichment,
                enricher,
                vector_store,
                temp_path,
                remote_path,
            )
        except Exception:
            # Phase requirement: swallow per-file errors and continue processing.
            had_error = True
        finally:
            if temp_path and os.path.exists(temp_path):
                try:
                    os.unlink(temp_path)
                except OSError:
                    had_error = True
            await tracker.mark_done(error=had_error)
@task(name="prefilter_yadisk_file_paths")
 def prefilter_yadisk_file_paths() -> List[str]:
    """Load file list JSON and keep only extensions supported by enrichment."""
    from enrichment import SUPPORTED_EXTENSIONS
    if not OUTPUT_FILE_LIST.exists():
        raise FileNotFoundError(f"File list not found: {OUTPUT_FILE_LIST}")
    with open(OUTPUT_FILE_LIST, "r", encoding="utf-8") as input_file:
        raw_paths = json.load(input_file)
    filtered = [
        path for path in raw_paths if Path(str(path)).suffix.lower() in SUPPORTED_EXTENSIONS
    ]
    return filtered
@task(name="enrich_filtered_yadisk_files_async")
 async def enrich_filtered_yadisk_files_async(filtered_paths: List[str]) -> dict:
    """Download/process Yandex Disk files concurrently and enrich LangChain vector store."""
    if not YADISK_TOKEN:
        raise ValueError("YADISK_TOKEN is required for Yandex Disk enrichment")
    if not filtered_paths:
        print("No supported files found for enrichment.")
        return {"total": 0, "processed": 0, "errors": 0}
    try:
        import yadisk
    except ImportError as error:
        raise RuntimeError("yadisk package is required for this flow") from error
    if not hasattr(yadisk, "AsyncYaDisk"):
        raise RuntimeError("Installed yadisk package does not expose AsyncYaDisk")
    from enrichment import DocumentEnricher
    from vector_storage import initialize_vector_store
    vector_store = initialize_vector_store()
    enricher = DocumentEnricher(vector_store)
    tracker = _ProgressTracker(total=len(filtered_paths))
    semaphore = asyncio.Semaphore(max(1, ENRICH_CONCURRENCY))
    async with yadisk.AsyncYaDisk(token=YADISK_TOKEN) as async_disk:
        tasks = [
            asyncio.create_task(
                _process_remote_file(
                    async_disk=async_disk,
                    remote_path=remote_path,
                    semaphore=semaphore,
                    tracker=tracker,
                    enricher=enricher,
                    vector_store=vector_store,
                )
            )
            for remote_path in filtered_paths
        ]
        await asyncio.gather(*tasks)
    return {
        "total": tracker.total,
        "processed": tracker.processed,
        "errors": tracker.errors,
    }
@flow(name="yadisk_predefined_enrich")
 async def yadisk_predefined_enrich() -> dict:
    filtered_paths = prefilter_yadisk_file_paths()
    return await enrich_filtered_yadisk_files_async(filtered_paths)
 def serve_yadisk_predefined_enrich() -> None:
    yadisk_predefined_enrich.serve(name="yadisk-predefined-enrich")
 if __name__ == "__main__":
    serve_mode = os.getenv("PREFECT_SERVE", "0") == "1"
    if serve_mode:
        serve_yadisk_predefined_enrich()
    else:
        asyncio.run(yadisk_predefined_enrich())
--- a/services/rag/langchain/requirements.txt
+++ b/services/rag/langchain/requirements.txt
@@ -56,3 +56,4 @@ unstructured-pytesseract>=0.3.12
 # System and utilities
 ollama>=0.3.0
 prefect>=2.19.0
 yadisk>=3.4.0
--- a/services/rag/llamaindex/PLANNING.md
+++ b/services/rag/llamaindex/PLANNING.md
@@ -67,5 +67,5 @@ Chosen data folder: relatve ./../../../data - from the current folder
 # Phase 11 (http endpoint to retrieve data from the vector storage by query)
- [ ] Create file `server.py`, with web framework fastapi, for example
+- [x] Create file `server.py`, with web framework fastapi, for example
- [ ] Add POST endpoint "/api/test-query" which will use agent, and retrieve response for query, sent in JSON format, field "query"
+- [x] Add POST endpoint "/api/test-query" which will use agent, and retrieve response for query, sent in JSON format, field "query"
--- a/services/rag/llamaindex/server.py
+++ b/services/rag/llamaindex/server.py
@@ -0,0 +1,109 @@
 #!/usr/bin/env python3
 """
 HTTP API server for querying the vector storage via the existing retrieval pipeline.
 """
 from pathlib import Path
 import sys
 from dotenv import load_dotenv
 from fastapi import FastAPI, HTTPException
 from pydantic import BaseModel, Field
 from loguru import logger
 from retrieval import initialize_retriever
 load_dotenv()
 def setup_logging() -> None:
    """Configure loguru to stdout and rotating file logs."""
    logs_dir = Path("logs")
    logs_dir.mkdir(exist_ok=True)
    logger.remove()
    logger.add(
        "logs/dev.log",
        rotation="10 MB",
        retention="10 days",
        level="INFO",
        format="{time:YYYY-MM-DD HH:mm:ss} | {level} | {file}:{line} | {message}",
    )
    logger.add(
        sys.stdout,
        level="INFO",
        format="{time:YYYY-MM-DD HH:mm:ss} | {level} | {message}",
        colorize=True,
    )
 setup_logging()
 app = FastAPI(title="LlamaIndex RAG API", version="1.0.0")
 class TestQueryRequest(BaseModel):
    query: str = Field(..., min_length=1, description="User query text")
    top_k: int = Field(5, ge=1, le=20, description="Number of retrieved chunks")
 class SourceItem(BaseModel):
    content: str
    score: float | None = None
    metadata: dict = Field(default_factory=dict)
 class TestQueryResponse(BaseModel):
    query: str
    response: str
    sources: list[SourceItem]
@app.get("/health")
 def health() -> dict[str, str]:
    return {"status": "ok"}
@app.post("/api/test-query", response_model=TestQueryResponse)
 def test_query(payload: TestQueryRequest) -> TestQueryResponse:
    """
    Query the vector store using the existing retrieval/query engine.
    """
    query = payload.query.strip()
    if not query:
        raise HTTPException(status_code=400, detail="Field 'query' must not be empty")
    logger.info(f"Received /api/test-query request (top_k={payload.top_k})")
    try:
        query_engine = initialize_retriever(similarity_top_k=payload.top_k)
        result = query_engine.query(query)
        sources: list[SourceItem] = []
        if hasattr(result, "source_nodes"):
            for node in result.source_nodes:
                sources.append(
                    SourceItem(
                        content=str(getattr(node, "text", "")),
                        score=getattr(node, "score", None),
                        metadata=getattr(node, "metadata", {}) or {},
                    )
                )
        response_text = str(result)
        logger.info(
            f"/api/test-query completed successfully (sources={len(sources)})"
        )
        return TestQueryResponse(query=query, response=response_text, sources=sources)
    except HTTPException:
        raise
    except Exception as e:
        logger.error(f"/api/test-query failed: {e}")
        raise HTTPException(status_code=500, detail="Failed to process query")
 if __name__ == "__main__":
    import uvicorn
    uvicorn.run("server:app", host="0.0.0.0", port=8000, reload=False)