langchain uploading new way of predefined paths from yandex disk

2026-02-26 00:01:47 +03:00
parent 2c7ab06b3f
commit 3e29ea70ed
7 changed files with 365 additions and 17 deletions
--- a/services/rag/langchain/.env.dist
+++ b/services/rag/langchain/.env.dist
@@ -15,3 +15,4 @@ ENRICHMENT_PROCESSING_MODE=async/sync
 ENRICHMENT_ADAPTIVE_FILES_QUEUE_LIMIT=5
 ENRICHMENT_ADAPTIVE_FILE_PROCESS_THREADS=4
 ENRICHMENT_ADAPTIVE_DOCUMENT_UPLOADS_THREADS=4
+PREFECT_YADISK_ENRICH_CONCURRENCY=8
--- a/services/rag/langchain/PLANNING.md
+++ b/services/rag/langchain/PLANNING.md
@@ -85,7 +85,6 @@ During enrichment, we should use adaptive collection from the helpers, for loadi
 - [x] We still will need filetypes that we will need to skip, so while iterating over files we need to check their extension and skip them.
 - [x] Adaptive files has filename in them, so it should be used when extracting metadata

-
 # Phase 13 (async processing of files)

 During this Phase we create asynchronous process of enrichment, utilizing async/await
@@ -111,3 +110,18 @@ During this Phase we create asynchronous process of enrichment, utilizing async/
 - [x] Create prefect task "iterate_yadisk_folder_and_store_file_paths" that will connect to yandex disk with yadisk library, analyze everything inside folder `Общая` recursively and store file paths in the ./../../../yadisk_files.json, in array of strings.
 - [x] In our pefect file add function for flow to serve, as per prefect documentation on serving flows
 - [x] Tests will be done manually by hand, by executing this script and checking prefect dashboard. No automatical tests needed for this phase.
+
+# Phase 15 (prefect enrichment process for langchain, with predefined values, also removal of non-documet formats)
+
+- [x] Remove for now formats, extensions for images of any kind, archives of any kind, and add possible text documents, documents formats, like .txt, .xlsx, etc. in enrichment processes/functions.
+- [x] Create prefect client file in `prefect/02_yadisk_predefined_enrich.py`. This file will firt load file from ./../../../yadisk_files.json into array of paths. After that, array of paths will be filtered, and only supported in enrichment extensions will be left. After that, code will iterate through each path in this filtered array, use yadisk library to download file, process it for enrichment, and the remove it after processing. There should be statistics for this, at runtime, with progressbar that shows how many files processed out of how many left. Also, near the progressbar there should be counter of errors. Yes, if there is an error, it should be swallowed, even if it is inside thred or async function.
+- [x] For yandex disk integration use library yadisk. In .env file there should be variable YADISK_TOKEN for accessing the needed connection
+- [x] Code for loading should be reflected upon, and then made it so it would be done in async way, with as much as possible simulatenous tasks. yadisk async integration should be used (async features can be checked here: https://pypi.org/project/yadisk/)
+- [x] No tests for code should be done at this phase, all tests will be done manually, because loading of documents can take a long time for automated test.
+
+# Phase 16 (making demo ui scalable)
+
+- [ ] Make demo-ui window containable and reusable part of html + js. This part will be used for creating multi-windowed demo ui.
+- [ ] Make tabbed UI with top level tabs. First tab exists and is selected. Each tab should have copy of demo ui, meaning the chat window with ability to specify the api url
+- [ ] At the end of the tabs there should be button with plus sign, which will add new tab. Tabs to be called by numbers.
+- [ ] There should predefined 3 tabs opened. First one should have predefined api url "https://rag.langchain.overwatch.su/api/test-query", second "https://rag.llamaindex.overwatch.su/api/test-query", third "https://rag.haystack.overwatch.su/api/test-query"
--- a/services/rag/langchain/enrichment.py
+++ b/services/rag/langchain/enrichment.py
@@ -8,7 +8,7 @@ from pathlib import Path
 from typing import List, Optional, Tuple

 from dotenv import load_dotenv
-from langchain_community.document_loaders import PyPDFLoader
+from langchain_community.document_loaders import PyPDFLoader, TextLoader
 from langchain_core.documents import Document
 from langchain_text_splitters import RecursiveCharacterTextSplitter
 from loguru import logger
@@ -75,21 +75,26 @@ ENRICHMENT_ADAPTIVE_DOCUMENT_UPLOADS_THREADS = int(
 )

 SUPPORTED_EXTENSIONS = {
-    ".pdf",
-    ".docx",
+    ".csv",
    ".doc",
-    ".pptx",
-    ".xlsx",
-    ".xls",
-    # ".jpg",
-    # ".jpeg",
-    # ".png",
-    # ".gif",
-    # ".bmp",
-    # ".tiff",
-    # ".webp",
+    ".docx",
+    ".epub",
+    ".htm",
+    ".html",
+    ".json",
+    ".jsonl",
+    ".md",
    ".odt",
-    ".txt",  # this one is obvious but was unexpected to see in data lol
+    ".pdf",
+    ".ppt",
+    ".pptx",
+    ".rtf",
+    ".rst",
+    ".tsv",
+    ".txt",
+    ".xls",
+    ".xlsx",
+    ".xml",
 }

 Base = declarative_base()
@@ -261,6 +266,8 @@ class DocumentEnricher:
            return UnstructuredODTLoader(
                file_path, **{"strategy": "hi_res", "languages": ["rus"]}
            )
+        if ext in [".txt", ".md"]:
+            return TextLoader(file_path, encoding="utf-8")
        return None

    def _load_one_adaptive_file(
--- a/services/rag/langchain/prefect/02_yadisk_predefined_enrich.py
+++ b/services/rag/langchain/prefect/02_yadisk_predefined_enrich.py
@@ -0,0 +1,216 @@
+"""Prefect flow to enrich Yandex Disk files from a predefined JSON file list."""
+
+from __future__ import annotations
+
+import asyncio
+import json
+import os
+import sys
+import tempfile
+from pathlib import Path
+from typing import List
+
+from dotenv import load_dotenv
+from prefect import flow, task
+
+load_dotenv()
+
+PROJECT_ROOT = Path(__file__).resolve().parent.parent
+if str(PROJECT_ROOT) not in sys.path:
+    sys.path.insert(0, str(PROJECT_ROOT))
+
+PREFECT_API_URL = os.getenv("PREFECT_API_URL")
+YADISK_TOKEN = os.getenv("YADISK_TOKEN")
+ENRICH_CONCURRENCY = int(os.getenv("PREFECT_YADISK_ENRICH_CONCURRENCY", "8"))
+OUTPUT_FILE_LIST = (PROJECT_ROOT / "../../../yadisk_files.json").resolve()
+
+if PREFECT_API_URL:
+    os.environ["PREFECT_API_URL"] = PREFECT_API_URL
+
+
+class _ProgressTracker:
+    def __init__(self, total: int):
+        self.total = total
+        self.processed = 0
+        self.errors = 0
+        self._lock = asyncio.Lock()
+
+    async def mark_done(self, error: bool = False):
+        async with self._lock:
+            self.processed += 1
+            if error:
+                self.errors += 1
+            self._render()
+
+    def _render(self):
+        total = max(self.total, 1)
+        width = 30
+        filled = int(width * self.processed / total)
+        bar = "#" * filled + "-" * (width - filled)
+        left = max(self.total - self.processed, 0)
+        print(
+            f"\r[{bar}] {self.processed}/{self.total} processed | left: {left} | errors: {self.errors}",
+            end="",
+            flush=True,
+        )
+        if self.processed >= self.total:
+            print()
+
+
+async def _download_yadisk_file(async_disk, remote_path: str, local_path: str) -> None:
+    await async_disk.download(remote_path, local_path)
+
+
+def _process_local_file_for_enrichment(enricher, vector_store, local_path: str, remote_path: str) -> bool:
+    """Process one downloaded file and upload chunks into vector store.
+
+    Returns True when file was processed/uploaded, False when skipped.
+    """
+    extension = Path(remote_path).suffix.lower()
+    file_hash = enricher._get_file_hash(local_path)
+    if enricher._is_document_hash_processed(file_hash):
+        return False
+
+    loader = enricher._get_loader_for_extension(local_path)
+    if loader is None:
+        return False
+
+    docs = loader.load()
+    filename = Path(remote_path).name
+
+    for doc in docs:
+        doc.metadata["source"] = remote_path
+        doc.metadata["filename"] = filename
+        doc.metadata["file_path"] = remote_path
+        doc.metadata["file_size"] = os.path.getsize(local_path)
+        doc.metadata["file_extension"] = extension
+        if "page" in doc.metadata:
+            doc.metadata["page_number"] = doc.metadata["page"]
+
+    split_docs = enricher.text_splitter.split_documents(docs)
+    from helpers import extract_russian_event_names, extract_years_from_text
+
+    for chunk in split_docs:
+        chunk.metadata["years"] = extract_years_from_text(chunk.page_content)
+        chunk.metadata["events"] = extract_russian_event_names(chunk.page_content)
+
+    if not split_docs:
+        return False
+
+    vector_store.add_documents(split_docs)
+    enricher._mark_document_processed(remote_path, file_hash)
+    return True
+
+
+async def _process_remote_file(async_disk, remote_path: str, semaphore: asyncio.Semaphore, tracker: _ProgressTracker, enricher, vector_store):
+    async with semaphore:
+        temp_path = None
+        had_error = False
+        try:
+            suffix = Path(remote_path).suffix
+            with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as tmp_file:
+                temp_path = tmp_file.name
+
+            await _download_yadisk_file(async_disk, remote_path, temp_path)
+            await asyncio.to_thread(
+                _process_local_file_for_enrichment,
+                enricher,
+                vector_store,
+                temp_path,
+                remote_path,
+            )
+        except Exception:
+            # Phase requirement: swallow per-file errors and continue processing.
+            had_error = True
+        finally:
+            if temp_path and os.path.exists(temp_path):
+                try:
+                    os.unlink(temp_path)
+                except OSError:
+                    had_error = True
+            await tracker.mark_done(error=had_error)
+
+
+@task(name="prefilter_yadisk_file_paths")
+def prefilter_yadisk_file_paths() -> List[str]:
+    """Load file list JSON and keep only extensions supported by enrichment."""
+    from enrichment import SUPPORTED_EXTENSIONS
+
+    if not OUTPUT_FILE_LIST.exists():
+        raise FileNotFoundError(f"File list not found: {OUTPUT_FILE_LIST}")
+
+    with open(OUTPUT_FILE_LIST, "r", encoding="utf-8") as input_file:
+        raw_paths = json.load(input_file)
+
+    filtered = [
+        path for path in raw_paths if Path(str(path)).suffix.lower() in SUPPORTED_EXTENSIONS
+    ]
+    return filtered
+
+
+@task(name="enrich_filtered_yadisk_files_async")
+async def enrich_filtered_yadisk_files_async(filtered_paths: List[str]) -> dict:
+    """Download/process Yandex Disk files concurrently and enrich LangChain vector store."""
+    if not YADISK_TOKEN:
+        raise ValueError("YADISK_TOKEN is required for Yandex Disk enrichment")
+
+    if not filtered_paths:
+        print("No supported files found for enrichment.")
+        return {"total": 0, "processed": 0, "errors": 0}
+
+    try:
+        import yadisk
+    except ImportError as error:
+        raise RuntimeError("yadisk package is required for this flow") from error
+
+    if not hasattr(yadisk, "AsyncYaDisk"):
+        raise RuntimeError("Installed yadisk package does not expose AsyncYaDisk")
+
+    from enrichment import DocumentEnricher
+    from vector_storage import initialize_vector_store
+
+    vector_store = initialize_vector_store()
+    enricher = DocumentEnricher(vector_store)
+
+    tracker = _ProgressTracker(total=len(filtered_paths))
+    semaphore = asyncio.Semaphore(max(1, ENRICH_CONCURRENCY))
+
+    async with yadisk.AsyncYaDisk(token=YADISK_TOKEN) as async_disk:
+        tasks = [
+            asyncio.create_task(
+                _process_remote_file(
+                    async_disk=async_disk,
+                    remote_path=remote_path,
+                    semaphore=semaphore,
+                    tracker=tracker,
+                    enricher=enricher,
+                    vector_store=vector_store,
+                )
+            )
+            for remote_path in filtered_paths
+        ]
+        await asyncio.gather(*tasks)
+
+    return {
+        "total": tracker.total,
+        "processed": tracker.processed,
+        "errors": tracker.errors,
+    }
+
+
+@flow(name="yadisk_predefined_enrich")
+async def yadisk_predefined_enrich() -> dict:
+    filtered_paths = prefilter_yadisk_file_paths()
+    return await enrich_filtered_yadisk_files_async(filtered_paths)
+
+
+def serve_yadisk_predefined_enrich() -> None:
+    yadisk_predefined_enrich.serve(name="yadisk-predefined-enrich")
+
+
+if __name__ == "__main__":
+    serve_mode = os.getenv("PREFECT_SERVE", "0") == "1"
+    if serve_mode:
+        serve_yadisk_predefined_enrich()
+    else:
+        asyncio.run(yadisk_predefined_enrich())
--- a/services/rag/langchain/requirements.txt
+++ b/services/rag/langchain/requirements.txt
@@ -56,3 +56,4 @@ unstructured-pytesseract>=0.3.12
 # System and utilities
 ollama>=0.3.0
 prefect>=2.19.0
+yadisk>=3.4.0