langchain uploading new way of predefined paths from yandex disk

This commit is contained in:
2026-02-26 00:01:47 +03:00
parent 2c7ab06b3f
commit 3e29ea70ed
7 changed files with 365 additions and 17 deletions

View File

@@ -15,3 +15,4 @@ ENRICHMENT_PROCESSING_MODE=async/sync
ENRICHMENT_ADAPTIVE_FILES_QUEUE_LIMIT=5 ENRICHMENT_ADAPTIVE_FILES_QUEUE_LIMIT=5
ENRICHMENT_ADAPTIVE_FILE_PROCESS_THREADS=4 ENRICHMENT_ADAPTIVE_FILE_PROCESS_THREADS=4
ENRICHMENT_ADAPTIVE_DOCUMENT_UPLOADS_THREADS=4 ENRICHMENT_ADAPTIVE_DOCUMENT_UPLOADS_THREADS=4
PREFECT_YADISK_ENRICH_CONCURRENCY=8

View File

@@ -85,7 +85,6 @@ During enrichment, we should use adaptive collection from the helpers, for loadi
- [x] We still will need filetypes that we will need to skip, so while iterating over files we need to check their extension and skip them. - [x] We still will need filetypes that we will need to skip, so while iterating over files we need to check their extension and skip them.
- [x] Adaptive files has filename in them, so it should be used when extracting metadata - [x] Adaptive files has filename in them, so it should be used when extracting metadata
# Phase 13 (async processing of files) # Phase 13 (async processing of files)
During this Phase we create asynchronous process of enrichment, utilizing async/await During this Phase we create asynchronous process of enrichment, utilizing async/await
@@ -111,3 +110,18 @@ During this Phase we create asynchronous process of enrichment, utilizing async/
- [x] Create prefect task "iterate_yadisk_folder_and_store_file_paths" that will connect to yandex disk with yadisk library, analyze everything inside folder `Общая` recursively and store file paths in the ./../../../yadisk_files.json, in array of strings. - [x] Create prefect task "iterate_yadisk_folder_and_store_file_paths" that will connect to yandex disk with yadisk library, analyze everything inside folder `Общая` recursively and store file paths in the ./../../../yadisk_files.json, in array of strings.
- [x] In our pefect file add function for flow to serve, as per prefect documentation on serving flows - [x] In our pefect file add function for flow to serve, as per prefect documentation on serving flows
- [x] Tests will be done manually by hand, by executing this script and checking prefect dashboard. No automatical tests needed for this phase. - [x] Tests will be done manually by hand, by executing this script and checking prefect dashboard. No automatical tests needed for this phase.
# Phase 15 (prefect enrichment process for langchain, with predefined values, also removal of non-documet formats)
- [x] Remove for now formats, extensions for images of any kind, archives of any kind, and add possible text documents, documents formats, like .txt, .xlsx, etc. in enrichment processes/functions.
- [x] Create prefect client file in `prefect/02_yadisk_predefined_enrich.py`. This file will firt load file from ./../../../yadisk_files.json into array of paths. After that, array of paths will be filtered, and only supported in enrichment extensions will be left. After that, code will iterate through each path in this filtered array, use yadisk library to download file, process it for enrichment, and the remove it after processing. There should be statistics for this, at runtime, with progressbar that shows how many files processed out of how many left. Also, near the progressbar there should be counter of errors. Yes, if there is an error, it should be swallowed, even if it is inside thred or async function.
- [x] For yandex disk integration use library yadisk. In .env file there should be variable YADISK_TOKEN for accessing the needed connection
- [x] Code for loading should be reflected upon, and then made it so it would be done in async way, with as much as possible simulatenous tasks. yadisk async integration should be used (async features can be checked here: https://pypi.org/project/yadisk/)
- [x] No tests for code should be done at this phase, all tests will be done manually, because loading of documents can take a long time for automated test.
# Phase 16 (making demo ui scalable)
- [ ] Make demo-ui window containable and reusable part of html + js. This part will be used for creating multi-windowed demo ui.
- [ ] Make tabbed UI with top level tabs. First tab exists and is selected. Each tab should have copy of demo ui, meaning the chat window with ability to specify the api url
- [ ] At the end of the tabs there should be button with plus sign, which will add new tab. Tabs to be called by numbers.
- [ ] There should predefined 3 tabs opened. First one should have predefined api url "https://rag.langchain.overwatch.su/api/test-query", second "https://rag.llamaindex.overwatch.su/api/test-query", third "https://rag.haystack.overwatch.su/api/test-query"

View File

@@ -8,7 +8,7 @@ from pathlib import Path
from typing import List, Optional, Tuple from typing import List, Optional, Tuple
from dotenv import load_dotenv from dotenv import load_dotenv
from langchain_community.document_loaders import PyPDFLoader from langchain_community.document_loaders import PyPDFLoader, TextLoader
from langchain_core.documents import Document from langchain_core.documents import Document
from langchain_text_splitters import RecursiveCharacterTextSplitter from langchain_text_splitters import RecursiveCharacterTextSplitter
from loguru import logger from loguru import logger
@@ -75,21 +75,26 @@ ENRICHMENT_ADAPTIVE_DOCUMENT_UPLOADS_THREADS = int(
) )
SUPPORTED_EXTENSIONS = { SUPPORTED_EXTENSIONS = {
".pdf", ".csv",
".docx",
".doc", ".doc",
".pptx", ".docx",
".xlsx", ".epub",
".xls", ".htm",
# ".jpg", ".html",
# ".jpeg", ".json",
# ".png", ".jsonl",
# ".gif", ".md",
# ".bmp",
# ".tiff",
# ".webp",
".odt", ".odt",
".txt", # this one is obvious but was unexpected to see in data lol ".pdf",
".ppt",
".pptx",
".rtf",
".rst",
".tsv",
".txt",
".xls",
".xlsx",
".xml",
} }
Base = declarative_base() Base = declarative_base()
@@ -261,6 +266,8 @@ class DocumentEnricher:
return UnstructuredODTLoader( return UnstructuredODTLoader(
file_path, **{"strategy": "hi_res", "languages": ["rus"]} file_path, **{"strategy": "hi_res", "languages": ["rus"]}
) )
if ext in [".txt", ".md"]:
return TextLoader(file_path, encoding="utf-8")
return None return None
def _load_one_adaptive_file( def _load_one_adaptive_file(

View File

@@ -0,0 +1,216 @@
"""Prefect flow to enrich Yandex Disk files from a predefined JSON file list."""
from __future__ import annotations
import asyncio
import json
import os
import sys
import tempfile
from pathlib import Path
from typing import List
from dotenv import load_dotenv
from prefect import flow, task
load_dotenv()
PROJECT_ROOT = Path(__file__).resolve().parent.parent
if str(PROJECT_ROOT) not in sys.path:
sys.path.insert(0, str(PROJECT_ROOT))
PREFECT_API_URL = os.getenv("PREFECT_API_URL")
YADISK_TOKEN = os.getenv("YADISK_TOKEN")
ENRICH_CONCURRENCY = int(os.getenv("PREFECT_YADISK_ENRICH_CONCURRENCY", "8"))
OUTPUT_FILE_LIST = (PROJECT_ROOT / "../../../yadisk_files.json").resolve()
if PREFECT_API_URL:
os.environ["PREFECT_API_URL"] = PREFECT_API_URL
class _ProgressTracker:
def __init__(self, total: int):
self.total = total
self.processed = 0
self.errors = 0
self._lock = asyncio.Lock()
async def mark_done(self, error: bool = False):
async with self._lock:
self.processed += 1
if error:
self.errors += 1
self._render()
def _render(self):
total = max(self.total, 1)
width = 30
filled = int(width * self.processed / total)
bar = "#" * filled + "-" * (width - filled)
left = max(self.total - self.processed, 0)
print(
f"\r[{bar}] {self.processed}/{self.total} processed | left: {left} | errors: {self.errors}",
end="",
flush=True,
)
if self.processed >= self.total:
print()
async def _download_yadisk_file(async_disk, remote_path: str, local_path: str) -> None:
await async_disk.download(remote_path, local_path)
def _process_local_file_for_enrichment(enricher, vector_store, local_path: str, remote_path: str) -> bool:
"""Process one downloaded file and upload chunks into vector store.
Returns True when file was processed/uploaded, False when skipped.
"""
extension = Path(remote_path).suffix.lower()
file_hash = enricher._get_file_hash(local_path)
if enricher._is_document_hash_processed(file_hash):
return False
loader = enricher._get_loader_for_extension(local_path)
if loader is None:
return False
docs = loader.load()
filename = Path(remote_path).name
for doc in docs:
doc.metadata["source"] = remote_path
doc.metadata["filename"] = filename
doc.metadata["file_path"] = remote_path
doc.metadata["file_size"] = os.path.getsize(local_path)
doc.metadata["file_extension"] = extension
if "page" in doc.metadata:
doc.metadata["page_number"] = doc.metadata["page"]
split_docs = enricher.text_splitter.split_documents(docs)
from helpers import extract_russian_event_names, extract_years_from_text
for chunk in split_docs:
chunk.metadata["years"] = extract_years_from_text(chunk.page_content)
chunk.metadata["events"] = extract_russian_event_names(chunk.page_content)
if not split_docs:
return False
vector_store.add_documents(split_docs)
enricher._mark_document_processed(remote_path, file_hash)
return True
async def _process_remote_file(async_disk, remote_path: str, semaphore: asyncio.Semaphore, tracker: _ProgressTracker, enricher, vector_store):
async with semaphore:
temp_path = None
had_error = False
try:
suffix = Path(remote_path).suffix
with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as tmp_file:
temp_path = tmp_file.name
await _download_yadisk_file(async_disk, remote_path, temp_path)
await asyncio.to_thread(
_process_local_file_for_enrichment,
enricher,
vector_store,
temp_path,
remote_path,
)
except Exception:
# Phase requirement: swallow per-file errors and continue processing.
had_error = True
finally:
if temp_path and os.path.exists(temp_path):
try:
os.unlink(temp_path)
except OSError:
had_error = True
await tracker.mark_done(error=had_error)
@task(name="prefilter_yadisk_file_paths")
def prefilter_yadisk_file_paths() -> List[str]:
"""Load file list JSON and keep only extensions supported by enrichment."""
from enrichment import SUPPORTED_EXTENSIONS
if not OUTPUT_FILE_LIST.exists():
raise FileNotFoundError(f"File list not found: {OUTPUT_FILE_LIST}")
with open(OUTPUT_FILE_LIST, "r", encoding="utf-8") as input_file:
raw_paths = json.load(input_file)
filtered = [
path for path in raw_paths if Path(str(path)).suffix.lower() in SUPPORTED_EXTENSIONS
]
return filtered
@task(name="enrich_filtered_yadisk_files_async")
async def enrich_filtered_yadisk_files_async(filtered_paths: List[str]) -> dict:
"""Download/process Yandex Disk files concurrently and enrich LangChain vector store."""
if not YADISK_TOKEN:
raise ValueError("YADISK_TOKEN is required for Yandex Disk enrichment")
if not filtered_paths:
print("No supported files found for enrichment.")
return {"total": 0, "processed": 0, "errors": 0}
try:
import yadisk
except ImportError as error:
raise RuntimeError("yadisk package is required for this flow") from error
if not hasattr(yadisk, "AsyncYaDisk"):
raise RuntimeError("Installed yadisk package does not expose AsyncYaDisk")
from enrichment import DocumentEnricher
from vector_storage import initialize_vector_store
vector_store = initialize_vector_store()
enricher = DocumentEnricher(vector_store)
tracker = _ProgressTracker(total=len(filtered_paths))
semaphore = asyncio.Semaphore(max(1, ENRICH_CONCURRENCY))
async with yadisk.AsyncYaDisk(token=YADISK_TOKEN) as async_disk:
tasks = [
asyncio.create_task(
_process_remote_file(
async_disk=async_disk,
remote_path=remote_path,
semaphore=semaphore,
tracker=tracker,
enricher=enricher,
vector_store=vector_store,
)
)
for remote_path in filtered_paths
]
await asyncio.gather(*tasks)
return {
"total": tracker.total,
"processed": tracker.processed,
"errors": tracker.errors,
}
@flow(name="yadisk_predefined_enrich")
async def yadisk_predefined_enrich() -> dict:
filtered_paths = prefilter_yadisk_file_paths()
return await enrich_filtered_yadisk_files_async(filtered_paths)
def serve_yadisk_predefined_enrich() -> None:
yadisk_predefined_enrich.serve(name="yadisk-predefined-enrich")
if __name__ == "__main__":
serve_mode = os.getenv("PREFECT_SERVE", "0") == "1"
if serve_mode:
serve_yadisk_predefined_enrich()
else:
asyncio.run(yadisk_predefined_enrich())

View File

@@ -56,3 +56,4 @@ unstructured-pytesseract>=0.3.12
# System and utilities # System and utilities
ollama>=0.3.0 ollama>=0.3.0
prefect>=2.19.0 prefect>=2.19.0
yadisk>=3.4.0

View File

@@ -67,5 +67,5 @@ Chosen data folder: relatve ./../../../data - from the current folder
# Phase 11 (http endpoint to retrieve data from the vector storage by query) # Phase 11 (http endpoint to retrieve data from the vector storage by query)
- [ ] Create file `server.py`, with web framework fastapi, for example - [x] Create file `server.py`, with web framework fastapi, for example
- [ ] Add POST endpoint "/api/test-query" which will use agent, and retrieve response for query, sent in JSON format, field "query" - [x] Add POST endpoint "/api/test-query" which will use agent, and retrieve response for query, sent in JSON format, field "query"

View File

@@ -0,0 +1,109 @@
#!/usr/bin/env python3
"""
HTTP API server for querying the vector storage via the existing retrieval pipeline.
"""
from pathlib import Path
import sys
from dotenv import load_dotenv
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel, Field
from loguru import logger
from retrieval import initialize_retriever
load_dotenv()
def setup_logging() -> None:
"""Configure loguru to stdout and rotating file logs."""
logs_dir = Path("logs")
logs_dir.mkdir(exist_ok=True)
logger.remove()
logger.add(
"logs/dev.log",
rotation="10 MB",
retention="10 days",
level="INFO",
format="{time:YYYY-MM-DD HH:mm:ss} | {level} | {file}:{line} | {message}",
)
logger.add(
sys.stdout,
level="INFO",
format="{time:YYYY-MM-DD HH:mm:ss} | {level} | {message}",
colorize=True,
)
setup_logging()
app = FastAPI(title="LlamaIndex RAG API", version="1.0.0")
class TestQueryRequest(BaseModel):
query: str = Field(..., min_length=1, description="User query text")
top_k: int = Field(5, ge=1, le=20, description="Number of retrieved chunks")
class SourceItem(BaseModel):
content: str
score: float | None = None
metadata: dict = Field(default_factory=dict)
class TestQueryResponse(BaseModel):
query: str
response: str
sources: list[SourceItem]
@app.get("/health")
def health() -> dict[str, str]:
return {"status": "ok"}
@app.post("/api/test-query", response_model=TestQueryResponse)
def test_query(payload: TestQueryRequest) -> TestQueryResponse:
"""
Query the vector store using the existing retrieval/query engine.
"""
query = payload.query.strip()
if not query:
raise HTTPException(status_code=400, detail="Field 'query' must not be empty")
logger.info(f"Received /api/test-query request (top_k={payload.top_k})")
try:
query_engine = initialize_retriever(similarity_top_k=payload.top_k)
result = query_engine.query(query)
sources: list[SourceItem] = []
if hasattr(result, "source_nodes"):
for node in result.source_nodes:
sources.append(
SourceItem(
content=str(getattr(node, "text", "")),
score=getattr(node, "score", None),
metadata=getattr(node, "metadata", {}) or {},
)
)
response_text = str(result)
logger.info(
f"/api/test-query completed successfully (sources={len(sources)})"
)
return TestQueryResponse(query=query, response=response_text, sources=sources)
except HTTPException:
raise
except Exception as e:
logger.error(f"/api/test-query failed: {e}")
raise HTTPException(status_code=500, detail="Failed to process query")
if __name__ == "__main__":
import uvicorn
uvicorn.run("server:app", host="0.0.0.0", port=8000, reload=False)