From 63c3e2c5c75bc37c2860d4b12a3302a0deca138d Mon Sep 17 00:00:00 2001 From: idchlife Date: Tue, 10 Feb 2026 20:12:43 +0300 Subject: [PATCH] Adaptive Collection, and Phase 11 WIP --- .DS_Store | Bin 6148 -> 6148 bytes services/rag/langchain/.env.dist | 1 + services/rag/langchain/PLANNING.md | 8 +++++ services/rag/langchain/helpers.py | 49 +++++++++++++++++++++++++++-- 4 files changed, 56 insertions(+), 2 deletions(-) diff --git a/.DS_Store b/.DS_Store index 9038f4c3f67bba6493cba3fe154055ab64ebdede..ef6d3ba672d5bba2ffb6502a9655db0b5b591f85 100644 GIT binary patch delta 81 zcmZoMXfc=|#>B`mF;Q%yo+2ab!~pBb0*nnnMo;y`)to$K!9{sF`FZIK3=E8$85y(L fHnVf^a{x5}Ip3Kl^NSdAFaqTofOxZq$R1_@Xqpp< delta 63 zcmZoMXfc=|#>CJ*F;Q%yo+2a5!~knXmdQMf)tlEaMzc-cz_@fXI|n}pP{rnjjNh3j T^NScVGEBDNk=`65vVs`^Kl>5L diff --git a/services/rag/langchain/.env.dist b/services/rag/langchain/.env.dist index d126d9f..b091d82 100644 --- a/services/rag/langchain/.env.dist +++ b/services/rag/langchain/.env.dist @@ -6,3 +6,4 @@ CHAT_MODEL_STRATEGY=ollama QDRANT_HOST=HOST QDRANT_REST_PORT=PORT QDRANT_GRPC_PORT=PORT +YADISK_TOKEN=TOKEN diff --git a/services/rag/langchain/PLANNING.md b/services/rag/langchain/PLANNING.md index e6d6965..b29f943 100644 --- a/services/rag/langchain/PLANNING.md +++ b/services/rag/langchain/PLANNING.md @@ -65,3 +65,11 @@ Chosen data folder: relatve ./../../../data - from the current folder - [x] Create heuristic, regex function in helpers module for extracting name of event, in Russian language. We need to use regex and possible words before, after the event, etc. - [x] Durint enriching vector storage, try to extract event name from the chunk and save in metadata in field "events", which will contain list of strings, possible evennts. Helper function usage is advised. - [x] In VectorStoreRetriever._get_relevant_documents add similarity search for the event name, if event name is present in the query. Helper function should be used here to try to extract the event name. + +# Phase 11 (adaptive collection, to attach different filesystems in the future) + +- [x] Create adaptive collection class and adaptive file class in the helpers, which will be as abstract classes, that should encompass feature of iterating and working with files locally +- [x] Write local filesystem implementation of adaptive collection +- [ ] Write tests for local filesystem implementation, using test/samples folder filled with files and directories for testing of iteration and recursivess +- [ ] Create Yandex Disk implementation of the Adaptive Collection. Constructor should have requirement for TOKEN for Yandex Disk. +- [ ] Write tests for Yandex Disk implementation, using folder "Общая/Информация". .env has YADISK_TOKEN variable for connecting. While testing log output of found files during iterating. If test fails at this step, leave to manual fixing, and this step can be marked as done. diff --git a/services/rag/langchain/helpers.py b/services/rag/langchain/helpers.py index 7b458f3..dbc93cd 100644 --- a/services/rag/langchain/helpers.py +++ b/services/rag/langchain/helpers.py @@ -1,8 +1,10 @@ """Helper utilities for metadata extraction from Russian text.""" +import os import re -from typing import List - +from abc import abstractmethod +from pathlib import Path +from typing import Callable, List _YEAR_PATTERN = re.compile(r"(? List[str]: seen.add(quoted) return events + + +class _AdaptiveFile: + extension: str # Format: .jpg + local_path: str + + def __init__(self, extension: str, local_path: str): + self.extension = extension + self.local_path = local_path + + # This method allows to work with file locally, and lambda should be provided for this. + # Why separate method? For possible cleanup after work is done. And to download file, if needed + # Lambda: first argument is a local path + @abstractmethod + def work_with_file_locally(self, func: Callable[[str], None]): + pass + + +class _AdaptiveCollection: + # Generator method with yield + @abstractmethod + def iterate(self, recursive: bool): + pass + + +class LocalFilesystemAdaptiveFile(_AdaptiveFile): + def work_with_file_locally(self, func: Callable[[str], None]): + func(self.local_path) + + +class LocalFilesystemAdaptiveCollection(_AdaptiveCollection): + base_dir: str + + def __init__(self, base_dir: str): + super().__init__() + + self.base_dir = base_dir + + def iterate(self, recursive: bool): + for root, dirs, files in os.walk(self.base_dir): + for file in files: + full_path = os.path.join(root, file) + yield _AdaptiveFile(Path(full_path).suffix, full_path)