Phase 12 done... loading via adaptive collection, yadisk or local

This commit is contained in:
2026-02-10 22:19:27 +03:00
parent e9dd28ad55
commit 1e6ab247b9
5 changed files with 154 additions and 113 deletions

View File

@@ -78,9 +78,9 @@ Chosen data folder: relatve ./../../../data - from the current folder
During enrichment, we should use adaptive collection from the helpers, for loading documents. We should not use directly local filesystem, but use adaptive collection as a wrapper. During enrichment, we should use adaptive collection from the helpers, for loading documents. We should not use directly local filesystem, but use adaptive collection as a wrapper.
- [ ] Adaptive file in helper now has filename in it, so tests should be adjusted for this - [x] Adaptive file in helper now has filename in it, so tests should be adjusted for this
- [ ] Add conditional usage of adaptive collection in the enrichment stage. .env has now variable ENRICHMENT_SOURCE with 2 possible values: yadisk, local - [x] Add conditional usage of adaptive collection in the enrichment stage. .env has now variable ENRICHMENT_SOURCE with 2 possible values: yadisk, local
- [ ] With local source, use env variable for local filesystem adaptive collection: ENRICHMENT_LOCAL_PATH - [x] With local source, use env variable for local filesystem adaptive collection: ENRICHMENT_LOCAL_PATH
- [ ] With yadisk source, use env variable for YADISK_TOKEN for token for auth within Yandex Disk, ENRICHMENT_YADISK_PATH for path on the Yandex Disk system - [x] With yadisk source, use env variable for YADISK_TOKEN for token for auth within Yandex Disk, ENRICHMENT_YADISK_PATH for path on the Yandex Disk system
- [ ] We still will need filetypes that we will need to skip, so while iterating over files we need to check their extension and skip them. - [x] We still will need filetypes that we will need to skip, so while iterating over files we need to check their extension and skip them.
- [ ] Adaptive files has filename in them, so it should be used when extracting metadata - [x] Adaptive files has filename in them, so it should be used when extracting metadata

View File

@@ -3,7 +3,7 @@
import os import os
import hashlib import hashlib
from pathlib import Path from pathlib import Path
from typing import List, Dict, Any from typing import List, Tuple
from dotenv import load_dotenv from dotenv import load_dotenv
from langchain_core.documents import Document from langchain_core.documents import Document
from langchain_text_splitters import RecursiveCharacterTextSplitter from langchain_text_splitters import RecursiveCharacterTextSplitter
@@ -37,9 +37,15 @@ from sqlalchemy import create_engine, Column, Integer, String
from sqlalchemy.ext.declarative import declarative_base from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.orm import sessionmaker from sqlalchemy.orm import sessionmaker
from loguru import logger from loguru import logger
import sqlite3
from helpers import extract_russian_event_names, extract_years_from_text from helpers import (
LocalFilesystemAdaptiveCollection,
YandexDiskAdaptiveCollection,
_AdaptiveCollection,
_AdaptiveFile,
extract_russian_event_names,
extract_years_from_text,
)
# Load environment variables # Load environment variables
load_dotenv() load_dotenv()
@@ -48,6 +54,27 @@ load_dotenv()
# Define the path to the data directory # Define the path to the data directory
DATA_DIR = Path("../../../data").resolve() DATA_DIR = Path("../../../data").resolve()
DB_PATH = Path("document_tracking.db").resolve() DB_PATH = Path("document_tracking.db").resolve()
ENRICHMENT_SOURCE = os.getenv("ENRICHMENT_SOURCE", "local").lower()
ENRICHMENT_LOCAL_PATH = os.getenv("ENRICHMENT_LOCAL_PATH")
ENRICHMENT_YADISK_PATH = os.getenv("ENRICHMENT_YADISK_PATH")
YADISK_TOKEN = os.getenv("YADISK_TOKEN")
SUPPORTED_EXTENSIONS = {
".pdf",
".docx",
".doc",
".pptx",
".xlsx",
".xls",
".jpg",
".jpeg",
".png",
".gif",
".bmp",
".tiff",
".webp",
".odt",
}
Base = declarative_base() Base = declarative_base()
@@ -91,19 +118,17 @@ class DocumentEnricher:
hash_sha256.update(chunk) hash_sha256.update(chunk)
return hash_sha256.hexdigest() return hash_sha256.hexdigest()
def _is_document_processed(self, file_path: str) -> bool: def _is_document_hash_processed(self, file_hash: str) -> bool:
"""Check if a document has already been processed.""" """Check if a document hash has already been processed."""
file_hash = self._get_file_hash(file_path)
existing = self.session.query(ProcessedDocument).filter_by( existing = self.session.query(ProcessedDocument).filter_by(
file_hash=file_hash file_hash=file_hash
).first() ).first()
return existing is not None return existing is not None
def _mark_document_processed(self, file_path: str): def _mark_document_processed(self, file_identifier: str, file_hash: str):
"""Mark a document as processed in the database.""" """Mark a document as processed in the database."""
file_hash = self._get_file_hash(file_path)
doc_record = ProcessedDocument( doc_record = ProcessedDocument(
file_path=file_path, file_path=file_identifier,
file_hash=file_hash file_hash=file_hash
) )
self.session.add(doc_record) self.session.add(doc_record)
@@ -142,77 +167,88 @@ class DocumentEnricher:
return None return None
return UnstructuredODTLoader(file_path, **{"strategy": "hi_res", "languages": ["rus"]}) return UnstructuredODTLoader(file_path, **{"strategy": "hi_res", "languages": ["rus"]})
else: else:
# For text files and unsupported formats, try to load as text return None
try:
with open(file_path, 'r', encoding='utf-8') as f:
content = f.read()
return None, content # Return content directly for text processing
except UnicodeDecodeError:
logger.warning(f"Could not decode file as text: {file_path}")
return None, None
def load_and_split_documents(self, file_paths: List[str]) -> List[Document]: def _load_one_adaptive_file(
"""Load documents from file paths and split them appropriately.""" self, adaptive_file: _AdaptiveFile
all_docs = [] ) -> Tuple[List[Document], str | None]:
"""Load and split one adaptive file by using its local working callback."""
loaded_docs: List[Document] = []
file_hash: str | None = None
source_identifier = adaptive_file.local_path
extension = adaptive_file.extension.lower()
for file_path in file_paths: def process_local_file(local_file_path: str):
if self._is_document_processed(file_path): nonlocal loaded_docs, file_hash
logger.info(f"Skipping already processed document: {file_path}")
continue
logger.info(f"Processing document: {file_path}") file_hash = self._get_file_hash(local_file_path)
if self._is_document_hash_processed(file_hash):
# Get the appropriate loader for the file extension logger.info(f"Skipping already processed document hash for: {source_identifier}")
loader = self._get_loader_for_extension(file_path) return
loader = self._get_loader_for_extension(local_file_path)
if loader is None: if loader is None:
# For unsupported formats that we tried to load as text logger.warning(f"No loader available for file: {source_identifier}")
continue return
try:
# Load the document(s)
docs = loader.load() docs = loader.load()
# Add metadata to each document
for doc in docs: for doc in docs:
# Extract metadata from the original file doc.metadata["source"] = source_identifier
doc.metadata["source"] = file_path doc.metadata["filename"] = adaptive_file.filename
doc.metadata["filename"] = Path(file_path).name doc.metadata["file_path"] = source_identifier
doc.metadata["file_path"] = file_path doc.metadata["file_size"] = os.path.getsize(local_file_path)
doc.metadata["file_size"] = os.path.getsize(file_path) doc.metadata["file_extension"] = extension
# Add page number if available in original metadata
if "page" in doc.metadata: if "page" in doc.metadata:
doc.metadata["page_number"] = doc.metadata["page"] doc.metadata["page_number"] = doc.metadata["page"]
# Add file extension as metadata
doc.metadata["file_extension"] = Path(file_path).suffix
# Split documents if they are too large
split_docs = self.text_splitter.split_documents(docs) split_docs = self.text_splitter.split_documents(docs)
# Extract additional metadata from each chunk.
for chunk in split_docs: for chunk in split_docs:
years = extract_years_from_text(chunk.page_content) years = extract_years_from_text(chunk.page_content)
events = extract_russian_event_names(chunk.page_content) events = extract_russian_event_names(chunk.page_content)
chunk.metadata["years"] = years chunk.metadata["years"] = years
chunk.metadata["events"] = events chunk.metadata["events"] = events
# Add to the collection loaded_docs = split_docs
all_docs.extend(split_docs)
except Exception as e: adaptive_file.work_with_file_locally(process_local_file)
logger.error(f"Error processing {file_path}: {str(e)}") return loaded_docs, file_hash
def load_and_split_documents(
self, adaptive_collection: _AdaptiveCollection, recursive: bool = True
) -> Tuple[List[Document], List[Tuple[str, str]]]:
"""Load documents from adaptive collection and split them appropriately."""
all_docs: List[Document] = []
processed_file_records: dict[str, str] = {}
for adaptive_file in adaptive_collection.iterate(recursive=recursive):
if adaptive_file.extension.lower() not in SUPPORTED_EXTENSIONS:
logger.debug(
f"Skipping unsupported file extension for {adaptive_file.filename}: {adaptive_file.extension}"
)
continue continue
return all_docs logger.info(f"Processing document: {adaptive_file.local_path}")
try:
split_docs, file_hash = self._load_one_adaptive_file(adaptive_file)
if split_docs:
all_docs.extend(split_docs)
if file_hash:
processed_file_records[adaptive_file.local_path] = file_hash
except Exception as e:
logger.error(f"Error processing {adaptive_file.local_path}: {str(e)}")
continue
def enrich_and_store(self, file_paths: List[str]): return all_docs, list(processed_file_records.items())
def enrich_and_store(self, adaptive_collection: _AdaptiveCollection):
"""Load, enrich, and store documents in the vector store.""" """Load, enrich, and store documents in the vector store."""
logger.info(f"Starting enrichment process for {len(file_paths)} files...") logger.info("Starting enrichment process...")
# Load and split documents # Load and split documents
documents = self.load_and_split_documents(file_paths) documents, processed_file_records = self.load_and_split_documents(
adaptive_collection
)
if not documents: if not documents:
logger.info("No new documents to process.") logger.info("No new documents to process.")
@@ -225,55 +261,58 @@ class DocumentEnricher:
self.vector_store.add_documents(documents) self.vector_store.add_documents(documents)
# Only mark documents as processed after successful insertion to vector store # Only mark documents as processed after successful insertion to vector store
processed_file_paths = set() for file_identifier, file_hash in processed_file_records:
for doc in documents: self._mark_document_processed(file_identifier, file_hash)
if 'source' in doc.metadata:
processed_file_paths.add(doc.metadata['source'])
for file_path in processed_file_paths: logger.info(
self._mark_document_processed(file_path) f"Successfully added {len(documents)} document chunks to vector store and marked {len(processed_file_records)} files as processed."
)
logger.info(f"Successfully added {len(documents)} document chunks to vector store and marked {len(processed_file_paths)} files as processed.")
except Exception as e: except Exception as e:
logger.error(f"Error adding documents to vector store: {str(e)}") logger.error(f"Error adding documents to vector store: {str(e)}")
raise raise
def get_all_documents_from_data_dir(data_dir: str = str(DATA_DIR)) -> List[str]: def get_enrichment_adaptive_collection(
"""Get all supported document file paths from the data directory.""" data_dir: str = str(DATA_DIR),
supported_extensions = { ) -> _AdaptiveCollection:
'.pdf', '.docx', '.doc', '.pptx', '.xlsx', '.xls', """Create adaptive collection based on environment source configuration."""
'.jpg', '.jpeg', '.png', '.gif', '.bmp', '.tiff', source = ENRICHMENT_SOURCE
'.webp', '.odt' if source == "local":
} local_path = ENRICHMENT_LOCAL_PATH or data_dir
logger.info(f"Using local adaptive collection from path: {local_path}")
return LocalFilesystemAdaptiveCollection(local_path)
file_paths = [] if source == "yadisk":
for root, dirs, files in os.walk(data_dir): if not YADISK_TOKEN:
for file in files: raise ValueError("YADISK_TOKEN must be set when ENRICHMENT_SOURCE=yadisk")
if Path(file).suffix.lower() in supported_extensions: if not ENRICHMENT_YADISK_PATH:
file_paths.append(os.path.join(root, file)) raise ValueError(
"ENRICHMENT_YADISK_PATH must be set when ENRICHMENT_SOURCE=yadisk"
)
logger.info(
f"Using Yandex Disk adaptive collection from path: {ENRICHMENT_YADISK_PATH}"
)
return YandexDiskAdaptiveCollection(
token=YADISK_TOKEN,
base_dir=ENRICHMENT_YADISK_PATH,
)
return file_paths raise ValueError(
f"Unsupported ENRICHMENT_SOURCE='{source}'. Allowed values: local, yadisk"
)
def run_enrichment_process(vector_store, data_dir: str = str(DATA_DIR)): def run_enrichment_process(vector_store, data_dir: str = str(DATA_DIR)):
"""Run the full enrichment process.""" """Run the full enrichment process."""
logger.info(f"Starting document enrichment from directory: {data_dir}") logger.info("Starting document enrichment process")
# Get all supported documents from the data directory adaptive_collection = get_enrichment_adaptive_collection(data_dir=data_dir)
file_paths = get_all_documents_from_data_dir(data_dir)
if not file_paths:
logger.warning(f"No supported documents found in {data_dir}")
return
logger.info(f"Found {len(file_paths)} documents to process")
# Initialize the document enricher # Initialize the document enricher
enricher = DocumentEnricher(vector_store) enricher = DocumentEnricher(vector_store)
# Run the enrichment process # Run the enrichment process
enricher.enrich_and_store(file_paths) enricher.enrich_and_store(adaptive_collection)
logger.info("Document enrichment process completed!") logger.info("Document enrichment process completed!")

View File

@@ -118,9 +118,10 @@ class _AdaptiveFile(ABC):
local_path: str local_path: str
filename: str filename: str
def __init__(self, filename: str, extension: str): def __init__(self, filename: str, extension: str, local_path: str):
self.filename = filename self.filename = filename
self.extension = extension self.extension = extension
self.local_path = local_path
# This method allows to work with file locally, and lambda should be provided for this. # This method allows to work with file locally, and lambda should be provided for this.
# Why separate method? For possible cleanup after work is done. And to download file, if needed # Why separate method? For possible cleanup after work is done. And to download file, if needed
@@ -138,11 +139,8 @@ class _AdaptiveCollection(ABC):
class LocalFilesystemAdaptiveFile(_AdaptiveFile): class LocalFilesystemAdaptiveFile(_AdaptiveFile):
local_path: str
def __init__(self, filename: str, extension: str, local_path: str): def __init__(self, filename: str, extension: str, local_path: str):
super().__init__(filename, extension) super().__init__(filename, extension, local_path)
self.local_path = local_path
def work_with_file_locally(self, func: Callable[[str], None]): def work_with_file_locally(self, func: Callable[[str], None]):
func(self.local_path) func(self.local_path)
@@ -173,7 +171,7 @@ class YandexDiskAdaptiveFile(_AdaptiveFile):
remote_path: str remote_path: str
def __init__(self, filename: str, extension: str, remote_path: str, token: str): def __init__(self, filename: str, extension: str, remote_path: str, token: str):
super().__init__(filename, extension) super().__init__(filename, extension, remote_path)
self.token = token self.token = token
self.remote_path = remote_path self.remote_path = remote_path

View File

@@ -13,7 +13,7 @@ class TestLocalFilesystemAdaptiveCollection(unittest.TestCase):
collection = LocalFilesystemAdaptiveCollection(str(self.samples_dir)) collection = LocalFilesystemAdaptiveCollection(str(self.samples_dir))
files = list(collection.iterate(recursive=False)) files = list(collection.iterate(recursive=False))
file_names = sorted(Path(file.local_path).name for file in files) file_names = sorted(file.filename for file in files)
self.assertEqual(file_names, ["root.txt"]) self.assertEqual(file_names, ["root.txt"])
self.assertTrue(all(isinstance(file, LocalFilesystemAdaptiveFile) for file in files)) self.assertTrue(all(isinstance(file, LocalFilesystemAdaptiveFile) for file in files))
@@ -33,7 +33,9 @@ class TestLocalFilesystemAdaptiveCollection(unittest.TestCase):
def test_work_with_file_locally_provides_existing_path(self): def test_work_with_file_locally_provides_existing_path(self):
target_path = self.samples_dir / "root.txt" target_path = self.samples_dir / "root.txt"
adaptive_file = LocalFilesystemAdaptiveFile(target_path.suffix, str(target_path)) adaptive_file = LocalFilesystemAdaptiveFile(
target_path.name, target_path.suffix, str(target_path)
)
observed = {} observed = {}
@@ -44,6 +46,7 @@ class TestLocalFilesystemAdaptiveCollection(unittest.TestCase):
adaptive_file.work_with_file_locally(callback) adaptive_file.work_with_file_locally(callback)
self.assertEqual(adaptive_file.filename, "root.txt")
self.assertEqual(observed["path"], str(target_path)) self.assertEqual(observed["path"], str(target_path))
self.assertEqual(observed["content"], "root file") self.assertEqual(observed["content"], "root file")

View File

@@ -31,6 +31,7 @@ class TestYandexDiskAdaptiveCollection(unittest.TestCase):
self.skipTest(f"Yandex Disk request failed and needs manual verification: {exc}") self.skipTest(f"Yandex Disk request failed and needs manual verification: {exc}")
for item in files: for item in files:
self.assertTrue(item.filename)
logger.info(f"Yandex file found during test iteration: {item.local_path}") logger.info(f"Yandex file found during test iteration: {item.local_path}")
self.assertIsInstance(files, list) self.assertIsInstance(files, list)