Enrichment now processed via chunks. 2 documents -> into the vector storage. Also geussing source from the file extension

This commit is contained in:
2026-02-11 11:23:50 +03:00
parent 1e6ab247b9
commit 7b52887558
4 changed files with 127 additions and 81 deletions

Binary file not shown.

View File

@@ -37,15 +37,16 @@ def ping():
name="enrich", name="enrich",
help="Load documents from data directory and store in vector database", help="Load documents from data directory and store in vector database",
) )
@click.option("--data-dir", default="../../../data", help="Path to the data directory")
@click.option( @click.option(
"--collection-name", "--collection-name",
default="documents_langchain", default="documents_langchain",
help="Name of the vector store collection", help="Name of the vector store collection",
) )
def enrich(data_dir, collection_name): def enrich(collection_name):
"""Load documents from data directory and store in vector database""" """Load documents from data directory and store in vector database"""
logger.info(f"Starting enrichment process for directory: {data_dir}") logger.info(
f"Starting enrichment process. Enrichment source: {os.getenv('ENRICHMENT_SOURCE')}"
)
try: try:
# Import here to avoid circular dependencies # Import here to avoid circular dependencies
@@ -56,7 +57,7 @@ def enrich(data_dir, collection_name):
vector_store = initialize_vector_store(collection_name=collection_name) vector_store = initialize_vector_store(collection_name=collection_name)
# Run enrichment process # Run enrichment process
run_enrichment_process(vector_store, data_dir=data_dir) run_enrichment_process(vector_store)
logger.info("Enrichment process completed successfully!") logger.info("Enrichment process completed successfully!")
click.echo("Documents have been successfully loaded into the vector store.") click.echo("Documents have been successfully loaded into the vector store.")

View File

@@ -1,13 +1,15 @@
"""Document enrichment module for loading documents into vector storage.""" """Document enrichment module for loading documents into vector storage."""
import os
import hashlib import hashlib
import os
from pathlib import Path from pathlib import Path
from typing import List, Tuple from typing import Iterator, List, Tuple
from dotenv import load_dotenv from dotenv import load_dotenv
from langchain_community.document_loaders import PyPDFLoader
from langchain_core.documents import Document from langchain_core.documents import Document
from langchain_text_splitters import RecursiveCharacterTextSplitter from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import PyPDFLoader
# Dynamically import other loaders to handle optional dependencies # Dynamically import other loaders to handle optional dependencies
try: try:
from langchain_community.document_loaders import UnstructuredWordDocumentLoader from langchain_community.document_loaders import UnstructuredWordDocumentLoader
@@ -33,10 +35,10 @@ try:
from langchain_community.document_loaders import UnstructuredODTLoader from langchain_community.document_loaders import UnstructuredODTLoader
except ImportError: except ImportError:
UnstructuredODTLoader = None UnstructuredODTLoader = None
from sqlalchemy import create_engine, Column, Integer, String from loguru import logger
from sqlalchemy import Column, Integer, String, create_engine
from sqlalchemy.ext.declarative import declarative_base from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.orm import sessionmaker from sqlalchemy.orm import sessionmaker
from loguru import logger
from helpers import ( from helpers import (
LocalFilesystemAdaptiveCollection, LocalFilesystemAdaptiveCollection,
@@ -76,11 +78,24 @@ SUPPORTED_EXTENSIONS = {
".odt", ".odt",
} }
def try_guess_source(extension: str) -> str:
if extension in [".xlsx", "xls"]:
return "таблица"
elif extension in [".jpg", ".jpeg", ".png", ".gif", ".bmp", ".tiff", ".webp"]:
return "изображение"
elif extension in [".pptx"]:
return "презентация"
else:
return "документ"
Base = declarative_base() Base = declarative_base()
class ProcessedDocument(Base): class ProcessedDocument(Base):
"""Database model for tracking processed documents.""" """Database model for tracking processed documents."""
__tablename__ = "processed_documents" __tablename__ = "processed_documents"
id = Column(Integer, primary_key=True) id = Column(Integer, primary_key=True)
@@ -120,17 +135,14 @@ class DocumentEnricher:
def _is_document_hash_processed(self, file_hash: str) -> bool: def _is_document_hash_processed(self, file_hash: str) -> bool:
"""Check if a document hash has already been processed.""" """Check if a document hash has already been processed."""
existing = self.session.query(ProcessedDocument).filter_by( existing = (
file_hash=file_hash self.session.query(ProcessedDocument).filter_by(file_hash=file_hash).first()
).first() )
return existing is not None return existing is not None
def _mark_document_processed(self, file_identifier: str, file_hash: str): def _mark_document_processed(self, file_identifier: str, file_hash: str):
"""Mark a document as processed in the database.""" """Mark a document as processed in the database."""
doc_record = ProcessedDocument( doc_record = ProcessedDocument(file_path=file_identifier, file_hash=file_hash)
file_path=file_identifier,
file_hash=file_hash
)
self.session.add(doc_record) self.session.add(doc_record)
self.session.commit() self.session.commit()
@@ -142,30 +154,50 @@ class DocumentEnricher:
return PyPDFLoader(file_path) return PyPDFLoader(file_path)
elif ext in [".docx", ".doc"]: elif ext in [".docx", ".doc"]:
if UnstructuredWordDocumentLoader is None: if UnstructuredWordDocumentLoader is None:
logger.warning(f"UnstructuredWordDocumentLoader not available for {file_path}. Skipping.") logger.warning(
f"UnstructuredWordDocumentLoader not available for {file_path}. Skipping."
)
return None return None
return UnstructuredWordDocumentLoader(file_path, **{"strategy": "hi_res", "languages": ["rus"]}) return UnstructuredWordDocumentLoader(
file_path, **{"strategy": "hi_res", "languages": ["rus"]}
)
elif ext == ".pptx": elif ext == ".pptx":
if UnstructuredPowerPointLoader is None: if UnstructuredPowerPointLoader is None:
logger.warning(f"UnstructuredPowerPointLoader not available for {file_path}. Skipping.") logger.warning(
f"UnstructuredPowerPointLoader not available for {file_path}. Skipping."
)
return None return None
return UnstructuredPowerPointLoader(file_path, **{"strategy": "hi_res", "languages": ["rus"]}) return UnstructuredPowerPointLoader(
file_path, **{"strategy": "hi_res", "languages": ["rus"]}
)
elif ext in [".xlsx", ".xls"]: elif ext in [".xlsx", ".xls"]:
if UnstructuredExcelLoader is None: if UnstructuredExcelLoader is None:
logger.warning(f"UnstructuredExcelLoader not available for {file_path}. Skipping.") logger.warning(
f"UnstructuredExcelLoader not available for {file_path}. Skipping."
)
return None return None
return UnstructuredExcelLoader(file_path, **{"strategy": "hi_res", "languages": ["rus"]}) return UnstructuredExcelLoader(
file_path, **{"strategy": "hi_res", "languages": ["rus"]}
)
elif ext in [".jpg", ".jpeg", ".png", ".gif", ".bmp", ".tiff", ".webp"]: elif ext in [".jpg", ".jpeg", ".png", ".gif", ".bmp", ".tiff", ".webp"]:
if UnstructuredImageLoader is None: if UnstructuredImageLoader is None:
logger.warning(f"UnstructuredImageLoader not available for {file_path}. Skipping.") logger.warning(
f"UnstructuredImageLoader not available for {file_path}. Skipping."
)
return None return None
# Use OCR strategy for images to extract text # Use OCR strategy for images to extract text
return UnstructuredImageLoader(file_path, **{"strategy": "ocr_only", "languages": ["rus"]}) return UnstructuredImageLoader(
file_path, **{"strategy": "ocr_only", "languages": ["rus"]}
)
elif ext == ".odt": elif ext == ".odt":
if UnstructuredODTLoader is None: if UnstructuredODTLoader is None:
logger.warning(f"UnstructuredODTLoader not available for {file_path}. Skipping.") logger.warning(
f"UnstructuredODTLoader not available for {file_path}. Skipping."
)
return None return None
return UnstructuredODTLoader(file_path, **{"strategy": "hi_res", "languages": ["rus"]}) return UnstructuredODTLoader(
file_path, **{"strategy": "hi_res", "languages": ["rus"]}
)
else: else:
return None return None
@@ -175,7 +207,7 @@ class DocumentEnricher:
"""Load and split one adaptive file by using its local working callback.""" """Load and split one adaptive file by using its local working callback."""
loaded_docs: List[Document] = [] loaded_docs: List[Document] = []
file_hash: str | None = None file_hash: str | None = None
source_identifier = adaptive_file.local_path source_identifier = try_guess_source(adaptive_file.extension)
extension = adaptive_file.extension.lower() extension = adaptive_file.extension.lower()
def process_local_file(local_file_path: str): def process_local_file(local_file_path: str):
@@ -183,7 +215,9 @@ class DocumentEnricher:
file_hash = self._get_file_hash(local_file_path) file_hash = self._get_file_hash(local_file_path)
if self._is_document_hash_processed(file_hash): if self._is_document_hash_processed(file_hash):
logger.info(f"Skipping already processed document hash for: {source_identifier}") logger.info(
f"Skipping already processed document hash for: {source_identifier}"
)
return return
loader = self._get_loader_for_extension(local_file_path) loader = self._get_loader_for_extension(local_file_path)
@@ -216,69 +250,79 @@ class DocumentEnricher:
def load_and_split_documents( def load_and_split_documents(
self, adaptive_collection: _AdaptiveCollection, recursive: bool = True self, adaptive_collection: _AdaptiveCollection, recursive: bool = True
) -> Tuple[List[Document], List[Tuple[str, str]]]: ) -> Iterator[Tuple[List[Document], List[Tuple[str, str]]]]:
"""Load documents from adaptive collection and split them appropriately.""" """Load documents from adaptive collection and split them appropriately."""
all_docs: List[Document] = [] docs_chunk: List[Document] = []
processed_file_records: dict[str, str] = {} processed_file_records: dict[str, str] = {}
for adaptive_file in adaptive_collection.iterate(recursive=recursive): for adaptive_file in adaptive_collection.iterate(recursive=recursive):
if len(processed_file_records) >= 2:
yield docs_chunk, list(processed_file_records.items())
docs_chunk = []
processed_file_records = {}
if adaptive_file.extension.lower() not in SUPPORTED_EXTENSIONS: if adaptive_file.extension.lower() not in SUPPORTED_EXTENSIONS:
logger.debug( logger.debug(
f"Skipping unsupported file extension for {adaptive_file.filename}: {adaptive_file.extension}" f"Skipping unsupported file extension for {adaptive_file.filename}: {adaptive_file.extension}"
) )
continue continue
logger.info(f"Processing document: {adaptive_file.local_path}") logger.info(f"Processing document: {adaptive_file.filename}")
try: try:
split_docs, file_hash = self._load_one_adaptive_file(adaptive_file) split_docs, file_hash = self._load_one_adaptive_file(adaptive_file)
if split_docs: if split_docs:
all_docs.extend(split_docs) docs_chunk.extend(split_docs)
if file_hash: if file_hash:
processed_file_records[adaptive_file.local_path] = file_hash processed_file_records[adaptive_file.filename] = file_hash
except Exception as e: except Exception as e:
logger.error(f"Error processing {adaptive_file.local_path}: {str(e)}") logger.error(f"Error processing {adaptive_file.filename}: {str(e)}")
continue continue
return all_docs, list(processed_file_records.items())
def enrich_and_store(self, adaptive_collection: _AdaptiveCollection): def enrich_and_store(self, adaptive_collection: _AdaptiveCollection):
"""Load, enrich, and store documents in the vector store.""" """Load, enrich, and store documents in the vector store."""
logger.info("Starting enrichment process...") logger.info("Starting enrichment process...")
# Load and split documents # Load and split documents
documents, processed_file_records = self.load_and_split_documents( for documents, processed_file_records in self.load_and_split_documents(
adaptive_collection adaptive_collection
) ):
if not documents:
if not documents: logger.info("No new documents to process.")
logger.info("No new documents to process.") return
return
logger.info(f"Loaded and split {len(documents)} document chunks, adding to vector store...")
# Add documents to vector store
try:
self.vector_store.add_documents(documents)
# Only mark documents as processed after successful insertion to vector store
for file_identifier, file_hash in processed_file_records:
self._mark_document_processed(file_identifier, file_hash)
logger.info( logger.info(
f"Successfully added {len(documents)} document chunks to vector store and marked {len(processed_file_records)} files as processed." f"Loaded and split {len(documents)} document chunks, adding to vector store..."
) )
except Exception as e: logger.debug(
logger.error(f"Error adding documents to vector store: {str(e)}") f"Documents len: {len(documents)}, processed_file_records len: {len(processed_file_records)}"
raise )
# Add documents to vector store
try:
self.vector_store.add_documents(documents)
# Only mark documents as processed after successful insertion to vector store
for file_identifier, file_hash in processed_file_records:
self._mark_document_processed(file_identifier, file_hash)
logger.info(
f"Successfully added {len(documents)} document chunks to vector store and marked {len(processed_file_records)} files as processed."
)
except Exception as e:
logger.error(f"Error adding documents to vector store: {str(e)}")
raise
def get_enrichment_adaptive_collection( def get_enrichment_adaptive_collection() -> _AdaptiveCollection:
data_dir: str = str(DATA_DIR),
) -> _AdaptiveCollection:
"""Create adaptive collection based on environment source configuration.""" """Create adaptive collection based on environment source configuration."""
source = ENRICHMENT_SOURCE source = ENRICHMENT_SOURCE
if source == "local": if source == "local":
local_path = ENRICHMENT_LOCAL_PATH or data_dir local_path = ENRICHMENT_LOCAL_PATH
if local_path is None:
raise RuntimeError(
"Enrichment strategy is local, but no ENRICHMENT_LOCAL_PATH is defined!"
)
logger.info(f"Using local adaptive collection from path: {local_path}") logger.info(f"Using local adaptive collection from path: {local_path}")
return LocalFilesystemAdaptiveCollection(local_path) return LocalFilesystemAdaptiveCollection(local_path)
@@ -302,11 +346,11 @@ def get_enrichment_adaptive_collection(
) )
def run_enrichment_process(vector_store, data_dir: str = str(DATA_DIR)): def run_enrichment_process(vector_store):
"""Run the full enrichment process.""" """Run the full enrichment process."""
logger.info("Starting document enrichment process") logger.info("Starting document enrichment process")
adaptive_collection = get_enrichment_adaptive_collection(data_dir=data_dir) adaptive_collection = get_enrichment_adaptive_collection()
# Initialize the document enricher # Initialize the document enricher
enricher = DocumentEnricher(vector_store) enricher = DocumentEnricher(vector_store)

View File

@@ -115,13 +115,11 @@ def extract_russian_event_names(text: str) -> List[str]:
class _AdaptiveFile(ABC): class _AdaptiveFile(ABC):
extension: str # Format: .jpg extension: str # Format: .jpg
local_path: str
filename: str filename: str
def __init__(self, filename: str, extension: str, local_path: str): def __init__(self, filename: str, extension: str):
self.filename = filename self.filename = filename
self.extension = extension self.extension = extension
self.local_path = local_path
# This method allows to work with file locally, and lambda should be provided for this. # This method allows to work with file locally, and lambda should be provided for this.
# Why separate method? For possible cleanup after work is done. And to download file, if needed # Why separate method? For possible cleanup after work is done. And to download file, if needed
@@ -139,8 +137,11 @@ class _AdaptiveCollection(ABC):
class LocalFilesystemAdaptiveFile(_AdaptiveFile): class LocalFilesystemAdaptiveFile(_AdaptiveFile):
local_path: str
def __init__(self, filename: str, extension: str, local_path: str): def __init__(self, filename: str, extension: str, local_path: str):
super().__init__(filename, extension, local_path) super().__init__(filename, extension)
self.local_path = local_path
def work_with_file_locally(self, func: Callable[[str], None]): def work_with_file_locally(self, func: Callable[[str], None]):
func(self.local_path) func(self.local_path)
@@ -171,7 +172,7 @@ class YandexDiskAdaptiveFile(_AdaptiveFile):
remote_path: str remote_path: str
def __init__(self, filename: str, extension: str, remote_path: str, token: str): def __init__(self, filename: str, extension: str, remote_path: str, token: str):
super().__init__(filename, extension, remote_path) super().__init__(filename, extension)
self.token = token self.token = token
self.remote_path = remote_path self.remote_path = remote_path